diff --git a/.bumpversion.toml b/.bumpversion.toml index ee405e7d689..012ea06e89f 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.1" +current_version = "9.0.0-beta.2" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", @@ -55,6 +55,11 @@ filename = "Cargo.toml" search = 'lance-datagen = {{ version = "={current_version}"' replace = 'lance-datagen = {{ version = "={new_version}"' +[[tool.bumpversion.files]] +filename = "Cargo.toml" +search = 'lance-derive = {{ version = "={current_version}"' +replace = 'lance-derive = {{ version = "={new_version}"' + [[tool.bumpversion.files]] filename = "Cargo.toml" search = 'lance-encoding = {{ version = "={current_version}"' diff --git a/.cargo/config.toml b/.cargo/config.toml index 1d9c9ecc9da..c455c4a978d 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -9,6 +9,13 @@ debug = true codegen-units = 16 lto = "thin" +[profile.release-no-lto] +inherits = "release" +debug = true +lto = false +# Prioritize compile time when LTO is not relevant to the measurement. +codegen-units = 16 + [profile.bench] inherits = "release" lto = "thin" diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000000..8a46f198419 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,30 @@ +version: 2 +updates: + - package-ecosystem: "cargo" + directory: "/" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + + - package-ecosystem: "cargo" + directory: "/python" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + + - package-ecosystem: "cargo" + directory: "/java/lance-jni" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + + - package-ecosystem: "uv" + directory: "/python" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + diff --git a/.github/workflows/build_linux_wheel/action.yml b/.github/workflows/build_linux_wheel/action.yml index fbfcff687ce..d6e6e0f1ada 100644 --- a/.github/workflows/build_linux_wheel/action.yml +++ b/.github/workflows/build_linux_wheel/action.yml @@ -3,7 +3,7 @@ name: build-linux-wheel description: "Build a manylinux wheel for lance" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" @@ -41,11 +41,9 @@ runs: args: ${{ inputs.args }} maturin-version: "1.10.2" before-script-linux: | - set -e - yum install -y openssl-devel \ - && curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$(uname -m).zip > /tmp/protoc.zip \ - && unzip /tmp/protoc.zip -d /usr/local \ - && rm /tmp/protoc.zip + set -euo pipefail + yum install -y openssl-devel + bash "${GITHUB_WORKSPACE}/.github/workflows/build_linux_wheel/install-protoc.sh" - name: Build x86_64 Manylinux {manylinux} wheel if: ${{ inputs.arm-build == 'false' && inputs.manylinux != '2_17' }} uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1 @@ -60,11 +58,9 @@ runs: args: ${{ inputs.args }} maturin-version: "1.10.2" before-script-linux: | - set -e - yum install -y openssl-devel clang \ - && curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$(uname -m).zip > /tmp/protoc.zip \ - && unzip /tmp/protoc.zip -d /usr/local \ - && rm /tmp/protoc.zip + set -euo pipefail + yum install -y openssl-devel clang + bash "${GITHUB_WORKSPACE}/.github/workflows/build_linux_wheel/install-protoc.sh" - name: Build Arm Manylinux Wheel if: ${{ inputs.arm-build == 'true' }} uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1 @@ -76,8 +72,6 @@ runs: args: ${{ inputs.args }} maturin-version: "1.10.2" before-script-linux: | - set -e - yum install -y openssl-devel clang \ - && curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-aarch_64.zip > /tmp/protoc.zip \ - && unzip /tmp/protoc.zip -d /usr/local \ - && rm /tmp/protoc.zip + set -euo pipefail + yum install -y openssl-devel clang + bash "${GITHUB_WORKSPACE}/.github/workflows/build_linux_wheel/install-protoc.sh" diff --git a/.github/workflows/build_linux_wheel/install-protoc.sh b/.github/workflows/build_linux_wheel/install-protoc.sh new file mode 100755 index 00000000000..2d5bf4ced40 --- /dev/null +++ b/.github/workflows/build_linux_wheel/install-protoc.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +version="${PROTOC_VERSION:-24.4}" +install_dir="${PROTOC_INSTALL_DIR:-/usr/local}" +machine="${1:-$(uname -m)}" + +case "${machine}" in + aarch64 | arm64) + asset_arch="aarch_64" + ;; + x86_64) + asset_arch="x86_64" + ;; + *) + echo "Unsupported protoc architecture: ${machine}" >&2 + exit 1 + ;; +esac + +zip_path="/tmp/protoc-${version}-linux-${asset_arch}.zip" +url="https://github.com/protocolbuffers/protobuf/releases/download/v${version}/protoc-${version}-linux-${asset_arch}.zip" + +for attempt in 1 2 3 4 5; do + rm -f "${zip_path}" + + if curl -fsSL --connect-timeout 15 --max-time 120 -o "${zip_path}" "${url}" \ + && unzip -tq "${zip_path}" >/dev/null; then + break + fi + + if [[ "${attempt}" == "5" ]]; then + echo "Failed to download a valid protoc archive from ${url}" >&2 + exit 1 + fi + + sleep "$((attempt * 2))" +done + +unzip -q -o "${zip_path}" -d "${install_dir}" +rm -f "${zip_path}" + +if [[ "$(uname -s)" == "Linux" ]]; then + "${install_dir}/bin/protoc" --version +else + test -f "${install_dir}/bin/protoc" +fi diff --git a/.github/workflows/build_mac_wheel/action.yml b/.github/workflows/build_mac_wheel/action.yml index 9d45bde42aa..0cac76c49cf 100644 --- a/.github/workflows/build_mac_wheel/action.yml +++ b/.github/workflows/build_mac_wheel/action.yml @@ -3,7 +3,7 @@ name: build_wheel description: "Build a lance wheel" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/build_windows_wheel/action.yml b/.github/workflows/build_windows_wheel/action.yml index 03b601db019..94475059c75 100644 --- a/.github/workflows/build_windows_wheel/action.yml +++ b/.github/workflows/build_windows_wheel/action.yml @@ -3,7 +3,7 @@ name: build_wheel description: "Build a lance wheel" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/compat-pair.yml b/.github/workflows/compat-pair.yml new file mode 100644 index 00000000000..b7f1f44535b --- /dev/null +++ b/.github/workflows/compat-pair.yml @@ -0,0 +1,78 @@ +# On-demand cross-version index compatibility run between two arbitrary refs. +# +# The PR path lives in python.yml (compat-sequence job), which ages the two previous +# majors into the PR head reusing the prebuilt wheel. This workflow is the manual escape +# hatch for any other pairing: each ref (version, sha, branch, or tag) is provisioned by +# the framework -- a published release installs a wheel, anything else is built from a +# worktree via maturin -- so two arbitrary refs can be compared even when neither has a +# published wheel. The suite ages an index under the writer ref and exercises it under the +# reader, searching maintenance-op sequences for panics or correctness divergence. +name: Compat (index ref pair) + +on: + workflow_dispatch: + inputs: + from_ref: + description: "Writer ref (version / sha / branch). Blank = 2nd most recent release." + required: false + default: "" + to_ref: + description: "Reader ref (version / sha / branch). Blank = most recent release." + required: false + default: "" + kinds: + description: "Comma-separated index kinds (INVERTED,BTREE,...) or 'all'." + required: false + default: "all" + max_length: + description: "Max maintenance-op sequence length to search (deeper = slower)." + required: false + default: "5" + +jobs: + compat-pair: + runs-on: ubuntu-latest + timeout-minutes: 120 + defaults: + run: + working-directory: python + env: + COMPAT_FROM_REF: ${{ inputs.from_ref }} + COMPAT_TO_REF: ${{ inputs.to_ref }} + COMPAT_MAX_LENGTH: ${{ inputs.max_length || '5' }} + steps: + # Full history so arbitrary refs can be checked out for builds. + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + fetch-depth: 0 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.11" + # Toolchain for the build-from-source provisioning path (refs without a wheel). + - uses: actions-rust-lang/setup-rust-toolchain@a0b538fa0b742a6aa35d6e2c169b4bd06d225a98 # v1 + - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 + - name: Install build deps + run: | + sudo apt update + sudo apt install -y protobuf-compiler libssl-dev + - name: Install host deps + run: pip install pytest pytest-xdist pyarrow packaging maturin + - name: Resolve kinds + id: kinds + env: + KINDS_IN: ${{ inputs.kinds }} + run: | + if [ -z "$KINDS_IN" ] || [ "$KINDS_IN" = "all" ]; then + echo "value=INVERTED,BTREE,BITMAP,LABEL_LIST,NGRAM,ZONEMAP,BLOOMFILTER" >> "$GITHUB_OUTPUT" + else + echo "value=$KINDS_IN" >> "$GITHUB_OUTPUT" + fi + # Oversubscribe (4x cores): each scenario writes a small dataset to disk and the + # reader spends most of its time in short subprocess round-trips, so a worker waiting + # on disk or the sub-venv pipe overlaps another's compute. + - name: Run compat suite (${{ inputs.from_ref }} -> ${{ inputs.to_ref }}) + env: + COMPAT_KINDS: ${{ steps.kinds.outputs.value }} + run: | + python -m pytest python/tests/compat/test_index_sequence.py \ + --run-compat -n "$(( $(nproc) * 4 ))" -v --no-header diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml index a51cf969a87..2b22b60dc92 100644 --- a/.github/workflows/java-publish.yml +++ b/.github/workflows/java-publish.yml @@ -28,10 +28,24 @@ permissions: contents: read jobs: - linux-arm64: - name: Build on Linux Arm64 - runs-on: ubuntu-24.04-arm64-8x + build-linux: + name: Build on Linux ${{ matrix.arch }} + runs-on: ${{ matrix.runner }} timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - arch: x86-64 + runner: ubuntu-24.04 + docker_platform: linux/amd64 + protoc_arch: x86_64 + artifact: liblance_jni_linux_x86_64.zip + - arch: arm64 + runner: ubuntu-24.04-arm64-8x + docker_platform: linux/arm64 + protoc_arch: aarch_64 + artifact: liblance_jni_linux_arm_64.zip steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -41,9 +55,9 @@ jobs: uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 - name: Check glibc version outside docker run: ldd --version - - name: Build and run in Debian 10 Arm64 container + - name: Build and run in Debian 10 container run: | - docker run --platform linux/arm64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " + docker run --platform ${{ matrix.docker_platform }} -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " set -ex # Update sources.list to use archive repositories for Debian 10 (EOL) @@ -81,7 +95,7 @@ jobs: unzip # https://github.com/databendlabs/databend/issues/8035 - PROTOC_ZIP=protoc-3.15.0-linux-aarch_64.zip + PROTOC_ZIP=protoc-3.15.0-linux-${{ matrix.protoc_arch }}.zip curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP unzip -o \$PROTOC_ZIP -d /usr/local rm -f \$PROTOC_ZIP @@ -102,101 +116,44 @@ jobs: " - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: - name: liblance_jni_linux_arm_64.zip + name: ${{ matrix.artifact }} path: java/lance-jni/target/release/liblance_jni.so retention-days: 1 if-no-files-found: error - linux-x86: - name: Build on Linux x86-64 - runs-on: ubuntu-24.04 + build-macos: + name: Build on MacOS Arm64 + runs-on: warp-macos-14-arm64-6x timeout-minutes: 60 steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: ref: ${{ inputs.ref || github.ref }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 - - name: Check glibc version outside docker - run: ldd --version - - name: Build and run in Debian 10 X86-64 container - run: | - docker run --platform linux/amd64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " - - set -ex - # Update sources.list to use archive repositories for Debian 10 (EOL) - echo 'deb http://archive.debian.org/debian/ buster main' > /etc/apt/sources.list - echo 'deb http://archive.debian.org/debian-security buster/updates main' >> /etc/apt/sources.list - echo 'deb http://archive.debian.org/debian/ buster-updates main' >> /etc/apt/sources.list - apt-get update - - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends --assume-yes \ - apt-transport-https \ - ca-certificates \ - curl \ - gpg \ - bash \ - less \ - openssl \ - libssl-dev \ - pkg-config \ - libsqlite3-dev \ - libsqlite3-0 \ - libreadline-dev \ - git \ - cmake \ - dh-autoreconf \ - clang \ - g++ \ - libc++-dev \ - libc++abi-dev \ - libprotobuf-dev \ - libncurses5-dev \ - libncursesw5-dev \ - libudev-dev \ - libhidapi-dev \ - zip \ - unzip - - # https://github.com/databendlabs/databend/issues/8035 - PROTOC_ZIP=protoc-3.15.0-linux-x86_64.zip - curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP - unzip -o \$PROTOC_ZIP -d /usr/local - rm -f \$PROTOC_ZIP - protoc --version - - curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable - source \$HOME/.cargo/env - cargo --version - - cd java/lance-jni - - # https://github.com/rustls/rustls/issues/1967 - export CC=clang - export CXX=clang++ - ldd --version - - cargo build --release - " + - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 + - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07 + - name: Install dependencies + run: brew install protobuf + - name: Build native lib + working-directory: java/lance-jni + run: cargo build --release - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: - name: liblance_jni_linux_x86_64.zip - path: java/lance-jni/target/release/liblance_jni.so + name: liblance_jni_darwin_aarch64.zip + path: java/lance-jni/target/release/liblance_jni.dylib retention-days: 1 if-no-files-found: error - macos-arm64: - name: Build on MacOS Arm64 and release - runs-on: warp-macos-14-arm64-6x - timeout-minutes: 60 + publish: + name: Publish Java packages + runs-on: ubuntu-latest + timeout-minutes: 30 needs: - - linux-arm64 - - linux-x86 + - build-linux + - build-macos steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: ref: ${{ inputs.ref || github.ref }} - - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 - name: Set up Java 11 uses: actions/setup-java@c1e323688fd81a25caa38c78aa6df2d33d3e20d9 # v4 with: @@ -208,18 +165,16 @@ jobs: server-password: SONATYPE_TOKEN gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }} gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }} - - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07 - - name: Install dependencies - run: | - brew install protobuf - brew install gpg - - name: Download artifact + - name: Download artifacts uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 - name: Copy native libs run: | - mkdir -p ./java/target/classes/nativelib/linux-x86-64 ./java/target/classes/nativelib/linux-aarch64 + mkdir -p ./java/target/classes/nativelib/linux-x86-64 \ + ./java/target/classes/nativelib/linux-aarch64 \ + ./java/target/classes/nativelib/darwin-aarch64 cp ./liblance_jni_linux_x86_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-x86-64/liblance_jni.so cp ./liblance_jni_linux_arm_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-aarch64/liblance_jni.so + cp ./liblance_jni_darwin_aarch64.zip/liblance_jni.dylib ./java/target/classes/nativelib/darwin-aarch64/liblance_jni.dylib - name: Set github run: | git config --global user.email "Lance Github Runner" @@ -230,7 +185,7 @@ jobs: inputs.mode == 'dry_run' working-directory: java run: | - mvn --batch-mode -DskipTests -Drust.release.build=true package + mvn --batch-mode -DskipTests -Dskip.build.jni=true package - name: Publish with Java 11 if: | github.event_name == 'release' || @@ -240,14 +195,14 @@ jobs: echo "use-agent" >> ~/.gnupg/gpg.conf echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf export GPG_TTY=$(tty) - mvn --batch-mode -DskipTests -Drust.release.build=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh -P shade-jar + mvn --batch-mode -DskipTests -Dskip.build.jni=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh env: SONATYPE_USER: ${{ secrets.SONATYPE_USER }} SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }} report-failure: name: Report Workflow Failure runs-on: ubuntu-latest - needs: [linux-arm64, linux-x86, macos-arm64] + needs: [build-linux, build-macos, publish] if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch') permissions: contents: read diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index b2bfe284fb5..77c76d6fc69 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -35,7 +35,7 @@ jobs: name: Python Linux 3.${{ matrix.python-minor-version }} ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }} strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] config: - platform: x86_64 manylinux: "2_17" @@ -101,7 +101,7 @@ jobs: runs-on: ${{ matrix.config.runner }} strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] config: - target: aarch64-apple-darwin runner: warp-macos-14-arm64-6x @@ -152,7 +152,7 @@ jobs: runs-on: windows-latest-4x strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 973d9a632a7..cce465807e3 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -97,7 +97,7 @@ jobs: timeout-minutes: 45 strategy: matrix: - python-minor-version: ["9", "13"] + python-minor-version: ["10", "13"] name: "Python Linux 3.${{ matrix.python-minor-version }} x86_64" runs-on: "ubuntu-24.04-4x" defaults: @@ -162,6 +162,65 @@ jobs: env: COMPAT_TEMP_VENV: 1 + # Cross-version index maintenance-sequence search (see tests/compat/compat_sequence.py). + # Ages an index under the latest release of each of the two previous majors and exercises + # it under this commit, searching op sequences for panics or correctness divergence. The + # reader (HEAD) reuses the wheel the `linux` job already built rather than recompiling. + # Post-merge only: the search is slower than the rest of the suite, so it runs on pushes + # to main/release rather than blocking every PR (same gating as rust-benchmark). + compat-sequence: + needs: linux + if: github.event_name != 'pull_request' + timeout-minutes: 60 + runs-on: ubuntu-24.04 + name: Index Sequence Compat + defaults: + run: + shell: bash + working-directory: python + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: 3.13 + - name: Download wheels + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + with: + name: linux-wheels + path: python/wheels + - name: Install host deps + run: pip install pytest pytest-xdist pyarrow packaging + # Age under each of the two previous majors (writer wheels from PyPI) and read under + # this commit's prebuilt wheel. Length 5 is the shallowest depth that reaches the + # ENT-1662 sequence; -n oversubscribes cores since the work is I/O-bound. + - name: Run sequence search (previous two majors -> HEAD) + env: + COMPAT_MAX_LENGTH: "5" + COMPAT_TO_REF: HEAD + COMPAT_PREBUILT_REF: HEAD + run: | + set -euo pipefail + wheel=$(ls "$PWD"/wheels/pylance-*.whl | head -1) + refs=$(PYTHONPATH=python/tests python -c " + from compat.compat_decorator import pylance_stable_versions + latest = {} + for v in pylance_stable_versions(): + latest[v.major] = v + print(' '.join(str(latest[m]) for m in sorted(latest)[-2:])) + ") + echo "reader wheel: $wheel" + echo "writer refs : $refs" + status=0 + for from_ref in $refs; do + echo "::group::$from_ref -> HEAD" + COMPAT_FROM_REF="$from_ref" COMPAT_PREBUILT_WHEEL="$wheel" \ + python -m pytest python/tests/compat/test_index_sequence.py \ + --run-compat -n "$(( $(nproc) * 4 ))" -v --no-header || status=1 + echo "::endgroup::" + done + exit $status + linux-arm: timeout-minutes: 45 runs-on: ubuntu-24.04-arm64-4x diff --git a/AGENTS.md b/AGENTS.md index 2003d6dba10..30c0abea1a7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,6 +41,9 @@ Key technical traits: async-first (tokio), Arrow-native, versioned writes with m * Coverage: `cargo +nightly llvm-cov -q -p --branch` * Coverage HTML: `cargo +nightly llvm-cov -q -p --branch --html` * Coverage for file: `python ci/coverage.py -p -f ` +* Use repository-defined Cargo profiles instead of ad hoc LTO overrides. +* Use `release-with-debug` for benchmarks and profiling so optimized builds keep debug symbols without a rebuild. +* Use `release-no-lto` only for local debugging, IO-bound benchmarks, or compile-time-sensitive performance investigation where LTO would not affect the measured bottleneck. ### Python / Java diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cf332215e49..8f3ec285f31 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,7 +19,7 @@ If you have any questions, please join our [Discord](https://discord.gg/zMM32dvN Currently Lance is implemented in Rust and comes with a Python wrapper. So you'll want to make sure you setup both. 1. Install Rust: https://www.rust-lang.org/tools/install -2. Install Python 3.9+: https://www.python.org/downloads/ +2. Install Python 3.10+: https://www.python.org/downloads/ 3. Install protoctol buffers: https://grpc.io/docs/protoc-installation/ (make sure you have version 3.20 or higher) 4. Install commit hooks: a. Install pre-commit: https://pre-commit.com/#install diff --git a/Cargo.lock b/Cargo.lock index dc8146a8e01..60899b5c69c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,21 +66,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514ce16346f9fc96702fd52f2ae7e383b185516ee6f556efd7c3176be8fe7bea" -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - [[package]] name = "alloca" version = "0.4.0" @@ -147,7 +132,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -158,7 +143,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -386,7 +371,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "serde_core", "serde_json", ] @@ -475,7 +460,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -486,7 +471,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -540,7 +525,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 1.4.1", + "http 1.4.2", "ring", "time", "tokio", @@ -602,7 +587,7 @@ dependencies = [ "bytes-utils", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "percent-encoding", @@ -630,7 +615,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -660,7 +645,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "lru", "percent-encoding", @@ -689,7 +674,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -713,7 +698,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -738,7 +723,7 @@ dependencies = [ "aws-types", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -759,7 +744,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "percent-encoding", "sha2 0.10.9", "time", @@ -788,7 +773,7 @@ dependencies = [ "bytes", "crc-fast", "hex", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "md-5 0.10.6", @@ -822,7 +807,7 @@ dependencies = [ "bytes-utils", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "percent-encoding", @@ -841,7 +826,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "h2", - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-rustls", "hyper-util", @@ -898,7 +883,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -918,7 +903,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "pin-project-lite", "tokio", "tracing", @@ -936,7 +921,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -984,7 +969,7 @@ dependencies = [ "axum-core", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -1017,7 +1002,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "mime", @@ -1119,9 +1104,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "bitpacking" @@ -1134,9 +1119,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -1178,9 +1163,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -1194,27 +1179,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - [[package]] name = "bs58" version = "0.5.1" @@ -1261,7 +1225,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1284,9 +1248,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -1315,9 +1279,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -1365,9 +1329,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "js-sys", @@ -1455,7 +1419,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1491,7 +1455,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -1944,7 +1908,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1957,7 +1921,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1979,7 +1943,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1990,7 +1954,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2026,7 +1990,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -2048,7 +2011,6 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -2123,7 +2085,6 @@ dependencies = [ "libc", "log", "object_store", - "parquet", "paste", "sqlparser", "tokio", @@ -2241,36 +2202,6 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - [[package]] name = "datafusion-doc" version = "53.1.0" @@ -2479,7 +2410,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2702,26 +2633,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "deepsize" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" -dependencies = [ - "deepsize_derive", -] - -[[package]] -name = "deepsize_derive" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "der" version = "0.7.10" @@ -2782,7 +2693,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2802,7 +2713,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core 0.20.2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2829,7 +2740,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2853,7 +2764,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -2864,7 +2775,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2989,7 +2900,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3005,7 +2916,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -3086,7 +2997,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "rustc_version", ] @@ -3098,7 +3009,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", - "zlib-rs", ] [[package]] @@ -3166,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3245,7 +3155,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3468,17 +3378,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -3505,7 +3413,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3526,18 +3434,42 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "goosefs-sdk" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae079b88ffe7772d12cfc5c40a5a324babb357893d95b5e3a22ae857f236c5f" +dependencies = [ + "async-trait", + "bytes", + "dashmap", + "hostname", + "prost", + "prost-types", + "rand 0.9.4", + "reqwest 0.12.28", + "serde", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic", + "tonic-prost", + "tracing", + "uuid", +] + [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http 1.4.1", + "http 1.4.2", "indexmap 2.14.0", "slab", "tokio", @@ -3646,7 +3578,7 @@ checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", "futures", - "http 1.4.1", + "http 1.4.2", "indicatif", "libc", "log", @@ -3670,7 +3602,7 @@ checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "more-asserts", "serde", "thiserror 2.0.18", @@ -3702,6 +3634,17 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "hostname" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" +dependencies = [ + "cfg-if 1.0.4", + "libc", + "windows-link", +] + [[package]] name = "http" version = "0.2.12" @@ -3715,9 +3658,9 @@ dependencies = [ [[package]] name = "http" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -3741,7 +3684,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", ] [[package]] @@ -3752,7 +3695,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "pin-project-lite", ] @@ -3795,7 +3738,7 @@ dependencies = [ "futures-channel", "futures-core", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "httparse", "httpdate", @@ -3812,7 +3755,7 @@ version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-util", "rustls", @@ -3820,7 +3763,20 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", ] [[package]] @@ -3849,7 +3805,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "hyper", "ipnet", @@ -4067,12 +4023,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -4176,7 +4126,7 @@ version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "cfg-if 1.0.4", "libc", ] @@ -4195,7 +4145,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -4294,7 +4244,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4339,7 +4289,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4358,7 +4308,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4373,13 +4323,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.99" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", - "once_cell", "wasm-bindgen", ] @@ -4431,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "all_asserts", "approx", @@ -4468,7 +4417,6 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-substrait", - "deepsize", "either", "env_logger", "fst", @@ -4503,6 +4451,7 @@ dependencies = [ "parquet", "permutation", "pin-project", + "pprof", "pretty_assertions", "prost", "prost-build", @@ -4534,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4582,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -4591,21 +4540,23 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-data", "arrow-schema", "async-trait", "byteorder", "bytes", "datafusion-common", "datafusion-sql", - "deepsize", "futures", "itertools 0.13.0", "lance-arrow", + "lance-derive", "libc", + "libm", "log", "moka", "num_cpus", @@ -4623,12 +4574,13 @@ dependencies = [ "tokio-stream", "tokio-util", "tracing", + "twox-hash", "url", ] [[package]] name = "lance-datafusion" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4661,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4676,12 +4628,20 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", +] + +[[package]] +name = "lance-derive" +version = "9.0.0-beta.2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", ] [[package]] name = "lance-encoding" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4726,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "all_asserts", "arrow", @@ -4752,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4766,7 +4726,6 @@ dependencies = [ "bytes", "criterion", "datafusion-common", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4792,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4806,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "approx", "arc-swap", @@ -4829,7 +4788,6 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "deepsize", "dirs", "env_logger", "fst", @@ -4856,7 +4814,7 @@ dependencies = [ "lance-table", "lance-testing", "lance-tokenizer", - "libm", + "libsais-rs", "log", "ndarray", "num-traits", @@ -4869,6 +4827,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "rstest", "serde", @@ -4878,13 +4837,12 @@ dependencies = [ "test-log", "tokio", "tracing", - "twox-hash", "uuid", ] [[package]] name = "lance-io" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4902,9 +4860,8 @@ dependencies = [ "bytes", "chrono", "criterion", - "deepsize", "futures", - "http 1.4.1", + "http 1.4.2", "io-uring", "lance-arrow", "lance-core", @@ -4933,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "approx", "arrow-array", @@ -4941,7 +4898,6 @@ dependencies = [ "arrow-schema", "cc", "criterion", - "deepsize", "half", "lance-arrow", "lance-core", @@ -4949,11 +4905,12 @@ dependencies = [ "num-traits", "proptest", "rand 0.9.4", + "rayon", ] [[package]] name = "lance-namespace" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4965,7 +4922,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-schema", @@ -4981,9 +4938,10 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", + "arrow-array", "arrow-ipc", "arrow-schema", "async-trait", @@ -4993,9 +4951,12 @@ dependencies = [ "base64 0.22.1", "bytes", "chrono", + "datafusion-common", + "datafusion-physical-plan", "futures", "hmac 0.12.1", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", @@ -5004,28 +4965,32 @@ dependencies = [ "lance-table", "log", "object_store", + "opendal", "quick-xml 0.38.4", "rand 0.9.4", "reqwest 0.12.28", "ring", + "roaring", "rstest", "rustls-pki-types", "serde", "serde_json", "sha2 0.10.9", "tempfile", + "time", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", "wiremock", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.0" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eefb02ded2c3d4b6b60669bb74822d9fa628e144fc748c79ee31f13f566e87b" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -5037,7 +5002,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -5045,7 +5010,6 @@ dependencies = [ "byteorder", "bytes", "criterion", - "deepsize", "itertools 0.13.0", "lance-core", "proptest", @@ -5056,7 +5020,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -5070,7 +5034,6 @@ dependencies = [ "bytes", "chrono", "criterion", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -5103,16 +5066,16 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "lance-testing" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-schema", @@ -5125,19 +5088,20 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "icu_segmenter", "jieba-rs", "lindera", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] [[package]] name = "lance-tools" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "clap", "lance-core", @@ -5157,12 +5121,6 @@ dependencies = [ "spin 0.9.8", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -5241,6 +5199,15 @@ dependencies = [ "libc", ] +[[package]] +name = "libsais-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fe164dbd47ea0c20e78a121c980ef673326905f1d4fba55e3645a20ef6717f" +dependencies = [ + "rayon", +] + [[package]] name = "lindera" version = "3.0.7" @@ -5297,9 +5264,9 @@ dependencies = [ [[package]] name = "link-section" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014e440054ce8170890229eeef5bcda955305e056ec713de40ed366944483f09" +checksum = "c2b1dd6fe32e55c0fc0ea9493aa57459ca3cf4ff3c857c7d0302290150da6e4f" [[package]] name = "linktime-proc-macro" @@ -5330,9 +5297,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "loom" @@ -5476,9 +5443,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "memmap2" @@ -5534,9 +5501,9 @@ dependencies = [ [[package]] name = "mock_instant" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce6dd36094cac388f119d2e9dc82dc730ef91c32a6222170d630e5414b956e6" +checksum = "9bb517913cfcfb9eeda59f36020269075a152701a01606c612f547e4890be399" [[package]] name = "mockall" @@ -5561,7 +5528,7 @@ dependencies = [ "cfg-if 1.0.4", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5603,7 +5570,7 @@ checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5635,7 +5602,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5715,7 +5682,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5828,7 +5795,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5843,7 +5810,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -5888,7 +5855,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body-util", "httparse", "humantime", @@ -5955,7 +5922,7 @@ version = "6.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "libc", "once_cell", "onig_sys", @@ -5993,9 +5960,11 @@ dependencies = [ "opendal-service-azdls", "opendal-service-cos", "opendal-service-gcs", + "opendal-service-goosefs", "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", + "opendal-service-tos", ] [[package]] @@ -6008,7 +5977,7 @@ dependencies = [ "base64 0.22.1", "bytes", "futures", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "jiff", "log", @@ -6033,7 +6002,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f" dependencies = [ "futures", - "http 1.4.1", + "http 1.4.2", "mea", "opendal-core", ] @@ -6077,7 +6046,7 @@ checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a" dependencies = [ "base64 0.22.1", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -6098,7 +6067,7 @@ checksum = "6dea4908d490143a9b0b7f7a790e139ff829b06a023f670455ed3d44f664b361" dependencies = [ "base64 0.22.1", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -6116,7 +6085,7 @@ version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051" dependencies = [ - "http 1.4.1", + "http 1.4.2", "opendal-core", ] @@ -6127,7 +6096,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa8cafe9729213375c7331019b0cb756ad3e1aff7f45cd32c45eae91ebde8901" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -6145,7 +6114,7 @@ checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -6158,6 +6127,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "opendal-service-goosefs" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e43048bde419947ba826fbdc2f134d6c03f44ebf48bd33a03b72f9fc45fcb4" +dependencies = [ + "bytes", + "goosefs-sdk", + "log", + "opendal-core", + "serde", + "tokio", +] + [[package]] name = "opendal-service-hf" version = "0.57.0" @@ -6166,7 +6149,7 @@ checksum = "c4922661976a1d40794a2adfbdb888cc3c23097690f825a92f773af38908a848" dependencies = [ "bytes", "hf-xet", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -6182,7 +6165,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -6201,7 +6184,7 @@ dependencies = [ "base64 0.22.1", "bytes", "crc32c", - "http 1.4.1", + "http 1.4.2", "log", "md-5 0.11.0", "opendal-core", @@ -6213,13 +6196,30 @@ dependencies = [ "url", ] +[[package]] +name = "opendal-service-tos" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" +dependencies = [ + "bytes", + "http 1.4.2", + "opendal-core", + "quick-xml 0.39.4", + "reqsign-core", + "reqsign-file-read-tokio", + "reqsign-volcengine-tos", + "serde", + "serde_json", +] + [[package]] name = "openssl" -version = "0.10.80" +version = "0.10.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" +checksum = "77823a27f0babb03091cb9ed9ef80af3b39dbc82f97e8fa530374b7dafd87a45" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "cfg-if 1.0.4", "foreign-types", "libc", @@ -6235,7 +6235,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6246,9 +6246,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "openssl-sys" -version = "0.9.116" +version = "0.9.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4" +checksum = "b47e7e6bb2c38cd930d25a23b40fa52e068c10e85f3e03a7f5ba5aaca5713695" dependencies = [ "cc", "libc", @@ -6358,26 +6358,19 @@ dependencies = [ "arrow-schema", "arrow-select", "base64 0.22.1", - "brotli", "bytes", "chrono", - "flate2", "futures", "half", "hashbrown 0.17.1", - "lz4_flex", "num-bigint", "num-integer", "num-traits", - "object_store", "paste", "seq-macro", - "simdutf8", - "snap", "thrift", "tokio", "twox-hash", - "zstd", ] [[package]] @@ -6562,7 +6555,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6755,7 +6748,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6784,7 +6777,7 @@ checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" dependencies = [ "bit-set", "bit-vec", - "bitflags 2.12.1", + "bitflags 2.13.0", "num-traits", "rand 0.9.4", "rand_chacha 0.9.0", @@ -6797,9 +6790,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -6807,9 +6800,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools 0.14.0", @@ -6820,28 +6813,28 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn 2.0.118", "tempfile", ] [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] @@ -6872,7 +6865,7 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6972,7 +6965,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -7039,7 +7032,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -7115,19 +7108,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -7186,7 +7166,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -7217,14 +7197,14 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "regex" -version = "1.12.3" +version = "1.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" dependencies = [ "aho-corasick", "memchr", @@ -7251,9 +7231,9 @@ checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" [[package]] name = "regress" @@ -7288,7 +7268,7 @@ checksum = "372266b4733756738eeb199a98188037d27a0989980e2600ae7ce1faf00a867d" dependencies = [ "anyhow", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -7307,7 +7287,7 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "quick-xml 0.40.1", @@ -7329,7 +7309,7 @@ dependencies = [ "base64 0.22.1", "bytes", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "pem", "percent-encoding", @@ -7353,7 +7333,7 @@ dependencies = [ "futures", "hex", "hmac 0.13.0", - "http 1.4.1", + "http 1.4.2", "jiff", "log", "percent-encoding", @@ -7383,7 +7363,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb215d0876a18b6bd9cdd380b589e5292aaa638ca15266de794b1122d898b6b2" dependencies = [ "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-aws-v4", @@ -7401,7 +7381,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84110aabba799fbcd48b3abb51fbbff4749f879252e5806b6f5d0cbe0fef6abb" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -7409,6 +7389,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "reqsign-volcengine-tos" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" +dependencies = [ + "anyhow", + "http 1.4.2", + "log", + "percent-encoding", + "reqsign-core", +] + [[package]] name = "reqwest" version = "0.12.28" @@ -7421,7 +7414,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -7455,7 +7448,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] @@ -7468,7 +7461,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -7506,7 +7499,7 @@ checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" dependencies = [ "anyhow", "async-trait", - "http 1.4.1", + "http 1.4.2", "reqwest 0.13.4", "thiserror 2.0.18", "tower-service", @@ -7562,7 +7555,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7638,7 +7631,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.117", + "syn 2.0.118", "unicode-ident", ] @@ -7689,11 +7682,11 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -7752,7 +7745,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -7875,7 +7868,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7907,7 +7900,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -7967,7 +7960,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7978,7 +7971,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8013,7 +8006,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8025,7 +8018,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8042,9 +8035,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +checksum = "76a5c54c7310e7b8b9577c286d7e399ddd876c3e12b3ed917a8aabc4b96e9e8c" dependencies = [ "base64 0.22.1", "bs58", @@ -8062,14 +8055,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +checksum = "84d57bc0c8b9a17920c178daa6bb924850d54a9c97ab45194bb8c17ad66bb660" dependencies = [ "darling 0.23.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8120,7 +8113,7 @@ checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8259,9 +8252,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -8281,15 +8274,9 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - [[package]] name = "socket2" version = "0.6.4" @@ -8297,7 +8284,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -8378,7 +8365,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8415,6 +8402,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "str_stack" version = "0.1.1" @@ -8461,7 +8457,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8473,7 +8469,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8496,7 +8492,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn 2.0.118", "typify", "walkdir", ] @@ -8549,9 +8545,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -8575,7 +8571,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8598,7 +8594,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -8632,10 +8628,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -8663,7 +8659,7 @@ checksum = "c26ef8b00e4d382e59f6a8ddb3cd790b3a5bb29f21a358a9a69ea2f29f13f27b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8672,7 +8668,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "944ad38adcbb71eaa682c56bceeb079e4ca82b4b3edc2a0fde5cb297b77dac8d" dependencies = [ - "syn 2.0.117", + "syn 2.0.118", "test-log-core", ] @@ -8702,7 +8698,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8713,7 +8709,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8879,7 +8875,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8894,9 +8890,9 @@ dependencies = [ [[package]] name = "tokio-retry" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40f644c762e9d396831ae2f8935c954b0d758c4532e924bead0f666d0c1c8640" +checksum = "4a129d95275ebf4c493ec53bf0f8cd95f5ac161bc4f381700809a54f595d4470" dependencies = [ "pin-project-lite", "rand 0.10.1", @@ -8968,6 +8964,45 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "h2", + "http 1.4.2", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -8976,9 +9011,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -8990,9 +9028,9 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -9008,11 +9046,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ "async-compression", - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -9069,7 +9107,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9207,7 +9245,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn 2.0.118", "thiserror 2.0.18", "unicode-ident", ] @@ -9225,7 +9263,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn 2.0.118", "typify-impl", ] @@ -9283,12 +9321,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unicode_categories" version = "0.1.1" @@ -9365,11 +9397,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.23.2" +version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d258b83ceec21034727ecee8c382cfa6c3e133699b0742c64571814fb420c9f7" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -9444,20 +9476,11 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" -dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -9471,9 +9494,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -9484,9 +9507,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.72" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9473dbd2991ae90b6291c3c32c30c6187ac49aa32f9905d1cce280ec1e110b0f" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -9494,9 +9517,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -9504,48 +9527,26 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -9572,23 +9573,11 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags 2.12.1", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" -version = "0.3.99" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621441cfc37b84979402712047321980c178f299193a3589d05b99e8763436" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -9606,9 +9595,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" dependencies = [ "rustls-pki-types", ] @@ -9619,14 +9608,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] name = "webpki-roots" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -9666,7 +9655,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -9728,7 +9717,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9739,7 +9728,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9980,7 +9969,7 @@ dependencies = [ "base64 0.22.1", "deadpool", "futures", - "http 1.4.1", + "http 1.4.2", "http-body-util", "hyper", "hyper-util", @@ -9993,100 +9982,12 @@ dependencies = [ "url", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags 2.12.1", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -10140,7 +10041,7 @@ dependencies = [ "clap", "crc32fast", "futures", - "http 1.4.1", + "http 1.4.2", "hyper", "lazy_static", "more-asserts", @@ -10181,7 +10082,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", @@ -10214,7 +10115,7 @@ dependencies = [ "chrono", "clap", "gearhash", - "http 1.4.1", + "http 1.4.2", "itertools 0.14.0", "lazy_static", "more-asserts", @@ -10294,9 +10195,9 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -10311,28 +10212,28 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.50" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.50" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -10352,15 +10253,15 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" @@ -10394,15 +10295,9 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index 5044498dd41..a457d35ea7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ members = [ "rust/lance-select", "rust/lance-tokenizer", "rust/lance-table", + "rust/lance-derive", "rust/lance-test-macros", "rust/lance-testing", "rust/lance-tools", @@ -31,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -56,26 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.1", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.1", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.1", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.1", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.1", path = "./rust/lance-datagen" } -lance-encoding = { version = "=8.0.0-beta.1", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.1", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.1", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.1", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.1", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.1", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.1", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.1", path = "./rust/lance-namespace-impls" } +lance = { version = "=9.0.0-beta.2", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=9.0.0-beta.2", path = "./rust/lance-arrow" } +lance-core = { version = "=9.0.0-beta.2", path = "./rust/lance-core" } +lance-datafusion = { version = "=9.0.0-beta.2", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=9.0.0-beta.2", path = "./rust/lance-datagen" } +lance-derive = { version = "=9.0.0-beta.2", path = "./rust/lance-derive" } +lance-encoding = { version = "=9.0.0-beta.2", path = "./rust/lance-encoding" } +lance-file = { version = "=9.0.0-beta.2", path = "./rust/lance-file" } +lance-geo = { version = "=9.0.0-beta.2", path = "./rust/lance-geo" } +lance-index = { version = "=9.0.0-beta.2", path = "./rust/lance-index" } +lance-io = { version = "=9.0.0-beta.2", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=9.0.0-beta.2", path = "./rust/lance-linalg" } +lance-namespace = { version = "=9.0.0-beta.2", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=9.0.0-beta.2", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } -lance-namespace-reqwest-client = "0.8.0" -lance-select = { version = "=8.0.0-beta.1", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.1", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.1", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.1", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.1", path = "./rust/lance-testing" } +lance-namespace-reqwest-client = "0.8.6" +lance-select = { version = "=9.0.0-beta.2", path = "./rust/lance-select" } +lance-tokenizer = { version = "=9.0.0-beta.2", path = "./rust/lance-tokenizer" } +lance-table = { version = "=9.0.0-beta.2", path = "./rust/lance-table" } +lance-test-macros = { version = "=9.0.0-beta.2", path = "./rust/lance-test-macros" } +lance-testing = { version = "=9.0.0-beta.2", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -102,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.1", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=9.0.0-beta.2", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -131,18 +133,17 @@ datafusion = { version = "53.0.0", default-features = false, features = [ "unicode_expressions", ] } datafusion-common = "53.0.0" -datafusion-functions = { version = "53.0.0", features = ["regex_expressions"] } +datafusion-functions = { version = "53.0.0", default-features = false, features = ["regex_expressions"] } datafusion-sql = "53.0.0" datafusion-expr = "53.0.0" datafusion-ffi = "53.0.0" datafusion-physical-expr = "53.0.0" datafusion-physical-plan = "53.0.0" -datafusion-substrait = "53.0.0" -deepsize = "0.2.0" +datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.1", path = "./rust/compression/fsst" } +fsst = { version = "=9.0.0-beta.2", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" @@ -179,6 +180,7 @@ rand_distr = { version = "0.5.1" } rand_xoshiro = "0.7.0" rangemap = { version = "1.0" } rayon = "1.10" +regex-syntax = "0.8.10" roaring = "0.11.4" rstest = "0.26.1" serde = { version = "^1" } @@ -202,6 +204,7 @@ tower = "0.5" tower-http = "0.5" tracing = "0.1" tracing-mock = { version = "=0.1.0-beta.3" } +twox-hash = "2.0" url = "2.5.7" uuid = { version = "1.2", features = ["v4", "serde"] } wiremock = "0.6" diff --git a/README.md b/README.md index 2f0b2bca18f..886fd70425e 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ pip install pylance To install a preview release: ```shell -pip install --pre --extra-index-url https://pypi.fury.io/lance-format/pylance +pip install --pre --extra-index-url https://pypi.fury.io/lance-format pylance ``` > [!TIP] diff --git a/ci/create_release_branch.sh b/ci/create_release_branch.sh index 9c7d9d3e58a..db88f5b6b24 100755 --- a/ci/create_release_branch.sh +++ b/ci/create_release_branch.sh @@ -229,9 +229,9 @@ else bump-my-version bump -vv --new-version "${RC_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) # Commit the RC version git add -A @@ -259,9 +259,9 @@ else bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "chore: bump main to ${NEXT_VERSION} diff --git a/ci/publish_beta.sh b/ci/publish_beta.sh index f50798a52e0..06fa5c16a91 100644 --- a/ci/publish_beta.sh +++ b/ci/publish_beta.sh @@ -93,9 +93,9 @@ if [[ "${BRANCH}" == "main" ]] && [[ "${CURRENT_VERSION}" =~ -beta\.[0-9]+$ ]]; bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "chore: bump to ${NEXT_VERSION} based on breaking change detection" @@ -133,9 +133,9 @@ echo "Bumping beta version" bump-my-version bump -vv prerelease_num # Update Cargo.lock files after version bump -cargo update -(cd python && cargo update) -(cd java/lance-jni && cargo update) +cargo update --workspace +(cd python && cargo update --workspace) +(cd java/lance-jni && cargo update --workspace) # Get new version NEW_VERSION=$(grep '^version = ' Cargo.toml | head -n1 | cut -d'"' -f2) diff --git a/ci/release_common.sh b/ci/release_common.sh index cd653212aae..573202d1689 100644 --- a/ci/release_common.sh +++ b/ci/release_common.sh @@ -29,9 +29,9 @@ bump_and_commit_version() { bump-my-version bump -vv --new-version "${NEW_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "${COMMIT_MESSAGE}" diff --git a/docs/src/community/maintainers.md b/docs/src/community/maintainers.md index 755201f069e..f3ba6e70304 100644 --- a/docs/src/community/maintainers.md +++ b/docs/src/community/maintainers.md @@ -40,38 +40,38 @@ Maintainers with GitHub write access are additionally encouraged to: ## Roster -| Name | GitHub Handle | Affiliation | GitHub Write Access | Ecosystem Roles | -|------------------------|----------------------|-------------------|---------------------|-------------------------------------------------| -| Wyatt Alt | wkalt | LanceDB | ✓ | | -| Matt Basta | mattbasta | Runway AI | | | -| Giuseppe Battista | giusedroid | AWS | | | -| Timothy Carambat | timothycarambat | Anything LLM | | | -| Ayush Chaurasia | AyushExel | LanceDB | | -| Chongchen Chen | chenkovsky | MiraclePlus | | | -| Akela Drissner-Schmid | akelad | dltHub | | | -| Ty Dunn | TyDunn | Continue | | | -| Enwei Jiao | jiaoew1991 | Luma.ai | ✓ | Milvus Maintainer | -| Bryan Keller | bryanck | Netflix | | Apache Iceberg Committer | -| Aman Kishore | AmanKishore | Harvey.ai | | | -| Sangwu Lee | RE-N-Y | Krea.ai | | | -| Jeremy Leibs | jleibs | Rerun.io | | | -| Haocheng Liu | HaochengLIU | Seven Research | ✓ | | -| Nathan Ma | majin1102 | ByteDance | ✓ | Apache Amoro (incubating) PPMC Member | -| ChanChan Mao | ccmao1130 | LanceDB | | | -| Lu Qiu | LuQQiu | LanceDB | ✓ | Alluxio PMC Member | -| Dan Rammer | hamersaw | LanceDB | ✓ | | -| Rong Rong | walterddr | Google DeepMind | | Apache Pinot PMC Member, Apache Flink Committer | -| Nat Roth | nrothGIT | Meta AI | | | -| Kevin Shaffer-Morrison | kevinshaffermorrison | AWS | | | -| Noah Shpak | noahshpak | Thinking Machines | | | -| Chunxu Tang | ChunxuTang | Google | | PrestoDB Committer | -| Ankit Vij | ankitvij-db | Databricks | | | -| Beinan Wang | beinan | Microsoft AI | | Alluxio PMC Member, Presto TSC Member | -| Jiacheng Yang | jiachengdb | Google AI | | | -| Yang Jie | LuciferYang | Baidu Inc. | | Apache Spark PMC Member, Apache Uniffle PMC Member | -| Jianjian Xie | jja725 | Uber | | | -| Zhang Yue | zhangyue19921010 | ByteDance | | | -| Jinglun | wojiaodoubao | ByteDance | | Apache Hadoop Committer | +| Name | GitHub Handle | Affiliation | GitHub Write Access | Ecosystem Roles | +|------------------------|----------------------|-------------------|---------------------|----------------------------------------------------| +| Wyatt Alt | wkalt | LanceDB | ✓ | | +| Matt Basta | mattbasta | Runway AI | | | +| Giuseppe Battista | giusedroid | AWS | | | +| Timothy Carambat | timothycarambat | Anything LLM | | | +| Ayush Chaurasia | AyushExel | LanceDB | | | +| Chongchen Chen | chenkovsky | MiraclePlus | | | +| Akela Drissner-Schmid | akelad | dltHub | | | +| Ty Dunn | TyDunn | Continue | | | +| Enwei Jiao | jiaoew1991 | Luma.ai | ✓ | Milvus Maintainer | +| Bryan Keller | bryanck | Netflix | | Apache Iceberg Committer | +| Aman Kishore | AmanKishore | Harvey.ai | | | +| Sangwu Lee | RE-N-Y | Krea.ai | | | +| Jeremy Leibs | jleibs | Rerun.io | | | +| Haocheng Liu | HaochengLIU | Seven Research | ✓ | | +| Nathan Ma | majin1102 | ByteDance | ✓ | Apache Amoro (incubating) PPMC Member | +| ChanChan Mao | ccmao1130 | LanceDB | | | +| Lu Qiu | LuQQiu | LanceDB | ✓ | Alluxio PMC Member | +| Dan Rammer | hamersaw | LanceDB | ✓ | | +| Rong Rong | walterddr | Google DeepMind | | Apache Pinot PMC Member, Apache Flink Committer | +| Nat Roth | nrothGIT | Meta AI | | | +| Kevin Shaffer-Morrison | kevinshaffermorrison | AWS | | | +| Noah Shpak | noahshpak | Thinking Machines | | | +| Chunxu Tang | ChunxuTang | Google | | PrestoDB Committer | +| Ankit Vij | ankitvij-db | Databricks | | | +| Beinan Wang | beinan | Microsoft AI | | Alluxio PMC Member, Presto TSC Member | +| Jiacheng Yang | jiachengdb | Google AI | | | +| Yang Jie | LuciferYang | Baidu Inc. | | Apache Spark PMC Member, Apache Uniffle PMC Member | +| Jianjian Xie | jja725 | Uber | | | +| Zhang Yue | zhangyue19921010 | ByteDance | | Apache Hudi PMC, Apache Druid Committer, Kafka Contributor | +| Jinglun | wojiaodoubao | ByteDance | | Apache Hadoop Committer | ## Becoming a Maintainer diff --git a/docs/src/format/file/encoding.md b/docs/src/format/file/encoding.md index a3d99ef39cb..4ca053d4fa6 100644 --- a/docs/src/format/file/encoding.md +++ b/docs/src/format/file/encoding.md @@ -683,9 +683,10 @@ the default mini-block size is negligible. You should only consider changing thi confirmed — through profiling — that mini-block read amplification is saturating your available bandwidth (for example, accessing a remote object store over a constrained network link). -The maximum number of values per mini-block can be lowered via an environment variable: +The maximum number of values per mini-block can be tuned via an environment variable: -- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`): upper bound on the number of values in a single mini-block chunk. +- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`, maximum `32768`): upper bound on the number of values in a single mini-block chunk. Reducing this value produces smaller mini-blocks, which reduces the amount of data fetched per read at the -cost of more mini-blocks and slightly more metadata overhead. +cost of more mini-blocks and slightly more metadata overhead. Increasing it can reduce metadata overhead and +improve throughput for highly compressible data, but it may increase random-read amplification. diff --git a/docs/src/format/index/scalar/fmindex.md b/docs/src/format/index/scalar/fmindex.md new file mode 100644 index 00000000000..1138024896f --- /dev/null +++ b/docs/src/format/index/scalar/fmindex.md @@ -0,0 +1,64 @@ +# FM-Index (Full-text / Substring / Regex Search) + +The FM-Index (Ferragina-Manzini Index) is a compressed substring index based on the Burrows-Wheeler Transform (BWT). Unlike traditional inverted indexes (Full-Text Search) which index distinct words, the FM-Index enables efficient **arbitrary substring search**, **prefix match**, and **suffix/regular-expression search** directly on raw bytes. + +In Lance, the FM-Index is designed to scale dynamically across millions of documents or large-scale datasets, and is partitioned using Lance's **Segmented Index** architecture to support incremental appends, disjoint fragment tracking, and segment merging. + +## High-Level Architecture + +The FM-Index indexes raw text by treating columns of strings or binary payloads as raw byte arrays. + +``` + +----------------------------------------+ + | Lance Dataset | + | (Disjoint groups of Fragments 0..N) | + +----------------------------------------+ + | + Divide fragments into num_segments + | + v + +----------------------------------------+ + | Segmented Index | + | +-----------+ +-----------+ +-------+ | + | | Segment 1 | | Segment 2 | | ... | | + | | (FM-Idx) | | (FM-Idx) | | | | + | +-----------+ +-----------+ +-------+ | + +----------------------------------------+ +``` + +Each segment contains its own self-contained physical FM-Index mapping byte sub-sequences to Lance global row IDs. + +## Data Normalization & Sanitization + +The FM-Index is **normalization-independent by design** because it operates entirely on raw bytes. + +### Byte Sanitization vs. Text Normalization + +1. **Byte Sanitization (Core Index Layer)**: + The physical FM-Index uses specific sentinel bytes internally to mark boundaries: + - `\x00` is reserved as the global Burrows-Wheeler Transform (BWT) terminator character. + - `\xFF` is reserved as the document/row separator character. + + To avoid breaking the indexing structures, any incoming occurrences of `\x00` or `\xFF` are sanitized by remapping them to space (`\x20`) characters at index-build time. No other bytes are changed in this layer. + +2. **Text Normalization (User/Application Layer)**: + Because the index faithfully maps raw bytes, any semantic normalization (such as case folding `Hello` -> `hello`, Unicode NFKC normalization, stemming, or whitespace collapsing) is fully decoupled from the core index engine: + - To build a case-insensitive search index, users apply a lowercase transform to the column *prior* to indexing. + - When querying, the user's query text must undergo the exact same normalization pipeline. + +## Configurable Segment Partitioning + +Merging or appending to BWT-based indexes cannot be done via simple concatenation; the BWT suffix array must be reconstructed by re-reading the text and rebuilding. To balance build cost and search performance, Lance allows configuring how fragments map to index segments. + +- **`num_segments` parameter**: Configured at index-creation time. If `num_segments` is specified (e.g. `num_segments = 4`), Lance splits the target dataset fragments into disjoint subsets and builds independent FM-Index segments over each chunk. +- **Unindexed Appends**: When new fragments are appended to the dataset, a subsequent `create_index` execution with unindexed fragment coverage will construct a new separate segment representing only those new fragments, keeping existing segments fully intact. +- **Segment Merging**: Multiple existing index segments can be merged into a single segment under Lance's `merge_segments` protocol. Lance unions the fragment coverage bitmaps of the selected segments, re-reads the raw text from those covered fragments, and constructs a fresh unified FM-Index. + +## Query Evaluation + +When a substring query is submitted (e.g., `CONTAINS(column, "query_string")`): +1. The search string is sanitized (remapping any `\x00` or `\xFF` to spaces) and optionally normalized if the target index is normalized. +2. The query is dispatched across all active segments in the logical index in parallel. +3. Each segment performs a BWT backward-search to locate occurrences of the pattern. +4. Matching offsets are mapped back to absolute dataset Row IDs. +5. Results from all segments are unioned to produce the final selection. diff --git a/docs/src/format/index/scalar/ngram.md b/docs/src/format/index/scalar/ngram.md index bdf78474d50..d437363d264 100644 --- a/docs/src/format/index/scalar/ngram.md +++ b/docs/src/format/index/scalar/ngram.md @@ -29,4 +29,10 @@ The N-gram index provides inexact results for the following query types: | Query Type | Description | Operation | Result Type | |----------------|--------------------------|-------------------------------------------------------|-------------| -| **contains** | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost | \ No newline at end of file +| **contains** | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost | +| **regexp_like** / **regexp_match** | Regular-expression match | Derives a necessary trigram condition from the pattern (AND of intersections, OR of unions), then rechecks the true regex | AtMost | +| **LIKE** (infix) | Wildcard match such as `%foo%bar%` | Uses the literal segments of the pattern as a trigram condition, then rechecks the LIKE | AtMost | + +Patterns from which no trigram can be derived - for example `a.b`, `.*`, +case-insensitive matches, or literal runs shorter than three characters - fall +back to rechecking every row. This is always correct, just not accelerated. diff --git a/docs/src/format/index/vector/index.md b/docs/src/format/index/vector/index.md index e565bd737f0..7aaf9b55996 100644 --- a/docs/src/format/index/vector/index.md +++ b/docs/src/format/index/vector/index.md @@ -192,12 +192,16 @@ Compresses vectors using scalar quantization for moderate memory savings: Compresses vectors using RabitQ with random rotation and binary quantization for extreme compression: -| Column | Type | Nullable | Description | -| ----------------- | -------------------------- | -------- | --------------------------------------------------------------- | -| `_rowid` | uint64 | false | Row identifier | -| `_rabit_codes` | list[dimension / 8] | false | Binary quantized codes (1 bit per dimension, packed into bytes) | -| `__add_factors` | float32 | false | Additive correction factors for distance computation | -| `__scale_factors` | float32 | false | Scale correction factors for distance computation | +| Column | Type | Nullable | Description | +| -------------------- | ------------------------------------------------ | ------------------------ | --------------------------------------------------------------- | +| `_rowid` | uint64 | false | Row identifier | +| `_rabit_codes` | list[dimension / 8] | false | Binary quantized codes (1 bit per dimension, packed into bytes) | +| `__add_factors` | float32 | false | Additive correction factors for distance computation | +| `__scale_factors` | float32 | false | Scale correction factors for distance computation | +| `__error_factors` | float32 | false for `raw_query` | Error factors for raw-query lower-bound pruning | +| `__ex_codes` | list[ceil(dimension * (num_bits - 1) / 8)] | false for `num_bits > 1` | Extra RabitQ code bits for multi-bit RQ | +| `__add_factors_ex` | float32 | false for `num_bits > 1` | Additive correction factors for ex-code distance computation | +| `__scale_factors_ex` | float32 | false for `num_bits > 1` | Scale correction factors for ex-code distance computation | #### Arrow Schema Metadata @@ -248,9 +252,10 @@ For **RabitQ (RQ)**: | JSON Key | Type | Description | | --------------------- | ---- | ---------------------------------------------------- | | `rotate_mat_position` | u32 | Position of the rotation matrix in the global buffer | -| `num_bits` | u8 | Number of bits per dimension (currently always 1) | +| `num_bits` | u8 | Number of bits per dimension, in the range 1..=9 | | `code_dim` | u32 | Rotated vector dimension for the 1-bit binary code | | `packed` | bool | Whether codes are packed for optimized computation | +| `query_estimator` | string | Distance estimator layout: `residual_query` or `raw_query`. Missing values are read as `residual_query` for compatibility with released 1-bit IVF_RQ indexes. | #### Lance File Global Buffer @@ -274,9 +279,11 @@ to rotate vectors before binary quantization: ``` The rotation matrix has shape `[code_dim, code_dim]` where `code_dim` is the rotated vector -dimension. Current IVF_RQ stores the 1-bit binary code in `_rabit_codes`; future multi-bit support -will store the remaining `num_bits - 1` ex-code bits separately instead of widening this binary -code path. +dimension. IVF_RQ always stores the 1-bit binary sign code in `_rabit_codes`; for `num_bits > 1`, +the remaining `num_bits - 1` ex-code bits are stored in `__ex_codes` instead of widening the +binary code path. New IVF_RQ indexes store raw-query estimator factors. `num_bits=1` indexes only +store the binary-code factor columns; multi-bit indexes also store separate ex-code additive and +scale factors. ## Appendices @@ -319,7 +326,8 @@ pa.schema([ ### Appendix 2: Example IVF_RQ Format This example shows how an `IVF_RQ` index is physically laid out. Assume vectors have dimension 128, -RQ uses 1 bit per dimension (num_bits=1), and distance type is "l2". +RQ uses 1 bit per dimension (`num_bits=1`), and distance type is "l2". For `num_bits > 1`, the +auxiliary schema also includes `__ex_codes`, `__add_factors_ex`, and `__scale_factors_ex`. #### Index File @@ -340,7 +348,7 @@ RQ uses 1 bit per dimension (num_bits=1), and distance type is "l2". - Arrow Schema Metadata: - `"distance_type"` → `"l2"` - `"lance:ivf"` → tracks per-partition `offsets` and `lengths` (no centroids here) - - `"lance:rabit"` → `"{"rotate_mat_position":1,"num_bits":1,"packed":true}"` + - `"lance:rabit"` → `"{"rotate_mat_position":1,"num_bits":1,"packed":true,"query_estimator":"raw_query"}"` - Lance File Global buffer: - `Tensor` rotation matrix with shape `[code_dim, code_dim]` = `[128, 128]` (float32) - Rows with Arrow schema: @@ -351,6 +359,7 @@ pa.schema([ pa.field("_rabit_codes", pa.list(pa.uint8(), list_size=16)), # dimension/8 = 128/8 = 16 bytes pa.field("__add_factors", pa.float32()), pa.field("__scale_factors", pa.float32()), + pa.field("__error_factors", pa.float32()), ]) ``` diff --git a/docs/src/format/table/transaction.md b/docs/src/format/table/transaction.md index d1a5191bf54..436d857c36a 100644 --- a/docs/src/format/table/transaction.md +++ b/docs/src/format/table/transaction.md @@ -457,6 +457,7 @@ The following operations are retryable conflicts with DataReplacement: - CreateIndex (only if the field being replaced is being indexed) - Rewrite (only if overlapping fragments) - Update (only if overlapping fragments) +- Delete (only if overlapping fragments) - Merge (always) ### UpdateMemWalState diff --git a/docs/src/guide/blob.md b/docs/src/guide/blob.md index b1f956a19e7..dd13fcaab34 100644 --- a/docs/src/guide/blob.md +++ b/docs/src/guide/blob.md @@ -95,6 +95,16 @@ Note: - By default, external blob URIs must map to a registered non-dataset-root base path. - If you need to reference external objects outside those bases, set `allow_external_blob_outside_bases=True` when writing. +- Blob v2 storage layout thresholds can be configured per column with + `blob_field(..., inline_size_threshold=..., dedicated_size_threshold=...)`. + The inline threshold controls when values move from the data file to packed + `.blob` sidecar storage. The dedicated threshold controls when values move + from packed sidecar storage to a dedicated `.blob` file. The dedicated + threshold is checked first. For existing columns, these thresholds are stored + in the dataset schema; appends that explicitly provide different threshold + metadata for the same column are rejected. +- `blob_pack_file_size_threshold` is a write option for rolling packed `.blob` + sidecar files. It does not control inline-vs-packed placement. ### Example: packed external blobs (single container file) diff --git a/docs/src/guide/distributed_indexing.md b/docs/src/guide/distributed_indexing.md index ae17b9bb0f2..389e5a1bc09 100644 --- a/docs/src/guide/distributed_indexing.md +++ b/docs/src/guide/distributed_indexing.md @@ -18,9 +18,10 @@ write: 3. Lance plans and builds index artifacts from the worker outputs supplied by the caller 4. the built artifacts are committed into the dataset manifest -For vector indices, the worker outputs are segments stored directly -under `indices//`. Lance can turn these outputs into one or more -physical segments and then commit them as one logical index. +For vector indices and segment-native scalar indices, the worker outputs are +segments stored directly under `indices//`. Lance can turn these +outputs into one or more physical segments and then commit them as one logical +index. ![Distributed Vector Segment Build](../images/distributed_vector_segment_build.svg) @@ -81,7 +82,7 @@ launching workers and driving the overall workflow. ## Current Model -The current model for distributed vector indexing has two layers of parallelism. +The current model for distributed indexing has two layers of parallelism. ### Worker Build @@ -105,9 +106,46 @@ or merged into larger segments: Within a single commit, built segments must have disjoint fragment coverage. +`merge_existing_index_segments(...)` currently supports vector, inverted, +bitmap, BTree, and zone map segments. Other scalar index families can still +commit multiple compatible segments directly when their build path supports +fragment-scoped segments, but cannot be merged into a larger physical segment +until they add a merge implementation. + +### Vector Model Scope + +Distributed vector builds support two model scopes. + +**Shared model artifacts**: the caller trains or provides IVF centroids once and +passes the same artifacts to every worker. For IVF-PQ segments that should be +physically mergeable, workers should also use the same PQ codebook. This makes +partition ids and quantizer state have the same meaning across segments. + +**Independent segment models**: each worker trains the IVF/PQ model for its own +`fragment_ids`. The resulting segments can be committed together as one logical +index without sharing centroids or codebooks. + +At query time, Lance searches each physical segment independently: + +1. Lance opens each segment by index UUID +2. each segment ranks IVF partitions using its own centroids +3. each segment searches the selected partitions using its own quantizer storage +4. Lance merges the candidate rows from all segments by `_distance` + +Because partition ids are interpreted only within a segment during this fanout +query path, independently trained committed segments can return valid results. +For L2 and cosine IVF-PQ, each segment computes residuals against its own IVF +centroid during both build and query, so distances remain estimates of the +original query-to-vector metric. + +Physical merge is a separate operation. It rewrites several segment artifacts +into one artifact with one model metadata scope. Use shared compatible model +artifacts for segments you plan to merge physically, or keep independently +trained segments as separate physical segments. + ## Internal Finalize Model -Internally, Lance models distributed vector segment build as: +Internally, Lance models distributed segment build as: 1. **build** one uncommitted segment per worker 2. **optionally merge** caller-defined groups of existing segments diff --git a/docs/src/guide/object_store.md b/docs/src/guide/object_store.md index 1710e3b5100..f901d2c2411 100644 --- a/docs/src/guide/object_store.md +++ b/docs/src/guide/object_store.md @@ -218,3 +218,238 @@ ds = lance.dataset( | `oss_secret_access_key` | Access key secret used for OSS authentication. Optional if credentials are provided by environment. | | `oss_region` | OSS region (for example, `cn-hangzhou`). Optional. | | `oss_security_token` | Security token for temporary credentials (STS). Optional. | + +## Volcengine TOS Configuration + +TOS credentials can be set in the environment variables `TOS_ACCESS_KEY_ID`, +`TOS_SECRET_ACCESS_KEY`, `TOS_ENDPOINT`, `TOS_REGION`, and `TOS_SECURITY_TOKEN`. +Lance also accepts the corresponding `VOLCENGINE_` environment variable prefix. +Alternatively, credentials can be passed as parameters to the `storage_options` +parameter; explicit `storage_options` override environment variables: + +```python +import lance +ds = lance.dataset( + "tos://bucket/path", + storage_options={ + "tos_endpoint": "https://tos-cn-beijing.volces.com", + "tos_region": "cn-beijing", + "tos_access_key_id": "my-access-key", + "tos_secret_access_key": "my-secret-key", + "tos_security_token": "my-session-token", + } +) +``` + +| Key | Description | +|-----|-------------| +| `tos_endpoint` | TOS endpoint. Required (for example, `https://tos-cn-beijing.volces.com`). | +| `tos_region` | TOS signing region (for example, `cn-beijing`). Optional. | +| `tos_access_key_id` | Access key ID used for TOS authentication. Optional if credentials are provided by environment. | +| `tos_secret_access_key` | Secret access key used for TOS authentication. Optional if credentials are provided by environment. | +| `tos_security_token` | Security token for temporary credentials. Optional. | + +## Tencent Cloud COS Configuration + +[COS (Cloud Object Storage)](https://cloud.tencent.com/product/cos) credentials can be set in environment variables prefixed +with `COS_` or `TENCENTCLOUD_` (for example, `COS_ENDPOINT`, `COS_SECRET_ID`, +`COS_SECRET_KEY`, `TENCENTCLOUD_REGION`, `TENCENTCLOUD_SECURITY_TOKEN`). +Alternatively, credentials can be passed as parameters to the `storage_options` +parameter; explicit `storage_options` override environment variables: + +=== "Python" + + ```python + import lance + ds = lance.dataset( + "cos://bucket/path", + storage_options={ + "cos_endpoint": "https://cos.ap-guangzhou.myqcloud.com", + "cos_secret_id": "my-secret-id", + "cos_secret_key": "my-secret-key", + } + ) + ``` + +=== "Rust" + + In this Lance distribution, `tencent` is already part of the **default + features** of the `lance` crate, so simply depending on `lance` is enough: + + ```toml + [dependencies] + lance = "*" + ``` + + You only need to enable the `tencent` feature explicitly in the following + cases: + + - You opted out of default features, e.g. + `lance = { version = "*", default-features = false, features = ["tencent", ...] }`. + - You depend on `lance-io` directly (without `lance`); `tencent` is **not** + a default feature of `lance-io`: + `lance-io = { version = "*", features = ["tencent"] }`. + +| Key | Description | +|-----|-------------| +| `cos_endpoint` | COS endpoint. Required (for example, `https://cos.ap-guangzhou.myqcloud.com`). Can also be set via the `COS_ENDPOINT` environment variable. | +| `cos_secret_id` | Secret ID used for COS authentication. Optional if credentials are provided by environment. | +| `cos_secret_key` | Secret key used for COS authentication. Optional if credentials are provided by environment. | +| `cos_enable_versioning` | Whether to enable object versioning on the bucket. Optional. | + +!!! note + + The OpenDAL `CosConfig` currently exposes a limited set of options. Additional + settings such as the security token (`TENCENTCLOUD_SECURITY_TOKEN`) and region + (`TENCENTCLOUD_REGION`) must be configured via environment variables. + +## GooseFS Configuration + +[GooseFS](https://cloud.tencent.com/product/goosefs) is a distributed caching +filesystem. Lance accesses GooseFS through its Master gRPC service. The URL format +is `goosefs://host:port/path`, where `host:port` is the GooseFS Master address +(default port: `9200`, may be omitted, e.g. `goosefs://10.0.0.1/path`) and +`/path` is the filesystem path within GooseFS. + +!!! note "About the dataset path" + + `/path` is just an arbitrary directory inside GooseFS — Lance does **not** + require the path to end with a `.lance` suffix. Any valid GooseFS directory + works, for example: + + - `goosefs://10.0.0.1:9200/data/my-dataset` + - `goosefs://10.0.0.1:9200/data/my-dataset.lance` + - `goosefs://10.0.0.1:9200/lance-test/lance-io` + + The `.lance` suffix used in the examples below is only a naming convention + that makes it easy to recognize a Lance dataset directory at a glance; it + has no special meaning to Lance itself. The only requirement is that the + same path is used consistently for reads and writes of a given dataset. + +=== "Python" + + ```python + import lance + + ds = lance.dataset( + "goosefs://10.0.0.1:9200/data/my-dataset.lance", + storage_options={ + "goosefs_auth_type": "simple", + "goosefs_auth_username": "lance", + }, + ) + ``` + +=== "Rust" + + In this Lance distribution, `goosefs` is already part of the **default + features** of the `lance` crate, so simply depending on `lance` is enough: + + ```toml + [dependencies] + lance = "*" + ``` + + You only need to enable the `goosefs` feature explicitly in the following + cases: + + - You opted out of default features, e.g. + `lance = { version = "*", default-features = false, features = ["goosefs", ...] }`. + - You depend on `lance-io` directly (without `lance`); `goosefs` is **not** + a default feature of `lance-io`: + `lance-io = { version = "*", features = ["goosefs"] }`. + + Open the underlying `lance_io::object_store::ObjectStore` directly (mirrors + the integration test in `rust/lance-io/tests/goosefs_integration.rs`): + + ```rust + use lance_io::object_store::ObjectStore; + + let uri = "goosefs://10.0.0.1:9200/lance-test/lance-io"; + let (store, path) = ObjectStore::from_uri(uri).await?; + + // Read / write through the underlying `object_store::ObjectStore` API + store.inner.put(&path, (&b"hello"[..]).into()).await?; + let result = store.inner.get(&path).await?; + let bytes = result.bytes().await?; + ``` + + Open a Lance dataset with custom storage options: + + ```rust + use std::collections::HashMap; + use lance::dataset::DatasetBuilder; + + let mut storage_options = HashMap::new(); + storage_options.insert("goosefs_master_addr".to_string(), "10.0.0.1:9200".to_string()); + storage_options.insert("goosefs_auth_type".to_string(), "simple".to_string()); + storage_options.insert("goosefs_auth_username".to_string(), "lance".to_string()); + + let dataset = DatasetBuilder::from_uri("goosefs://10.0.0.1:9200/data/my-dataset.lance") + .with_storage_options(storage_options) + .load() + .await?; + ``` + +=== "Java" + + Pass the GooseFS configuration through `ReadOptions.setStorageOptions` + when opening the dataset: + + ```java + import org.lance.Dataset; + import org.lance.ReadOptions; + + import java.util.HashMap; + import java.util.Map; + + Map storageOptions = new HashMap<>(); + storageOptions.put("goosefs_master_addr", "10.0.0.1:9200"); + storageOptions.put("goosefs_auth_type", "simple"); + storageOptions.put("goosefs_auth_username", "lance"); + + ReadOptions options = new ReadOptions.Builder() + .setStorageOptions(storageOptions) + .build(); + + try (Dataset dataset = Dataset.open() + .uri("goosefs://10.0.0.1:9200/data/my-dataset.lance") + .readOptions(options) + .build()) { + // ... use the dataset + } + ``` + + For writes, the same `storageOptions(...)` setter is available on + `WriteDatasetBuilder` and `WriteFragmentBuilder`. + +The Master address can be resolved from (in priority order): + +1. The `goosefs_master_addr` storage option (supports HA: `"addr1:port,addr2:port"`). +2. The `GOOSEFS_MASTER_ADDR` environment variable. +3. The host and port from the URL authority. + +The following keys can be used as both environment variables or keys in the +`storage_options` parameter: + +| Key | Description | +|-----|-------------| +| `goosefs_master_addr` / `GOOSEFS_MASTER_ADDR` | GooseFS Master address. Supports a single address (`host:port`) or comma-separated HA addresses (`addr1:port,addr2:port`). Optional if the address is provided in the URL. | +| `goosefs_write_type` / `GOOSEFS_WRITE_TYPE` | Write type, e.g. `MUST_CACHE`, `CACHE_THROUGH`, `THROUGH`, `ASYNC_THROUGH`. Optional. | +| `goosefs_block_size` / `GOOSEFS_BLOCK_SIZE` | GooseFS block size in bytes (this is the GooseFS-side block size, not Lance's I/O block size). Optional. | +| `goosefs_chunk_size` / `GOOSEFS_CHUNK_SIZE` | Chunk size in bytes used when reading or writing files. Optional. | +| `goosefs_auth_type` / `GOOSEFS_AUTH_TYPE` | Authentication type. Either `nosasl` or `simple` (case-insensitive; the value is passed through to OpenDAL). Optional. | +| `goosefs_auth_username` / `GOOSEFS_AUTH_USERNAME` | Username used in `simple` authentication mode. Optional. | + +!!! note "Running the GooseFS integration tests" + + The Rust integration tests for GooseFS live at + `rust/lance-io/tests/goosefs_integration.rs` and are gated behind feature + flags. They require a reachable GooseFS cluster (configured via the + `GOOSEFS_MASTER_ADDR` and `GOOSEFS_AUTH_TYPE` environment variables) and + can be run with: + + ```bash + cargo test -p lance-io --features "goosefs goosefs-test" \ + --test goosefs_integration -- --ignored --nocapture --test-threads=1 + ``` diff --git a/docs/src/quickstart/index.md b/docs/src/quickstart/index.md index 606948263c4..34367c7177f 100644 --- a/docs/src/quickstart/index.md +++ b/docs/src/quickstart/index.md @@ -22,17 +22,17 @@ For the latest features and bug fixes, you can install the preview version: === "pip" ```bash - pip install --pre --extra-index-url https://pypi.fury.io/lance-format/ pylance + pip install --pre --extra-index-url https://pypi.fury.io/lance-format pylance ``` === "uv" ```bash uv venv - uv pip install --prerelease allow --index https://pypi.fury.io/lance-format/ pylance + uv pip install --prerelease allow --index https://pypi.fury.io/lance-format pylance # To add to pyproject.toml, just do: - uv add --prerelease allow --index https://pypi.fury.io/lance-format/ pylance + uv add --prerelease allow --index https://pypi.fury.io/lance-format pylance ``` !!! note diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 79d554807bd..d8131377563 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -42,21 +42,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - [[package]] name = "allocator-api2" version = "0.2.21" @@ -426,7 +411,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -437,7 +422,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -491,7 +476,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 1.4.1", + "http 1.4.2", "ring", "time", "tokio", @@ -551,7 +536,7 @@ dependencies = [ "bytes", "bytes-utils", "fastrand", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "percent-encoding", "pin-project-lite", @@ -578,7 +563,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -602,7 +587,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -627,7 +612,7 @@ dependencies = [ "aws-types", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -647,7 +632,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "percent-encoding", "sha2 0.10.9", "time", @@ -677,7 +662,7 @@ dependencies = [ "bytes-utils", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "percent-encoding", @@ -696,7 +681,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "h2", - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-rustls", "hyper-util", @@ -753,7 +738,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -773,7 +758,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "pin-project-lite", "tokio", "tracing", @@ -790,7 +775,7 @@ dependencies = [ "bytes", "bytes-utils", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -836,7 +821,7 @@ dependencies = [ "axum-core", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -869,7 +854,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "mime", @@ -929,9 +914,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "bitpacking" @@ -944,9 +929,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -988,9 +973,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -1004,27 +989,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - [[package]] name = "bs58" version = "0.5.1" @@ -1065,9 +1029,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -1090,9 +1054,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -1137,9 +1101,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "js-sys", @@ -1200,7 +1164,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1563,7 +1527,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn", ] [[package]] @@ -1574,7 +1538,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1610,7 +1574,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1632,7 +1595,6 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -1707,7 +1669,6 @@ dependencies = [ "libc", "log", "object_store", - "parquet", "paste", "sqlparser", "tokio", @@ -1825,36 +1786,6 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - [[package]] name = "datafusion-doc" version = "53.1.0" @@ -2063,7 +1994,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2259,26 +2190,6 @@ dependencies = [ "url", ] -[[package]] -name = "deepsize" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" -dependencies = [ - "deepsize_derive", -] - -[[package]] -name = "deepsize_derive" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "der" version = "0.7.10" @@ -2318,7 +2229,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2353,7 +2264,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2525,7 +2436,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", - "zlib-rs", ] [[package]] @@ -2569,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2646,7 +2556,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2863,17 +2773,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -2894,7 +2802,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2915,18 +2823,42 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "goosefs-sdk" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae079b88ffe7772d12cfc5c40a5a324babb357893d95b5e3a22ae857f236c5f" +dependencies = [ + "async-trait", + "bytes", + "dashmap", + "hostname", + "prost", + "prost-types", + "rand 0.9.4", + "reqwest 0.12.28", + "serde", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic", + "tonic-prost", + "tracing", + "uuid", +] + [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http 1.4.1", + "http 1.4.2", "indexmap 2.14.0", "slab", "tokio", @@ -3035,7 +2967,7 @@ checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "more-asserts", "serde", "thiserror 2.0.18", @@ -3067,6 +2999,17 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "hostname" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" +dependencies = [ + "cfg-if 1.0.4", + "libc", + "windows-link", +] + [[package]] name = "http" version = "0.2.12" @@ -3080,9 +3023,9 @@ dependencies = [ [[package]] name = "http" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -3106,7 +3049,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", ] [[package]] @@ -3117,7 +3060,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "pin-project-lite", ] @@ -3160,7 +3103,7 @@ dependencies = [ "futures-channel", "futures-core", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "httparse", "httpdate", @@ -3177,7 +3120,7 @@ version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-util", "rustls", @@ -3185,6 +3128,20 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", ] [[package]] @@ -3197,7 +3154,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "hyper", "ipnet", @@ -3415,12 +3372,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -3481,12 +3432,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - [[package]] name = "io-uring" version = "0.7.12" @@ -3568,7 +3513,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3629,7 +3574,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn", ] [[package]] @@ -3657,7 +3602,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3672,13 +3617,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.99" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", - "once_cell", "wasm-bindgen", ] @@ -3695,7 +3639,7 @@ dependencies = [ "jiff", "nom", "num-traits", - "ordered-float 5.3.0", + "ordered-float", "rand 0.9.4", "serde", "serde_json", @@ -3721,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -3750,7 +3694,6 @@ dependencies = [ "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-plan", - "deepsize", "either", "fst", "futures", @@ -3795,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -3837,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -3846,21 +3789,23 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-data", "arrow-schema", "async-trait", "byteorder", "bytes", "datafusion-common", "datafusion-sql", - "deepsize", "futures", "itertools 0.13.0", "lance-arrow", + "lance-derive", "libc", + "libm", "log", "moka", "num_cpus", @@ -3876,12 +3821,13 @@ dependencies = [ "tokio-stream", "tokio-util", "tracing", + "twox-hash", "url", ] [[package]] name = "lance-datafusion" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -3913,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -3926,12 +3872,20 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", +] + +[[package]] +name = "lance-derive" +version = "9.0.0-beta.2" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] name = "lance-encoding" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -3966,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -3979,7 +3933,6 @@ dependencies = [ "byteorder", "bytes", "datafusion-common", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -3997,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4011,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4032,7 +3985,6 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "deepsize", "dirs", "fst", "futures", @@ -4055,7 +4007,7 @@ dependencies = [ "lance-select", "lance-table", "lance-tokenizer", - "libm", + "libsais-rs", "log", "ndarray", "num-traits", @@ -4067,6 +4019,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "serde", "serde_json", @@ -4074,13 +4027,12 @@ dependencies = [ "tempfile", "tokio", "tracing", - "twox-hash", "uuid", ] [[package]] name = "lance-io" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4097,9 +4049,8 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", - "http 1.4.1", + "http 1.4.2", "io-uring", "lance-arrow", "lance-core", @@ -4122,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4158,23 +4109,23 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", "cc", - "deepsize", "half", "lance-arrow", "lance-core", "num-traits", "rand 0.9.4", + "rayon", ] [[package]] name = "lance-namespace" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4186,7 +4137,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-ipc", @@ -4194,6 +4145,8 @@ dependencies = [ "async-trait", "axum", "bytes", + "datafusion-common", + "datafusion-physical-plan", "futures", "lance", "lance-core", @@ -4206,19 +4159,22 @@ dependencies = [ "object_store", "rand 0.9.4", "reqwest 0.12.28", + "roaring", "serde", "serde_json", + "time", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.0" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eefb02ded2c3d4b6b60669bb74822d9fa628e144fc748c79ee31f13f566e87b" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -4230,14 +4186,13 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", "byteorder", "bytes", - "deepsize", "itertools 0.13.0", "lance-core", "roaring", @@ -4246,7 +4201,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4257,7 +4212,6 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4284,11 +4238,12 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "icu_segmenter", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] @@ -4301,12 +4256,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -4385,11 +4334,20 @@ dependencies = [ "libc", ] +[[package]] +name = "libsais-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fe164dbd47ea0c20e78a121c980ef673326905f1d4fba55e3645a20ef6717f" +dependencies = [ + "rayon", +] + [[package]] name = "link-section" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014e440054ce8170890229eeef5bcda955305e056ec713de40ed366944483f09" +checksum = "c2b1dd6fe32e55c0fc0ea9493aa57459ca3cf4ff3c857c7d0302290150da6e4f" [[package]] name = "linktime-proc-macro" @@ -4420,9 +4378,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "loom" @@ -4530,9 +4488,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "mime" @@ -4745,7 +4703,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -4790,7 +4748,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body-util", "httparse", "humantime", @@ -4867,9 +4825,11 @@ dependencies = [ "opendal-service-azdls", "opendal-service-cos", "opendal-service-gcs", + "opendal-service-goosefs", "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", + "opendal-service-tos", ] [[package]] @@ -4882,7 +4842,7 @@ dependencies = [ "base64", "bytes", "futures", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "jiff", "log", @@ -4907,7 +4867,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f" dependencies = [ "futures", - "http 1.4.1", + "http 1.4.2", "mea", "opendal-core", ] @@ -4951,7 +4911,7 @@ checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -4972,7 +4932,7 @@ checksum = "6dea4908d490143a9b0b7f7a790e139ff829b06a023f670455ed3d44f664b361" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -4990,7 +4950,7 @@ version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051" dependencies = [ - "http 1.4.1", + "http 1.4.2", "opendal-core", ] @@ -5001,7 +4961,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa8cafe9729213375c7331019b0cb756ad3e1aff7f45cd32c45eae91ebde8901" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5019,7 +4979,7 @@ checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5032,6 +4992,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "opendal-service-goosefs" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e43048bde419947ba826fbdc2f134d6c03f44ebf48bd33a03b72f9fc45fcb4" +dependencies = [ + "bytes", + "goosefs-sdk", + "log", + "opendal-core", + "serde", + "tokio", +] + [[package]] name = "opendal-service-hf" version = "0.57.0" @@ -5040,7 +5014,7 @@ checksum = "c4922661976a1d40794a2adfbdb888cc3c23097690f825a92f773af38908a848" dependencies = [ "bytes", "hf-xet", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5056,7 +5030,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5075,7 +5049,7 @@ dependencies = [ "base64", "bytes", "crc32c", - "http 1.4.1", + "http 1.4.2", "log", "md-5 0.11.0", "opendal-core", @@ -5087,6 +5061,23 @@ dependencies = [ "url", ] +[[package]] +name = "opendal-service-tos" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" +dependencies = [ + "bytes", + "http 1.4.2", + "opendal-core", + "quick-xml 0.39.4", + "reqsign-core", + "reqsign-file-read-tokio", + "reqsign-volcengine-tos", + "serde", + "serde_json", +] + [[package]] name = "openssl-probe" version = "0.2.1" @@ -5099,15 +5090,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" -[[package]] -name = "ordered-float" -version = "2.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" -dependencies = [ - "num-traits", -] - [[package]] name = "ordered-float" version = "5.3.0" @@ -5171,42 +5153,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "parquet" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "futures", - "half", - "hashbrown 0.17.1", - "lz4_flex", - "num-bigint", - "num-integer", - "num-traits", - "object_store", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "tokio", - "twox-hash", - "zstd", -] - [[package]] name = "paste" version = "1.0.15" @@ -5350,7 +5296,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -5457,7 +5403,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn", ] [[package]] @@ -5480,9 +5426,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -5490,9 +5436,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools 0.14.0", @@ -5503,28 +5449,28 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn", "tempfile", ] [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] @@ -5659,7 +5605,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -5726,19 +5672,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -5817,14 +5750,14 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] name = "regex" -version = "1.12.3" +version = "1.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" dependencies = [ "aho-corasick", "memchr", @@ -5851,9 +5784,9 @@ checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" [[package]] name = "regress" @@ -5873,7 +5806,7 @@ checksum = "372266b4733756738eeb199a98188037d27a0989980e2600ae7ce1faf00a867d" dependencies = [ "anyhow", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -5892,7 +5825,7 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "quick-xml 0.40.1", @@ -5914,7 +5847,7 @@ dependencies = [ "base64", "bytes", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "pem", "percent-encoding", @@ -5938,7 +5871,7 @@ dependencies = [ "futures", "hex", "hmac 0.13.0", - "http 1.4.1", + "http 1.4.2", "jiff", "log", "percent-encoding", @@ -5968,7 +5901,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb215d0876a18b6bd9cdd380b589e5292aaa638ca15266de794b1122d898b6b2" dependencies = [ "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-aws-v4", @@ -5986,7 +5919,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84110aabba799fbcd48b3abb51fbbff4749f879252e5806b6f5d0cbe0fef6abb" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -5994,6 +5927,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "reqsign-volcengine-tos" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" +dependencies = [ + "anyhow", + "http 1.4.2", + "log", + "percent-encoding", + "reqsign-core", +] + [[package]] name = "reqwest" version = "0.12.28" @@ -6006,7 +5952,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6037,6 +5983,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", + "webpki-roots", ] [[package]] @@ -6049,7 +5996,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6087,7 +6034,7 @@ checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" dependencies = [ "anyhow", "async-trait", - "http 1.4.1", + "http 1.4.2", "reqwest 0.13.4", "thiserror 2.0.18", "tower-service", @@ -6369,7 +6316,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn", ] [[package]] @@ -6461,7 +6408,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6472,7 +6419,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6507,7 +6454,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6519,7 +6466,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn", ] [[package]] @@ -6536,9 +6483,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +checksum = "76a5c54c7310e7b8b9577c286d7e399ddd876c3e12b3ed917a8aabc4b96e9e8c" dependencies = [ "base64", "bs58", @@ -6556,14 +6503,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +checksum = "84d57bc0c8b9a17920c178daa6bb924850d54a9c97ab45194bb8c17ad66bb660" dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6704,9 +6651,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -6726,15 +6673,9 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - [[package]] name = "socket2" version = "0.6.4" @@ -6791,7 +6732,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6828,6 +6769,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "strsim" version = "0.11.1" @@ -6853,7 +6803,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn", ] [[package]] @@ -6876,7 +6826,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn", "typify", "walkdir", ] @@ -6895,20 +6845,9 @@ checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" [[package]] name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -6932,7 +6871,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6989,7 +6928,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -7021,7 +6960,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7032,7 +6971,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7053,17 +6992,6 @@ dependencies = [ "cfg-if 1.0.4", ] -[[package]] -name = "thrift" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" -dependencies = [ - "byteorder", - "integer-encoding", - "ordered-float 2.10.1", -] - [[package]] name = "time" version = "0.3.47" @@ -7155,14 +7083,14 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] name = "tokio-retry" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40f644c762e9d396831ae2f8935c954b0d758c4532e924bead0f666d0c1c8640" +checksum = "4a129d95275ebf4c493ec53bf0f8cd95f5ac161bc4f381700809a54f595d4470" dependencies = [ "pin-project-lite", "rand 0.10.1", @@ -7234,6 +7162,45 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64", + "bytes", + "h2", + "http 1.4.2", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -7242,9 +7209,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -7258,7 +7228,7 @@ checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ "bitflags", "bytes", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -7278,7 +7248,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -7335,7 +7305,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7442,7 +7412,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn", "thiserror 2.0.18", "unicode-ident", ] @@ -7460,7 +7430,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn", "typify-impl", ] @@ -7497,12 +7467,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -7553,11 +7517,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.23.2" +version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d258b83ceec21034727ecee8c382cfa6c3e133699b0742c64571814fb420c9f7" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -7617,20 +7581,11 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -7644,9 +7599,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -7657,9 +7612,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.72" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9473dbd2991ae90b6291c3c32c30c6187ac49aa32f9905d1cce280ec1e110b0f" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -7667,9 +7622,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7677,48 +7632,26 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -7745,23 +7678,11 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" -version = "0.3.99" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621441cfc37b84979402712047321980c178f299193a3589d05b99e8763436" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -7779,9 +7700,18 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "webpki-roots" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -7883,7 +7813,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7894,7 +7824,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -8182,100 +8112,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -8329,7 +8171,7 @@ dependencies = [ "clap", "crc32fast", "futures", - "http 1.4.1", + "http 1.4.2", "hyper", "lazy_static", "more-asserts", @@ -8370,7 +8212,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", @@ -8403,7 +8245,7 @@ dependencies = [ "chrono", "clap", "gearhash", - "http 1.4.1", + "http 1.4.2", "itertools 0.14.0", "lazy_static", "more-asserts", @@ -8477,9 +8319,9 @@ checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -8494,28 +8336,28 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.50" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.50" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -8535,15 +8377,15 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" @@ -8577,15 +8419,9 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - [[package]] name = "zmij" version = "1.0.21" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index e1478b27aeb..3626d7aad3e 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" @@ -23,7 +23,7 @@ lance-linalg = { path = "../../rust/lance-linalg" } lance-index = { path = "../../rust/lance-index" } lance-io = { path = "../../rust/lance-io" } lance-namespace = { path = "../../rust/lance-namespace" } -lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter", "dir-goosefs"] } lance-core = { path = "../../rust/lance-core" } lance-file = { path = "../../rust/lance-file" } lance-table = { path = "../../rust/lance-table" } diff --git a/java/lance-jni/src/async_scanner.rs b/java/lance-jni/src/async_scanner.rs index 7cb71c37086..6da10479266 100644 --- a/java/lance-jni/src/async_scanner.rs +++ b/java/lance-jni/src/async_scanner.rs @@ -193,6 +193,9 @@ pub extern "system" fn Java_org_lance_ipc_AsyncScanner_createAsyncScanner<'local use_scalar_index: jboolean, fast_search: jboolean, substrait_aggregate_obj: JObject<'local>, + include_deleted_rows: jboolean, + strict_batch_size: jboolean, + disable_scoring_autoprojection: jboolean, ) -> JObject<'local> { crate::ok_or_throw!( env, @@ -216,6 +219,9 @@ pub extern "system" fn Java_org_lance_ipc_AsyncScanner_createAsyncScanner<'local use_scalar_index, fast_search, substrait_aggregate_obj, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, ) ) } @@ -241,6 +247,9 @@ fn inner_create_async_scanner<'local>( use_scalar_index: jboolean, fast_search: jboolean, substrait_aggregate_obj: JObject<'local>, + include_deleted_rows: jboolean, + strict_batch_size: jboolean, + disable_scoring_autoprojection: jboolean, ) -> Result> { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET) }?; @@ -265,6 +274,9 @@ fn inner_create_async_scanner<'local>( use_scalar_index, fast_search, substrait_aggregate_obj, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, }; let scanner = build_scanner_with_options(env, &dataset, options)?; diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index cd43c69d61a..1d06f3eed87 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -61,6 +61,7 @@ use std::iter::empty; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; +use uuid::Uuid; pub const NATIVE_DATASET: &str = "nativeDatasetHandle"; @@ -200,7 +201,8 @@ impl BlockingDataset { if namespace_client_managed_versioning && let (Some(namespace_client), Some(tid)) = (namespace, table_id) { - let external_store = LanceNamespaceExternalManifestStore::new(namespace_client, tid); + let external_store = + LanceNamespaceExternalManifestStore::for_table_uri(namespace_client, tid, uri)?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); @@ -688,8 +690,11 @@ fn create_dataset<'local>( if let Some((namespace, table_id)) = namespace_info { // Set up commit handler only if namespace manages versioning if namespace_client_managed_versioning { - let external_store = - LanceNamespaceExternalManifestStore::new(namespace.clone(), table_id.clone()); + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + namespace.clone(), + table_id.clone(), + &path_str, + )?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); @@ -951,7 +956,13 @@ fn inner_create_index<'local>( let fragment_ids = env .get_ints_opt(&fragments_jobj)? .map(|vec| vec.into_iter().map(|i| i as u32).collect()); - let index_uuid = env.get_string_opt(&index_uuid_jobj)?; + let index_uuid = env + .get_string_opt(&index_uuid_jobj)? + .map(|s| { + Uuid::parse_str(&s) + .map_err(|e| Error::input_error(format!("Invalid UUID string for index_uuid: {e}"))) + }) + .transpose()?; let arrow_stream_addr_opt = env.get_long_opt(&arrow_stream_addr_jobj)?; let batch_reader = if let Some(arrow_stream_addr) = arrow_stream_addr_opt { let stream_ptr = arrow_stream_addr as *mut FFI_ArrowArrayStream; @@ -974,6 +985,7 @@ fn inner_create_index<'local>( | IndexType::NGram | IndexType::ZoneMap | IndexType::BloomFilter + | IndexType::Fm | IndexType::RTree => { // For scalar indices, create a scalar IndexParams let (index_type_str, params_opt) = get_scalar_index_params(env, params_jobj)?; @@ -1088,7 +1100,9 @@ fn inner_merge_index_metadata( index_type_code_jobj: jint, batch_readhead_jobj: JObject, // Optional ) -> Result<()> { - let index_uuid = index_uuid.extract(env)?; + let index_uuid_str = index_uuid.extract(env)?; + let index_uuid = Uuid::parse_str(&index_uuid_str) + .map_err(|e| Error::input_error(format!("Invalid UUID string for index_uuid: {e}")))?; let index_type = IndexType::try_from(index_type_code_jobj)?; let batch_readhead = env .get_int_opt(&batch_readhead_jobj)? diff --git a/java/lance-jni/src/blocking_scanner.rs b/java/lance-jni/src/blocking_scanner.rs index f18b0d92a27..335cb2a4fa3 100644 --- a/java/lance-jni/src/blocking_scanner.rs +++ b/java/lance-jni/src/blocking_scanner.rs @@ -30,6 +30,7 @@ use crate::{ RT, blocking_dataset::{BlockingDataset, NATIVE_DATASET}, traits::IntoJava, + utils::parse_approx_mode, }; pub const NATIVE_SCANNER: &str = "nativeScannerHandle"; @@ -247,6 +248,9 @@ pub(crate) struct ScannerOptions<'a> { pub use_scalar_index: jboolean, pub fast_search: jboolean, pub substrait_aggregate_obj: JObject<'a>, + pub include_deleted_rows: jboolean, + pub strict_batch_size: jboolean, + pub disable_scoring_autoprojection: jboolean, } /// Build a scanner with options applied - shared by blocking and async scanners @@ -354,6 +358,9 @@ pub(crate) fn build_scanner_with_options<'a>( .call_method(&java_obj, "getQueryParallelism", "()I", &[])? .i()?; scanner.query_parallelism(query_parallelism); + + let approx_mode_str = env.get_string_from_method(&java_obj, "getApproxModeString")?; + scanner.approx_mode(parse_approx_mode(&approx_mode_str)?); Ok(()) })?; @@ -394,6 +401,16 @@ pub(crate) fn build_scanner_with_options<'a>( scanner.aggregate(AggregateExpr::substrait(substrait_aggregate))?; } + if options.include_deleted_rows == JNI_TRUE { + scanner.include_deleted_rows(); + } + + scanner.strict_batch_size(options.strict_batch_size == JNI_TRUE); + + if options.disable_scoring_autoprojection == JNI_TRUE { + scanner.disable_scoring_autoprojection(); + } + Ok(scanner) } @@ -423,6 +440,9 @@ pub extern "system" fn Java_org_lance_ipc_LanceScanner_createScanner<'local>( fast_search: jboolean, // boolean substrait_aggregate_obj: JObject<'local>, // Optional collect_stats: jboolean, // boolean + include_deleted_rows: jboolean, // boolean + strict_batch_size: jboolean, // boolean + disable_scoring_autoprojection: jboolean, // boolean ) -> JObject<'local> { ok_or_throw!( env, @@ -447,6 +467,9 @@ pub extern "system" fn Java_org_lance_ipc_LanceScanner_createScanner<'local>( fast_search, substrait_aggregate_obj, collect_stats, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, ) ) } @@ -473,6 +496,9 @@ fn inner_create_scanner<'local>( fast_search: jboolean, substrait_aggregate_obj: JObject<'local>, collect_stats: jboolean, + include_deleted_rows: jboolean, + strict_batch_size: jboolean, + disable_scoring_autoprojection: jboolean, ) -> Result> { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET) }?; @@ -497,6 +523,9 @@ fn inner_create_scanner<'local>( use_scalar_index, fast_search, substrait_aggregate_obj, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, }; let scanner = build_scanner_with_options(env, &dataset, options)?; diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index a6798c2f237..d6603925947 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -4,7 +4,7 @@ use arrow::array::{RecordBatch, RecordBatchIterator, StructArray}; use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi_and_data_type}; use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; -use arrow_schema::DataType; +use arrow_schema::{DataType, Schema as ArrowSchema}; use jni::objects::{JIntArray, JValue, JValueGen}; use jni::{ JNIEnv, @@ -19,7 +19,7 @@ use lance_io::utils::CachedFileSize; use lance_table::rowids::{RowIdSequence, write_row_ids}; use std::iter::once; -use lance::dataset::fragment::FileFragment; +use lance::dataset::fragment::write::FragmentCreateBuilder; use lance::io::ObjectStoreParams; use lance_datafusion::utils::StreamingWriteSource; use lance_io::object_store::{LanceNamespaceStorageOptionsProvider, StorageOptionsProvider}; @@ -108,6 +108,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -130,6 +131,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, ), JObject::default() ) @@ -155,6 +157,7 @@ fn inner_create_with_ffi_array<'local>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -186,6 +189,7 @@ fn inner_create_with_ffi_array<'local>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, reader, ) } @@ -210,6 +214,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -231,6 +236,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, ), JObject::null() ) @@ -255,6 +261,7 @@ fn inner_create_with_ffi_stream<'local>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -276,6 +283,7 @@ fn inner_create_with_ffi_stream<'local>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, reader, ) } @@ -298,6 +306,7 @@ fn create_fragment<'a>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -345,11 +354,19 @@ fn create_fragment<'a>( }); } - let fragments = RT.block_on(FileFragment::create_fragments( - &path_str, - source, - Some(write_params), - ))?; + let mut builder = FragmentCreateBuilder::new(&path_str).write_params(&write_params); + let schema; + if schema_addr != 0 { + let c_schema_ptr = schema_addr as *mut FFI_ArrowSchema; + let c_schema = unsafe { FFI_ArrowSchema::from_raw(c_schema_ptr) }; + let arrow_schema = ArrowSchema::try_from(&c_schema)?; + // Schema::try_from restores Lance field IDs from the LANCE_FIELD_ID_KEY + // metadata inserted by LanceSchema.asArrowSchemaWithFieldIds(). + schema = Schema::try_from(&arrow_schema)?; + builder = builder.schema(&schema); + } + + let fragments = RT.block_on(builder.write_fragments(source))?; export_vec(env, &fragments) } diff --git a/java/lance-jni/src/index.rs b/java/lance-jni/src/index.rs index 1e533eed9fc..6cb64a05a81 100644 --- a/java/lance-jni/src/index.rs +++ b/java/lance-jni/src/index.rs @@ -173,6 +173,8 @@ fn determine_index_type<'local>( Some("ZONEMAP") } else if lower.contains("bloomfilter") { Some("BLOOM_FILTER") + } else if lower.contains("rtree") { + Some("RTREE") } else if lower.contains("ivfhnsw") { if lower.contains("sq") { Some("IVF_HNSW_SQ") diff --git a/java/lance-jni/src/mem_wal.rs b/java/lance-jni/src/mem_wal.rs index 9ba3fdd7440..20404b6a88b 100644 --- a/java/lance-jni/src/mem_wal.rs +++ b/java/lance-jni/src/mem_wal.rs @@ -27,6 +27,7 @@ use jni::sys::{jdouble, jint, jlong}; use lance::dataset::Dataset as LanceDataset; use lance::dataset::mem_wal::scanner::{ FlushedGeneration, LsmDataSourceCollector, LsmPointLookupPlanner, LsmVectorSearchPlanner, + write_pk_sidecar, }; use lance::dataset::mem_wal::write::{MemTableStats, WriteStatsSnapshot}; use lance::dataset::mem_wal::{ @@ -180,6 +181,42 @@ fn inner_put(env: &mut JNIEnv, this: JObject, stream_addr: jlong) -> Result<()> Ok(()) } +/// Test-support: write a primary-key dedup sidecar (`_pk_index/`) for a +/// flushed-generation dataset already staged at `gen_path`, mirroring what +/// production flush emits. Lets Java tests stage a *faithful* flushed +/// generation (dataset + sidecar); production always writes the sidecar during +/// flush, so a dataset-without-sidecar is not a state the system produces. +/// Mirrors the Python `_write_pk_sidecar` binding. +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_memwal_MemWalTest_nativeWritePkSidecar( + mut env: JNIEnv, + _class: JClass, + gen_path: JString, + stream_addr: jlong, + pk_columns: JObject, +) { + ok_or_throw_without_return!( + env, + inner_write_pk_sidecar(&mut env, gen_path, stream_addr, pk_columns) + ); +} + +fn inner_write_pk_sidecar( + env: &mut JNIEnv, + gen_path: JString, + stream_addr: jlong, + pk_columns: JObject, +) -> Result<()> { + let gen_path: String = env.get_string(&gen_path)?.into(); + let pk_columns = env.get_strings(&pk_columns)?; + let stream_ptr = stream_addr as *mut FFI_ArrowArrayStream; + let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; + let batches: Vec = reader.collect::>()?; + let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); + RT.block_on(write_pk_sidecar(&gen_path, &batches, &pk_refs))?; + Ok(()) +} + #[unsafe(no_mangle)] pub extern "system" fn Java_org_lance_memwal_ShardWriter_nativeStats<'local>( mut env: JNIEnv<'local>, diff --git a/java/lance-jni/src/merge_insert.rs b/java/lance-jni/src/merge_insert.rs index 0898d1b049f..df4d63bd2f6 100644 --- a/java/lance-jni/src/merge_insert.rs +++ b/java/lance-jni/src/merge_insert.rs @@ -51,6 +51,7 @@ fn inner_merge_insert<'local>( let conflict_retries = extract_conflict_retries(env, &jparam)?; let retry_timeout_ms = extract_retry_timeout_ms(env, &jparam)?; let skip_auto_cleanup = extract_skip_auto_cleanup(env, &jparam)?; + let use_index = extract_use_index(env, &jparam)?; let marked_generations = extract_marked_generations(env, &jparam)?; let (new_ds, merge_stats) = unsafe { @@ -69,6 +70,7 @@ fn inner_merge_insert<'local>( .conflict_retries(conflict_retries) .retry_timeout(Duration::from_millis(retry_timeout_ms as u64)) .skip_auto_cleanup(skip_auto_cleanup) + .use_index(use_index) .mark_generations_as_merged(marked_generations) .try_build()?; @@ -234,6 +236,11 @@ fn extract_skip_auto_cleanup<'local>(env: &mut JNIEnv<'local>, jparam: &JObject) Ok(skip_auto_cleanup) } +fn extract_use_index<'local>(env: &mut JNIEnv<'local>, jparam: &JObject) -> Result { + let use_index = env.call_method(jparam, "useIndex", "()Z", &[])?.z()?; + Ok(use_index) +} + fn extract_marked_generations<'local>( env: &mut JNIEnv<'local>, jparam: &JObject, diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 6bc1948ae6a..4f899f56ff2 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -774,12 +774,18 @@ fn inner_commit_to_dataset<'local>( // Set namespace commit handler only if namespace_client_managed_versioning is true let namespace_info = extract_namespace_info(env, &namespace_obj, &table_id_obj)?; let commit_handler = if namespace_client_managed_versioning { - namespace_info.map(|(ns, tid)| { - let external_store = LanceNamespaceExternalManifestStore::new(ns, tid); - Arc::new(ExternalManifestCommitHandler { - external_manifest_store: Arc::new(external_store), - }) as Arc - }) + match namespace_info { + Some((ns, tid)) => { + // The store derives the branch a request targets from the base + // path it is handed, resolved against the table root. + let table_root = java_blocking_ds.inner.branch_location().find_main()?.path; + let external_store = LanceNamespaceExternalManifestStore::new(ns, tid, table_root); + Some(Arc::new(ExternalManifestCommitHandler { + external_manifest_store: Arc::new(external_store), + }) as Arc) + } + None => None, + } } else { None }; @@ -1560,7 +1566,8 @@ fn inner_commit_to_uri<'local>( // Set namespace commit handler only if namespace_client_managed_versioning is true if namespace_client_managed_versioning && let Some((namespace_client, tid)) = namespace_info { - let external_store = LanceNamespaceExternalManifestStore::new(namespace_client, tid); + let external_store = + LanceNamespaceExternalManifestStore::for_table_uri(namespace_client, tid, &uri_str)?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 1321e8e71e3..94372ef27cc 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -25,7 +25,7 @@ use crate::error::{Error, Result}; use crate::ffi::JNIEnvExt; use crate::traits::FromJObjectWithEnv; -use lance_index::vector::Query; +use lance_index::vector::{ApproxMode, Query}; use std::collections::HashMap; use std::str::FromStr; @@ -76,6 +76,18 @@ pub fn extract_base_store_params( }) } +pub(crate) fn parse_approx_mode(value: &str) -> Result { + match value { + "fast" => Ok(ApproxMode::Fast), + "normal" => Ok(ApproxMode::Normal), + "accurate" => Ok(ApproxMode::Accurate), + _ => Err(Error::input_error(format!( + "Invalid approx mode '{}'. Expected one of: fast, normal, accurate", + value + ))), + } +} + #[allow(clippy::too_many_arguments)] pub fn extract_write_params( env: &mut JNIEnv, @@ -253,6 +265,8 @@ pub fn get_query(env: &mut JNIEnv, query_obj: JObject) -> Result> let query_parallelism = env .call_method(&java_obj, "getQueryParallelism", "()I", &[])? .i()?; + let approx_mode_str = env.get_string_from_method(&java_obj, "getApproxModeString")?; + let approx_mode = parse_approx_mode(&approx_mode_str)?; Ok(Query { column, @@ -268,6 +282,7 @@ pub fn get_query(env: &mut JNIEnv, query_obj: JObject) -> Result> use_index, dist_q_c: 0.0, query_parallelism, + approx_mode, }) })?; diff --git a/java/pom.xml b/java/pom.xml index 4639cd74d2b..15d05b95d68 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.1 + 9.0.0-beta.2 jar Lance Format Java API diff --git a/java/src/main/java/org/lance/Fragment.java b/java/src/main/java/org/lance/Fragment.java index b27b189bf48..3b12e158617 100644 --- a/java/src/main/java/org/lance/Fragment.java +++ b/java/src/main/java/org/lance/Fragment.java @@ -18,6 +18,7 @@ import org.lance.ipc.LanceScanner; import org.lance.ipc.ScanOptions; import org.lance.namespace.LanceNamespace; +import org.lance.schema.LanceSchema; import org.apache.arrow.c.ArrowArray; import org.apache.arrow.c.ArrowArrayStream; @@ -260,6 +261,18 @@ static List create( WriteParams params, LanceNamespace namespaceClient, List tableId) { + return create(datasetUri, allocator, root, params, namespaceClient, tableId, null); + } + + /** Create a fragment from the given arrow array and schema. */ + static List create( + String datasetUri, + BufferAllocator allocator, + VectorSchemaRoot root, + WriteParams params, + LanceNamespace namespaceClient, + List tableId, + LanceSchema schema) { Preconditions.checkNotNull(datasetUri); Preconditions.checkNotNull(allocator); Preconditions.checkNotNull(root); @@ -267,6 +280,30 @@ static List create( try (ArrowSchema arrowSchema = ArrowSchema.allocateNew(allocator); ArrowArray arrowArray = ArrowArray.allocateNew(allocator)) { Data.exportVectorSchemaRoot(allocator, root, null, arrowArray, arrowSchema); + if (schema != null) { + try (ArrowSchema lanceSchema = ArrowSchema.allocateNew(allocator)) { + Data.exportSchema(allocator, schema.asArrowSchemaWithFieldIds(), null, lanceSchema); + return createWithFfiArray( + datasetUri, + arrowArray.memoryAddress(), + arrowSchema.memoryAddress(), + params.getMaxRowsPerFile(), + params.getMaxRowsPerGroup(), + params.getMaxBytesPerFile(), + params.getMode(), + params.getEnableStableRowIds(), + params.getDataStorageVersion(), + params.getStorageOptions(), + params.getBaseStoreParams(), + params.getInitialBases(), + params.getTargetBases(), + namespaceClient, + tableId, + params.getAllowExternalBlobOutsideBases(), + params.getBlobPackFileSizeThreshold(), + lanceSchema.memoryAddress()); + } + } return createWithFfiArray( datasetUri, arrowArray.memoryAddress(), @@ -284,7 +321,8 @@ static List create( namespaceClient, tableId, params.getAllowExternalBlobOutsideBases(), - params.getBlobPackFileSizeThreshold()); + params.getBlobPackFileSizeThreshold(), + 0L); } } @@ -295,9 +333,45 @@ static List create( WriteParams params, LanceNamespace namespaceClient, List tableId) { + return create(datasetUri, null, stream, params, namespaceClient, tableId, null); + } + + /** Create a fragment from the given arrow stream. */ + static List create( + String datasetUri, + BufferAllocator allocator, + ArrowArrayStream stream, + WriteParams params, + LanceNamespace namespaceClient, + List tableId, + LanceSchema schema) { Preconditions.checkNotNull(datasetUri); Preconditions.checkNotNull(stream); Preconditions.checkNotNull(params); + if (schema != null) { + Preconditions.checkNotNull(allocator, "allocator is required with schema"); + try (ArrowSchema lanceSchema = ArrowSchema.allocateNew(allocator)) { + Data.exportSchema(allocator, schema.asArrowSchemaWithFieldIds(), null, lanceSchema); + return createWithFfiStream( + datasetUri, + stream.memoryAddress(), + params.getMaxRowsPerFile(), + params.getMaxRowsPerGroup(), + params.getMaxBytesPerFile(), + params.getMode(), + params.getEnableStableRowIds(), + params.getDataStorageVersion(), + params.getStorageOptions(), + params.getBaseStoreParams(), + params.getInitialBases(), + params.getTargetBases(), + namespaceClient, + tableId, + params.getAllowExternalBlobOutsideBases(), + params.getBlobPackFileSizeThreshold(), + lanceSchema.memoryAddress()); + } + } return createWithFfiStream( datasetUri, stream.memoryAddress(), @@ -314,7 +388,8 @@ static List create( namespaceClient, tableId, params.getAllowExternalBlobOutsideBases(), - params.getBlobPackFileSizeThreshold()); + params.getBlobPackFileSizeThreshold(), + 0L); } /** Create a fragment from the given arrow array and schema. */ @@ -335,7 +410,8 @@ private static native List createWithFfiArray( LanceNamespace namespaceClient, List tableId, Optional allowExternalBlobOutsideBases, - Optional blobPackFileSizeThreshold); + Optional blobPackFileSizeThreshold, + long schemaMemoryAddress); /** Create a fragment from the given arrow stream. */ private static native List createWithFfiStream( @@ -354,5 +430,6 @@ private static native List createWithFfiStream( LanceNamespace namespaceClient, List tableId, Optional allowExternalBlobOutsideBases, - Optional blobPackFileSizeThreshold); + Optional blobPackFileSizeThreshold, + long schemaMemoryAddress); } diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index baece0767a1..32fd5ca7635 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -216,8 +216,8 @@ private Dataset buildFromNamespaceClient() { // Call describe_table to get location and storage options DescribeTableRequest request = new DescribeTableRequest(); request.setId(tableId); - // Only set version if present - options.getVersion().ifPresent(v -> request.setVersion(Long.valueOf(v))); + // Do not set the dataset version here. Some namespace implementations only support describing + // the latest table metadata; the requested version is applied when opening the dataset below. DescribeTableResponse response = namespaceClient.describeTable(request); diff --git a/java/src/main/java/org/lance/WriteFragmentBuilder.java b/java/src/main/java/org/lance/WriteFragmentBuilder.java index 5d7dc1a42b2..2dbef873849 100644 --- a/java/src/main/java/org/lance/WriteFragmentBuilder.java +++ b/java/src/main/java/org/lance/WriteFragmentBuilder.java @@ -14,6 +14,7 @@ package org.lance; import org.lance.namespace.LanceNamespace; +import org.lance.schema.LanceSchema; import org.apache.arrow.c.ArrowArrayStream; import org.apache.arrow.memory.BufferAllocator; @@ -45,6 +46,7 @@ public class WriteFragmentBuilder { private BufferAllocator allocator; private VectorSchemaRoot vectorSchemaRoot; private ArrowArrayStream arrowArrayStream; + private LanceSchema schema; private WriteParams writeParams; private WriteParams.Builder writeParamsBuilder; private LanceNamespace namespaceClient; @@ -100,6 +102,22 @@ public WriteFragmentBuilder data(ArrowArrayStream stream) { return this; } + /** + * Set the Lance dataset schema to use when writing fragments. + * + *

This is useful for distributed writes where workers create uncommitted fragments and a + * coordinator commits them later. When this schema is supplied, lance-core does not need to open + * the existing dataset to infer the schema in APPEND mode. The schema should come from the target + * dataset so Lance field IDs are preserved. + * + * @param schema the target Lance dataset schema + * @return this builder + */ + public WriteFragmentBuilder schema(LanceSchema schema) { + this.schema = schema; + return this; + } + /** * Set the write parameters. * @@ -278,10 +296,22 @@ public List execute() { // storage options provider when these are non-null for credential refresh if (vectorSchemaRoot != null) { return Fragment.create( - datasetUri, allocator, vectorSchemaRoot, finalWriteParams, namespaceClient, tableId); + datasetUri, + allocator, + vectorSchemaRoot, + finalWriteParams, + namespaceClient, + tableId, + schema); } else { return Fragment.create( - datasetUri, arrowArrayStream, finalWriteParams, namespaceClient, tableId); + datasetUri, + allocator, + arrowArrayStream, + finalWriteParams, + namespaceClient, + tableId, + schema); } } @@ -312,6 +342,8 @@ private void validate() { Preconditions.checkState( vectorSchemaRoot == null || allocator != null, "allocator is required when using VectorSchemaRoot"); + Preconditions.checkState( + schema == null || allocator != null, "allocator is required with schema"); Preconditions.checkState( writeParams == null || writeParamsBuilder == null, "Cannot use both writeParams() and individual parameter methods"); diff --git a/java/src/main/java/org/lance/index/IndexType.java b/java/src/main/java/org/lance/index/IndexType.java index 3a03934effd..1fff86fc7e0 100644 --- a/java/src/main/java/org/lance/index/IndexType.java +++ b/java/src/main/java/org/lance/index/IndexType.java @@ -24,6 +24,7 @@ public enum IndexType { MEM_WAL(7), ZONEMAP(8), BLOOM_FILTER(9), + RTREE(10), VECTOR(100), IVF_FLAT(101), IVF_SQ(102), diff --git a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java index 345a55f20b2..b3408e2d68d 100644 --- a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java +++ b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java @@ -31,7 +31,7 @@ private ScalarIndexParams(Builder builder) { * Create a new ScalarIndexParams with the given index type and no parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") * @return ScalarIndexParams */ public static ScalarIndexParams create(String indexType) { @@ -42,7 +42,7 @@ public static ScalarIndexParams create(String indexType) { * Create a new ScalarIndexParams with the given index type and JSON parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") * @param jsonParams JSON string containing index-specific parameters * @return ScalarIndexParams */ @@ -58,7 +58,7 @@ public static class Builder { * Create a new builder for scalar index parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") */ public Builder(String indexType) { this.indexType = indexType; diff --git a/java/src/main/java/org/lance/ipc/ApproxMode.java b/java/src/main/java/org/lance/ipc/ApproxMode.java new file mode 100644 index 00000000000..e6aa7fc3552 --- /dev/null +++ b/java/src/main/java/org/lance/ipc/ApproxMode.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.ipc; + +/** + * Controls the speed / accuracy tradeoff for approximate vector search. + * + *

This setting currently only affects RQ-quantized vector indexes, such as IVF_RQ. Other index + * types ignore this setting. + */ +public enum ApproxMode { + /** Prefer faster approximate scoring when supported by the RQ index. */ + FAST("fast"), + + /** Use the index's default approximation behavior. */ + NORMAL("normal"), + + /** Prefer more accurate approximate scoring when supported by the RQ index. */ + ACCURATE("accurate"); + + private final String value; + + ApproxMode(String value) { + this.value = value; + } + + /** Returns the lowercase value passed across the JNI boundary. */ + public String toRustString() { + return value; + } +} diff --git a/java/src/main/java/org/lance/ipc/AsyncScanner.java b/java/src/main/java/org/lance/ipc/AsyncScanner.java index 2ec317cb245..6e515e3546c 100644 --- a/java/src/main/java/org/lance/ipc/AsyncScanner.java +++ b/java/src/main/java/org/lance/ipc/AsyncScanner.java @@ -80,7 +80,10 @@ public static AsyncScanner create( options.getColumnOrderings(), options.isUseScalarIndex(), options.isFastSearch(), - options.getSubstraitAggregate()); + options.getSubstraitAggregate(), + options.isIncludeDeletedRows(), + options.isStrictBatchSize(), + options.isDisableScoringAutoprojection()); scanner.allocator = allocator; return scanner; } @@ -103,7 +106,10 @@ static native AsyncScanner createAsyncScanner( Optional> columnOrderings, boolean useScalarIndex, boolean fastSearch, - Optional substraitAggregate); + Optional substraitAggregate, + boolean includeDeletedRows, + boolean strictBatchSize, + boolean disableScoringAutoprojection); /** * Asynchronously scan batches and return a CompletableFuture. diff --git a/java/src/main/java/org/lance/ipc/LanceScanner.java b/java/src/main/java/org/lance/ipc/LanceScanner.java index edd3ebc22cc..3a413e0ccfd 100644 --- a/java/src/main/java/org/lance/ipc/LanceScanner.java +++ b/java/src/main/java/org/lance/ipc/LanceScanner.java @@ -77,7 +77,10 @@ public static LanceScanner create( options.isUseScalarIndex(), options.isFastSearch(), options.getSubstraitAggregate(), - options.isCollectStats()); + options.isCollectStats(), + options.isIncludeDeletedRows(), + options.isStrictBatchSize(), + options.isDisableScoringAutoprojection()); scanner.allocator = allocator; scanner.dataset = dataset; scanner.options = options; @@ -103,7 +106,10 @@ static native LanceScanner createScanner( boolean useScalarIndex, boolean fastSearch, Optional substraitAggregate, - boolean collectStats); + boolean collectStats, + boolean includeDeletedRows, + boolean strictBatchSize, + boolean disableScoringAutoprojection); /** * Closes this scanner and releases any system resources associated with it. If the scanner is diff --git a/java/src/main/java/org/lance/ipc/Query.java b/java/src/main/java/org/lance/ipc/Query.java index 48013b375ee..215865310df 100644 --- a/java/src/main/java/org/lance/ipc/Query.java +++ b/java/src/main/java/org/lance/ipc/Query.java @@ -32,6 +32,7 @@ public class Query { private final Optional distanceType; private final boolean useIndex; private final int queryParallelism; + private final ApproxMode approxMode; private Query(Builder builder) { this.column = Preconditions.checkNotNull(builder.column, "Columns must be set"); @@ -52,6 +53,7 @@ private Query(Builder builder) { this.distanceType = builder.distanceType; this.useIndex = builder.useIndex; this.queryParallelism = builder.queryParallelism; + this.approxMode = builder.approxMode; } public String getColumn() { @@ -98,6 +100,14 @@ public int getQueryParallelism() { return queryParallelism; } + public ApproxMode getApproxMode() { + return approxMode; + } + + public String getApproxModeString() { + return approxMode.toRustString(); + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -111,6 +121,7 @@ public String toString() { .add("distanceType", distanceType.orElse(null)) .add("useIndex", useIndex) .add("queryParallelism", queryParallelism) + .add("approxMode", approxMode) .toString(); } @@ -125,6 +136,7 @@ public static class Builder { private Optional distanceType = Optional.empty(); private boolean useIndex = true; private int queryParallelism = 0; + private ApproxMode approxMode = ApproxMode.NORMAL; /** * Sets the column to be searched. @@ -275,6 +287,20 @@ public Builder setQueryParallelism(int queryParallelism) { return this; } + /** + * Sets the speed / accuracy tradeoff for approximate vector search. + * + *

This setting currently only affects RQ-quantized vector indexes, such as IVF_RQ. Other + * index types ignore this setting. + * + * @param approxMode The approximate search mode to use for the query. + * @return The Builder instance for method chaining. + */ + public Builder setApproxMode(ApproxMode approxMode) { + this.approxMode = Preconditions.checkNotNull(approxMode, "ApproxMode must not be null"); + return this; + } + /** * Builds the Query object. * diff --git a/java/src/main/java/org/lance/ipc/ScanOptions.java b/java/src/main/java/org/lance/ipc/ScanOptions.java index 68c485e39a3..a9aad590c2b 100644 --- a/java/src/main/java/org/lance/ipc/ScanOptions.java +++ b/java/src/main/java/org/lance/ipc/ScanOptions.java @@ -40,6 +40,9 @@ public class ScanOptions { private final Optional substraitAggregate; private final boolean collectStats; private final boolean fastSearch; + private final boolean includeDeletedRows; + private final boolean strictBatchSize; + private final boolean disableScoringAutoprojection; public ScanOptions( Optional> fragmentIds, @@ -77,6 +80,9 @@ public ScanOptions( useScalarIndex, substraitAggregate, collectStats, + false, + false, + false, false); } @@ -121,7 +127,10 @@ public ScanOptions( boolean useScalarIndex, Optional substraitAggregate, boolean collectStats, - boolean fastSearch) { + boolean fastSearch, + boolean includeDeletedRows, + boolean strictBatchSize, + boolean disableScoringAutoprojection) { Preconditions.checkArgument( !(filter.isPresent() && substraitFilter.isPresent()), "cannot set both substrait filter and string filter"); @@ -143,6 +152,9 @@ public ScanOptions( this.substraitAggregate = substraitAggregate; this.collectStats = collectStats; this.fastSearch = fastSearch; + this.includeDeletedRows = includeDeletedRows; + this.strictBatchSize = strictBatchSize; + this.disableScoringAutoprojection = disableScoringAutoprojection; } /** @@ -297,6 +309,33 @@ public boolean isCollectStats() { return collectStats; } + /** + * Get whether to include deleted rows in scan results. + * + * @return true if deleted rows should be included, false otherwise. + */ + public boolean isIncludeDeletedRows() { + return includeDeletedRows; + } + + /** + * Get whether to enforce strict batch sizing. + * + * @return true if batch sizes must be strictly enforced, false otherwise. + */ + public boolean isStrictBatchSize() { + return strictBatchSize; + } + + /** + * Get whether to disable scoring autoprojection. + * + * @return true if scoring column autoprojection is disabled, false otherwise. + */ + public boolean isDisableScoringAutoprojection() { + return disableScoringAutoprojection; + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -322,6 +361,9 @@ public String toString() { "substraitAggregate", substraitAggregate.map(buf -> "ByteBuffer[" + buf.remaining() + " bytes]").orElse(null)) .add("collectStats", collectStats) + .add("includeDeletedRows", includeDeletedRows) + .add("strictBatchSize", strictBatchSize) + .add("disableScoringAutoprojection", disableScoringAutoprojection) .toString(); } @@ -345,6 +387,9 @@ public static class Builder { private boolean fastSearch = false; private Optional substraitAggregate = Optional.empty(); private boolean collectStats = false; + private boolean includeDeletedRows = false; + private boolean strictBatchSize = false; + private boolean disableScoringAutoprojection = false; public Builder() {} @@ -372,6 +417,9 @@ public Builder(ScanOptions options) { this.fastSearch = options.isFastSearch(); this.substraitAggregate = options.getSubstraitAggregate(); this.collectStats = options.isCollectStats(); + this.includeDeletedRows = options.isIncludeDeletedRows(); + this.strictBatchSize = options.isStrictBatchSize(); + this.disableScoringAutoprojection = options.isDisableScoringAutoprojection(); } /** @@ -577,6 +625,39 @@ public Builder collectStats(boolean collectStats) { return this; } + /** + * Set whether to include deleted rows in scan results. Default is false. + * + * @param includeDeletedRows whether to include deleted rows + * @return Builder instance for method chaining. + */ + public Builder includeDeletedRows(boolean includeDeletedRows) { + this.includeDeletedRows = includeDeletedRows; + return this; + } + + /** + * Set whether to enforce strict batch sizing. Default is false. + * + * @param strictBatchSize whether to enforce strict batch sizing + * @return Builder instance for method chaining. + */ + public Builder strictBatchSize(boolean strictBatchSize) { + this.strictBatchSize = strictBatchSize; + return this; + } + + /** + * Set whether to disable scoring column autoprojection. Default is false. + * + * @param disableScoringAutoprojection whether to disable autoprojection + * @return Builder instance for method chaining. + */ + public Builder disableScoringAutoprojection(boolean disableScoringAutoprojection) { + this.disableScoringAutoprojection = disableScoringAutoprojection; + return this; + } + /** * Build the LanceScanOptions instance. * @@ -601,7 +682,10 @@ public ScanOptions build() { useScalarIndex, substraitAggregate, collectStats, - fastSearch); + fastSearch, + includeDeletedRows, + strictBatchSize, + disableScoringAutoprojection); } } } diff --git a/java/src/main/java/org/lance/merge/MergeInsertParams.java b/java/src/main/java/org/lance/merge/MergeInsertParams.java index de40c9e4f1c..2ae27b67cba 100644 --- a/java/src/main/java/org/lance/merge/MergeInsertParams.java +++ b/java/src/main/java/org/lance/merge/MergeInsertParams.java @@ -38,6 +38,7 @@ public class MergeInsertParams { private int conflictRetries = 10; private long retryTimeoutMs = 30 * 1000; private boolean skipAutoCleanup = false; + private boolean useIndex = true; private List markedGenerations = Collections.emptyList(); public MergeInsertParams(List on) { @@ -227,6 +228,22 @@ public MergeInsertParams withSkipAutoCleanup(boolean skipAutoCleanup) { return this; } + /** + * Controls whether to use indices for the merge operation. + * + *

When set to false, forces a full table scan even if an index exists on the join key. This + * can be useful for benchmarking or when the optimizer chooses a suboptimal path. + * + *

Default is true (use index if available). + * + * @param useIndex Whether to use indices for the merge join + * @return This MergeInsertParams instance + */ + public MergeInsertParams withUseIndex(boolean useIndex) { + this.useIndex = useIndex; + return this; + } + /** * Mark MemWAL generations as merged into the base table. * @@ -298,6 +315,10 @@ public boolean skipAutoCleanup() { return skipAutoCleanup; } + public boolean useIndex() { + return useIndex; + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -315,6 +336,7 @@ public String toString() { .add("conflictRetries", conflictRetries) .add("retryTimeoutMs", retryTimeoutMs) .add("skipAutoCleanup", skipAutoCleanup) + .add("useIndex", useIndex) .toString(); } diff --git a/java/src/main/java/org/lance/schema/LanceField.java b/java/src/main/java/org/lance/schema/LanceField.java index 9c7014092fa..4dbb3a0ea38 100644 --- a/java/src/main/java/org/lance/schema/LanceField.java +++ b/java/src/main/java/org/lance/schema/LanceField.java @@ -25,6 +25,7 @@ import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -156,6 +157,25 @@ public Field asArrowField() { name, new FieldType(nullable, type, dictionaryEncoding, metadata), arrowChildren); } + Field asArrowFieldWithFieldIds() { + List arrowChildren = + children.stream().map(LanceField::asArrowFieldWithFieldIds).collect(Collectors.toList()); + + if (type instanceof ArrowType.FixedSizeList) { + arrowChildren.addAll(childrenForFixedSizeList()); + } + + if (id < 0) { + throw new IllegalStateException("Lance field id is required for schema override: " + name); + } + Map metadataWithFieldId = new HashMap<>(metadata); + metadataWithFieldId.put(LanceSchema.LANCE_FIELD_ID_KEY, Integer.toString(id)); + return new Field( + name, + new FieldType(nullable, type, dictionaryEncoding, metadataWithFieldId), + arrowChildren); + } + private List childrenForFixedSizeList() { if (logicalType == null || logicalType.isEmpty()) { return Collections.emptyList(); diff --git a/java/src/main/java/org/lance/schema/LanceSchema.java b/java/src/main/java/org/lance/schema/LanceSchema.java index 9492ef45d5e..50a48e578af 100644 --- a/java/src/main/java/org/lance/schema/LanceSchema.java +++ b/java/src/main/java/org/lance/schema/LanceSchema.java @@ -23,6 +23,7 @@ import java.util.stream.Collectors; public class LanceSchema { + static final String LANCE_FIELD_ID_KEY = "lance:field_id"; private final List fields; private final Map metadata; @@ -68,6 +69,12 @@ public Schema asArrowSchema() { fields.stream().map(LanceField::asArrowField).collect(Collectors.toList()), metadata); } + public Schema asArrowSchemaWithFieldIds() { + return new Schema( + fields.stream().map(LanceField::asArrowFieldWithFieldIds).collect(Collectors.toList()), + metadata); + } + @Override public String toString() { return MoreObjects.toStringHelper(this) diff --git a/java/src/test/java/org/lance/AsyncScannerTest.java b/java/src/test/java/org/lance/AsyncScannerTest.java index 578bf000755..fc786ff57c2 100644 --- a/java/src/test/java/org/lance/AsyncScannerTest.java +++ b/java/src/test/java/org/lance/AsyncScannerTest.java @@ -192,6 +192,75 @@ private static int countRows(ArrowReader reader) throws Exception { return rowCount; } + @Test + void testIncludeDeletedRowsAsync(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("async_scanner_include_deleted_rows").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 10)) { + assertEquals(10, dataset.countRows()); + + // Delete half the rows + dataset.delete("id >= 5"); + assertEquals(5, dataset.countRows()); + + // Async scan without includeDeletedRows — should only see live rows + ScanOptions defaultOptions = new ScanOptions.Builder().batchSize(20L).build(); + try (AsyncScanner scanner = AsyncScanner.create(dataset, defaultOptions, allocator)) { + ArrowReader reader = scanner.scanBatchesAsync().get(10, TimeUnit.SECONDS); + assertEquals(5, countRows(reader), "default async scan: should exclude deleted rows"); + reader.close(); + } + + // Async scan with includeDeletedRows=true — should see all rows + ScanOptions includeDeletedOptions = + new ScanOptions.Builder() + .batchSize(20L) + .withRowId(true) // required by includeDeletedRows + .includeDeletedRows(true) + .build(); + try (AsyncScanner scanner = + AsyncScanner.create(dataset, includeDeletedOptions, allocator)) { + ArrowReader reader = scanner.scanBatchesAsync().get(10, TimeUnit.SECONDS); + assertEquals( + 10, countRows(reader), "includeDeletedRows async: should include deleted rows"); + reader.close(); + } + } + } + } + + @Test + void testStrictBatchSizeAsync(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("async_scanner_strict_batch_size").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 25)) { + int batchSize = 10; + + ScanOptions strictOptions = + new ScanOptions.Builder().batchSize(batchSize).strictBatchSize(true).build(); + + try (AsyncScanner scanner = AsyncScanner.create(dataset, strictOptions, allocator)) { + ArrowReader reader = scanner.scanBatchesAsync().get(10, TimeUnit.SECONDS); + int totalRows = 0; + while (reader.loadNextBatch()) { + int rows = reader.getVectorSchemaRoot().getRowCount(); + assertTrue( + rows <= batchSize, "strict async: batch " + rows + " should be <= " + batchSize); + totalRows += rows; + } + assertEquals(25, totalRows, "strictBatchSize async: should read all rows"); + reader.close(); + } + } + } + } + /** * Example 3: Multiple concurrent async scans. * diff --git a/java/src/test/java/org/lance/DatasetTest.java b/java/src/test/java/org/lance/DatasetTest.java index 3ea6a0812e1..45466a0367c 100644 --- a/java/src/test/java/org/lance/DatasetTest.java +++ b/java/src/test/java/org/lance/DatasetTest.java @@ -1993,18 +1993,20 @@ void testOptimizingIndices(@TempDir Path tempDir) throws Exception { OptimizeOptions options = OptimizeOptions.builder().numIndicesToMerge(0).build(); dsAppended.optimizeIndices(options); - List afterIndexes = dsAppended.getIndexes(); - Index idIndexAfter = - afterIndexes.stream() + List idIndexes = + dsAppended.getIndexes().stream() .filter(idx -> "id_idx".equals(idx.name())) - .findFirst() - .orElse(null); - assertNotNull(idIndexAfter); - List afterFragments = idIndexAfter.fragments().orElse(Collections.emptyList()); - - assertTrue(afterFragments.contains(0)); - assertTrue(afterFragments.contains(1)); - assertEquals(2, afterFragments.size()); + .collect(Collectors.toList()); + assertEquals( + 2, + idIndexes.size(), + "append-only optimize must add a delta segment instead of merging"); + + Set coveredFragments = + idIndexes.stream() + .flatMap(idx -> idx.fragments().orElse(Collections.emptyList()).stream()) + .collect(Collectors.toSet()); + assertEquals(new HashSet<>(Arrays.asList(0, 1)), coveredFragments); } } } diff --git a/java/src/test/java/org/lance/FragmentTest.java b/java/src/test/java/org/lance/FragmentTest.java index 61bfc439290..29a21b5258a 100644 --- a/java/src/test/java/org/lance/FragmentTest.java +++ b/java/src/test/java/org/lance/FragmentTest.java @@ -17,9 +17,12 @@ import org.lance.ipc.LanceScanner; import org.lance.ipc.ScanOptions; import org.lance.operation.Merge; +import org.lance.operation.Project; import org.lance.operation.Update; +import org.lance.schema.LanceField; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; @@ -29,6 +32,7 @@ import org.junit.jupiter.api.io.TempDir; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; @@ -37,6 +41,7 @@ import java.util.Optional; import java.util.stream.Collectors; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -81,6 +86,70 @@ void testFragmentCreate(@TempDir Path tempDir) throws Exception { } } + @Test + void testWriteFragmentWithSchemaOverride(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("fragment_schema_override").toString(); + try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + try (Dataset dataset = testDataset.createEmptyDataset()) { + List fieldList = + new ArrayList<>(testDataset.getSchema().getFields()); + Collections.reverse(fieldList); + + try (Transaction projectTxn = + new Transaction.Builder() + .readVersion(dataset.version()) + .operation(Project.builder().schema(new Schema(fieldList)).build()) + .build(); + Dataset evolvedDataset = new CommitBuilder(dataset).execute(projectTxn); + VectorSchemaRoot root = + VectorSchemaRoot.create(evolvedDataset.getSchema(), allocator)) { + root.allocateNew(); + VarCharVector nameVector = (VarCharVector) root.getVector("name"); + IntVector idVector = (IntVector) root.getVector("id"); + nameVector.setSafe(0, "Person 1".getBytes(StandardCharsets.UTF_8)); + idVector.setSafe(0, 1); + root.setRowCount(1); + + List fragments = + Fragment.write() + .datasetUri(datasetPath) + .allocator(allocator) + .data(root) + .schema(evolvedDataset.getLanceSchema()) + .mode(WriteParams.WriteMode.APPEND) + .execute(); + + assertEquals(1, fragments.size()); + assertEquals(1, fragments.get(0).getPhysicalRows()); + assertArrayEquals( + evolvedDataset.getLanceSchema().fields().stream() + .mapToInt(LanceField::getId) + .toArray(), + fragments.get(0).getFiles().get(0).getFields()); + + FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments); + try (Dataset appendedDataset = + Dataset.commit( + allocator, datasetPath, appendOp, Optional.of(evolvedDataset.version())); + ArrowReader reader = appendedDataset.newScan().scanBatches()) { + assertEquals(3, appendedDataset.version()); + assertEquals(1, appendedDataset.countRows()); + assertTrue(reader.loadNextBatch()); + VectorSchemaRoot batch = reader.getVectorSchemaRoot(); + assertEquals(1, batch.getRowCount()); + assertEquals( + "Person 1", + new String( + ((VarCharVector) batch.getVector("name")).get(0), StandardCharsets.UTF_8)); + assertEquals(1, ((IntVector) batch.getVector("id")).get(0)); + } + } + } + } + } + @Test void commitWithoutVersion(@TempDir Path tempDir) { String datasetPath = tempDir.resolve("commit_without_version").toString(); diff --git a/java/src/test/java/org/lance/JNITest.java b/java/src/test/java/org/lance/JNITest.java index c0e5f900edc..daa123b3200 100644 --- a/java/src/test/java/org/lance/JNITest.java +++ b/java/src/test/java/org/lance/JNITest.java @@ -20,6 +20,7 @@ import org.lance.index.vector.PQBuildParams; import org.lance.index.vector.SQBuildParams; import org.lance.index.vector.VectorIndexParams; +import org.lance.ipc.ApproxMode; import org.lance.ipc.Query; import org.lance.test.JniTestHelper; @@ -28,6 +29,7 @@ import java.util.Arrays; import java.util.Optional; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class JNITest { @@ -48,6 +50,10 @@ public void testIntsOpt() { @Test public void testQuery() { + Query defaultQuery = + new Query.Builder().setColumn("column").setKey(new float[] {1.0f, 2.0f, 3.0f}).build(); + assertEquals(ApproxMode.NORMAL, defaultQuery.getApproxMode()); + JniTestHelper.parseQuery( Optional.of( new Query.Builder() @@ -60,6 +66,7 @@ public void testQuery() { .setDistanceType(DistanceType.L2) .setUseIndex(true) .setQueryParallelism(-1) + .setApproxMode(ApproxMode.ACCURATE) .build())); } diff --git a/java/src/test/java/org/lance/MergeInsertTest.java b/java/src/test/java/org/lance/MergeInsertTest.java index c36ec26b4fa..b738ef8852d 100644 --- a/java/src/test/java/org/lance/MergeInsertTest.java +++ b/java/src/test/java/org/lance/MergeInsertTest.java @@ -275,6 +275,29 @@ private ArrowArrayStream convertToStream(VectorSchemaRoot root, RootAllocator al return stream; } + @Test + public void testMergeInsertWithoutIndex() throws Exception { + // Verify that merge insert with useIndex=false still completes and + // produces results consistent with the default (useIndex=true). + + try (VectorSchemaRoot source = buildSource(testDataset.getSchema(), allocator)) { + try (ArrowArrayStream sourceStream = convertToStream(source, allocator)) { + MergeInsertResult result = + dataset.mergeInsert( + new MergeInsertParams(Collections.singletonList("id")) + .withMatchedUpdateAll() + .withNotMatched(MergeInsertParams.WhenNotMatched.InsertAll) + .withUseIndex(false), + sourceStream); + + Assertions.assertEquals( + "{0=Source 0, 1=Source 1, 2=Source 2, 3=Person 3, 4=Person 4, 7=Source 7, 8=Source 8, 9=Source 9}", + readAll(result.dataset()).toString(), + "merge insert with useIndex=false should produce correct upsert results"); + } + } + } + private TreeMap readAll(Dataset dataset) throws Exception { try (ArrowReader reader = dataset.newScan().scanBatches()) { TreeMap map = new TreeMap<>(); diff --git a/java/src/test/java/org/lance/ScannerTest.java b/java/src/test/java/org/lance/ScannerTest.java index 894b208e8af..00434034b64 100644 --- a/java/src/test/java/org/lance/ScannerTest.java +++ b/java/src/test/java/org/lance/ScannerTest.java @@ -697,6 +697,120 @@ void testFastSearchSkipsUnindexedFragments(@TempDir Path tempDir) throws Excepti } } + @Test + void testIncludeDeletedRows(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("include_deleted_rows").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 10)) { + assertEquals(10, dataset.countRows()); + + // Delete rows where id >= 5 + dataset.delete("id >= 5"); + assertEquals(5, dataset.countRows()); + + // Default scan should exclude deleted rows + try (LanceScanner scanner = + dataset.newScan(new ScanOptions.Builder().batchSize(20).build())) { + assertEquals(5, scanner.countRows(), "default scan: should exclude deleted rows"); + } + + // includeDeletedRows=true should surface deleted rows + // NOTE: includeDeletedRows requires withRowId=true + try (LanceScanner scanner = + dataset.newScan( + new ScanOptions.Builder() + .batchSize(20) + .withRowId(true) + .includeDeletedRows(true) + .build())) { + assertEquals(10, scanner.countRows(), "includeDeletedRows: should include deleted rows"); + } + } + } + } + + @Test + void testStrictBatchSize(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("strict_batch_size").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 25)) { + int batchSize = 10; + + // With strictBatchSize=true, no batch should exceed batchSize + try (Scanner scanner = + dataset.newScan( + new ScanOptions.Builder().batchSize(batchSize).strictBatchSize(true).build())) { + try (ArrowReader reader = scanner.scanBatches()) { + int totalRows = 0; + while (reader.loadNextBatch()) { + int rows = reader.getVectorSchemaRoot().getRowCount(); + assertTrue(rows <= batchSize, "strict: batch " + rows + " should be <= " + batchSize); + totalRows += rows; + } + assertEquals(25, totalRows); + } + } + + // strictBatchSize=false (default) — batch size may vary + try (Scanner scanner = + dataset.newScan(new ScanOptions.Builder().batchSize(batchSize).build())) { + try (ArrowReader reader = scanner.scanBatches()) { + int totalRows = 0; + while (reader.loadNextBatch()) { + totalRows += reader.getVectorSchemaRoot().getRowCount(); + } + assertEquals(25, totalRows); + } + } + } + } + } + + @Test + void testDisableScoringAutoprojection(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("disable_scoring_autoprojection").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 10)) { + // Smoke test: verify the option is accepted and scan still works + ScanOptions options = + new ScanOptions.Builder().batchSize(20).disableScoringAutoprojection(true).build(); + + try (LanceScanner scanner = dataset.newScan(options)) { + assertEquals( + 10, + scanner.countRows(), + "scan with disableScoringAutoprojection should return all rows"); + } + + // Also verify it doesn't break when combined with other options + ScanOptions combinedOptions = + new ScanOptions.Builder() + .batchSize(20) + .filter("id < 5") + .disableScoringAutoprojection(true) + .includeDeletedRows(false) + .strictBatchSize(false) + .build(); + + try (LanceScanner scanner = dataset.newScan(combinedOptions)) { + assertEquals( + 5, + scanner.countRows(), + "scan with disableScoringAutoprojection + filter should work"); + } + } + } + } + private void validScanResult(Dataset dataset, int fragmentId, int rowCount) throws Exception { try (Scanner scanner = dataset.newScan( diff --git a/java/src/test/java/org/lance/index/ScalarIndexTest.java b/java/src/test/java/org/lance/index/ScalarIndexTest.java index b993a7e8a5f..cb090e7c955 100644 --- a/java/src/test/java/org/lance/index/ScalarIndexTest.java +++ b/java/src/test/java/org/lance/index/ScalarIndexTest.java @@ -25,14 +25,18 @@ import org.apache.arrow.c.Data; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.ArrowReader; import org.apache.arrow.vector.ipc.ArrowStreamReader; import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -318,4 +322,78 @@ public void testCreateZonemapIndex(@TempDir Path tempDir) throws Exception { } } } + + @Test + public void testCreateRTreeIndex(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("rtree_test").toString(); + ArrowType f64 = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE); + Field geometryField = + new Field( + "geometry", + new FieldType( + true, + new ArrowType.Struct(), + null, + Collections.singletonMap("ARROW:extension:name", "geoarrow.point")), + Arrays.asList(Field.notNullable("x", f64), Field.notNullable("y", f64))); + Schema schema = new Schema(Collections.singletonList(geometryField), null); + + int rowCount = 3; + try (RootAllocator allocator = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + root.allocateNew(); + StructVector geometry = (StructVector) root.getVector("geometry"); + Float8Vector x = (Float8Vector) geometry.getChild("x"); + Float8Vector y = (Float8Vector) geometry.getChild("y"); + for (int i = 0; i < rowCount; i++) { + geometry.setIndexDefined(i); + x.setSafe(i, (double) i); + y.setSafe(i, i * 2.0); + } + geometry.setValueCount(rowCount); + root.setRowCount(rowCount); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out)) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator); + Dataset dataset = + Dataset.write() + .reader(reader) + .uri(datasetPath) + .allocator(allocator) + .mode(WriteParams.WriteMode.CREATE) + .execute()) { + // The point data round-trips through Lance. + assertEquals(rowCount, dataset.countRows()); + try (ArrowReader scan = dataset.newScan(new ScanOptions.Builder().build()).scanBatches()) { + assertTrue(scan.loadNextBatch()); + StructVector readGeometry = + (StructVector) scan.getVectorSchemaRoot().getVector("geometry"); + assertEquals(2.0, ((Float8Vector) readGeometry.getChild("x")).get(2)); + assertEquals(4.0, ((Float8Vector) readGeometry.getChild("y")).get(2)); + } + + // Creating and listing an RTree index via the typed IndexType works end-to-end. + Index index = + dataset.createIndex( + Collections.singletonList("geometry"), + IndexType.RTREE, + Optional.of("rtree_geometry_index"), + IndexParams.builder() + .setScalarIndexParams(ScalarIndexParams.create("rtree")) + .build(), + true); + assertEquals(IndexType.RTREE, index.indexType()); + assertTrue( + dataset.listIndexes().contains("rtree_geometry_index"), + "Expected 'rtree_geometry_index' in: " + dataset.listIndexes()); + } + } + } } diff --git a/java/src/test/java/org/lance/memwal/MemWalTest.java b/java/src/test/java/org/lance/memwal/MemWalTest.java index ee26932dd59..5af3bd3f474 100644 --- a/java/src/test/java/org/lance/memwal/MemWalTest.java +++ b/java/src/test/java/org/lance/memwal/MemWalTest.java @@ -50,6 +50,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.UUID; @@ -142,6 +143,30 @@ private static Dataset writeAppendOnlyDataset( } } + /** + * Stage a faithful flushed generation at {@code genPath}: the Lance dataset plus its + * primary-key dedup sidecar ({@code _pk_index/}), mirroring what production flush emits. The LSM + * scanner's cross-generation block-list opens the sidecar, so a dataset alone (no sidecar) is not + * a state production produces. Mirrors the Python {@code _write_flushed_gen} test helper. + */ + private static void writeFlushedGen( + BufferAllocator allocator, String genPath, long[] ids, String prefix) throws Exception { + writeLookupDataset(allocator, genPath, ids, prefix).close(); + try (VectorSchemaRoot root = lookupRoot(allocator, ids, prefix); + ArrowReader reader = toReader(allocator, root); + ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator)) { + Data.exportArrayStream(allocator, reader, stream); + nativeWritePkSidecar(genPath, stream.memoryAddress(), Collections.singletonList("id")); + } + } + + /** + * Test-support native: write the primary-key dedup sidecar for a flushed-generation dataset + * already staged at {@code genPath}. See {@link #writeFlushedGen}. + */ + private static native void nativeWritePkSidecar( + String genPath, long streamAddress, List pkColumns); + /** Read an LSM scanner fully into an {@code id -> name} map. */ private static Map readByName(ArrowReader reader) throws Exception { Map byId = new HashMap<>(); @@ -367,7 +392,7 @@ void testLsmScannerFromSnapshots(@TempDir Path tempDir) throws Exception { // Flushed generation overwrites id=2. String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1"; - writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close(); + writeFlushedGen(allocator, genPath, new long[] {2}, "gen1"); ShardSnapshot snapshot = new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2); @@ -393,7 +418,7 @@ void testPointLookup(@TempDir Path tempDir) throws Exception { dataset.initializeMemWal(new InitializeMemWalParams()); String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1"; - writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close(); + writeFlushedGen(allocator, genPath, new long[] {2}, "gen1"); ShardSnapshot snapshot = new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2); diff --git a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java index f425ddcc4f9..c622bac9fcd 100644 --- a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java +++ b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java @@ -189,6 +189,33 @@ void testNamespaceId() { "namespaceId should contain 'DirectoryNamespace', got: " + namespaceId); } + @Test + void testOpenSpecificVersionDoesNotPassVersionToDescribeTable() throws Exception { + VersionRejectingNamespace versionRejectingNamespace = + new VersionRejectingNamespace(innerNamespaceClient); + namespaceClient = versionRejectingNamespace; + List tableId = Arrays.asList("test_table"); + + namespaceClient.createTable(new CreateTableRequest().id(tableId), createTestTableData()); + namespaceClient.insertIntoTable( + new InsertIntoTableRequest().id(tableId).mode("append"), createTestTableData()); + + try (Dataset versionOne = + Dataset.open() + .allocator(allocator) + .namespaceClient(namespaceClient) + .tableId(tableId) + .readOptions(new ReadOptions.Builder().setVersion(1L).build()) + .build()) { + assertEquals(1, versionOne.version()); + assertEquals(3, versionOne.countRows()); + } + + assertTrue( + versionRejectingNamespace.getDescribeTableCallCount() > 0, + "Expected describeTable to be called when opening through namespace"); + } + @Test void testCreateAndListNamespaces() { // Create a namespace @@ -1439,4 +1466,25 @@ private byte[] createVectorTableData(int numRows, int dim) throws Exception { return out.toByteArray(); } } + + private static class VersionRejectingNamespace extends CustomNamespace { + private final AtomicInteger describeTableCallCount = new AtomicInteger(); + + VersionRejectingNamespace(DirectoryNamespace inner) { + super(inner); + } + + @Override + public DescribeTableResponse describeTable(DescribeTableRequest request) { + describeTableCallCount.incrementAndGet(); + assertNull( + request.getVersion(), + "Dataset version should be passed to dataset open, not describeTable"); + return super.describeTable(request); + } + + int getDescribeTableCallCount() { + return describeTableCallCount.get(); + } + } } diff --git a/memtest/pyproject.toml b/memtest/pyproject.toml index 396d7c442e0..4418d0e19c8 100644 --- a/memtest/pyproject.toml +++ b/memtest/pyproject.toml @@ -7,7 +7,7 @@ name = "lance-memtest" version = "0.1.0" description = "Memory allocation testing utilities for Python test suites" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { text = "Apache-2.0" } authors = [ { name = "Lance Developers" } @@ -17,7 +17,6 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", diff --git a/protos/ann.proto b/protos/ann.proto index c9d3b4dcc2f..f5de5e25e7b 100644 --- a/protos/ann.proto +++ b/protos/ann.proto @@ -9,6 +9,20 @@ import "table_identifier.proto"; import "table.proto"; import "index.proto"; +// Query-time approximation mode for vector search. +// +// This currently only affects RQ-quantized vector indexes, such as IVF_RQ. +// Other index types ignore this setting. +enum VectorApproxMode { + // Use all RQ bits for query-time scoring with u8-quantized lookup tables. + Normal = 0; + // Use only one RQ bit for query-time scoring, even for multi-bit indexes. + Fast = 1; + // Use all RQ bits for query-time scoring with u16-quantized lookup tables + // to reduce estimator quantization error. + Accurate = 2; +} + // Serialized vector query parameters. message VectorQueryProto { // Query vector as Arrow IPC bytes (supports Float16, Float32, Float64, UInt8, etc.) @@ -26,6 +40,9 @@ message VectorQueryProto { bool use_index = 11; optional float dist_q_c = 12; optional int32 query_parallelism = 13; + // Query-time approximation mode. Currently only affects RQ-quantized vector + // indexes, such as IVF_RQ. Other index types ignore this setting. + VectorApproxMode approx_mode = 14; } // Serializable form of ANNIvfSubIndexExec — the IVF sub-index search node. diff --git a/protos/index.proto b/protos/index.proto index ea21c70387d..b1045f8977c 100644 --- a/protos/index.proto +++ b/protos/index.proto @@ -246,4 +246,6 @@ message JsonIndexDetails { } message BloomFilterIndexDetails {} -message RTreeIndexDetails {} \ No newline at end of file +message RTreeIndexDetails {} + +message FMIndexIndexDetails {} \ No newline at end of file diff --git a/python/Cargo.lock b/python/Cargo.lock index 162a221907f..955f35f97eb 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -185,15 +185,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ar_archive_writer" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" -dependencies = [ - "object", -] - [[package]] name = "arc-swap" version = "1.9.1" @@ -417,7 +408,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "serde_core", "serde_json", ] @@ -517,7 +508,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -528,7 +519,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -582,7 +573,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 1.4.1", + "http 1.4.2", "ring", "time", "tokio", @@ -642,7 +633,7 @@ dependencies = [ "bytes", "bytes-utils", "fastrand", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "percent-encoding", "pin-project-lite", @@ -669,7 +660,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -693,7 +684,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -717,7 +708,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -742,7 +733,7 @@ dependencies = [ "aws-types", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -762,7 +753,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "percent-encoding", "sha2 0.10.9", "time", @@ -792,7 +783,7 @@ dependencies = [ "bytes-utils", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "percent-encoding", @@ -811,7 +802,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "h2", - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-rustls", "hyper-util", @@ -868,7 +859,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -888,7 +879,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "pin-project-lite", "tokio", "tracing", @@ -906,7 +897,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -954,7 +945,7 @@ dependencies = [ "axum-core", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -987,7 +978,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "mime", @@ -1053,9 +1044,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "bitpacking" @@ -1068,9 +1059,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -1112,9 +1103,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -1130,9 +1121,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.3" +version = "8.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" +checksum = "5cc91aac060a7a1e25823bdccbfb6af1875b88f17c6daac97894eed8207166b3" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1141,9 +1132,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "5.0.1" +version = "5.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" +checksum = "3a32acac15fe1967bc3986b2a6347dffc965602354ea6f450ad07e8bfd253583" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1195,7 +1186,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1218,9 +1209,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -1232,15 +1223,6 @@ dependencies = [ "either", ] -[[package]] -name = "bzip2" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" -dependencies = [ - "libbz2-rs-sys", -] - [[package]] name = "cbc" version = "0.1.2" @@ -1252,9 +1234,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -1302,9 +1284,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "js-sys", @@ -1365,7 +1347,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1430,13 +1412,9 @@ version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" dependencies = [ - "bzip2", "compression-core", "flate2", - "liblzma", "memchr", - "zstd", - "zstd-safe", ] [[package]] @@ -1764,7 +1742,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1777,7 +1755,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1788,7 +1766,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1799,7 +1777,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1826,7 +1804,6 @@ dependencies = [ "arrow-schema", "async-trait", "bytes", - "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -1836,7 +1813,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1853,14 +1829,11 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "datafusion-sql", - "flate2", "futures", "itertools 0.14.0", - "liblzma", "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -1868,7 +1841,6 @@ dependencies = [ "tokio", "url", "uuid", - "zstd", ] [[package]] @@ -1938,7 +1910,6 @@ dependencies = [ "object_store", "parquet", "paste", - "recursive", "sqlparser", "tokio", "web-time", @@ -1962,10 +1933,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", - "async-compression", "async-trait", "bytes", - "bzip2", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -1976,18 +1945,14 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", - "flate2", "futures", "glob", "itertools 0.14.0", - "liblzma", "log", "object_store", "rand 0.9.4", "tokio", - "tokio-util", "url", - "zstd", ] [[package]] @@ -2138,7 +2103,6 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "paste", - "recursive", "serde_json", "sqlparser", ] @@ -2330,7 +2294,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2348,7 +2312,6 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "log", - "recursive", "regex", "regex-syntax", ] @@ -2373,7 +2336,6 @@ dependencies = [ "parking_lot", "paste", "petgraph", - "recursive", "tokio", ] @@ -2425,7 +2387,6 @@ dependencies = [ "datafusion-physical-plan", "datafusion-pruning", "itertools 0.14.0", - "recursive", ] [[package]] @@ -2544,7 +2505,6 @@ dependencies = [ "datafusion-functions-nested", "indexmap 2.14.0", "log", - "recursive", "regex", "sqlparser", ] @@ -2569,26 +2529,6 @@ dependencies = [ "url", ] -[[package]] -name = "deepsize" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" -dependencies = [ - "deepsize_derive", -] - -[[package]] -name = "deepsize_derive" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "der" version = "0.7.10" @@ -2628,7 +2568,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2638,7 +2578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2659,7 +2599,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2694,7 +2634,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2863,7 +2803,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "rustc_version", ] @@ -2919,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2996,7 +2936,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3222,17 +3162,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -3253,7 +3191,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3274,18 +3212,42 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "goosefs-sdk" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae079b88ffe7772d12cfc5c40a5a324babb357893d95b5e3a22ae857f236c5f" +dependencies = [ + "async-trait", + "bytes", + "dashmap", + "hostname", + "prost", + "prost-types", + "rand 0.9.4", + "reqwest 0.12.28", + "serde", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic", + "tonic-prost", + "tracing", + "uuid", +] + [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http 1.4.1", + "http 1.4.2", "indexmap 2.14.0", "slab", "tokio", @@ -3394,7 +3356,7 @@ checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "more-asserts", "serde", "thiserror 2.0.18", @@ -3426,6 +3388,17 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "hostname" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" +dependencies = [ + "cfg-if 1.0.4", + "libc", + "windows-link", +] + [[package]] name = "http" version = "0.2.12" @@ -3439,9 +3412,9 @@ dependencies = [ [[package]] name = "http" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -3465,7 +3438,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", ] [[package]] @@ -3476,7 +3449,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "pin-project-lite", ] @@ -3519,7 +3492,7 @@ dependencies = [ "futures-channel", "futures-core", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "httparse", "httpdate", @@ -3536,7 +3509,7 @@ version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-util", "rustls", @@ -3544,6 +3517,20 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", ] [[package]] @@ -3556,7 +3543,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "hyper", "ipnet", @@ -3774,12 +3761,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -3852,7 +3833,7 @@ version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "cfg-if 1.0.4", "libc", ] @@ -3950,7 +3931,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3995,7 +3976,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4014,7 +3995,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4029,13 +4010,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.99" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", - "once_cell", "wasm-bindgen", ] @@ -4087,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4117,7 +4097,6 @@ dependencies = [ "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-plan", - "deepsize", "either", "fst", "futures", @@ -4162,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4204,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -4213,21 +4192,23 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-data", "arrow-schema", "async-trait", "byteorder", "bytes", "datafusion-common", "datafusion-sql", - "deepsize", "futures", "itertools 0.13.0", "lance-arrow", + "lance-derive", "libc", + "libm", "log", "moka", "num_cpus", @@ -4243,12 +4224,13 @@ dependencies = [ "tokio-stream", "tokio-util", "tracing", + "twox-hash", "url", ] [[package]] name = "lance-datafusion" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4280,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4293,12 +4275,20 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", +] + +[[package]] +name = "lance-derive" +version = "9.0.0-beta.2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", ] [[package]] name = "lance-encoding" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4333,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4346,7 +4336,6 @@ dependencies = [ "byteorder", "bytes", "datafusion-common", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4364,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4378,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4399,7 +4388,6 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "deepsize", "dirs", "fst", "futures", @@ -4423,7 +4411,7 @@ dependencies = [ "lance-select", "lance-table", "lance-tokenizer", - "libm", + "libsais-rs", "log", "ndarray", "num-traits", @@ -4435,6 +4423,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "serde", "serde_json", @@ -4442,13 +4431,12 @@ dependencies = [ "tempfile", "tokio", "tracing", - "twox-hash", "uuid", ] [[package]] name = "lance-io" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4465,9 +4453,8 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", - "http 1.4.1", + "http 1.4.2", "io-uring", "lance-arrow", "lance-core", @@ -4490,23 +4477,23 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", "cc", - "deepsize", "half", "lance-arrow", "lance-core", "num-traits", "rand 0.9.4", + "rayon", ] [[package]] name = "lance-namespace" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4518,7 +4505,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-ipc", @@ -4526,6 +4513,8 @@ dependencies = [ "async-trait", "axum", "bytes", + "datafusion-common", + "datafusion-physical-plan", "futures", "lance", "lance-core", @@ -4538,19 +4527,22 @@ dependencies = [ "object_store", "rand 0.9.4", "reqwest 0.12.28", + "roaring", "serde", "serde_json", + "time", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.0" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eefb02ded2c3d4b6b60669bb74822d9fa628e144fc748c79ee31f13f566e87b" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -4562,14 +4554,13 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", "byteorder", "bytes", - "deepsize", "itertools 0.13.0", "lance-core", "roaring", @@ -4578,7 +4569,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4591,7 +4582,6 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4618,13 +4608,14 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "icu_segmenter", "jieba-rs", "lindera", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] @@ -4637,12 +4628,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -4700,12 +4685,6 @@ dependencies = [ "lexical-util", ] -[[package]] -name = "libbz2-rs-sys" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" - [[package]] name = "libc" version = "0.2.186" @@ -4722,26 +4701,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "liblzma" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" -dependencies = [ - "liblzma-sys", -] - -[[package]] -name = "liblzma-sys" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "libm" version = "0.2.16" @@ -4757,6 +4716,15 @@ dependencies = [ "libc", ] +[[package]] +name = "libsais-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fe164dbd47ea0c20e78a121c980ef673326905f1d4fba55e3645a20ef6717f" +dependencies = [ + "rayon", +] + [[package]] name = "lindera" version = "3.0.7" @@ -4813,9 +4781,9 @@ dependencies = [ [[package]] name = "link-section" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014e440054ce8170890229eeef5bcda955305e056ec713de40ed366944483f09" +checksum = "c2b1dd6fe32e55c0fc0ea9493aa57459ca3cf4ff3c857c7d0302290150da6e4f" [[package]] name = "linktime-proc-macro" @@ -4846,9 +4814,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "loom" @@ -4956,9 +4924,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "memmap2" @@ -5055,7 +5023,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5200,7 +5168,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5209,7 +5177,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -5231,15 +5199,6 @@ dependencies = [ "objc2-core-foundation", ] -[[package]] -name = "object" -version = "0.37.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" -dependencies = [ - "memchr", -] - [[package]] name = "object_store" version = "0.13.2" @@ -5254,7 +5213,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body-util", "httparse", "humantime", @@ -5331,9 +5290,11 @@ dependencies = [ "opendal-service-azdls", "opendal-service-cos", "opendal-service-gcs", + "opendal-service-goosefs", "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", + "opendal-service-tos", ] [[package]] @@ -5346,7 +5307,7 @@ dependencies = [ "base64", "bytes", "futures", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "jiff", "log", @@ -5371,7 +5332,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f" dependencies = [ "futures", - "http 1.4.1", + "http 1.4.2", "mea", "opendal-core", ] @@ -5415,7 +5376,7 @@ checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -5436,7 +5397,7 @@ checksum = "6dea4908d490143a9b0b7f7a790e139ff829b06a023f670455ed3d44f664b361" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -5454,7 +5415,7 @@ version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051" dependencies = [ - "http 1.4.1", + "http 1.4.2", "opendal-core", ] @@ -5465,7 +5426,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa8cafe9729213375c7331019b0cb756ad3e1aff7f45cd32c45eae91ebde8901" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5483,7 +5444,7 @@ checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5496,6 +5457,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "opendal-service-goosefs" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e43048bde419947ba826fbdc2f134d6c03f44ebf48bd33a03b72f9fc45fcb4" +dependencies = [ + "bytes", + "goosefs-sdk", + "log", + "opendal-core", + "serde", + "tokio", +] + [[package]] name = "opendal-service-hf" version = "0.57.0" @@ -5504,7 +5479,7 @@ checksum = "c4922661976a1d40794a2adfbdb888cc3c23097690f825a92f773af38908a848" dependencies = [ "bytes", "hf-xet", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5520,7 +5495,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5539,7 +5514,7 @@ dependencies = [ "base64", "bytes", "crc32c", - "http 1.4.1", + "http 1.4.2", "log", "md-5 0.11.0", "opendal-core", @@ -5551,6 +5526,23 @@ dependencies = [ "url", ] +[[package]] +name = "opendal-service-tos" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" +dependencies = [ + "bytes", + "http 1.4.2", + "opendal-core", + "quick-xml 0.39.4", + "reqsign-core", + "reqsign-file-read-tokio", + "reqsign-volcengine-tos", + "serde", + "serde_json", +] + [[package]] name = "openssl-probe" version = "0.2.1" @@ -5853,7 +5845,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5960,7 +5952,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5983,9 +5975,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -5993,9 +5985,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools 0.14.0", @@ -6006,42 +5998,32 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn 2.0.118", "tempfile", ] [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] -[[package]] -name = "psm" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" -dependencies = [ - "ar_archive_writer", - "cc", -] - [[package]] name = "ptr_meta" version = "0.3.1" @@ -6059,13 +6041,14 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "pylance" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ + "alloc-stdlib", "arrow", "arrow-array", "arrow-cast", @@ -6154,7 +6137,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6167,7 +6150,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6319,7 +6302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -6386,19 +6369,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -6431,26 +6401,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.117", -] - [[package]] name = "redb" version = "3.1.3" @@ -6466,7 +6416,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -6497,14 +6447,14 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "regex" -version = "1.12.3" +version = "1.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" dependencies = [ "aho-corasick", "memchr", @@ -6531,9 +6481,9 @@ checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" [[package]] name = "regress" @@ -6571,7 +6521,7 @@ checksum = "372266b4733756738eeb199a98188037d27a0989980e2600ae7ce1faf00a867d" dependencies = [ "anyhow", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -6590,7 +6540,7 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "quick-xml 0.40.1", @@ -6612,7 +6562,7 @@ dependencies = [ "base64", "bytes", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "pem", "percent-encoding", @@ -6636,7 +6586,7 @@ dependencies = [ "futures", "hex", "hmac 0.13.0", - "http 1.4.1", + "http 1.4.2", "jiff", "log", "percent-encoding", @@ -6666,7 +6616,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb215d0876a18b6bd9cdd380b589e5292aaa638ca15266de794b1122d898b6b2" dependencies = [ "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-aws-v4", @@ -6684,7 +6634,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84110aabba799fbcd48b3abb51fbbff4749f879252e5806b6f5d0cbe0fef6abb" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -6692,6 +6642,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "reqsign-volcengine-tos" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" +dependencies = [ + "anyhow", + "http 1.4.2", + "log", + "percent-encoding", + "reqsign-core", +] + [[package]] name = "reqwest" version = "0.12.28" @@ -6704,7 +6667,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6735,6 +6698,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", + "webpki-roots", ] [[package]] @@ -6747,7 +6711,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6785,7 +6749,7 @@ checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" dependencies = [ "anyhow", "async-trait", - "http 1.4.1", + "http 1.4.2", "reqwest 0.13.4", "thiserror 2.0.18", "tower-service", @@ -6832,7 +6796,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6924,7 +6888,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "errno", "libc", "linux-raw-sys", @@ -7097,7 +7061,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7129,7 +7093,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -7189,7 +7153,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7200,7 +7164,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7235,7 +7199,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7247,7 +7211,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7264,9 +7228,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +checksum = "76a5c54c7310e7b8b9577c286d7e399ddd876c3e12b3ed917a8aabc4b96e9e8c" dependencies = [ "base64", "bs58", @@ -7284,14 +7248,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +checksum = "84d57bc0c8b9a17920c178daa6bb924850d54a9c97ab45194bb8c17ad66bb660" dependencies = [ "darling 0.23.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7445,9 +7409,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -7467,7 +7431,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7521,7 +7485,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" dependencies = [ "log", - "recursive", "sqlparser_derive", ] @@ -7533,7 +7496,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7542,19 +7505,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" -[[package]] -name = "stacker" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" -dependencies = [ - "cc", - "cfg-if 1.0.4", - "libc", - "psm", - "windows-sys 0.61.2", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -7583,6 +7533,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "strsim" version = "0.11.1" @@ -7617,7 +7576,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7629,7 +7588,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7652,7 +7611,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn 2.0.118", "typify", "walkdir", ] @@ -7682,9 +7641,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -7708,7 +7667,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7731,7 +7690,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -7771,7 +7730,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -7803,7 +7762,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7814,7 +7773,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7937,14 +7896,14 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "tokio-retry" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40f644c762e9d396831ae2f8935c954b0d758c4532e924bead0f666d0c1c8640" +checksum = "4a129d95275ebf4c493ec53bf0f8cd95f5ac161bc4f381700809a54f595d4470" dependencies = [ "pin-project-lite", "rand 0.10.1", @@ -8016,6 +7975,45 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64", + "bytes", + "h2", + "http 1.4.2", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -8024,9 +8022,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -8038,9 +8039,9 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -8056,11 +8057,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ "async-compression", - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -8117,7 +8118,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8256,7 +8257,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn 2.0.118", "thiserror 2.0.18", "unicode-ident", ] @@ -8274,7 +8275,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn 2.0.118", "typify-impl", ] @@ -8317,12 +8318,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -8373,11 +8368,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.23.2" +version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d258b83ceec21034727ecee8c382cfa6c3e133699b0742c64571814fb420c9f7" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -8437,20 +8432,11 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" -dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -8464,9 +8450,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -8477,9 +8463,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.72" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9473dbd2991ae90b6291c3c32c30c6187ac49aa32f9905d1cce280ec1e110b0f" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -8487,9 +8473,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -8497,48 +8483,26 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.122" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -8565,23 +8529,11 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags 2.12.1", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" -version = "0.3.99" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621441cfc37b84979402712047321980c178f299193a3589d05b99e8763436" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -8599,9 +8551,18 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "webpki-roots" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -8703,7 +8664,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8714,7 +8675,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8936,100 +8897,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags 2.12.1", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -9083,7 +8956,7 @@ dependencies = [ "clap", "crc32fast", "futures", - "http 1.4.1", + "http 1.4.2", "hyper", "lazy_static", "more-asserts", @@ -9124,7 +8997,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", @@ -9157,7 +9030,7 @@ dependencies = [ "chrono", "clap", "gearhash", - "http 1.4.1", + "http 1.4.2", "itertools 0.14.0", "lazy_static", "more-asserts", @@ -9231,9 +9104,9 @@ checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -9248,28 +9121,28 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.50" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.50" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9289,15 +9162,15 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" @@ -9331,7 +9204,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] diff --git a/python/Cargo.toml b/python/Cargo.toml index 2ea0d46764b..e76137fc63c 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.1" +version = "9.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -19,9 +19,13 @@ arrow-cast = "58.0.0" arrow-data = "58.0.0" arrow-schema = "58.0.0" object_store = "0.13.2" -datafusion = "53.0.0" +datafusion = { version = "53.0.0", default-features = false } datafusion-ffi = "53.0.0" datafusion-common = "53.0.0" +# Keep the Python FFI build on the working Brotli allocator resolution until +# datafusion-ffi no longer enables datafusion-proto/default. +# See https://github.com/lance-format/lance/issues/7271. +alloc-stdlib = "=0.2.2" async-trait = "0.1" chrono = "0.4.42" env_logger = "0.11.7" @@ -31,6 +35,7 @@ half = { version = "2.5", default-features = false, features = [ "std", ] } lance = { path = "../rust/lance", features = [ + "goosefs", "dynamodb", "substrait", ] } @@ -46,7 +51,7 @@ lance-index = { path = "../rust/lance-index", features = [ lance-io = { path = "../rust/lance-io" } lance-linalg = { path = "../rust/lance-linalg" } lance-namespace = { path = "../rust/lance-namespace" } -lance-namespace-impls = { path = "../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-impls = { path = "../rust/lance-namespace-impls", features = ["rest", "rest-adapter", "dir-goosefs"] } lance-table = { path = "../rust/lance-table" } lance-datafusion = { path = "../rust/lance-datafusion" } libc = "0.2.176" @@ -55,7 +60,7 @@ prost = "0.14.1" prost-types = "0.14.1" pyo3 = { version = "0.28", features = [ "extension-module", - "abi3-py39", + "abi3-py310", "py-clone", "chrono", ] } diff --git a/python/pyproject.toml b/python/pyproject.toml index a1e69855a0f..d863fe38517 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "pylance" dynamic = ["version"] -dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.0,<0.9"] +dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.5,<0.9"] description = "python wrapper for Lance columnar format" authors = [{ name = "Lance Devs", email = "dev@lance.org" }] license = { file = "LICENSE" } repository = "https://github.com/lancedb/lance" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" keywords = [ "data-format", "data-science", @@ -30,7 +30,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -61,7 +60,7 @@ tests = [ # Only test tensorflow on linux for now. We will deprecate tensorflow soon. "tensorflow; sys_platform == 'linux'", "tqdm", - "datafusion>=53,<54; python_version >= '3.10'", + "datafusion>=53,<54", ] dev = ["ruff==0.11.2", "pyright"] benchmarks = ["pytest-benchmark"] @@ -74,7 +73,7 @@ geo = [ [dependency-groups] tests = [ "boto3==1.40.43", - "datasets==4.1.1; python_version >= '3.10'", + "datasets==4.1.1", "duckdb==1.4.0", "ml_dtypes==0.5.3", "pillow==11.3.0", @@ -82,9 +81,9 @@ tests = [ "polars[pyarrow,pandas]==1.34.0", "psutil==7.1.0", "pytest==8.4.2", - "tensorflow==2.20.0; sys_platform == 'linux' and python_version >= '3.10'", + "tensorflow==2.20.0; sys_platform == 'linux'", "tqdm==4.67.1", - "datafusion==53.0.0; python_version >= '3.10'", + "datafusion==53.0.0", ] dev = [ "maturin==1.13.3", diff --git a/python/python/benchmarks/test_search.py b/python/python/benchmarks/test_search.py index 61076e61687..b4e33338cb1 100644 --- a/python/python/benchmarks/test_search.py +++ b/python/python/benchmarks/test_search.py @@ -78,10 +78,12 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset: rows_remaining -= next_batch_length table = create_table(next_batch_length, offset) if offset == 0: - dataset = lance.write_dataset(table, tmp_path, use_legacy_format=False) + dataset = lance.write_dataset( + table, tmp_path, data_storage_version="stable" + ) else: dataset = lance.write_dataset( - table, tmp_path, mode="append", use_legacy_format=False + table, tmp_path, mode="append", data_storage_version="stable" ) offset += next_batch_length @@ -98,7 +100,7 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset: dataset.create_scalar_index("category", "BITMAP") dataset.create_scalar_index("genres", "LABEL_LIST") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) def create_delete_dataset(data_dir): @@ -113,7 +115,7 @@ def create_delete_dataset(data_dir): dataset = lance.dataset(tmp_path) dataset.delete("filterable % 2 != 0") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) def create_new_rows_dataset(data_dir): @@ -129,7 +131,7 @@ def create_new_rows_dataset(data_dir): table = create_table(NEW_ROWS, offset=NUM_ROWS) dataset = lance.write_dataset(table, tmp_path, mode="append") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) class Datasets(NamedTuple): diff --git a/python/python/ci_benchmarks/benchmarks/test_count_rows.py b/python/python/ci_benchmarks/benchmarks/test_count_rows.py new file mode 100644 index 00000000000..f6228b04f13 --- /dev/null +++ b/python/python/ci_benchmarks/benchmarks/test_count_rows.py @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Benchmark count_rows acceleration for IS NULL / IS NOT NULL filters. + +Tests five index configurations against an int32 dataset with ~1% NULL +values. Each configuration stores the same data in a separate column so +that only one index type is active per measurement: + + none — no index, full column scan (baseline) + BITMAP — bitmap index + BTREE — btree index + ZONEMAP — zone-map index + BLOOMFILTER — bloom-filter index + +Two filters are exercised for each configuration: + IS NULL — count the ~1% null rows + IS NOT NULL — count the ~99% non-null rows + +Indexed configurations are tested in two cache states to separate first-load +latency from steady-state throughput: + + warm — one prewarm call is made before measuring; the same dataset instance + is reused so its in-memory index cache is already populated. + cold — a fresh ``lance.dataset()`` instance is created inside each measured + round so the in-memory index cache starts empty every time. No + prewarm pass is performed. +""" + +from __future__ import annotations + +import lance +import pytest +from ci_benchmarks.datasets import get_dataset_uri + +# --------------------------------------------------------------------------- +# Parameters +# --------------------------------------------------------------------------- + +# Indexed configs only (warm/cold dimension applies to these) +_INDEXED_CONFIGS: list[tuple[str, str]] = [ + ("bitmap", "value_bitmap"), + ("btree", "value_btree"), + ("zonemap", "value_zonemap"), + ("bloomfilter", "value_bloomfilter"), +] +_INDEXED_IDS = [cfg[0] for cfg in _INDEXED_CONFIGS] + +_FILTERS = ["is_null", "is_not_null"] + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def count_rows_ds() -> lance.LanceDataset: + """Shared dataset instance (index cache persists across rounds — use for warm).""" + return lance.dataset(get_dataset_uri("count_rows")) + + +@pytest.fixture(scope="module") +def count_rows_uri() -> str: + return get_dataset_uri("count_rows") + + +# --------------------------------------------------------------------------- +# No-index baseline (no warm/cold — there is no index cache to speak of) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("filter_type", _FILTERS) +def test_count_rows_no_index( + benchmark, + count_rows_ds: lance.LanceDataset, + filter_type: str, +) -> None: + """Full-scan baseline with no scalar index.""" + filt = ( + "value_none IS NULL" if filter_type == "is_null" else "value_none IS NOT NULL" + ) + + def bench() -> int: + return count_rows_ds.count_rows(filter=filt) + + benchmark.pedantic(bench, warmup_rounds=1, rounds=5) + + +# --------------------------------------------------------------------------- +# Indexed benchmarks — warm vs cold +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("warm", [True, False], ids=["warm", "cold"]) +@pytest.mark.parametrize("filter_type", _FILTERS) +@pytest.mark.parametrize("index_id,column", _INDEXED_CONFIGS, ids=_INDEXED_IDS) +def test_count_rows_indexed( + benchmark, + count_rows_ds: lance.LanceDataset, + count_rows_uri: str, + index_id: str, + column: str, + filter_type: str, + warm: bool, +) -> None: + """Benchmark count_rows with a scalar index, in warm and cold cache states. + + Args: + index_id: Human-readable index name (parametrize label only). + column: Dataset column that carries this index type. + filter_type: ``"is_null"`` or ``"is_not_null"``. + warm: If True, prewarm the index cache before measuring and reuse the + shared dataset instance. If False, create a fresh dataset + instance on every round so the index cache starts empty. + """ + filt = f"{column} IS NULL" if filter_type == "is_null" else f"{column} IS NOT NULL" + + if warm: + + def bench() -> int: + return count_rows_ds.count_rows(filter=filt) + + # warmup_rounds=1 makes one unmeasured call that populates the cache. + benchmark.pedantic(bench, warmup_rounds=1, rounds=5) + else: + + def bench() -> int: + # Fresh instance → empty in-memory index cache every round. + ds = lance.dataset(count_rows_uri) + return ds.count_rows(filter=filt) + + benchmark.pedantic(bench, warmup_rounds=0, rounds=5) diff --git a/python/python/ci_benchmarks/datagen/count_rows.py b/python/python/ci_benchmarks/datagen/count_rows.py new file mode 100644 index 00000000000..ff5fa02cf8a --- /dev/null +++ b/python/python/ci_benchmarks/datagen/count_rows.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Generate the count_rows benchmark dataset. + +Creates a 10-million-row Lance dataset with five int32 columns that all hold the +same values and the same ~1% null mask. Each column carries a different scalar +index so the benchmark can compare no-index, BITMAP, BTREE, ZONEMAP, and +BLOOMFILTER side-by-side on the same underlying data. + +Column layout +------------- +value_none — no index (full-scan baseline) +value_bitmap — BITMAP index +value_btree — BTREE index +value_zonemap — ZONEMAP index +value_bloomfilter — BLOOMFILTER index + +Null pattern: row i is null when i % 100 == 0 (~1% of rows exactly). +""" + +import lance +import numpy as np +import pyarrow as pa +from lance.log import LOGGER + +from ci_benchmarks.datasets import get_dataset_uri + +NUM_ROWS = 10_000_000 +BATCH_SIZE = 1_000_000 # 1 M rows per batch → 10 batches total + +COLUMNS = [ + "value_none", + "value_bitmap", + "value_btree", + "value_zonemap", + "value_bloomfilter", +] + +SCHEMA = pa.schema([(col, pa.int32()) for col in COLUMNS]) + + +def _gen_data(): + num_batches = NUM_ROWS // BATCH_SIZE + for batch_idx in range(num_batches): + offset = batch_idx * BATCH_SIZE + values = np.arange(offset, offset + BATCH_SIZE, dtype=np.int32) + # Null mask: True where the value should be null (~1% of rows) + null_mask = (np.arange(BATCH_SIZE) + offset) % 100 == 0 + col = pa.array(values, type=pa.int32(), mask=null_mask) + yield pa.record_batch([col] * len(COLUMNS), schema=SCHEMA) + + +def gen_count_rows() -> lance.LanceDataset: + dataset_uri = get_dataset_uri("count_rows") + + try: + ds = lance.dataset(dataset_uri) + if ds.count_rows() == NUM_ROWS: + LOGGER.info( + "count_rows dataset already exists at %s (%d rows)", + dataset_uri, + NUM_ROWS, + ) + return ds + LOGGER.warning( + "count_rows dataset at %s has unexpected row count %d; regenerating", + dataset_uri, + ds.count_rows(), + ) + except Exception: + pass + + LOGGER.info( + "Writing count_rows dataset (%d rows, %d columns) to %s", + NUM_ROWS, + len(COLUMNS), + dataset_uri, + ) + ds = lance.write_dataset( + _gen_data(), + dataset_uri, + schema=SCHEMA, + mode="overwrite", + ) + LOGGER.info("Dataset written; building scalar indexes …") + + for index_type, column in [ + ("BITMAP", "value_bitmap"), + ("BTREE", "value_btree"), + ("ZONEMAP", "value_zonemap"), + ("BLOOMFILTER", "value_bloomfilter"), + ]: + LOGGER.info(" Creating %s index on %s …", index_type, column) + ds.create_scalar_index(column, index_type) + + LOGGER.info("count_rows dataset ready.") + return ds diff --git a/python/python/ci_benchmarks/datagen/gen_all.py b/python/python/ci_benchmarks/datagen/gen_all.py index 1da7c05fd9b..d5120d20ff7 100644 --- a/python/python/ci_benchmarks/datagen/gen_all.py +++ b/python/python/ci_benchmarks/datagen/gen_all.py @@ -6,6 +6,7 @@ from lance.log import LOGGER from ci_benchmarks.datagen.basic import gen_basic +from ci_benchmarks.datagen.count_rows import gen_count_rows from ci_benchmarks.datagen.lineitems import gen_tcph from ci_benchmarks.datagen.wikipedia import gen_wikipedia @@ -40,6 +41,9 @@ def setup_logging(): LOGGER.info("Generating Wikipedia dataset...") gen_wikipedia() + LOGGER.info("Generating count_rows benchmark dataset...") + gen_count_rows() + LOGGER.info("=" * 80) LOGGER.info("All datasets generated successfully!") LOGGER.info("=" * 80) diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index f58b169a47a..be99eb05cc5 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -230,7 +230,9 @@ def dataset( "Both 'namespace_client' and 'table_id' must be provided together." ) - request = DescribeTableRequest(id=table_id, version=version) + # Resolve the latest table metadata here. The requested dataset version is + # applied by the lower-level dataset open path after namespace resolution. + request = DescribeTableRequest(id=table_id, version=None) response = namespace_client.describe_table(request) uri = response.location diff --git a/python/python/lance/blob.py b/python/python/lance/blob.py index 46faf760cdd..a87c9302736 100644 --- a/python/python/lance/blob.py +++ b/python/python/lance/blob.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors +import ctypes import io from dataclasses import dataclass from typing import IO, Any, Iterator, Optional, Union @@ -9,6 +10,12 @@ from .lance import LanceBlobFile +_BLOB_INLINE_SIZE_THRESHOLD_META_KEY = b"lance-encoding:blob-inline-size-threshold" +_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY = ( + b"lance-encoding:blob-dedicated-size-threshold" +) +_MAX_RUST_USIZE = ctypes.c_size_t(-1).value + @dataclass(frozen=True) class Blob: @@ -190,9 +197,63 @@ def blob_array(values: list[Any]) -> BlobArray: return BlobArray.from_pylist(values) -def blob_field(name: str, *, nullable: bool = True) -> pa.Field: - """Construct an Arrow field for a Lance blob column.""" - return pa.field(name, BlobType(), nullable=nullable) +def _validate_threshold(name: str, value: Optional[int], *, allow_zero: bool) -> None: + if value is None: + return + if isinstance(value, bool) or not isinstance(value, int): + raise TypeError(f"{name} must be an int, got {type(value).__name__}") + if allow_zero: + if value < 0: + raise ValueError(f"{name} must be non-negative") + elif value <= 0: + raise ValueError(f"{name} must be positive") + if value > _MAX_RUST_USIZE: + raise OverflowError(f"{name} must fit in a Rust usize") + + +def blob_field( + name: str, + *, + nullable: bool = True, + inline_size_threshold: Optional[int] = None, + dedicated_size_threshold: Optional[int] = None, +) -> pa.Field: + """ + Construct an Arrow field for a Lance blob column. + + Parameters + ---------- + name : str + Field name. + nullable : bool, default True + Whether the blob column accepts null values. + inline_size_threshold : optional, int + Maximum payload size in bytes to keep inline in the data file before + using packed blob storage. + dedicated_size_threshold : optional, int + Maximum payload size in bytes to store in packed blob storage before + using dedicated blob storage. This threshold is checked before + ``inline_size_threshold``. + """ + _validate_threshold("inline_size_threshold", inline_size_threshold, allow_zero=True) + _validate_threshold( + "dedicated_size_threshold", dedicated_size_threshold, allow_zero=False + ) + + field = pa.field(name, BlobType(), nullable=nullable) + if inline_size_threshold is None and dedicated_size_threshold is None: + return field + + metadata = dict(field.metadata or {}) + if inline_size_threshold is not None: + metadata[_BLOB_INLINE_SIZE_THRESHOLD_META_KEY] = str( + inline_size_threshold + ).encode() + if dedicated_size_threshold is not None: + metadata[_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY] = str( + dedicated_size_threshold + ).encode() + return field.with_metadata(metadata) class BlobIterator: diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 4f7ed434436..45dc1b253d3 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -950,6 +950,9 @@ def create_branch( ds._base_store_params = self._base_store_params ds._namespace_client = self._namespace_client ds._table_id = self._table_id + ds._namespace_client_managed_versioning = ( + self._namespace_client_managed_versioning + ) ds._default_scan_options = self._default_scan_options ds._read_params = self._read_params return ds @@ -958,14 +961,14 @@ def checkout_latest(self): """Check out the latest version of the current branch.""" self._ds.checkout_latest() - def list_indices(self) -> List[Index]: + def list_indices(self) -> List[IndexInformation]: """ Returns index information for all indices in the dataset. - This method is deprecated as it requires loading the statistics for each index - which can be a very expensive operation. Instead use describe_indices() to - list index information and index_statistics() to get the statistics for - individual indexes of interest. + This method is deprecated. Use describe_indices() instead, which returns + richer per-index information. + + Each returned :class:`IndexInformation` describes one index segment. """ warnings.warn( "The 'list_indices' method is deprecated. It may be removed in a future " @@ -973,7 +976,19 @@ def list_indices(self) -> List[Index]: DeprecationWarning, ) - return self._ds.load_indices() + return [ + { + "name": desc.name, + "type": desc.index_type, + "uuid": segment.uuid, + "fields": desc.field_names, + "version": segment.dataset_version_at_last_update, + "fragment_ids": segment.fragment_ids, + "base_id": segment.base_id, + } + for desc in self.describe_indices() + for segment in desc.segments + ] def describe_indices(self) -> List[IndexDescription]: """Returns index information for all indices in the dataset.""" @@ -1338,7 +1353,10 @@ def data_storage_version(self) -> str: @property def has_stable_row_ids(self) -> bool: """ - Whether this dataset has stable row IDs enabled + Whether this dataset has stable row IDs enabled. + + This is based on the dataset manifest feature flag and does not depend on + whether the current version has any fragments. """ return self._ds.has_stable_row_ids @@ -2271,9 +2289,9 @@ def alter_columns(self, *alterations: Iterable[AlterColumn]): not changed. - "nullable": bool, optional Whether the column should be nullable. If not specified, the column - nullability is not changed. Only non-nullable columns can be changed - to nullable. Currently, you cannot change a nullable column to - non-nullable. + nullability is not changed. A non-nullable column can always be made + nullable. A nullable column can be made non-nullable only if it + contains no NULL values; otherwise an error is raised. - "data_type": pyarrow.DataType, optional The new data type to cast the column to. If not specified, the column data type is not changed. @@ -3062,6 +3080,38 @@ def _prepare_scalar_index_request( else: raise Exception("index_type must be str or IndexConfig") + @staticmethod + def _normalized_index_type( + index_type: Union[str, IndexConfig], + ) -> str: + if isinstance(index_type, IndexConfig): + index_type = index_type.index_type + return index_type.upper() + + @classmethod + def _is_segment_native_scalar_index_type( + cls, + index_type: Union[str, IndexConfig], + ) -> bool: + return cls._normalized_index_type(index_type) in { + "BTREE", + "BITMAP", + "INVERTED", + "FTS", + "ZONEMAP", + } + + @classmethod + def _requires_uncommitted_scalar_index( + cls, + index_type: Union[str, IndexConfig], + ) -> bool: + return cls._normalized_index_type(index_type) in { + "BTREE", + "BITMAP", + "ZONEMAP", + } + def create_scalar_index( self, column: str, @@ -3250,7 +3300,7 @@ def create_scalar_index( import lance dataset = lance.dataset("/tmp/images.lance") - dataset.create_index( + dataset.create_scalar_index( "category", "BTREE", ) @@ -3279,12 +3329,14 @@ def create_scalar_index( column, index_type, kwargs ) - if fragment_ids is not None and logical_index_type == "BTREE": + if fragment_ids is not None and self._requires_uncommitted_scalar_index( + logical_index_type + ): raise ValueError( - "BTree distributed indexing uses create_index_uncommitted(..., " - 'index_type="BTREE", fragment_ids=...)' + f"{logical_index_type} distributed indexing uses " + "create_index_uncommitted(..., " + f'index_type="{logical_index_type}", fragment_ids=...)' ) - # Add fragment_ids and index_uuid to kwargs if provided if fragment_ids is not None: kwargs["fragment_ids"] = fragment_ids @@ -3327,6 +3379,7 @@ def _create_index_impl( streaming_coreset_rate: Optional[int] = None, streaming_refine_passes: Optional[int] = None, skip_transpose: bool = False, + rabitq_model: Optional[str] = None, require_commit: bool = True, **kwargs, ) -> Index: @@ -3648,6 +3701,9 @@ def _create_index_impl( if skip_transpose: kwargs["skip_transpose"] = True + if rabitq_model is not None: + kwargs["rabitq_model"] = rabitq_model + # Add fragment_ids and index_uuid to kwargs if provided for # distributed indexing if fragment_ids is not None: @@ -3785,6 +3841,17 @@ def create_index( to the dataset. The returned metadata can be passed to ``merge_existing_index_segments(...)`` if grouping is needed and then committed with ``commit_existing_index_segments(...)``. + + Vector segments support both shared and independent model scopes. If + the caller provides the same IVF centroids, and for IVF_PQ the same + PQ codebook, to each worker, the resulting segments share model + semantics and are suitable for workflows that physically merge + compatible segments. If those artifacts are omitted, each segment can + train its own IVF/PQ model for its assigned fragments. Such segments + can be committed together and are queried independently by segment + UUID; partition ids are interpreted within each segment's own model. + Keep independently trained segments as separate physical segments + unless the merge workflow can preserve or reconcile the model state. index_uuid : str, optional A UUID to use for the segment written by this call. If not provided, a new UUID will be generated. @@ -3970,15 +4037,18 @@ def create_index_uncommitted( streaming_coreset_rate: Optional[int] = None, streaming_refine_passes: Optional[int] = None, skip_transpose: bool = False, + rabitq_model: Optional[str] = None, **kwargs, ) -> Index: """ Create one segment without publishing it and return its metadata. - This is the public distributed-build API for vector and BTREE scalar - index construction. Unlike :meth:`create_index`, this method does not - publish the index into the dataset manifest. Instead, it writes one - segment under ``_indices//`` and returns the resulting + This is the public distributed-build API for vector, BTREE scalar, + canonical bitmap scalar, INVERTED scalar, and ZONEMAP scalar index + construction. Unlike + :meth:`create_index`, this method does not publish the index into the + dataset manifest. Instead, it writes one segment under + ``_indices//`` and returns the resulting :class:`Index` metadata. Callers should: @@ -3991,27 +4061,37 @@ def create_index_uncommitted( 4. commit the final segment list with :meth:`commit_existing_index_segments` - BTREE segments do not yet support the segment builder (steps 3-4); collect - the returned segments and pass them straight to - :meth:`commit_existing_index_segments`. - + BTREE, BITMAP, INVERTED, and ZONEMAP segments may + be merged with :meth:`merge_existing_index_segments` before commit. Parameters are the same as :meth:`create_index`, with one additional requirement: - ``fragment_ids`` must be provided + - Vector segments support both shared and independent model scopes. Pass + the same IVF centroids, and for IVF_PQ the same PQ codebook, to each + worker when segments need shared model semantics or physical merge + compatibility. If these artifacts are omitted, each segment may train + its own IVF/PQ model and can be committed with other segments as one + logical index; query execution searches each segment by UUID and + interprets partition ids within that segment. Keep independently + trained segments as separate physical segments unless the merge + workflow can preserve or reconcile the model state. + - ``rabitq_model`` (``IVF_RQ`` only): a JSON string produced by + ``lance.lance.indices.build_rq_model``. It must be identical across all + workers for their segments to be mergeable, since it pins the RaBitQ + rotation so every segment rotates vectors the same way. If omitted, each + call generates its own random rotation, which is only safe for a single, + non-merged segment. Returns ------- Index Metadata for the segment that was written by this call. """ - is_btree_request = ( - isinstance(index_type, str) and index_type.upper() == "BTREE" - ) or ( - isinstance(index_type, IndexConfig) - and index_type.index_type.upper() == "BTREE" + is_scalar_segment_request = self._is_segment_native_scalar_index_type( + index_type ) - if is_btree_request: + if is_scalar_segment_request: if fragment_ids is None: raise ValueError( "create_index_uncommitted requires fragment_ids " @@ -4062,6 +4142,7 @@ def create_index_uncommitted( streaming_coreset_rate=streaming_coreset_rate, streaming_refine_passes=streaming_refine_passes, skip_transpose=skip_transpose, + rabitq_model=rabitq_model, require_commit=False, **kwargs, ) @@ -4106,9 +4187,10 @@ def merge_index_metadata( """ Merge distributed scalar index metadata. - Vector distributed indexing no longer uses this API. For vector indices, - build segments with :meth:`create_index_uncommitted`, optionally merge - them with :meth:`merge_existing_index_segments`, and publish them with + Vector and Bitmap distributed indexing no longer use this API. For + those index families, build segments with + :meth:`create_index_uncommitted`, optionally merge caller-defined + groups with :meth:`merge_existing_index_segments`, and publish them with :meth:`commit_existing_index_segments`. This method does NOT commit changes. @@ -4503,6 +4585,7 @@ def commit_batch( ds._base_store_params = base_store_params ds._namespace_client = None ds._table_id = None + ds._namespace_client_managed_versioning = False ds._default_scan_options = None ds._read_params = None return BulkCommitResult( @@ -5053,6 +5136,65 @@ def mem_wal_writer( raw = self._ds.mem_wal_writer(shard_id, **kwargs) return _mw.ShardWriter(raw) + def tracked_files( + self, + *, + min_version: Optional[int] = None, + progress: Optional[Callable] = None, + ) -> pa.RecordBatchReader: + """Stream all files referenced by any manifest version of this dataset. + + Parameters + ---------- + min_version : int, optional + If set, only include manifests with version >= min_version. + progress : callable, optional + Called after each manifest is processed with two arguments: + ``(manifests_processed: int, manifests_total: Optional[int])``. + ``manifests_total`` is ``None`` until all manifest locations + have been listed. Works well with ``tqdm``:: + + from tqdm import tqdm + pbar = tqdm(unit="manifest") + def on_progress(processed, total): + if total is not None: + pbar.total = total + pbar.update(1) + reader = ds.tracked_files(progress=on_progress) + table = reader.read_all() + pbar.close() + + Returns + ------- + pyarrow.RecordBatchReader + Schema: + + - **version** (int64): manifest version number + - **base_uri** (dictionary): storage root URI + - **path** (utf8): file path relative to ``base_uri`` + - **type** (dictionary): one of ``manifest``, + ``data file``, ``deletion file``, ``transaction file``, + ``index file`` + + Output order is non-deterministic. + """ + return self._ds.tracked_files(min_version=min_version, progress=progress) + + def all_files(self) -> pa.RecordBatchReader: + """Stream all files physically present at this dataset's base URI. + + Returns a :class:`pyarrow.RecordBatchReader` with schema: + + - **base_uri** (dictionary): storage root URI + - **path** (utf8): file path relative to ``base_uri`` + - **size_bytes** (int64): file size in bytes + - **last_modified** (timestamp[us, UTC]): last modification time + + Only the primary object store is scanned; alternate ``base_paths`` + entries are not included. + """ + return self._ds.all_files() + class SqlQuery: """ @@ -5287,6 +5429,19 @@ class Index: index_details: Optional[Tuple[str, bytes]] = None +class IndexInformation(TypedDict): + """Information about a single index segment, as returned by + :meth:`LanceDataset.list_indices`.""" + + name: str + type: str + uuid: str + fields: List[str] + version: int + fragment_ids: Set[int] + base_id: Optional[int] + + class AutoCleanupConfig(TypedDict): interval: int older_than_seconds: int @@ -6135,6 +6290,7 @@ def nearest( use_index: bool = True, ef: Optional[int] = None, query_parallelism: Optional[int] = None, + approx_mode: Literal["fast", "normal", "accurate"] = "normal", distance_range: Optional[tuple[Optional[float], Optional[float]]] = None, ) -> ScannerBuilder: """Configure nearest neighbor search. @@ -6158,6 +6314,13 @@ def nearest( the CPU pool size. Value 1 uses the single-worker sequential path. Values >= 2 use the partition-parallel path and are clamped to the CPU pool size. + approx_mode: {"fast", "normal", "accurate"}, default "normal" + Controls the speed / accuracy tradeoff for approximate vector search + when supported by the selected index. This currently only affects + RQ-quantized indexes, such as IVF_RQ. Other index types ignore this + setting. ``fast`` favors lower latency and may reduce recall, + ``normal`` uses the default balance, and ``accurate`` favors higher + recall and may increase latency. """ self._nearest = _build_vector_search_query( column, @@ -6172,6 +6335,7 @@ def nearest( use_index=use_index, ef=ef, query_parallelism=query_parallelism, + approx_mode=approx_mode, distance_range=distance_range, ) return self @@ -6496,20 +6660,22 @@ def explain_plan(self, verbose=False) -> str: return self._scanner.explain_plan(verbose=verbose) - def analyze_plan(self) -> str: + def analyze_plan(self, count_rows: bool = False) -> str: """Execute the plan for this scanner and display with runtime metrics. Parameters ---------- - verbose : bool, default False - Use a verbose output format. + count_rows : bool, default False + If True, auto-apply a ``COUNT(*)`` aggregate before analyzing so + the returned plan reflects what :py:meth:`count_rows` would + execute (including the optimizer's count-pushdown decisions). Returns ------- plan : str """ - return self._scanner.analyze_plan() + return self._scanner.analyze_plan(count_rows=count_rows) class DatasetOptimizer: @@ -7292,6 +7458,7 @@ def _build_vector_search_query( use_index: bool = True, ef: Optional[int] = None, query_parallelism: Optional[int] = None, + approx_mode: Literal["fast", "normal", "accurate"] = "normal", distance_range: Optional[tuple[Optional[float], Optional[float]]] = None, ) -> dict: """Configure nearest neighbor search. @@ -7333,6 +7500,13 @@ def _build_vector_search_query( maps to the single-worker sequential path. Value -1 uses the CPU pool size. Value 1 uses the single-worker sequential path. Values >= 2 use the partition-parallel path and are clamped to the CPU pool size. + approx_mode: {"fast", "normal", "accurate"}, default "normal" + Controls the speed / accuracy tradeoff for approximate vector search + when supported by the selected index. This currently only affects + RQ-quantized indexes, such as IVF_RQ. Other index types ignore this + setting. ``fast`` favors lower latency and may reduce recall, + ``normal`` uses the default balance, and ``accurate`` favors higher + recall and may increase latency. distance_range: tuple[Optional[float], Optional[float]], optional A tuple of (lower_bound, upper_bound) to filter results by distance. Both bounds are optional. The lower bound is inclusive and the upper @@ -7406,6 +7580,12 @@ def _build_vector_search_query( if query_parallelism is not None and query_parallelism < -1: raise ValueError("query_parallelism must be >= -1") + if approx_mode not in {"fast", "normal", "accurate"}: + raise ValueError( + "approx_mode must be one of 'fast', 'normal', or 'accurate', " + f"got {approx_mode!r}" + ) + if distance_range is not None: if len(distance_range) != 2: raise ValueError( @@ -7423,6 +7603,7 @@ def _build_vector_search_query( "use_index": use_index, "ef": ef, "query_parallelism": query_parallelism, + "approx_mode": approx_mode, "distance_range": distance_range, } @@ -7579,6 +7760,7 @@ def __init__( use_index: bool = True, ef: Optional[int] = None, query_parallelism: Optional[int] = None, + approx_mode: Literal["fast", "normal", "accurate"] = "normal", ): self._inner = _build_vector_search_query( column, @@ -7592,6 +7774,7 @@ def __init__( use_index=use_index, ef=ef, query_parallelism=query_parallelism, + approx_mode=approx_mode, ) def inner(self): diff --git a/python/python/lance/indices/__init__.py b/python/python/lance/indices/__init__.py index 40dc9ed93ac..675754cc2d0 100644 --- a/python/python/lance/indices/__init__.py +++ b/python/python/lance/indices/__init__.py @@ -30,7 +30,6 @@ class IndexFileVersion(str, Enum): class SupportedDistributedIndices(str, Enum): # Scalar index types BTREE = "BTREE" - BITMAP = "BITMAP" INVERTED = "INVERTED" # Precise vector index types supported by distributed merge diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py index d3d61c5f8ff..6059166d6ba 100644 --- a/python/python/lance/indices/builder.py +++ b/python/python/lance/indices/builder.py @@ -150,7 +150,7 @@ def train_ivf( max_iters=max_iters, ) num_dims = ivf_centroids.shape[1] - ivf_centroids.shape = -1 + ivf_centroids = ivf_centroids.reshape(-1) flat_centroids_array = pa.array(ivf_centroids) centroids_array = pa.FixedSizeListArray.from_arrays( flat_centroids_array, num_dims diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index f3bc9a681b2..26ad75a27b7 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -226,9 +226,10 @@ class _Dataset: def replace_field_metadata(self, field_name: str, metadata: Dict[str, str]): ... @property def data_storage_version(self) -> str: ... + @property + def has_stable_row_ids(self) -> bool: ... def index_statistics(self, index_name: str) -> str: ... def serialized_manifest(self) -> bytes: ... - def load_indices(self) -> List[Index]: ... def describe_indices(self) -> List[IndexDescription]: ... def scanner( self, @@ -462,6 +463,27 @@ class _Dataset: def get_transactions( self, recent_transactions=10 ) -> List[Optional[Transaction]]: ... + def hamming_clustering_for_ivf_partition( + self, + index_name: str, + partition_id: int, + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... + def get_ivf_partition_info(self, index_name: str) -> List[dict]: ... + def hamming_clustering_for_sample( + self, + column: str, + sample_size: Optional[int], + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... + def hamming_clustering_for_range( + self, + column: str, + fragment_id: int, + start_row: int, + num_rows: int, + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... class _MergeInsertBuilder: def __init__(self, dataset: _Dataset, on: str | Iterable[str]): ... @@ -475,7 +497,7 @@ class _Scanner: @property def schema(self) -> pa.Schema: ... def explain_plan(self, verbose: bool) -> str: ... - def analyze_plan(self) -> str: ... + def analyze_plan(self, count_rows: bool = False) -> str: ... def count_rows(self) -> int: ... def to_pyarrow(self) -> pa.RecordBatchReader: ... diff --git a/python/python/lance/lance/indices/__init__.pyi b/python/python/lance/lance/indices/__init__.pyi index 384e2528e99..0f5db7037df 100644 --- a/python/python/lance/lance/indices/__init__.pyi +++ b/python/python/lance/lance/indices/__init__.pyi @@ -59,6 +59,11 @@ def transform_vectors( pq_codebook: pa.Array, dst_uri: str, ): ... +def build_rq_model( + dimension: int, + num_bits: int = 1, + dtype: str = "float32", +) -> str: ... class IndexSegmentDescription: uuid: str @@ -67,6 +72,7 @@ class IndexSegmentDescription: index_version: int created_at: Optional[datetime] size_bytes: Optional[int] + base_id: Optional[int] def __repr__(self) -> str: ... diff --git a/python/python/lance/lance/optimize.pyi b/python/python/lance/lance/optimize.pyi index 9a26d23c003..c4b6b6546e6 100644 --- a/python/python/lance/lance/optimize.pyi +++ b/python/python/lance/lance/optimize.pyi @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import List, Optional from lance import LanceDataset from lance.fragment import FragmentMetadata @@ -51,5 +51,7 @@ class Compaction: def plan(dataset: "LanceDataset", options: CompactionOptions) -> CompactionPlan: ... @staticmethod def commit( - dataset: "LanceDataset", rewrites: List[RewriteResult] + dataset: "LanceDataset", + rewrites: List[RewriteResult], + options: Optional[CompactionOptions] = None, ) -> CompactionMetrics: ... diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py index f448e5c3368..fec3a1cfb1e 100644 --- a/python/python/lance/namespace.py +++ b/python/python/lance/namespace.py @@ -32,6 +32,8 @@ CreateMaterializedViewResponse, CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, @@ -42,6 +44,8 @@ DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableBranchRequest, + DeleteTableBranchResponse, DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, @@ -70,6 +74,8 @@ LanceNamespace, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, + ListTableBranchesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, @@ -850,6 +856,27 @@ def update_table_tag( response_dict = self._inner.update_table_tag(request.model_dump()) return UpdateTableTagResponse.from_dict(response_dict) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + """Create a new branch forked from a table version.""" + response_dict = self._inner.create_table_branch(request.model_dump()) + return CreateTableBranchResponse.from_dict(response_dict) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + """List all branches of a table.""" + response_dict = self._inner.list_table_branches(request.model_dump()) + return ListTableBranchesResponse.from_dict(response_dict) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + """Delete a branch from a table.""" + response_dict = self._inner.delete_table_branch(request.model_dump()) + return DeleteTableBranchResponse.from_dict(response_dict) + # Operation metrics methods def retrieve_ops_metrics(self) -> Dict[str, int]: @@ -1420,6 +1447,27 @@ def update_table_tag( response_dict = self._inner.update_table_tag(request.model_dump()) return UpdateTableTagResponse.from_dict(response_dict) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + """Create a new branch forked from a table version.""" + response_dict = self._inner.create_table_branch(request.model_dump()) + return CreateTableBranchResponse.from_dict(response_dict) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + """List all branches of a table.""" + response_dict = self._inner.list_table_branches(request.model_dump()) + return ListTableBranchesResponse.from_dict(response_dict) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + """Delete a branch from a table.""" + response_dict = self._inner.delete_table_branch(request.model_dump()) + return DeleteTableBranchResponse.from_dict(response_dict) + # Operation metrics methods def retrieve_ops_metrics(self) -> Dict[str, int]: diff --git a/python/python/lance/optimize.py b/python/python/lance/optimize.py index 8b98308d442..3ac7547960b 100644 --- a/python/python/lance/optimize.py +++ b/python/python/lance/optimize.py @@ -57,6 +57,14 @@ class CompactionOptions(TypedDict): The batch size to use when scanning input fragments. You may want to reduce this if you are running out of memory during compaction. + The default will use the same default from ``scanner``. + """ + io_buffer_size: Optional[int] + """ + The number of bytes to allow to queue up in the I/O buffer when scanning + input fragments. Increasing this can avoid a deadlock that occurs when a + single batch of data is larger than the I/O buffer size. + The default will use the same default from ``scanner``. """ compaction_mode: Optional[ diff --git a/python/python/lance/udf.py b/python/python/lance/udf.py index de6c7c4ff59..3a80349479e 100644 --- a/python/python/lance/udf.py +++ b/python/python/lance/udf.py @@ -205,7 +205,9 @@ def normalize_transform( ) ) ) - if isinstance(sample_batch, pd.DataFrame): + if _check_for_pandas(sample_batch) and isinstance( + sample_batch, pd.DataFrame + ): sample_batch = pa.RecordBatch.from_pandas(sample_batch) udf_like.output_schema = sample_batch.schema @@ -233,7 +235,9 @@ def normalize_transform( ) ) ) - if isinstance(sample_batch, pd.DataFrame): + if _check_for_pandas(sample_batch) and isinstance( + sample_batch, pd.DataFrame + ): sample_batch = pa.RecordBatch.from_pandas(sample_batch) udf_like = BatchUDF(udf_like, output_schema=sample_batch.schema) diff --git a/python/python/lance/util.py b/python/python/lance/util.py index 5b94ad5c35a..2161c4e0d45 100644 --- a/python/python/lance/util.py +++ b/python/python/lance/util.py @@ -254,4 +254,4 @@ def _target_partition_size_to_num_partitions( if target_partition_size is None: target_partition_size = 8192 num_partitions = num_rows // target_partition_size - return max(1, num_partitions, 4096) + return min(max(1, num_partitions), 4096) diff --git a/python/python/lance/vector.py b/python/python/lance/vector.py index 34a6154a321..5ce5e8b61e5 100644 --- a/python/python/lance/vector.py +++ b/python/python/lance/vector.py @@ -749,3 +749,150 @@ def _partition_and_pq_codes_assignment() -> Iterable[pa.RecordBatch]: data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() ] return dst_dataset_uri, shuffle_buffers + + +# ============================================================================= +# Hamming Distance Clustering +# ============================================================================= + + +def hamming_clustering_for_ivf_partition( + dataset: "LanceDataset", + index_name: str, + partition_id: int, + hamming_threshold: int, +) -> pa.RecordBatchReader: + """ + Perform hamming clustering on a partition of an IVF_FLAT index. + + Loads a partition from an IVF_FLAT index on a hash column, computes + pairwise hamming distances between all hashes in the partition, + filters by threshold, and clusters the results using union-find. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column with an IVF_FLAT index. + index_name : str + Name of the IVF_FLAT index on the hash column + partition_id : int + The partition ID within the IVF_FLAT index + hamming_threshold : int + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_ivf_partition( + index_name, partition_id, hamming_threshold + ) + + +def get_ivf_partition_info( + dataset: "LanceDataset", + index_name: str, +) -> List[dict]: + """ + Get partition information for an IVF_FLAT index. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column with an IVF_FLAT index. + index_name : str + Name of the IVF_FLAT index + + Returns + ------- + list[dict] + List of partition info dicts with 'partition_id' and 'size' + """ + return dataset._ds.get_ivf_partition_info(index_name) + + +def hamming_clustering_for_sample( + dataset: "LanceDataset", + column: str, + sample_size: Optional[int] = None, + hamming_threshold: int = 10, +) -> pa.RecordBatchReader: + """ + Perform pairwise hamming distance clustering on a sample of the dataset. + + Randomly samples rows from the dataset, computes pairwise hamming distances + between all hashes in the sample, filters by threshold, and clusters the + results using union-find. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column. + column : str + Name of the hash column (must be FixedSizeList) + sample_size : int, optional + Number of rows to sample. If None, uses all rows. + hamming_threshold : int, default 10 + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_sample( + column, sample_size, hamming_threshold + ) + + +def hamming_clustering_for_range( + dataset: "LanceDataset", + column: str, + fragment_id: int, + start_row: int, + num_rows: int, + hamming_threshold: int = 10, +) -> pa.RecordBatchReader: + """ + Perform pairwise hamming distance clustering on a contiguous range of rows. + + Reads a contiguous range of rows from a specific fragment, computes pairwise + hamming distances between all hashes in the range, filters by threshold, + and clusters the results using union-find. + + Unlike sampling, this reads sequential rows which is useful for distributed + processing where each worker handles a specific range of a fragment. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column. + column : str + Name of the hash column (must be FixedSizeList) + fragment_id : int + The fragment ID to read from + start_row : int + The starting row offset within the fragment + num_rows : int + Number of rows to read from the start position + hamming_threshold : int, default 10 + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_range( + column, fragment_id, start_row, num_rows, hamming_threshold + ) diff --git a/python/python/tests/compat/compat_decorator.py b/python/python/tests/compat/compat_decorator.py index 0ab35672410..fdfe09a6879 100644 --- a/python/python/tests/compat/compat_decorator.py +++ b/python/python/tests/compat/compat_decorator.py @@ -153,6 +153,10 @@ def skip_read_after_current_write(self, version: str) -> bool: """Return True to skip the old-version read after current-version writes.""" return False + def skip_write_after_current_write(self, version: str) -> bool: + """Return True to skip the old-version write after current-version writes.""" + return False + def skip_downgrade(self, version: str) -> bool: """Return True to skip the current-write -> old-read downgrade test.""" return False @@ -333,8 +337,10 @@ def test_func({sig_params}): obj.create() # Old version: verify can read venv = venv_factory.get_venv(version) - venv.execute_method(obj, "check_read", obj.compat_env(version, "check_read")) - venv.execute_method(obj, "check_write", obj.compat_env(version, "check_write")) + if not obj.skip_read_after_current_write(version): + venv.execute_method(obj, "check_read", obj.compat_env(version, "check_read")) + if not obj.skip_write_after_current_write(version): + venv.execute_method(obj, "check_write", obj.compat_env(version, "check_write")) ''' else: # upgrade_downgrade func_body = f''' diff --git a/python/python/tests/compat/compat_sequence.py b/python/python/tests/compat/compat_sequence.py new file mode 100644 index 00000000000..bec216d6a6f --- /dev/null +++ b/python/python/tests/compat/compat_sequence.py @@ -0,0 +1,296 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Cross-version index maintenance-sequence search. + +Runs on the same per-ref venv substrate as the rest of this package: venv_factory +(venv_manager.py) provisions one venv per ref, so the *setup* half of a sequence runs +under `from_ref` and the *exercise* half under `to_ref` (the version split). After each +run an oracle checks that the reader did not panic and that an index query agrees with a +full (unindexed) scan. This *discovers* cross-version regressions (e.g. ENT-1662) +without hand-coding the triggering sequence. + +The scenario is parameterized by index *kind* so every scalar index type gets the same +aged-lifecycle, cross-version treatment. The oracle runs the same predicate twice -- +normally and with use_scalar_index=False (lance ignores the index) -- and requires +the results to match. If the two query plans are identical the index wasn't used, so the +comparison is skipped rather than failed (uninformative, not a regression). FTS has no +"ignore the index" mode to diff against, so its oracle reconstructs ground truth from a +full scan: tokenize every live row, then require an FTS search for a spread of sampled +terms to return exactly the rows that contain them. The FTS scenarios run under both +on-disk format versions (LANCE_FTS_FORMAT_VERSION 1 and 2), which take different merge +paths. + +The op vocabulary and bounds are deliberately small so the search is runnable; this is +exhaustive over the maintenance-lifecycle grammar up to the configured lengths, not over +every op permutation. +""" + +import itertools +import os +import shutil +from pathlib import Path + +ROWS_PER_WRITE = 200 + +SETUP_TAIL_OPS = ["D", "C", "W"] +EXERCISE_OPS = ["W", "D", "C", "Oa", "Om", "Od"] + +OP_NAMES = { + "W": "write rows", + "I": "create index", + "D": "delete rows", + "C": "compact", + "Oa": "optimize (append)", + "Om": "optimize (merge)", + "Od": "optimize", +} + + +def describe(kind, from_ref, to_ref, setup_ops, exercise_ops, fts_version=None): + """A plain-English description of a scenario for failure output.""" + writer = ", then ".join(OP_NAMES[o] for o in ["W", "I", *setup_ops]) + reader = ", then ".join(OP_NAMES[o] for o in exercise_ops) + tag = f" (fts fmt v{fts_version})" if fts_version is not None else "" + return f"{kind}{tag} ({from_ref} -> {to_ref}): writer [{writer}]; reader [{reader}]" + + +# Index kinds covered by the maintenance-sequence search. +SCALAR_KINDS = ["BTREE", "BITMAP", "LABEL_LIST", "NGRAM", "ZONEMAP", "BLOOMFILTER"] +ALL_KINDS = ["INVERTED", *SCALAR_KINDS] + + +class IndexScenario: + """A picklable, kind-parameterized scenario run across a version split.""" + + def __init__(self, kind, path, setup_ops, exercise_ops): + self.kind = kind + self.path = str(path) + self.setup_ops = list(setup_ops) + self.exercise_ops = list(exercise_ops) + self.next_idx = 0 + + # --- in-venv helpers (only lance + pyarrow available) --- + def _open(self): + import lance + + session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) + return lance.dataset(self.path, session=session) + + def _batch(self, a, b): + import pyarrow as pa + + idx = list(range(a, b)) + if self.kind == "INVERTED": + # Each row's text mixes tokens of different frequency: a unique term, a + # mid-frequency bucket (~1/7 of rows), and one shared by every row. Sampling + # across that spread exercises postings of varied length. + return pa.table( + {"idx": idx, "key": [f"term{i} bucket{i % 7} shared" for i in idx]} + ) + if self.kind == "LABEL_LIST": + return pa.table({"idx": idx, "key": [[f"l{i % 8}"] for i in idx]}) + if self.kind == "NGRAM": + return pa.table({"idx": idx, "key": [f"w{i % 50}x" for i in idx]}) + # BTREE / BITMAP / ZONEMAP / BLOOMFILTER: integer column + card = 8 if self.kind == "BITMAP" else 50 + key = [i if self.kind == "ZONEMAP" else i % card for i in idx] + return pa.table({"idx": idx, "key": key}) + + def _index_type(self): + return "INVERTED" if self.kind == "INVERTED" else self.kind + + def _oracle_pred(self): + if self.kind == "LABEL_LIST": + return "array_has_any(key, ['l3'])" + if self.kind == "NGRAM": + return "contains(key, 'w3x')" + if self.kind == "ZONEMAP": + return "key >= 100 AND key < 300" + return "key == 3" # BTREE / BITMAP / BLOOMFILTER + + # --- ops --- + def _op_W(self): + import lance + + a, b = self.next_idx, self.next_idx + ROWS_PER_WRITE + self.next_idx = b + tbl = self._batch(a, b) + if not os.path.exists(self.path): + lance.write_dataset(tbl, self.path) # single fragment + else: + self._open().insert(tbl) + + def _op_I(self): + kwargs = {"with_position": True} if self.kind == "INVERTED" else {} + self._open().create_scalar_index("key", self._index_type(), **kwargs) + + def _op_D(self): + # Partial-range delete inside the id space so compaction rewrites and remaps the + # index per-row. + if self.next_idx == 0: + return + lo, hi = self.next_idx // 4, self.next_idx // 2 + if hi > lo: + self._open().delete(f"idx >= {lo} AND idx < {hi}") + + def _op_C(self): + self._open().optimize.compact_files() + + def _op_Oa(self): + self._open().optimize.optimize_indices(num_indices_to_merge=0) + + def _op_Om(self): + self._open().optimize.optimize_indices(num_indices_to_merge=10) + + def _op_Od(self): + self._open().optimize.optimize_indices() + + def _run(self, ops): + for op in ops: + getattr(self, f"_op_{op}")() + + # --- methods invoked across the version split --- + def setup(self): + shutil.rmtree(self.path, ignore_errors=True) + self.next_idx = 0 + self._run(["W", "I"] + self.setup_ops) + return self.next_idx + + def exercise_and_check(self): + self._run(self.exercise_ops) + ds = self._open() + if self.kind == "INVERTED": + # Differential oracle: rebuild the token -> rows map from a full (unindexed) + # scan, then require an FTS search for a spread of sampled terms to return + # exactly those rows. Catches a merge that drops or misassigns postings, not + # just a row-count drift. (Tokens here are alphanumeric and space-separated, + # so a whitespace split reproduces lance's tokenization.) + rows = ds.to_table(columns=["idx", "key"]) + idxs = rows.column("idx").to_pylist() + texts = rows.column("key").to_pylist() + truth = {} + for i, text in zip(idxs, texts): + for tok in text.split(): + truth.setdefault(tok, set()).add(i) + if not truth: + return # everything deleted; nothing to search + vocab = sorted(truth) + # A spread across the vocabulary plus the most common term. + sample = set(vocab[:: max(1, len(vocab) // 6)]) + sample.add(max(truth, key=lambda t: len(truth[t]))) + for term in sorted(sample): + hit = ds.to_table(full_text_query={"query": term, "columns": ["key"]}) + got = set(hit.column("idx").to_pylist()) + want = truth[term] + assert got == want, ( + f"FTS('{term}'): index returned {len(got)} rows, corpus has " + f"{len(want)} (missing {sorted(want - got)[:5]}, " + f"extra {sorted(got - want)[:5]})" + ) + return + # Same column/predicate, index on vs forced off: use_scalar_index=False makes + # lance ignore the index, so the plans differ iff the index is used. If they are + # identical the index wasn't consulted here (the planner chose a scan after + # deletes), so the comparison is vacuous -- skip rather than compare two scans. + pred = self._oracle_pred() + plan_index = ds.scanner(filter=pred).explain_plan(True) + plan_scan = ds.scanner(filter=pred, use_scalar_index=False).explain_plan(True) + if plan_index == plan_scan: + return + got = ds.to_table(filter=pred).num_rows + expected = ds.to_table(filter=pred, use_scalar_index=False).num_rows + assert got == expected, ( + f"{self.kind}: index gave {got} rows, full scan {expected}, for '{pred}'" + ) + + +def generate(max_length): + """Yield every (setup_ops, exercise_ops) whose combined length is 1..max_length, + breadth-first by total length (shorter first). `max_length` is the number of + maintenance ops after the implicit write + create-index, split between the writer + (setup) and reader (exercise) at every position. The order is neutral, so finding a + bug is a real search, not a sorted shortcut. The space grows fast with max_length, + so deeper bugs (ENT-1662 needs length 5) cost more to reach.""" + for total in range(1, max_length + 1): + for setup_len in range(total): # exercise gets total - setup_len >= 1 + for s in itertools.product(SETUP_TAIL_OPS, repeat=setup_len): + for e in itertools.product(EXERCISE_OPS, repeat=total - setup_len): + yield list(s), list(e) + + +def search( + venv_factory, + from_ref, + to_ref, + base_path, + kind, + max_length=4, + shard=0, + num_shards=1, + stop_on_first=True, + fts_version=None, +): + """Search index-maintenance sequences up to `max_length` ops for one `kind`, across + (from_ref -> to_ref). Runs only scenarios in this shard (i % num_shards == shard) so + the space can be split across parallel workers. For INVERTED, `fts_version` ("1" or + "2") pins the on-disk FTS format (LANCE_FTS_FORMAT_VERSION) on both sides; both are + Fst token sets and exercise distinct merge paths. Returns failures; stops on the + first when `stop_on_first`.""" + from_venv = venv_factory.get_venv(from_ref) + to_venv = venv_factory.get_venv(to_ref) + env = {} + if kind == "INVERTED" and fts_version is not None: + env["LANCE_FTS_FORMAT_VERSION"] = str(fts_version) + base = Path(base_path) + failures = [] + # Each setup's aged dataset is built once under from_ref and snapshotted; every + # exercise for that setup runs on a *copy* of it (a dir copy is far cheaper + # than rebuilding the index). Cached per shard, keyed by the setup ops. + snapshots = {} # tuple(setup) -> (snapshot_path, next_idx), or None if setup failed + try: + for i, (setup_tail, exercise) in enumerate(generate(max_length)): + if i % num_shards != shard: + continue + key = tuple(setup_tail) + if key not in snapshots: + snap = base / f"snap_{kind}_{len(snapshots)}" + shutil.rmtree(snap, ignore_errors=True) + builder = IndexScenario(kind, snap, setup_tail, []) + try: + next_idx = from_venv.execute_method(builder, "setup", env) + snapshots[key] = (snap, next_idx) + except Exception as e: + label = describe( + kind, from_ref, to_ref, setup_tail, [], fts_version + ) + err = str(e).strip() + failures.append({"run": i, "sequence": label, "error": err}) + snapshots[key] = None + shutil.rmtree(snap, ignore_errors=True) + if stop_on_first: + break + entry = snapshots[key] + if entry is None: + continue # setup failed; skip its exercises + snap, next_idx = entry + ex_path = base / f"ex_{kind}_{i}" + shutil.rmtree(ex_path, ignore_errors=True) + shutil.copytree(snap, ex_path) + scenario = IndexScenario(kind, ex_path, setup_tail, exercise) + scenario.next_idx = next_idx + label = describe(kind, from_ref, to_ref, setup_tail, exercise, fts_version) + try: + to_venv.execute_method(scenario, "exercise_and_check", env) + except Exception as e: + error = str(e).strip() + failures.append({"run": i, "sequence": label, "error": error}) + if stop_on_first: + break + finally: + shutil.rmtree(ex_path, ignore_errors=True) + finally: + for entry in snapshots.values(): + if entry is not None: + shutil.rmtree(entry[0], ignore_errors=True) + return failures diff --git a/python/python/tests/compat/test_index_sequence.py b/python/python/tests/compat/test_index_sequence.py new file mode 100644 index 00000000000..4d0db694064 --- /dev/null +++ b/python/python/tests/compat/test_index_sequence.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Cross-version index maintenance-sequence search, wired as a compat test. + +For each index kind, generate maintenance-op sequences from a small grammar, run the +setup half under an older writer ref and the exercise half under a newer reader, and +check after each that the reader did not panic and that an index query matches a full +scan. Discovers cross-version regressions (e.g. ENT-1662, under INVERTED) with no +hand-coded sequence. + +Refs and max length are environment-driven so the suite can run between two refs +(versions, commits, or branches): COMPAT_FROM_REF / COMPAT_TO_REF / COMPAT_MAX_LENGTH / +COMPAT_KINDS (comma-separated subset of kinds) / COMPAT_SHARDS (split each kind's search +into this many cases so pytest-xdist (`-n auto`) parallelizes them across cores). +""" + +import os + +import pytest + +from .compat_decorator import pylance_stable_versions +from .compat_sequence import ALL_KINDS, search + + +def _default_refs(): + """The two most recent published stable releases (older -> newer).""" + versions = pylance_stable_versions() + if len(versions) >= 2: + return str(versions[-2]), str(versions[-1]) + return "6.0.1", "7.0.0" # fallback if PyPI is unreachable + + +_default_from, _default_to = _default_refs() +FROM_REF = os.environ.get("COMPAT_FROM_REF") or _default_from +TO_REF = os.environ.get("COMPAT_TO_REF") or _default_to +MAX_LENGTH = int(os.environ.get("COMPAT_MAX_LENGTH", "4")) +KINDS = os.environ.get("COMPAT_KINDS", ",".join(ALL_KINDS)).split(",") +# Many small shards (default 4x cores) so xdist's dynamic scheduler keeps every worker +# busy and an oversubscribed `-n` has work to overlap. +NUM_SHARDS = int(os.environ.get("COMPAT_SHARDS", str((os.cpu_count() or 1) * 4))) + + +def _cases(): + """(kind, fts_version) cases. FTS runs under both on-disk formats (v1, v2); the + scalar kinds are format-agnostic and run once.""" + cases = [] + for kind in KINDS: + if kind == "INVERTED": + cases.extend([("INVERTED", "1"), ("INVERTED", "2")]) + else: + cases.append((kind, None)) + return cases + + +CASES = _cases() +CASE_IDS = [k if v is None else f"{k}-fmtv{v}" for k, v in CASES] + + +@pytest.mark.compat +@pytest.mark.parametrize("kind,fts_version", CASES, ids=CASE_IDS) +@pytest.mark.parametrize("shard", range(NUM_SHARDS)) +def test_index_maintenance_sequence_search( + venv_factory, tmp_path, kind, fts_version, shard +): + failures = search( + venv_factory, + FROM_REF, + TO_REF, + tmp_path, + kind, + max_length=MAX_LENGTH, + shard=shard, + num_shards=NUM_SHARDS, + fts_version=fts_version, + ) + # First line is the failure itself so it shows in pytest's bottom summary; the rest + # (if more than one) appears in the failure body. + assert not failures, "\n".join( + f"{f['sequence']} ==> {f['error']}" for f in failures + ) diff --git a/python/python/tests/compat/test_vector_indices.py b/python/python/tests/compat/test_vector_indices.py index b98ffdf63e3..e97d6be8bf6 100644 --- a/python/python/tests/compat/test_vector_indices.py +++ b/python/python/tests/compat/test_vector_indices.py @@ -8,6 +8,7 @@ can be read and written by other versions. """ +import os import shutil from pathlib import Path @@ -268,6 +269,21 @@ class IvfRqVectorIndex(UpgradeDowngradeTest): def __init__(self, path: Path): self.path = path + def current_env(self, method_name: str): + if method_name == "check_read": + return {"LANCE_COMPAT_CURRENT_RUNTIME": "1"} + return {} + + def skip_read_after_current_write(self, version: str) -> bool: + # Newly written IVF_RQ indexes carry raw-query estimator metadata and + # split-code schema that older runtimes cannot query or optimize safely. + # The upgrade_downgrade variant still covers old 1-bit residual-query + # indexes being read and rewritten by the current runtime. + return True + + def skip_write_after_current_write(self, version: str) -> bool: + return True + def create(self): """Create dataset with IVF_RQ vector index.""" shutil.rmtree(self.path, ignore_errors=True) @@ -319,6 +335,12 @@ def check_read(self): stats = ds.stats.index_stats(name) assert stats["num_indexed_rows"] > 0 + if os.environ.get("LANCE_COMPAT_CURRENT_RUNTIME") == "1": + # Old 1-bit IVF_RQ indexes do not have split ex-code columns. + # The successful query above verifies the current reader does not + # require them. + sub_index = stats["indices"][0]["sub_index"] + assert sub_index["num_bits"] == 1 def check_write(self): """Verify can insert vectors and run optimize workflows.""" diff --git a/python/python/tests/compat/venv_manager.py b/python/python/tests/compat/venv_manager.py index 9e16b7e2dc7..c4b23486cd3 100644 --- a/python/python/tests/compat/venv_manager.py +++ b/python/python/tests/compat/venv_manager.py @@ -8,15 +8,39 @@ with specific Lance versions installed. """ +import contextlib +import glob import os import pickle +import re +import shutil import struct import subprocess import sys from pathlib import Path from typing import Any, Optional -from packaging.version import Version +from packaging.version import InvalidVersion, Version + +try: + import fcntl +except ImportError: # pragma: no cover - non-POSIX + fcntl = None + + +@contextlib.contextmanager +def _venv_lock(lock_path: Path): + """Hold an exclusive lock so parallel workers don't race creating the same venv.""" + lock_path.parent.mkdir(parents=True, exist_ok=True) + with open(lock_path, "w") as handle: + if fcntl is not None: + fcntl.flock(handle, fcntl.LOCK_EX) + try: + yield + finally: + if fcntl is not None: + fcntl.flock(handle, fcntl.LOCK_UN) + NAMESPACE_0_6_DEPENDENCY = "lance-namespace<0.7" NAMESPACE_0_7_DEPENDENCY = "lance-namespace>=0.7.2,<0.8" @@ -31,6 +55,47 @@ def _lance_namespace_dependency(pylance_version: str) -> str: return NAMESPACE_0_6_DEPENDENCY +def _is_release_version(ref: str) -> bool: + """A ref is treated as a published release (install a wheel) if it parses as a + version; anything else (commit sha, branch, tag) is built from source.""" + try: + Version(ref) + return True + except InvalidVersion: + return False + + +def _prebuilt_wheel_for(ref: str) -> Optional[str]: + """A prebuilt wheel to install for `ref` instead of building it from source. + + When CI has already built a ref (e.g. the PR head, built once by the Python build + job), COMPAT_PREBUILT_REF names that ref and COMPAT_PREBUILT_WHEEL points at the + wheel (a path or glob). Lets the PR workflow reuse that wheel rather than rebuilding + the reader. Returns None when no prebuilt wheel applies to `ref`. + """ + if os.environ.get("COMPAT_PREBUILT_REF") != ref: + return None + pattern = os.environ.get("COMPAT_PREBUILT_WHEEL") + if not pattern: + return None + matches = sorted(glob.glob(pattern)) + if not matches: + raise FileNotFoundError( + f"COMPAT_PREBUILT_WHEEL={pattern!r} matched no wheel for ref {ref!r}" + ) + return matches[0] + + +def _repo_root() -> Path: + """Lance source checkout holding this test file (used to build refs from source).""" + # .../python/python/tests/compat/venv_manager.py -> repo root is parents[4] + return Path(__file__).resolve().parents[4] + + +def _safe(ref: str) -> str: + return re.sub(r"[^A-Za-z0-9._-]", "_", ref) + + class VenvExecutor: """Manages a virtual environment with a specific Lance version.""" @@ -52,6 +117,8 @@ def __init__(self, version: str, venv_path: Path, persistent: bool = False): self.persistent = persistent self._created = False self._subprocess: Optional[subprocess.Popen] = None + self._stderr_path: Optional[Path] = None + self._stderr_file = None @property def python_path(self) -> Path: @@ -59,54 +126,61 @@ def python_path(self) -> Path: return self.venv_path / "Scripts" / "python.exe" return self.venv_path / "bin" / "python" - def _validate_venv(self) -> bool: - """Check if existing venv is valid and has correct Lance version.""" - if not self.venv_path.exists(): - return False + @property + def _marker_path(self) -> Path: + return self.venv_path / ".compat_ref" + def _validate_venv(self) -> bool: + """A cached venv is reusable if it exists and its recorded ref matches. A marker + file is used (not `pip show`) so source-built commit refs also validate.""" if not self.python_path.exists(): return False - - # Check if pylance is installed with correct version try: - result = subprocess.run( - [str(self.python_path), "-m", "pip", "show", "pylance"], - capture_output=True, - text=True, - timeout=5, - ) - if result.returncode != 0: - return False - - # Parse version from output - for line in result.stdout.splitlines(): - if line.startswith("Version:"): - installed_version = line.split(":", 1)[1].strip() - return installed_version == self.version - - except Exception: + return self._marker_path.read_text().strip() == self.version + except OSError: return False - return False - def create(self): """Create the virtual environment and install the specified Lance version.""" if self._created: return - - # Check if persistent venv already exists and is valid if self.persistent and self._validate_venv(): self._created = True return - # Create virtual environment + # Lock so parallel workers don't build the same venv at once; re-check in the + # lock since another worker may have just finished it. + with _venv_lock(self.venv_path.parent / f".lock_{_safe(self.version)}"): + if not self._validate_venv(): + if self.venv_path.exists(): + shutil.rmtree(self.venv_path) # drop any partial build + subprocess.run( + [sys.executable, "-m", "venv", str(self.venv_path)], + check=True, + capture_output=True, + ) + # Prefer a wheel CI already built for this ref; else a published + # release installs its wheel; else build the ref (commit/branch/tag) + # from source -- so two arbitrary refs can be compared and only the + # ones without a wheel pay a build. + prebuilt = _prebuilt_wheel_for(self.version) + if prebuilt is not None: + self._install_wheel(prebuilt) + elif _is_release_version(self.version): + self._install_release_wheel() + else: + self._build_from_source() + self._marker_path.write_text(self.version) + self._created = True + + def _install_wheel(self, wheel: str): subprocess.run( - [sys.executable, "-m", "venv", str(self.venv_path)], + [str(self.python_path), "-m", "pip", "install", "--quiet", wheel, "pytest"], check=True, capture_output=True, ) - # Install specific pylance version and pytest + def _install_release_wheel(self): subprocess.run( [ str(self.python_path), @@ -131,7 +205,55 @@ def create(self): capture_output=True, ) - self._created = True + def _build_from_source(self): + """Build a wheel for an arbitrary git ref via a worktree + maturin, then install + it. The worktree/build is cached by ref so it is paid at most once.""" + py = str(self.python_path) + src = self.venv_path.parent / f"src_{_safe(self.version)}" + if not src.exists(): + subprocess.run( + [ + "git", + "-C", + str(_repo_root()), + "worktree", + "add", + "--detach", + str(src), + self.version, + ], + check=True, + capture_output=True, + ) + subprocess.run( + [py, "-m", "pip", "install", "--quiet", "maturin", "pytest", "pyarrow"], + check=True, + capture_output=True, + ) + wheels = src / "target" / "compat-wheels" + subprocess.run( + [ + py, + "-m", + "maturin", + "build", + "--release", + "--interpreter", + py, + "-m", + str(src / "python" / "Cargo.toml"), + "--out", + str(wheels), + ], + check=True, + capture_output=True, + ) + wheel = next(wheels.glob("pylance-*.whl")) + subprocess.run( + [py, "-m", "pip", "install", "--quiet", str(wheel)], + check=True, + capture_output=True, + ) def _ensure_subprocess(self): """Ensure the persistent subprocess is running.""" @@ -147,14 +269,35 @@ def _ensure_subprocess(self): tests_dir = Path(__file__).parent.parent env["PYTHONPATH"] = str(tests_dir) + # Capture stderr to a file so a Rust panic (which crashes the runner) can be + # surfaced in the error instead of an opaque "broken pipe". + self._stderr_path = self.venv_path / ".runner_stderr.log" + self._stderr_file = open(self._stderr_path, "w") self._subprocess = subprocess.Popen( [str(self.python_path), "-u", str(runner_script)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=None, # Inherit stderr to see timing messages + stderr=self._stderr_file, env=env, ) + def _last_panic(self) -> str: + """Pull the panic message from the runner's captured stderr, if any.""" + try: + text = self._stderr_path.read_text() + except (OSError, AttributeError): + return "" + lines = text.splitlines() + for i, line in enumerate(lines): + if "panicked at" in line: + # Compact the long path to just "builder.rs:962:57" + loc = line.split("panicked at", 1)[1].strip().rstrip(":") + loc = loc.rsplit("/", 1)[-1] + msg = lines[i + 1].strip() if i + 1 < len(lines) else "" + return f"panic at {loc}: {msg}" if msg else f"panic at {loc}" + tail = [line.strip() for line in lines if line.strip()] + return tail[-1] if tail else "" + def _send_message(self, obj: Any): """Send a length-prefixed pickled message to subprocess.""" data = pickle.dumps(obj) @@ -165,18 +308,19 @@ def _send_message(self, obj: Any): def _receive_message(self) -> Any: """Receive a length-prefixed pickled message from subprocess.""" - # Read 4-byte length header + # Short reads mean the subprocess closed stdout (usually a crash); raise + # EOFError so the caller can surface the panic from captured stderr. length_bytes = self._subprocess.stdout.read(4) if len(length_bytes) < 4: - raise RuntimeError("Failed to read message length from subprocess") + raise EOFError("subprocess closed stdout before sending a message length") length = struct.unpack(">I", length_bytes)[0] # Read message data data = self._subprocess.stdout.read(length) if len(data) < length: - raise RuntimeError( - f"Incomplete message: expected {length} bytes, got {len(data)}" + raise EOFError( + f"incomplete message: expected {length} bytes, got {len(data)}" ) return pickle.loads(data) @@ -234,11 +378,15 @@ def execute_method( raise RuntimeError(error_msg) except (BrokenPipeError, EOFError, struct.error) as e: - # Subprocess died or communication failed - raise RuntimeError( - f"Communication with venv subprocess failed (Lance {self.version}):\n" - f"Error: {e}" - ) + # Subprocess died (usually a Rust panic); flush it, then surface that. + if self._subprocess is not None: + try: + self._subprocess.wait(timeout=2) + except Exception: + pass + panic = self._last_panic() + detail = panic or f"subprocess communication failed: {e}" + raise RuntimeError(f"Lance {self.version}: {detail}") def cleanup(self): """Remove the virtual environment directory and terminate subprocess.""" @@ -295,7 +443,7 @@ def get_venv(self, version: str) -> VenvExecutor: Executor for the specified version """ if version not in self.venvs: - venv_path = self.base_path / f"venv_{version}" + venv_path = self.base_path / f"venv_{_safe(version)}" executor = VenvExecutor(version, venv_path, persistent=self.persistent) executor.create() self.venvs[version] = executor diff --git a/python/python/tests/test_blob.py b/python/python/tests/test_blob.py index 5a896d21c5d..fc879c9cbaa 100644 --- a/python/python/tests/test_blob.py +++ b/python/python/tests/test_blob.py @@ -45,6 +45,56 @@ def _external_blob_table(blob_path, payload=b"hello"): return pa.table({"blob": lance.blob_array([blob_path.as_uri()])}) +def _add_columns_blob_v2_values(tmp_path): + external_base = tmp_path / "external_base" + external_blob = external_base / "external_blob.bin" + external_blob.parent.mkdir(parents=True, exist_ok=True) + external_blob.write_bytes(b"external") + + payloads = [ + b"inline", + b"p" * (64 * 1024 + 1024), + b"d" * (4 * 1024 * 1024 + 1024), + b"external", + ] + values = [payloads[0], payloads[1], payloads[2], external_blob.as_uri()] + initial_bases = [DatasetBasePath(external_base.as_uri(), name="external", id=1)] + return values, payloads, initial_bases + + +def _assert_blob_v2_add_columns_result(dataset, column, payloads): + desc = dataset.to_table(columns=[column]).column(column).chunk(0) + + assert desc.field("kind").to_pylist() == [0, 1, 2, 3] + assert desc.field("blob_id").to_pylist()[3] == 1 + assert desc.field("blob_uri").to_pylist()[3] == "external_blob.bin" + + blobs = dataset.take_blobs(column, indices=range(len(payloads))) + assert [blob.readall() for blob in blobs] == payloads + + +def _dataset_file_set(dataset_path): + return { + path.relative_to(dataset_path) + for path in dataset_path.rglob("*") + if path.is_file() + } + + +def _write_two_fragment_blob_v2_seed_dataset(tmp_path, name): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + dataset_path = tmp_path / name + ds = lance.write_dataset( + pa.table({"id": range(8)}), + dataset_path, + data_storage_version="2.2", + initial_bases=initial_bases, + max_rows_per_file=4, + max_rows_per_group=4, + ) + return ds, dataset_path, values, payloads + + def _out_of_order_blob_selection(dataset_with_blobs, selection_kind): addresses = _blob_row_addresses(dataset_with_blobs) expected = [(addresses[4], b"quux"), (addresses[0], b"foo")] @@ -533,6 +583,160 @@ def test_blob_extension_write_inline(tmp_path): assert f.read() == b"foo" +def test_blob_field_threshold_metadata(): + field = lance.blob_field( + "blob", + inline_size_threshold=16 * 1024, + dedicated_size_threshold=2 * 1024 * 1024, + ) + + assert field.metadata[b"lance-encoding:blob-inline-size-threshold"] == b"16384" + assert field.metadata[b"lance-encoding:blob-dedicated-size-threshold"] == b"2097152" + + +@pytest.mark.parametrize( + ("kwargs", "error", "message"), + [ + pytest.param( + {"inline_size_threshold": -1}, + ValueError, + "inline_size_threshold must be non-negative", + id="negative_inline", + ), + pytest.param( + {"dedicated_size_threshold": 0}, + ValueError, + "dedicated_size_threshold must be positive", + id="zero_dedicated", + ), + pytest.param( + {"dedicated_size_threshold": -1}, + ValueError, + "dedicated_size_threshold must be positive", + id="negative_dedicated", + ), + pytest.param( + {"inline_size_threshold": True}, + TypeError, + "inline_size_threshold must be an int", + id="bool_inline", + ), + pytest.param( + {"dedicated_size_threshold": True}, + TypeError, + "dedicated_size_threshold must be an int", + id="bool_dedicated", + ), + pytest.param( + {"inline_size_threshold": 1.5}, + TypeError, + "inline_size_threshold must be an int", + id="float_inline", + ), + pytest.param( + {"inline_size_threshold": 2**100}, + OverflowError, + "inline_size_threshold must fit in a Rust usize", + id="overflow_inline", + ), + pytest.param( + {"dedicated_size_threshold": 2**100}, + OverflowError, + "dedicated_size_threshold must fit in a Rust usize", + id="overflow_dedicated", + ), + ], +) +def test_blob_field_rejects_invalid_thresholds(kwargs, error, message): + with pytest.raises(error, match=message): + lance.blob_field("blob", **kwargs) + + +def test_blob_extension_inline_threshold_per_column(tmp_path): + payload = b"x" * 2048 + schema = pa.schema( + [ + lance.blob_field("inline_blob", inline_size_threshold=4096), + lance.blob_field("packed_blob", inline_size_threshold=1024), + ] + ) + table = pa.table( + { + "inline_blob": lance.blob_array([payload]), + "packed_blob": lance.blob_array([payload]), + }, + schema=schema, + ) + ds = lance.write_dataset( + table, + tmp_path / "test_ds_v2_inline_threshold_per_column", + data_storage_version="2.2", + ) + + desc = ds.to_table(columns=["inline_blob", "packed_blob"]) + assert desc.column("inline_blob").chunk(0).field("kind").to_pylist() == [0] + assert desc.column("packed_blob").chunk(0).field("kind").to_pylist() == [1] + + +def test_blob_extension_threshold_metadata_persists_after_reopen(tmp_path): + dataset_path = tmp_path / "test_ds_v2_threshold_metadata_persists" + schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)]) + table = pa.table({"blob": lance.blob_array([b"x"])}, schema=schema) + + lance.write_dataset(table, dataset_path, data_storage_version="2.2") + reopened = lance.dataset(dataset_path) + + assert ( + reopened.schema.field("blob").metadata[ + b"lance-encoding:blob-inline-size-threshold" + ] + == b"1024" + ) + + +def test_blob_extension_append_rejects_explicit_threshold_mismatch(tmp_path): + dataset_path = tmp_path / "test_ds_v2_append_threshold_mismatch" + initial_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=4096)]) + initial = pa.table( + {"blob": lance.blob_array([b"x" * 2048])}, + schema=initial_schema, + ) + lance.write_dataset(initial, dataset_path, data_storage_version="2.2") + + append_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)]) + append = pa.table( + {"blob": lance.blob_array([b"x" * 2048])}, + schema=append_schema, + ) + + with pytest.raises( + OSError, match="Cannot append data with blob threshold metadata" + ): + lance.write_dataset(append, dataset_path, mode="append") + + +def test_blob_extension_dedicated_threshold_precedes_inline_threshold(tmp_path): + payload = b"x" * 2048 + schema = pa.schema( + [ + lance.blob_field( + "blob", + inline_size_threshold=4096, + dedicated_size_threshold=1024, + ) + ] + ) + table = pa.table({"blob": lance.blob_array([payload])}, schema=schema) + ds = lance.write_dataset( + table, + tmp_path / "test_ds_v2_dedicated_precedes_inline", + data_storage_version="2.2", + ) + + desc = ds.to_table(columns=["blob"]).column("blob").chunk(0) + assert desc.field("kind").to_pylist() == [2] + + def test_blob_extension_write_external(tmp_path): blob_path = tmp_path / "external_blob.bin" blob_path.write_bytes(b"hello") @@ -608,6 +812,137 @@ def test_blob_extension_write_external_ingest_rejects_reference_only_options(tmp ) +def test_blob_extension_add_columns_record_batch_reader_all_kinds(tmp_path): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_reader_blob_v2", + data_storage_version="2.2", + initial_bases=initial_bases, + ) + + ds.add_columns(pa.table({"blob": lance.blob_array(values)}).to_reader()) + + _assert_blob_v2_add_columns_result(ds, "blob", payloads) + + +@pytest.mark.parametrize( + "failure_mode", + [ + pytest.param("raises_after_first_fragment", id="reader_raises_mid_stream"), + pytest.param("wrong_schema", id="reader_yields_wrong_schema"), + pytest.param("too_many_rows", id="reader_produces_too_many_rows"), + ], +) +def test_blob_extension_add_columns_record_batch_reader_failure_cleans_files( + tmp_path, + failure_mode, +): + ds, dataset_path, values, payloads = _write_two_fragment_blob_v2_seed_dataset( + tmp_path, + f"test_add_columns_reader_blob_v2_fail_cleanup_{failure_mode}", + ) + external_blob_path = tmp_path / "external_base" / "external_blob.bin" + files_before = _dataset_file_set(dataset_path) + + schema = pa.schema([lance.blob_field("blob")]) + first_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema) + second_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema) + + if failure_mode == "raises_after_first_fragment": + match = "reader failed after first fragment" + + def failing_reader(): + yield first_fragment_batch + raise RuntimeError("reader failed after first fragment") + + elif failure_mode == "wrong_schema": + match = "field names" + + def failing_reader(): + yield first_fragment_batch + yield pa.record_batch([pa.array(range(4))], ["not_blob"]) + + else: + match = "Stream produced more values than expected for dataset" + + def failing_reader(): + yield first_fragment_batch + yield second_fragment_batch + yield pa.record_batch([lance.blob_array([payloads[0]])], schema=schema) + + with pytest.raises(OSError, match=match): + ds.add_columns(failing_reader(), reader_schema=schema) + + assert ds.version == 1 + assert _dataset_file_set(dataset_path) == files_before + assert external_blob_path.exists() + + +def test_blob_extension_add_columns_batch_udf_failure_cleans_files(tmp_path): + ds, dataset_path, values, _ = _write_two_fragment_blob_v2_seed_dataset( + tmp_path, + "test_add_columns_udf_blob_v2_fail_cleanup", + ) + external_blob_path = tmp_path / "external_base" / "external_blob.bin" + files_before = _dataset_file_set(dataset_path) + call_count = 0 + + @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")])) + def fail_on_second_fragment(batch): + nonlocal call_count + call_count += 1 + if call_count == 2: + raise RuntimeError("udf failed after first fragment") + blob_values = [values[row.as_py() % len(values)] for row in batch["id"]] + return pa.record_batch( + [lance.blob_array(blob_values)], + ["blob"], + ) + + with pytest.raises(OSError, match="udf failed after first fragment"): + ds.add_columns(fail_on_second_fragment, read_columns=["id"], batch_size=4) + + assert call_count == 2 + assert ds.version == 1 + assert _dataset_file_set(dataset_path) == files_before + assert external_blob_path.exists() + + +def test_blob_extension_add_columns_batch_udf_all_kinds(tmp_path): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_udf_blob_v2", + data_storage_version="2.2", + initial_bases=initial_bases, + ) + + @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")])) + def make_blob_column(batch): + return pa.record_batch( + [lance.blob_array([values[row.as_py()] for row in batch["id"]])], + ["blob"], + ) + + ds.add_columns(make_blob_column, read_columns=["id"]) + + _assert_blob_v2_add_columns_result(ds, "blob", payloads) + + +def test_blob_extension_add_columns_all_nulls_blob_v2(tmp_path): + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_all_nulls_blob_v2", + data_storage_version="2.2", + ) + + ds.add_columns(lance.blob_field("blob")) + + assert ds.to_table(columns=["blob"]).column("blob").to_pylist() == [None] * 4 + assert ds.take_blobs("blob", indices=range(4)) == [] + + def test_blob_extension_write_fragments_external_denied_by_default(tmp_path): blob_path = tmp_path / "external_blob.bin" @@ -1125,6 +1460,38 @@ def test_read_blobs_resolves_nested_field_path(dataset_with_nested_blobs): assert [data for _, data in results] == [b"foo", b"baz"] +def test_write_nested_blob_v2_and_take_by_field_path(tmp_path): + packed = b"x" * (70 * 1024) + blob_field = lance.blob_field("blob") + info_fields = [pa.field("name", pa.string()), blob_field] + info_type = pa.struct(info_fields) + info_array = pa.StructArray.from_arrays( + [pa.array(["a", "b", "c"]), lance.blob_array([b"foo", packed, None])], + fields=info_fields, + ) + table = pa.table( + [info_array], + schema=pa.schema([pa.field("info", info_type)]), + ) + + dataset = lance.write_dataset( + table, + tmp_path / "nested_blob_v2", + data_storage_version="2.2", + ) + + desc = dataset.to_table(columns=["info.blob"]).column("info.blob").chunk(0) + assert desc.field("kind").to_pylist()[:2] == [0, 1] + + blobs = dataset.take_blobs("info.blob", indices=[0, 1]) + with blobs[0] as f: + assert f.read() == b"foo" + with blobs[1] as f: + assert f.read() == packed + + assert dataset.take_blobs("info.blob", indices=[2]) == [] + + def test_to_pandas_returns_blob_files_for_projected_nested_fields( dataset_with_nested_blobs, ): diff --git a/python/python/tests/test_column_names.py b/python/python/tests/test_column_names.py index f7b5962b523..d402ba5bcbb 100644 --- a/python/python/tests/test_column_names.py +++ b/python/python/tests/test_column_names.py @@ -349,7 +349,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): indices = special_char_dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["user-id"] + assert indices[0].field_names == ["`user-id`"] assert indices[0].name == "user-id_idx" # Query using the indexed column (requires backticks in filter) @@ -462,7 +462,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.describe_indices() assert len(indices) == 1 assert indices[0].name == "MetaData.userId_idx" - assert indices[0].field_names == ["userId"] + assert indices[0].field_names == ["MetaData.userId"] # Query using the indexed column result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -512,7 +512,7 @@ def test_scalar_index_with_lowercased_nested_path(self, nested_mixed_case_datase assert len(indices) == 1 # Should store with correct case from schema assert indices[0].name == "MetaData.userId_idx" - assert indices[0].field_names == ["userId"] + assert indices[0].field_names == ["MetaData.userId"] # Query should also work with correct case result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -576,7 +576,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase indices = nested_special_char_dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["user-id"] + assert indices[0].field_names == ["`meta-data`.`user-id`"] assert indices[0].name == "meta-data.user-id_idx" # Query using the indexed column (backticks required in filter) @@ -600,7 +600,7 @@ def test_scalar_index_on_top_level_special_chars(self, nested_special_char_datas indices = nested_special_char_dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["row-id"] + assert indices[0].field_names == ["`row-id`"] result = nested_special_char_dataset.to_table(filter="`row-id` = 50") assert result.num_rows == 1 diff --git a/python/python/tests/test_count_pushdown.py b/python/python/tests/test_count_pushdown.py new file mode 100644 index 00000000000..896bd629fc9 --- /dev/null +++ b/python/python/tests/test_count_pushdown.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""End-to-end tests for count-from-mask pushdown. + +The optimizer rule under test (`CountPushdown`) rewrites +``SELECT COUNT(*) ... WHERE indexed_col v`` into +``AggregateExec(Final) → CountFromMaskExec → ScalarIndexExec`` when the +index covers every dataset fragment, or splits into a Union of a pushdown +branch over the indexed fragments and a scan branch over the rest when +coverage is partial. This is category 1 (count-from-mask) of the four +aggregate-acceleration categories; the other three (mask-to-answer, +zone-aware, dimension-keyed) are not implemented yet. + +Each test exercises a different state of the dataset (clean, with deletions, +with updates that introduce unindexed fragments, with a fully-deleted indexed +fragment) and asserts: + + 1. The returned count matches the ground truth (correctness), and + 2. The plan routes through ``CountFromMaskExec`` (the rule fired). + +For the cases where the index covers the whole dataset, the tests also assert +no ``LanceRead`` is present in the plan — proof that the count is being +answered from index metadata, not by scanning column data. The happy-path +test additionally re-runs the query and asserts the second call performs no +I/O. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import lance +import pyarrow as pa + +if TYPE_CHECKING: + from pathlib import Path + + +# -------------------------------------------------------------------------- +# Helpers +# -------------------------------------------------------------------------- + +# 4 fragments × 25 rows = 100 rows; values 0..99 in `x`. +NUM_FRAGMENTS = 4 +ROWS_PER_FRAGMENT = 25 +NUM_ROWS = NUM_FRAGMENTS * ROWS_PER_FRAGMENT # 100 + + +def _make_dataset(tmp_path: Path) -> lance.LanceDataset: + """Build a 4-fragment dataset with a BTREE index on `x`.""" + table = pa.table({"x": pa.array(range(NUM_ROWS), pa.int64())}) + dataset = lance.write_dataset( + table, + tmp_path / "ds", + max_rows_per_file=ROWS_PER_FRAGMENT, + ) + assert len(dataset.get_fragments()) == NUM_FRAGMENTS + dataset.create_scalar_index("x", "BTREE") + return dataset + + +def _filtered_count_plan(dataset: lance.LanceDataset, filter: str) -> str: + """Return the ``analyze_plan(count_rows=True)`` output for a filtered + ``COUNT(*)`` — the same plan ``count_rows(filter=…)`` actually executes.""" + return dataset.scanner(columns=[], with_row_id=True, filter=filter).analyze_plan( + count_rows=True + ) + + +def _assert_pushdown_fired(plan: str) -> None: + assert "CountFromMask" in plan, f"expected CountFromMaskExec in plan, got:\n{plan}" + + +def _assert_no_column_scan(plan: str) -> None: + """Stricter: no LanceRead anywhere. Only applies when the index covers + every dataset fragment (no partial-coverage split branch).""" + assert "LanceRead" not in plan, ( + f"unexpected LanceRead in plan — column data was scanned:\n{plan}" + ) + + +# -------------------------------------------------------------------------- +# Tests +# -------------------------------------------------------------------------- + + +def test_filtered_count_with_scalar_index(tmp_path: Path): + """Happy path: filtered count on an indexed column, run twice. + + The second call must perform zero I/O — proof the rule routed the count + through the index/deletion-mask metadata both times and the second call + re-used the cache. The check uses ``dataset.io_stats_incremental()`` + rather than parsing the plan's ``bytes_read=…`` so we get a direct + accounting of every object-store read the dataset performed during the + second call, not just what the plan happens to surface. + """ + dataset = _make_dataset(tmp_path) + filter = "x < 50" + expected = 50 + + # Verify the rule fires for this shape. + _assert_pushdown_fired(_filtered_count_plan(dataset, filter)) + _assert_no_column_scan(_filtered_count_plan(dataset, filter)) + + # First call warms the index + deletion-mask caches. + assert dataset.count_rows(filter=filter) == expected + # Reset counters so the next snapshot only reflects the second call. + dataset.io_stats_incremental() + + # Second call: must do zero I/O. + assert dataset.count_rows(filter=filter) == expected + stats = dataset.io_stats_incremental() + assert stats.read_iops == 0, f"expected 0 read_iops, got {stats.read_iops}" + assert stats.read_bytes == 0, f"expected 0 read_bytes, got {stats.read_bytes}" + + +def test_filtered_count_with_deleted_rows(tmp_path: Path): + """Some matching rows are deleted — the count must reflect the deletions. + + Deletions don't change fragment coverage, so the index still covers every + dataset fragment and the rule emits a single pushdown branch (no scan). + """ + dataset = _make_dataset(tmp_path) + # Delete three rows that match the filter (x < 50). + dataset.delete("x = 10 OR x = 20 OR x = 30") + plan = _filtered_count_plan(dataset, "x < 50") + _assert_pushdown_fired(plan) + _assert_no_column_scan(plan) + assert dataset.count_rows(filter="x < 50") == 50 - 3 + + +def test_filtered_count_with_updated_rows(tmp_path: Path): + """Updates move rows in/out of the filter set. + + Before: x < 50 ⇒ 50 rows match (values 0..49). + After: + - x = 5 → x = 100 (one row leaves the matched set) + - x = 7 → x = 101 (another row leaves) + - x = 60 → x = 8 (a row joins the matched set) + - x = 70 → x = 9 (another joins) + + Net change: −2 + 2 = 0, so the final count is still 50, but the + underlying row identities have shifted. Each update is materialized as + a delete + insert into a new fragment in Lance — the new fragments are + not in the index's coverage, so the optimizer rule emits a split plan: + pushdown for the originally-indexed fragments, plus a scan branch for + the rewritten fragments. The final count must still be correct. + """ + dataset = _make_dataset(tmp_path) + dataset.update({"x": "100"}, where="x = 5") + dataset.update({"x": "101"}, where="x = 7") + dataset.update({"x": "8"}, where="x = 60") + dataset.update({"x": "9"}, where="x = 70") + + plan = _filtered_count_plan(dataset, "x < 50") + _assert_pushdown_fired(plan) + # 50 originally matching − 2 that left + 2 that joined = 50. + assert dataset.count_rows(filter="x < 50") == 50 + + +def test_filtered_count_with_whole_fragment_deleted(tmp_path: Path): + """Delete every row in one indexed fragment. + + Fragment 0 covers x ∈ [0, 25). Deleting all of those rows removes 25 + matches of `x < 50`, dropping the count from 50 to 25. + + Lance retires the now-empty fragment, so the dataset has 3 fragments + while the index still claims 4 — the index is a strict *superset* of + the dataset, which is safe (the extra index entries simply don't + apply). The rule emits a single pushdown branch (no scan needed). + """ + dataset = _make_dataset(tmp_path) + dataset.delete("x < 25") + plan = _filtered_count_plan(dataset, "x < 50") + _assert_pushdown_fired(plan) + _assert_no_column_scan(plan) + assert dataset.count_rows(filter="x < 50") == 25 diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index b5c81669fa1..45866f3c4da 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -93,6 +93,25 @@ def test_roundtrip_types(tmp_path: Path): assert dataset.to_table() == table +@pytest.mark.parametrize("data_storage_version", ["legacy", "stable", "2.1"]) +def test_write_zero_dimension_fixed_size_list( + tmp_path: Path, data_storage_version: str +): + # Zero-dimension fixed-size lists must be rejected with a clean error + # instead of a divide-by-zero panic (#5102) + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("vec", pa.list_(pa.float32(), 0)), + ] + ) + table = pa.table({"id": [1], "vec": [[]]}, schema=schema) + with pytest.raises(OSError, match="dimension must be a positive integer"): + lance.write_dataset( + table, tmp_path / "ds.lance", data_storage_version=data_storage_version + ) + + def test_dataset_overwrite(tmp_path: Path): table1 = pa.Table.from_pylist([{"a": 1, "b": 2}, {"a": 10, "b": 20}]) base_dir = tmp_path / "test" @@ -424,16 +443,27 @@ def test_enable_stable_row_ids(tmp_path: Path): assert table_after["_rowaddr"][3].as_py() == (2 << 32) + 3 -def test_has_stable_row_ids_property(tmp_path: Path): - table = pa.Table.from_pylist([{"a": 1}, {"a": 2}]) +@pytest.mark.parametrize("enable_stable_row_ids", [True, False]) +@pytest.mark.parametrize( + "rows", + [[{"a": 1}, {"a": 2}], []], + ids=["non_empty", "empty"], +) +def test_has_stable_row_ids_property(tmp_path: Path, enable_stable_row_ids: bool, rows): + schema = pa.schema([pa.field("a", pa.int64())]) + table = pa.Table.from_pylist(rows, schema=schema) - stable_path = tmp_path / "stable" - lance.write_dataset(table, stable_path, enable_stable_row_ids=True) - assert lance.dataset(stable_path).has_stable_row_ids is True + path = tmp_path / f"stable_row_ids_{enable_stable_row_ids}_{len(rows)}" + lance.write_dataset( + table, + path, + enable_stable_row_ids=enable_stable_row_ids, + ) + ds = lance.dataset(path) - non_stable_path = tmp_path / "non_stable" - lance.write_dataset(table, non_stable_path, enable_stable_row_ids=False) - assert lance.dataset(non_stable_path).has_stable_row_ids is False + assert ds.count_rows() == len(rows) + assert len(ds.get_fragments()) == (0 if len(rows) == 0 else 1) + assert ds.has_stable_row_ids is enable_stable_row_ids def _list_manifests(versions_dir): @@ -1742,6 +1772,7 @@ def test_commit_batch_append(): result = lance.LanceDataset.commit_batch(dataset, [txn2, txn3]) dataset = result["dataset"] assert dataset.version == 2 + assert dataset.checkout_version(1).version == 1 assert len(dataset.get_fragments()) == 3 assert dataset.to_table() == pa.concat_tables([data1, data2, data3]) merged_txn = result["merged"] @@ -5538,6 +5569,8 @@ def test_branches(tmp_path: Path): branch1 = ds_main.create_branch("branch1") ds_main.branches.replace_metadata("branch1", {"description": "branch one"}) assert branch1.version == 1 + # The dataset returned by create_branch must be fully constructed + assert branch1.checkout_version(("main", None)).version == 1 branch1_append = pa.Table.from_pydict({"a": [7, 8], "b": [9, 10]}) branch1 = lance.write_dataset(branch1_append, branch1, mode="append") assert branch1.version == 2 @@ -5657,4 +5690,33 @@ def test_default_scan_options_nearest(tmp_path: Path) -> None: distances = result["_distance"].to_pylist() assert distances == sorted(distances) - assert "id" in result.column_names + +def test_tracked_files(tmp_path): + table = pa.table({"x": [1, 2, 3]}) + ds = lance.write_dataset(table, tmp_path / "ds") + ds.delete("x = 2") # adds a deletion file + + reader = ds.tracked_files() + assert isinstance(reader, pa.RecordBatchReader) + + result = reader.read_all() + assert result.schema.field("version").type == pa.int64() + assert result.num_rows >= 2 # at least manifest + data file + + types = set(result.column("type").to_pylist()) + assert "manifest" in types + assert "data file" in types + assert "deletion file" in types + + +def test_all_files(tmp_path): + table = pa.table({"x": [1, 2, 3]}) + ds = lance.write_dataset(table, tmp_path / "ds") + + reader = ds.all_files() + assert isinstance(reader, pa.RecordBatchReader) + + result = reader.read_all() + assert result.schema.field("size_bytes").type == pa.int64() + assert result.num_rows >= 2 # at least manifest + data file + assert all(s > 0 for s in result.column("size_bytes").to_pylist()) diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 7f6595f2ecc..02cf64541d6 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -25,7 +25,7 @@ def make_ds(num_rows: int, rows_per_frag: int, tmpdir: pathlib.Path, dtype: str): vectors = np.random.randn(num_rows, DIMENSION).astype(dtype) - vectors.shape = -1 + vectors = vectors.reshape(-1) vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION) table = pa.Table.from_arrays([vectors], names=["vectors"]) uri = str(tmpdir / "dataset") @@ -53,7 +53,7 @@ def small_rand_dataset(tmpdir, request): @pytest.fixture def mostly_null_dataset(tmpdir, request): vectors = np.random.randn(NUM_ROWS, DIMENSION).astype(np.float32) - vectors.shape = -1 + vectors = vectors.reshape(-1) vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION) vectors = vectors.to_pylist() vectors = [vec if i % 10 == 0 else None for i, vec in enumerate(vectors)] @@ -219,7 +219,7 @@ def test_ivf_centroids_fragment_ids(tmpdir): ], axis=0, ) - vectors.shape = -1 + vectors = vectors.reshape(-1) table = pa.Table.from_arrays( [pa.FixedSizeListArray.from_arrays(vectors, DIMENSION)], names=["vectors"] ) diff --git a/python/python/tests/test_mem_wal.py b/python/python/tests/test_mem_wal.py index b8c859cb637..c21e88b2416 100644 --- a/python/python/tests/test_mem_wal.py +++ b/python/python/tests/test_mem_wal.py @@ -60,9 +60,16 @@ def _write_flushed_gen(base_path: str, shard_id: str, gen_folder: str, data: pa. The collector resolves flushed generation paths as: {base_dataset_path}/_mem_wal/{shard_id}/{gen_folder} + + Production flush also writes a primary-key dedup sidecar (`_pk_index/`) that + the LSM scanner opens to dedup across generations; stage it here too so the + flushed generation faithfully matches what flush produces. """ + from lance.lance import _write_pk_sidecar + gen_path = os.path.join(base_path, "_mem_wal", shard_id, gen_folder) lance.write_dataset(data, gen_path, schema=_LOOKUP_SCHEMA) + _write_pk_sidecar(gen_path, data, ["id"]) def test_point_lookup_with_memtables(tmp_path): diff --git a/python/python/tests/test_namespace_dir.py b/python/python/tests/test_namespace_dir.py index 1991b82946e..fa1bc93b422 100644 --- a/python/python/tests/test_namespace_dir.py +++ b/python/python/tests/test_namespace_dir.py @@ -29,6 +29,8 @@ CountTableRowsRequest, CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, @@ -37,6 +39,8 @@ CreateTableVersionResponse, DeclareTableRequest, DeclareTableResponse, + DeleteTableBranchRequest, + DeleteTableBranchResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, @@ -54,6 +58,8 @@ InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, + ListTableBranchesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, @@ -71,6 +77,8 @@ InvalidInputError, NamespaceNotEmptyError, NamespaceNotFoundError, + TableBranchAlreadyExistsError, + TableBranchNotFoundError, TableNotFoundError, ) @@ -151,6 +159,21 @@ def create_table_version( ) -> CreateTableVersionResponse: return self._inner.create_table_version(request) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + return self._inner.create_table_branch(request) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + return self._inner.list_table_branches(request) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + return self._inner.delete_table_branch(request) + def create_table_index( self, request: CreateTableIndexRequest ) -> CreateTableIndexResponse: @@ -564,6 +587,110 @@ def test_register_table_rejects_path_traversal(self, temp_ns_client): assert "Path traversal is not allowed" in str(exc_info.value) +class TestTableBranchOperations: + """Branch CRUD through the python bindings - mirrors the Rust branch + CRUD tests.""" + + def test_branch_crud_round_trip(self, temp_ns_client): + create_ns_req = CreateNamespaceRequest(id=["workspace"]) + temp_ns_client.create_namespace(create_ns_req) + ipc_data = table_to_ipc_bytes(create_test_data()) + table_id = ["workspace", "branched_table"] + temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data) + + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert "dev" in listed.branches + assert listed.branches["dev"].parent_version == 1 + + # Duplicate creation and deleting a missing branch surface the typed + # branch errors (codes 23 and 22), not InternalError. + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev2") + ) + with pytest.raises(TableBranchAlreadyExistsError): + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev2") + ) + + temp_ns_client.delete_table_branch( + DeleteTableBranchRequest(id=table_id, name="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert "dev" not in listed.branches + with pytest.raises(TableBranchNotFoundError): + temp_ns_client.delete_table_branch( + DeleteTableBranchRequest(id=table_id, name="dev") + ) + + def test_create_branch_from_other_branch(self, temp_ns_client): + """Forking from a non-main source branch records the right parent.""" + create_ns_req = CreateNamespaceRequest(id=["workspace"]) + temp_ns_client.create_namespace(create_ns_req) + ipc_data = table_to_ipc_bytes(create_test_data()) + table_id = ["workspace", "fork_table"] + temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data) + + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev") + ) + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="child", from_branch="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert listed.branches["child"].parent_branch == "dev" + + +class _ForeignCodeError(Exception): + """Not a LanceNamespaceError, but carries the same integer code as + TABLE_NOT_FOUND.""" + + code = 4 + + +class _RaisingNamespace(LanceNamespace): + """A namespace whose describe_table raises the configured exception.""" + + def __init__(self, exc: Exception): + self._exc = exc + + def namespace_id(self) -> str: + return "raising" + + def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse: + raise self._exc + + +class TestPythonNamespaceErrorMapping: + """The Rust adapter must trust the `code` attribute only on the + lance_namespace exception hierarchy.""" + + def test_namespace_error_identity_preserved(self): + ns = _RaisingNamespace(TableNotFoundError("no such table")) + with pytest.raises(TableNotFoundError, match="no such table"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + # Branch error codes (22/23) survive the round trip too. + ns = _RaisingNamespace(TableBranchNotFoundError("no such branch")) + with pytest.raises(TableBranchNotFoundError, match="no such branch"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + def test_foreign_code_attribute_not_trusted(self): + # The foreign exception must surface as itself, not be reinterpreted + # as a namespace error via its `code` attribute. + ns = _RaisingNamespace(_ForeignCodeError("boom")) + with pytest.raises(_ForeignCodeError, match="boom"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + class TestChildNamespaceOperations: """Tests for operations in child namespaces - mirrors Rust tests.""" @@ -979,6 +1106,49 @@ def test_external_manifest_store_invokes_namespace_apis(use_custom): ), "describe_table_version should be called once when opening version 1" +def test_dataset_namespace_open_does_not_pass_version_to_describe_table(): + """Dataset versions are applied to dataset open, not namespace describe_table.""" + + class VersionRejectingNamespace(CustomNamespace): + def __init__(self, inner: lance.namespace.DirectoryNamespace): + super().__init__(inner) + self.describe_versions = [] + + def describe_table( + self, request: DescribeTableRequest + ) -> DescribeTableResponse: + self.describe_versions.append(request.version) + assert request.version is None + return super().describe_table(request) + + with tempfile.TemporaryDirectory() as tmpdir: + inner_ns_client = lance.namespace.DirectoryNamespace(root=tmpdir) + ns_client = VersionRejectingNamespace(inner_ns_client) + table_id = ["test_table"] + + table1 = pa.Table.from_pylist([{"a": 1}, {"a": 2}]) + ds = lance.write_dataset( + table1, namespace_client=ns_client, table_id=table_id, mode="create" + ) + assert ds.count_rows() == 2 + assert ds.version == 1 + + table2 = pa.Table.from_pylist([{"a": 3}]) + ds = lance.write_dataset( + table2, namespace_client=ns_client, table_id=table_id, mode="append" + ) + assert ds.count_rows() == 3 + assert ds.version == 2 + + version_one = lance.dataset( + namespace_client=ns_client, table_id=table_id, version=1 + ) + assert version_one.count_rows() == 2 + assert version_one.version == 1 + assert ns_client.describe_versions + assert all(version is None for version in ns_client.describe_versions) + + @pytest.mark.skipif( sys.platform == "win32", reason="Windows file locking prevents reliable concurrent filesystem operations", diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 4605b755816..fc08370d247 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -31,6 +31,8 @@ from lance_namespace import ( CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableRequest, CreateTableResponse, CreateTableVersionRequest, @@ -136,6 +138,11 @@ def create_table_version( ) -> CreateTableVersionResponse: return self._inner.create_table_version(request) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + return self._inner.create_table_branch(request) + def retrieve_ops_metrics(self) -> Optional[Dict[str, int]]: return self._inner.retrieve_ops_metrics() @@ -199,6 +206,7 @@ def create_tracking_namespace( storage_options: dict, credential_expires_in_seconds: int = 60, use_custom: bool = False, + managed_versioning: bool = False, ): """Create a DirectoryNamespace with ops metrics and credential vending enabled. @@ -212,6 +220,9 @@ def create_tracking_namespace( storage_options: Storage options to pass through (credentials, endpoint, etc.) credential_expires_in_seconds: Interval in seconds for credential expiration use_custom: If True, wrap in CustomNamespace for testing custom implementations + managed_versioning: If True, enable the manifest catalog so table versions + are tracked by the namespace and commits route through + create_table_version Returns: Tuple of (namespace_client, inner_namespace_client) where inner is always @@ -238,6 +249,10 @@ def create_tracking_namespace( dir_props["vend_input_storage_options_refresh_interval_millis"] = str( credential_expires_in_seconds * 1000 ) + if managed_versioning: + dir_props["manifest_enabled"] = "true" + dir_props["table_version_tracking_enabled"] = "true" + dir_props["table_version_storage_enabled"] = "true" inner_ns_client = DirectoryNamespace(**dir_props) ns_client = _wrap_if_custom(inner_ns_client, use_custom) @@ -558,6 +573,87 @@ def test_namespace_write_overwrite_mode(s3_bucket: str, use_custom: bool): assert get_describe_call_count(inner_ns_client) == call_count_before_reads +@pytest.mark.integration +@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"]) +def test_namespace_managed_branches(s3_bucket: str, use_custom: bool): + """Branches on a managed-versioning table over S3. + + Branch commits must route through the catalog (create_table_version) and + leave main's chain untouched. A cross-branch checkout at an overlapping + version number must resolve the requested chain: branch version numbers + continue from the fork point, so the same number exists on both chains + with different data. + """ + storage_options = copy.deepcopy(CONFIG) + + ns_client, inner_ns_client = create_tracking_namespace( + bucket_name=s3_bucket, + storage_options=storage_options, + credential_expires_in_seconds=3600, + use_custom=use_custom, + managed_versioning=True, + ) + + table_name = uuid.uuid4().hex + table_id = ["test_ns", table_name] + + def commit_count() -> int: + return inner_ns_client.retrieve_ops_metrics().get("create_table_version", 0) + + lance.write_dataset( + pa.Table.from_pylist([{"a": 1}]), + namespace_client=ns_client, + table_id=table_id, + mode="create", + storage_options=storage_options, + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 2}]), + namespace_client=ns_client, + table_id=table_id, + mode="append", + storage_options=storage_options, + ) + assert commit_count() >= 2 + + ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev", from_version=2) + ) + + dev = ds.checkout_version(("dev", None)) + commits_before_branch_append = commit_count() + dev = lance.write_dataset( + pa.Table.from_pylist([{"a": 3}]), + dev, + mode="append", + storage_options=storage_options, + ) + assert commit_count() == commits_before_branch_append + 1 + assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3] + + # Diverge main to the same version number as dev's tip. + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 100}]), + namespace_client=ns_client, + table_id=table_id, + mode="append", + storage_options=storage_options, + ) + assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100] + + on_dev = ds.checkout_version(("dev", 3)) + assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3] + back_on_main = dev.checkout_version(("main", None)) + assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100] + + fresh = lance.dataset( + namespace_client=ns_client, + table_id=table_id, + storage_options=storage_options, + ) + assert sorted(fresh.to_table()["a"].to_pylist()) == [1, 2, 100] + + @pytest.mark.integration @pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"]) def test_namespace_distributed_write(s3_bucket: str, use_custom: bool): diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index 801efcbd4f2..049ce2cc3a5 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -324,17 +324,56 @@ def test_defer_index_remap(tmp_path: Path): assert any(idx.name == "__lance_frag_reuse" for idx in indices) +@pytest.mark.parametrize("use_commit_options", [True, False]) +def test_defer_index_remap_via_commit_options(tmp_path: Path, use_commit_options: bool): + """Compaction.commit respects defer_index_remap passed in options. + + When options={"defer_index_remap": True} is supplied to Compaction.commit + the __lance_frag_reuse system index must appear in describe_indices(). + When the option is omitted (default) no such system index is written. + """ + base_dir = tmp_path / f"dataset_commit_opts_{use_commit_options}" + data = pa.table({"i": range(6_000), "val": range(6_000)}) + dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000) + dataset.create_scalar_index("i", "BTREE") + dataset.delete("i < 500") + + plan = Compaction.plan( + dataset, + options=dict(target_rows_per_fragment=2_000, num_threads=1), + ) + rewrites = [task.execute(dataset) for task in plan.tasks] + + if use_commit_options: + Compaction.commit(dataset, rewrites, options={"defer_index_remap": True}) + else: + Compaction.commit(dataset, rewrites) + + dataset = lance.dataset(base_dir) + indices = dataset.describe_indices() + has_frag_reuse = any(idx.name == "__lance_frag_reuse" for idx in indices) + + if use_commit_options: + assert has_frag_reuse, ( + "expected __lance_frag_reuse system index when defer_index_remap=True " + "is passed to Compaction.commit" + ) + else: + assert not has_frag_reuse, ( + "did not expect __lance_frag_reuse system index when options is omitted " + "from Compaction.commit" + ) + + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_describe_indices_matches_list_indices_for_frag_reuse(tmp_path: Path): """describe_indices() and list_indices() must agree on the index_type string for every index, including the __lance_frag_reuse system index that defer_index_remap produces. - list_indices() special-cases system indices via infer_system_index_type() - in python/src/dataset.rs. describe_indices() in - rust/lance/src/index.rs::IndexDescriptionImpl::try_new does not, so it - falls through to a plugin lookup that has no entry for - FragmentReuseIndexDetails and reports 'Unknown' instead. + list_indices() is a wrapper over describe_indices(), so the two must stay + in sync. System indices are identified by name via infer_system_index_type() + in rust/lance/src/index.rs::IndexDescriptionImpl::try_new. """ base_dir = tmp_path / "dataset" data = pa.table({"i": range(6_000), "val": range(6_000)}) diff --git a/python/python/tests/test_s3_ddb.py b/python/python/tests/test_s3_ddb.py index b9c9e4be6c0..dc9744115e2 100644 --- a/python/python/tests/test_s3_ddb.py +++ b/python/python/tests/test_s3_ddb.py @@ -212,6 +212,58 @@ def writh_dataset_with_start_barrier(): assert lance.dataset(table_dir).count_rows() == expected_version * 2 +@pytest.mark.integration +def test_s3_ddb_branches(s3_bucket: str, ddb_table: str): + """Branches on a table committed through the DynamoDB external manifest + store. + + The DDB store keys version chains by base uri, so each branch chain must + get its own entries via its branch-qualified path. Both chains are given + the same version number with diverged data so a wrong-chain resolution + cannot pass silently. + """ + storage_options = copy.deepcopy(CONFIG) + table_name = uuid.uuid4().hex + table_dir = f"s3+ddb://{s3_bucket}/{table_name}?ddbTableName={ddb_table}" + + # main: v1 (a=1), v2 (a=2) + lance.write_dataset( + pa.Table.from_pylist([{"a": 1}]), table_dir, storage_options=storage_options + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 2}]), + table_dir, + mode="append", + storage_options=storage_options, + ) + + # Fork "dev" at v2 and commit on it, then diverge main to the same + # version number. + dev = ds.create_branch("dev", 2) + dev = lance.write_dataset( + pa.Table.from_pylist([{"a": 3}]), + dev, + mode="append", + storage_options=storage_options, + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 100}]), + table_dir, + mode="append", + storage_options=storage_options, + ) + + assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3] + assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100] + + # Cross-branch checkout at the overlapping version number resolves each + # chain's own data. + on_dev = ds.checkout_version(("dev", 3)) + assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3] + back_on_main = dev.checkout_version(("main", None)) + assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100] + + @pytest.mark.integration def test_s3_unsafe(s3_bucket: str): storage_options = copy.deepcopy(CONFIG) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index d173ef807ba..b6e882633f5 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -108,6 +108,23 @@ def _commit_segmented_btree_index(dataset, column, index_name): return dataset.commit_existing_index_segments(index_name, column, segments) +def test_create_scalar_index_rejects_invalid_uuid(tmp_path): + """Invalid UUID strings passed to create_scalar_index and merge_index_metadata + must surface as a Python ValueError at the FFI boundary.""" + data = pa.table({"id": pa.array(range(100), type=pa.int64())}) + dataset = lance.write_dataset(data, tmp_path / "ds") + + with pytest.raises(ValueError, match="Invalid UUID"): + dataset.create_scalar_index( + column="id", + index_type="BTREE", + index_uuid="not-a-uuid", + ) + + with pytest.raises(ValueError, match="Invalid UUID"): + dataset.merge_index_metadata("also-not-a-uuid", index_type="BTREE") + + @pytest.fixture def btree_comparison_datasets(tmp_path): """Setup datasets for B-tree comparison tests""" @@ -147,7 +164,7 @@ def btree_comparison_datasets(tmp_path): } -def test_load_indices(indexed_dataset: lance.LanceDataset): +def test_describe_indices_vector_and_scalar(indexed_dataset: lance.LanceDataset): indices = indexed_dataset.describe_indices() vec_idx = next(idx for idx in indices if "VectorIndex" in idx.type_url) scalar_idx = next(idx for idx in indices if idx.index_type == "BTree") @@ -155,6 +172,154 @@ def test_load_indices(indexed_dataset: lance.LanceDataset): assert scalar_idx is not None +def test_list_indices_characterization(indexed_dataset: lance.LanceDataset): + """Lock down the backwards-compatible shape of the deprecated list_indices(). + + list_indices() returns a list of plain dicts (one per index segment), not + Index dataclasses. This characterization test guards the dict keys and + values so the deprecated method stays backwards compatible. + """ + with pytest.warns(DeprecationWarning): + indices = indexed_dataset.list_indices() + + assert len(indices) == 2 + by_name = {idx["name"]: idx for idx in indices} + assert set(by_name) == {"vector_idx", "meta_idx"} + + expected_keys = { + "name", + "type", + "uuid", + "fields", + "version", + "fragment_ids", + "base_id", + } + for idx in indices: + assert set(idx) == expected_keys + assert isinstance(idx["uuid"], str) and len(idx["uuid"]) > 0 + assert isinstance(idx["fields"], list) + assert isinstance(idx["fragment_ids"], set) + assert isinstance(idx["version"], int) + assert idx["type"] != "Unknown" + assert idx["base_id"] is None + + vector_idx = by_name["vector_idx"] + assert vector_idx["type"] == "IVF_PQ" + assert vector_idx["fields"] == ["vector"] + assert vector_idx["fragment_ids"] == {0} + + meta_idx = by_name["meta_idx"] + assert meta_idx["type"] == "BTree" + assert meta_idx["fields"] == ["meta"] + assert meta_idx["fragment_ids"] == {0} + + +def test_list_indices_nested_field_path(tmp_path): + """list_indices() reports nested fields as full dotted paths.""" + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("meta", pa.struct([pa.field("lang", pa.string())])), + ] + ) + data = pa.table( + { + "id": [1, 2, 3], + "meta": [{"lang": "en"}, {"lang": "fr"}, {"lang": "en"}], + }, + schema=schema, + ) + ds = lance.write_dataset(data, tmp_path) + ds.create_scalar_index(column="meta.lang", index_type="BTREE") + + with pytest.warns(DeprecationWarning): + indices = ds.list_indices() + + assert len(indices) == 1 + assert indices[0]["fields"] == ["meta.lang"] + + +def _commit_index(ds, index): + """Commit a single raw Index entry via the CreateIndex operation.""" + return lance.LanceDataset.commit( + ds.uri, + lance.LanceOperation.CreateIndex(new_indices=[index], removed_indices=[]), + read_version=ds.version, + ) + + +def test_list_indices_index_without_details(tmp_path): + """An index whose manifest entry has no index details (e.g. committed by an + older writer) is still reported on a best-effort basis: describe_indices() + does not error, and the type is reported as "Unknown".""" + from lance.dataset import Index + + data = pa.table({"id": range(100), "val": range(100)}) + ds = lance.write_dataset(data, tmp_path) + + field_id = ds.schema.get_field_index("id") + fragment_ids = {f.fragment_id for f in ds.get_fragments()} + ds = _commit_index( + ds, + Index( + uuid=str(uuid.uuid4()), + name="legacy_idx", + fields=[field_id], + dataset_version=ds.version, + fragment_ids=fragment_ids, + index_version=0, + ), + ) + + described = ds.describe_indices() + assert len(described) == 1 + assert described[0].name == "legacy_idx" + assert described[0].index_type == "Unknown" + assert described[0].type_url == "" + + with pytest.warns(DeprecationWarning): + listed = ds.list_indices() + assert len(listed) == 1 + assert listed[0]["name"] == "legacy_idx" + assert listed[0]["type"] == "Unknown" + + +def test_list_indices_legacy_vector_index_without_details(tmp_path): + """A legacy vector index predates VectorIndexDetails: it has no index + details but stores a monolithic index file. Its type is recognized as + "Vector" from the index file rather than reported as "Unknown".""" + from lance.dataset import Index, IndexFile + + data = pa.table({"id": range(100), "val": range(100)}) + ds = lance.write_dataset(data, tmp_path) + + field_id = ds.schema.get_field_index("id") + fragment_ids = {f.fragment_id for f in ds.get_fragments()} + ds = _commit_index( + ds, + Index( + uuid=str(uuid.uuid4()), + name="legacy_vector_idx", + fields=[field_id], + dataset_version=ds.version, + fragment_ids=fragment_ids, + index_version=0, + # "index.idx" is the legacy monolithic index file name; its presence + # is how a pre-details vector index is recognized. + files=[IndexFile(path="index.idx", size_bytes=0)], + ), + ) + + described = ds.describe_indices() + assert len(described) == 1 + assert described[0].index_type == "Vector" + + with pytest.warns(DeprecationWarning): + listed = ds.list_indices() + assert listed[0]["type"] == "Vector" + + def test_indexed_scalar_scan(indexed_dataset: lance.LanceDataset, data_table: pa.Table): sample_meta = data_table["meta"][50] expected_price = data_table["price"][50] @@ -483,7 +648,10 @@ def make_fts_search(ds): assert "ScalarIndexQuery" in plan assert "MaterializeIndex" not in plan assert "FlatMatchQuery" in plan - assert "LanceScan" in plan + # Flat FTS now reads via FilteredReadExec (prints as `LanceRead`) so the + # BTree on `id` pushes into the unindexed-fragment scan too. + assert "LanceRead" in plan + assert "LanceScan" not in plan assert make_fts_search(ds).to_table().num_rows == 12 # Update vector index but NOT scalar index @@ -703,6 +871,51 @@ def test_fts_custom_stop_words(tmp_path): assert len(results["_rowid"].to_pylist()) == 1 +def test_fts_stop_words_respect_language_for_simple_tokenizer(tmp_path): + data = pa.table({"text": ["the lance data", "的 lance data"]}) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="simple", + stem=False, + ) + + results = ds.to_table(full_text_query="the", with_row_id=True) + assert results.num_rows == 0 + + results = ds.to_table(full_text_query="的", with_row_id=True) + assert results["text"].to_pylist() == ["的 lance data"] + + +def test_fts_icu_stop_words_are_all_or_none(tmp_path): + data = pa.table({"text": ["the 的 lance data", "useful data"]}) + ds = lance.write_dataset(data, tmp_path / "enabled", mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu", + stem=False, + remove_stop_words=True, + ) + + assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 0 + assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 0 + assert ds.to_table(full_text_query="lance", with_row_id=True).num_rows == 1 + + ds = lance.write_dataset(data, tmp_path / "disabled", mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu", + stem=False, + remove_stop_words=False, + ) + + assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 1 + assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 1 + + def test_rowid_order(dataset): dataset.create_scalar_index("doc", index_type="INVERTED", with_position=False) results = dataset.scanner( @@ -3212,6 +3425,52 @@ def test_build_distributed_fts_index_basic(tmp_path): assert results.num_rows > 0, "No results found for search term 'frodo'" +@pytest.mark.parametrize("index_type", ["INVERTED", "FTS"]) +def test_segment_fts(tmp_path, index_type): + ds = generate_multi_fragment_dataset( + tmp_path, num_fragments=3, rows_per_fragment=100 + ) + + index_name = f"text_{index_type.lower()}_segment_idx" + segments = [ + ds.create_index_uncommitted( + column="text", + index_type=index_type, + name=index_name, + fragment_ids=[fragment.fragment_id], + with_position=False, + remove_stop_words=False, + ) + for fragment in ds.get_fragments() + ] + committed_ds = ds.commit_existing_index_segments(index_name, "text", segments) + + query = MatchQuery("frodo", "text") + results_without_index = committed_ds.scanner( + full_text_query=query, + columns=["id", "text"], + use_scalar_index=False, + ).to_table() + results_with_index = committed_ds.scanner( + full_text_query=query, + columns=["id", "text"], + use_scalar_index=True, + ).to_table() + + compare_fts_results(results_without_index, results_with_index) + assert any( + idx.name == index_name and idx.index_type == "Inverted" + for idx in committed_ds.describe_indices() + ) + assert ( + "FlatMatchQuery" + not in committed_ds.scanner( + full_text_query=query, + use_scalar_index=True, + ).explain_plan() + ) + + def test_compare_fts_results_identical(tmp_path): """ Test compare_fts_results function with identical results. @@ -3783,41 +4042,52 @@ def test_distribute_btree_index_build(tmp_path): ) -def _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragment_ids): - ds.merge_index_metadata(index_id, index_type="BITMAP") +def test_bitmap_uncommitted_segments_can_be_committed_from_python(tmp_path): + dataset_path = tmp_path / "bitmap_segments.lance" + ds = generate_multi_fragment_bitmap_dataset( + dataset_path, num_fragments=4, rows_per_fragment=40 + ) - from lance.dataset import Index + index_name = "bitmap_segment_idx" + fragment_ids = [fragment.fragment_id for fragment in ds.get_fragments()] + fragment_groups = [ + fragment_ids[idx : idx + 2] for idx in range(0, len(fragment_ids), 2) + ] + assert len(fragment_groups) >= 2 - field_id = ds.schema.get_field_index("category") - index = Index( - uuid=index_id, - name=index_name, - fields=[field_id], - dataset_version=ds.version, - fragment_ids=set(fragment_ids), - index_version=0, - ) - create_index_op = lance.LanceOperation.CreateIndex( - new_indices=[index], - removed_indices=[], - ) - lance.LanceDataset.commit( - ds.uri, - create_index_op, - read_version=ds.version, - ) - reopened_ds = lance.dataset(ds.uri) + staged_segments = [ + ds.create_index_uncommitted( + column="category", + index_type="BITMAP", + name=index_name, + fragment_ids=fragment_group, + ) + for fragment_group in fragment_groups + ] + + assert len({segment.uuid for segment in staged_segments}) == len(staged_segments) + for segment, fragment_group in zip(staged_segments, fragment_groups): + assert segment.fragment_ids == set(fragment_group) + assert any(file.path == "bitmap_page_lookup.lance" for file in segment.files) + assert all(not file.path.startswith("part_") for file in segment.files) - stats = reopened_ds.stats.index_stats(index_name) - assert stats["index_type"] == "Bitmap" + merged_segment = ds.merge_existing_index_segments(staged_segments) + assert merged_segment.uuid not in {segment.uuid for segment in staged_segments} + assert merged_segment.fragment_ids == set(fragment_ids) + assert any(file.path == "bitmap_page_lookup.lance" for file in merged_segment.files) + assert all(not file.path.startswith("part_") for file in merged_segment.files) + + ds = ds.commit_existing_index_segments(index_name, "category", [merged_segment]) + descriptions = {index.name: index for index in ds.describe_indices()} + assert len(descriptions[index_name].segments) == 1 filter_expr = "category = 3" - without_index = reopened_ds.scanner( + without_index = ds.scanner( filter=filter_expr, columns=["id", "category"], use_scalar_index=False, ).to_table() - with_index = reopened_ds.scanner( + with_index = ds.scanner( filter=filter_expr, columns=["id", "category"], use_scalar_index=True, @@ -3826,77 +4096,83 @@ def _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragmen assert with_index.num_rows == without_index.num_rows assert with_index["id"].to_pylist() == without_index["id"].to_pylist() assert set(with_index["category"].to_pylist()) == {3} - - explain = reopened_ds.scanner( - filter=filter_expr, - use_scalar_index=True, - ).explain_plan() - assert "ScalarIndexQuery" in explain - - empty_without_index = reopened_ds.scanner( - filter="category = 99", - use_scalar_index=False, - ).to_table() - empty_with_index = reopened_ds.scanner( - filter="category = 99", - use_scalar_index=True, - ).to_table() - assert empty_with_index.num_rows == empty_without_index.num_rows == 0 + assert ( + "ScalarIndexQuery" + in ds.scanner(filter=filter_expr, use_scalar_index=True).explain_plan() + ) -def test_distributed_bitmap_index_build(tmp_path): - ds = generate_multi_fragment_bitmap_dataset( - tmp_path / "bitmap_dist.lance", num_fragments=4, rows_per_fragment=40 +def test_zonemap_fragment_ids_parameter_validation(tmp_path): + ds = generate_multi_fragment_dataset( + tmp_path, num_fragments=2, rows_per_fragment=100 ) - index_id = str(uuid.uuid4()) - index_name = "bitmap_multiple_fragment_idx" - fragments = ds.get_fragments() - fragment_ids = [fragment.fragment_id for fragment in fragments] - fragment_groups = [ - fragment_ids[idx : idx + 2] for idx in range(0, len(fragment_ids), 2) - ] - assert len(fragment_groups) >= 2 - - for shard_id, fragment_group in enumerate(fragment_groups): + fragment_ids = [fragment.fragment_id for fragment in ds.get_fragments()] + with pytest.raises(ValueError, match="create_index_uncommitted"): ds.create_scalar_index( - column="category", - index_type=IndexConfig( - index_type="bitmap", - parameters={"shard_id": shard_id}, - ), - name=index_name, - replace=False, - index_uuid=index_id, - fragment_ids=fragment_group, + column="id", + index_type="ZONEMAP", + fragment_ids=[fragment_ids[0]], ) - _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragment_ids) - -def test_distributed_bitmap_index_build_single_fragment_shards(tmp_path): - ds = generate_multi_fragment_bitmap_dataset( - tmp_path / "bitmap_single_fragment_dist.lance", - num_fragments=4, - rows_per_fragment=40, +def test_zonemap_segment_merge_and_commit_from_python(tmp_path): + rows_per_fragment = 20_000 + ds = generate_multi_fragment_dataset( + tmp_path, num_fragments=4, rows_per_fragment=rows_per_fragment ) - index_id = str(uuid.uuid4()) - index_name = "bitmap_single_fragment_idx" + index_name = "id_zonemap_segments" fragment_ids = [fragment.fragment_id for fragment in ds.get_fragments()] - assert len(fragment_ids) >= 2 - - for fragment_id in fragment_ids: - ds.create_scalar_index( - column="category", - index_type="BITMAP", + staged_segments = [ + ds.create_index_uncommitted( + column="id", + index_type="ZONEMAP", name=index_name, - replace=False, - index_uuid=index_id, fragment_ids=[fragment_id], ) + for fragment_id in fragment_ids + ] - _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragment_ids) + assert len({segment.uuid for segment in staged_segments}) == len(staged_segments) + for segment, fragment_id in zip(staged_segments, fragment_ids): + files = segment.files + assert files is not None + assert segment.fragment_ids == {fragment_id} + assert any(file.path == "zonemap.lance" for file in files) + assert all(not file.path.startswith("part_") for file in files) + + merged_segment = ds.merge_existing_index_segments(staged_segments) + merged_files = merged_segment.files + assert merged_files is not None + assert merged_segment.uuid not in {segment.uuid for segment in staged_segments} + assert merged_segment.fragment_ids == set(fragment_ids) + assert any(file.path == "zonemap.lance" for file in merged_files) + assert all(not file.path.startswith("part_") for file in merged_files) + + ds = ds.commit_existing_index_segments(index_name, "id", [merged_segment]) + descriptions = {index.name: index for index in ds.describe_indices()} + assert descriptions[index_name].index_type == "ZoneMap" + assert len(descriptions[index_name].segments) == 1 + + filter_expr = "id >= 8200 AND id < 8300" + without_index = ds.scanner( + filter=filter_expr, + columns=["id", "text"], + use_scalar_index=False, + ).to_table() + with_index = ds.scanner( + filter=filter_expr, + columns=["id", "text"], + use_scalar_index=True, + ).to_table() + + assert with_index.num_rows == without_index.num_rows + assert with_index["id"].to_pylist() == without_index["id"].to_pylist() + assert ( + "ScalarIndexQuery" + in ds.scanner(filter=filter_expr, use_scalar_index=True).explain_plan() + ) def test_merge_index_metadata_btree_soft_break(tmp_path): @@ -4243,7 +4519,7 @@ def test_nested_field_btree_index(tmp_path): # Verify index was created indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["lang"] + assert indices[0].field_names == ["meta.lang"] assert indices[0].index_type == "BTree" # Test query using the index - filter for English language @@ -4344,7 +4620,7 @@ def test_nested_field_fts_index(tmp_path): # Verify index was created indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["text"] + assert indices[0].field_names == ["data.text"] assert indices[0].index_type == "Inverted" # Test full text search on nested field @@ -4418,7 +4694,7 @@ def test_nested_field_bitmap_index(tmp_path): # Verify index was created indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["color"] + assert indices[0].field_names == ["attributes.color"] assert indices[0].index_type == "Bitmap" # Test equality query diff --git a/python/python/tests/test_schema_evolution.py b/python/python/tests/test_schema_evolution.py index 205aaa4fa66..7df6962789e 100644 --- a/python/python/tests/test_schema_evolution.py +++ b/python/python/tests/test_schema_evolution.py @@ -6,6 +6,8 @@ from pathlib import Path import lance +import lance.dependencies as dependencies +import lance.udf as udf_module import numpy as np import pandas as pd import pyarrow as pa @@ -287,6 +289,37 @@ def mapper(batch: pa.RecordBatch): check_add_columns(dataset, expected, use_fragments, mapper) +@pytest.mark.parametrize("use_batch_udf", [False, True]) +def test_add_columns_arrow_udf_without_pandas_dependency( + tmp_path: Path, monkeypatch, use_batch_udf +): + table = pa.table({"caption": ["a Shutterstock photo", "clean"]}) + dataset = lance.write_dataset(table, tmp_path) + + def mapper(batch: pa.RecordBatch) -> pa.RecordBatch: + flags = pc.match_substring_regex( + pc.utf8_lower(batch["caption"]), "shutterstock" + ) + return pa.record_batch([flags], names=["wm"]) + + if use_batch_udf: + mapper = lance.batch_udf()(mapper) + + # CI has pandas installed, so simulate Lance's no-pandas lazy proxy state. + # Without the guard, accessing pd.DataFrame raises ModuleNotFoundError. + monkeypatch.setattr(dependencies, "_PANDAS_AVAILABLE", False) + monkeypatch.setattr( + udf_module, + "pd", + dependencies._LazyModule("pandas", module_available=False), + ) + + dataset.add_columns(mapper, read_columns=["caption"], batch_size=64) + + expected = table.append_column("wm", pa.array([True, False])) + assert dataset.to_table() == expected + + def test_query_after_merge(tmp_path): # https://github.com/lancedb/lance/issues/1905 tab = pa.table( diff --git a/python/python/tests/test_vector.py b/python/python/tests/test_vector.py index c02c8312f88..4ea4e7d425e 100644 --- a/python/python/tests/test_vector.py +++ b/python/python/tests/test_vector.py @@ -5,7 +5,7 @@ import numpy as np import pyarrow as pa import pytest -from lance.vector import vec_to_table +from lance.vector import hamming_clustering_for_sample, vec_to_table def test_dict(): @@ -147,3 +147,38 @@ def test_binary_vectors_invalid_metric(tmp_path): "metric": "l2", } ).to_table() + + +def _hash_table(hashes): + """Build a table with a ``hash`` column of FixedSizeList. + + ``hashes`` is a list of 8-byte sequences, one per row. + """ + flat = [byte for row in hashes for byte in row] + values = pa.FixedSizeListArray.from_arrays( + pa.array(flat, type=pa.uint8()), list_size=8 + ) + return pa.Table.from_arrays([values], names=["hash"]) + + +def test_hamming_clustering_for_sample(tmp_path): + hash_a = [0, 0, 0, 0, 0, 0, 0, 0] + hash_b = [255, 0, 0, 0, 0, 0, 0, 0] # 8 bits from hash_a + hash_c = [1, 2, 3, 4, 5, 6, 7, 8] # far from both + # Rows 0,1,2 share hash_a; rows 3,4 share hash_b; row 5 is unique. + table = _hash_table([hash_a, hash_a, hash_a, hash_b, hash_b, hash_c]) + dataset = lance.write_dataset(table, tmp_path / "hashes") + + # threshold 0 => only exact-match hashes cluster together. Full scan + # (sample_size=None) yields deterministic row ids 0..5. + result = hamming_clustering_for_sample(dataset, "hash", None, 0).read_all() + + clusters = { + rep: sorted(dups) + for rep, dups in zip( + result["representative"].to_pylist(), + result["duplicates"].to_pylist(), + ) + } + # Singleton row 5 is not emitted as a cluster. + assert clusters == {0: [1, 2], 3: [4]} diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 9df41dd8300..4e3addfedb8 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -23,7 +23,10 @@ from lance.dataset import VectorIndexReader from lance.indices import IndexFileVersion, IndicesBuilder from lance.query import MatchQuery, PhraseQuery -from lance.util import validate_vector_index # noqa: E402 +from lance.util import ( # noqa: E402 + _target_partition_size_to_num_partitions, + validate_vector_index, +) from lance.vector import vec_to_table # noqa: E402 @@ -856,6 +859,12 @@ def test_create_ivf_pq_with_target_partition_size(dataset, tmp_path): assert ann_ds.stats.index_stats("vector_idx")["indices"][0]["num_partitions"] == 2 +def test_target_partition_size_to_num_partitions_clamps(): + assert _target_partition_size_to_num_partitions(1000, 1000) == 1 + assert _target_partition_size_to_num_partitions(1000, 500) == 2 + assert _target_partition_size_to_num_partitions(8192 * 5000, 8192) == 4096 + + def test_index_size_stats(tmp_path: Path): num_rows = 512 dims = 32 @@ -1058,16 +1067,57 @@ def test_create_ivf_rq_skip_transpose(): assert stats["indices"][0]["sub_index"]["packed"] is False -def test_create_ivf_rq_rejects_unsupported_num_bits(): - ds = lance.write_dataset(create_table(), "memory://") +def _assert_recall_at_least(ds, query, metric=None, k=10, recall_requirement=0.5): + nearest = {"column": "vector", "q": query, "k": k} + if metric is not None: + nearest["metric"] = metric - with pytest.raises(NotImplementedError, match="only num_bits=1 is supported"): - ds.create_index( - "vector", - index_type="IVF_RQ", - num_partitions=4, - num_bits=2, + gt_ids = ds.to_table(nearest=nearest, columns=["id"])["id"].to_numpy() + create_index_kwargs = { + "index_type": "IVF_RQ", + "num_partitions": 4, + "num_bits": 9, + } + if metric is not None: + create_index_kwargs["metric"] = metric + indexed = ds.create_index("vector", **create_index_kwargs) + result_ids = indexed.to_table(nearest=nearest, columns=["id"])["id"].to_numpy() + + assert result_ids.shape[0] == k + recall = len(set(gt_ids) & set(result_ids)) / k + assert recall >= recall_requirement, ( + f"recall={recall}, gt={gt_ids}, result={result_ids}" + ) + return indexed + + +def test_create_ivf_rq_multi_bit_searches_l2_and_cosine(): + rng = np.random.default_rng(42) + mat = rng.standard_normal((1000, 128)).astype(np.float32) + tbl = vec_to_table(data=mat).append_column("id", pa.array(range(len(mat)))) + + ds = lance.write_dataset(tbl, "memory://") + ds = _assert_recall_at_least(ds, mat[0]) + stats = ds.stats.index_stats("vector_idx") + assert stats["indices"][0]["sub_index"]["num_bits"] == 9 + assert stats["indices"][0]["sub_index"]["query_estimator"] == "raw_query" + for approx_mode in ["fast", "normal", "accurate"]: + result = ds.to_table( + nearest={ + "column": "vector", + "q": mat[0], + "k": 10, + "approx_mode": approx_mode, + }, + columns=["id"], ) + assert result.num_rows == 10 + + cosine_ds = lance.write_dataset(tbl, "memory://") + cosine_ds = _assert_recall_at_least(cosine_ds, mat[1], metric="cosine") + cosine_stats = cosine_ds.stats.index_stats("vector_idx") + assert cosine_stats["indices"][0]["sub_index"]["num_bits"] == 9 + assert cosine_stats["indices"][0]["sub_index"]["query_estimator"] == "raw_query" def test_create_ivf_rq_requires_dim_divisible_by_8(): @@ -1722,6 +1772,8 @@ def test_index_cast_centroids(tmp_path): values = pa.array([x for arr in centroids for x in arr], pa.float32()) centroids = pa.FixedSizeListArray.from_arrays(values, 128) + # Cast invalidates the attached index; drop it first per the new contract. + dataset.drop_index(index_name) dataset.alter_columns(dict(path="vector", data_type=pa.list_(pa.float16(), 128))) # centroids are f32, but the column is now f16 @@ -1797,7 +1849,7 @@ def test_fragment_scan_disallowed_on_ann_with_index_scan_prefilter(tmp_path): assert results == results_no_scalar_index -def test_load_indices(dataset): +def test_describe_indices(dataset): indices = dataset.describe_indices() assert len(indices) == 0 @@ -2058,6 +2110,33 @@ def test_vector_index_invalid_query_parallelism(indexed_dataset): ) +def test_vector_index_with_approx_mode(indexed_dataset): + q = np.random.randn(128) + + for approx_mode in ["fast", "normal", "accurate"]: + result = indexed_dataset.to_table( + nearest={ + "column": "vector", + "q": q, + "k": 10, + "approx_mode": approx_mode, + } + ) + assert len(result) == 10 + + +def test_vector_index_invalid_approx_mode(indexed_dataset): + with pytest.raises(ValueError, match="approx_mode"): + indexed_dataset.scanner( + nearest={ + "column": "vector", + "q": np.random.randn(128), + "k": 10, + "approx_mode": "hacc", + } + ) + + def test_knn_deleted_rows(tmp_path): data = create_table() ds = lance.write_dataset(data, tmp_path) @@ -2141,7 +2220,7 @@ def test_nested_field_vector_index(tmp_path): # Verify index was created indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["embedding"] + assert indices[0].field_names == ["data.embedding"] # Test querying with the index query_vec = vectors[0] @@ -3000,6 +3079,51 @@ def test_commit_existing_index_segments_accepts_index_metadata(tmp_path): assert 0 < len(results) <= 5 +def test_distributed_ivf_rq_shared_rotation(tmp_path): + """Two IVF_RQ segments built on separate fragments with one shared RaBitQ rotation + merge into a single committed, queryable index. The shared ``rabitq_model`` (from + ``lance.lance.indices.build_rq_model``) is what makes the independently built + segments mergeable.""" + from lance.lance import indices + + dim = 32 + ds = _make_sample_dataset_base( + tmp_path, "dist_rq_merge", n_rows=512, dim=dim, max_rows_per_file=256 + ) + frags = ds.get_fragments() + assert len(frags) == 2 + + ivf_model = IndicesBuilder(ds, "vector").train_ivf( + num_partitions=2, + distance_type="l2", + sample_rate=8, + ) + rabitq_model = indices.build_rq_model(dimension=dim, num_bits=1) + base_kwargs = { + "column": "vector", + "index_type": "IVF_RQ", + "num_partitions": 2, + "num_bits": 1, + "ivf_centroids": ivf_model.centroids, + "rabitq_model": rabitq_model, + } + first = ds.create_index_uncommitted( + **base_kwargs, + fragment_ids=[frags[0].fragment_id], + ) + second = ds.create_index_uncommitted( + **base_kwargs, + fragment_ids=[frags[1].fragment_id], + ) + + merged = ds.merge_existing_index_segments([first, second]) + ds = ds.commit_existing_index_segments("vector_idx", "vector", [merged]) + + q = np.random.rand(dim).astype(np.float32) + results = ds.to_table(nearest={"column": "vector", "q": q, "k": 5}) + assert 0 < len(results) <= 5 + + def test_commit_existing_index_segments_accepts_uncommitted_vector_segments(tmp_path): ds = _make_sample_dataset_base(tmp_path, "segment_commit_ds", 2000, 128) frags = ds.get_fragments() diff --git a/python/src/dataset.rs b/python/src/dataset.rs index f70a9c7b1fb..31eaa96a654 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -20,10 +20,11 @@ use blob::LanceBlobFile; use chrono::{Duration, TimeDelta, Utc}; use futures::{StreamExt, TryFutureExt}; use lance_index::vector::bq::RQBuildParams; +use lance_index::vector::bq::storage::RabitQuantizationMetadata; use log::error; use object_store::path::Path; use pyo3::exceptions::{PyStopIteration, PyTypeError}; -use pyo3::types::{PyBytes, PyInt, PyList, PySet, PyString, PyTuple}; +use pyo3::types::{PyBytes, PyInt, PyList, PyString, PyTuple}; use pyo3::{IntoPyObjectExt, prelude::*}; use pyo3::{ PyResult, @@ -59,9 +60,7 @@ use lance::dataset::{ transaction::{Operation, Transaction}, }; use lance::index::vector::utils::get_vector_type; -use lance::index::{ - DatasetIndexExt, DatasetIndexInternalExt, IndexSegment, vector::VectorIndexParams, -}; +use lance::index::{DatasetIndexExt, IndexSegment, vector::VectorIndexParams}; use lance::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion}; use lance_arrow::as_fixed_size_list_array; use lance_core::Error; @@ -69,6 +68,7 @@ use lance_core::datatypes::BlobHandling; use lance_datafusion::utils::reader_to_stream; use lance_encoding::decoder::DecoderConfig; use lance_file::reader::FileReaderOptions; +use lance_index::scalar::inverted::query::Occur; use lance_index::scalar::inverted::query::{ BooleanQuery, BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, Operator, PhraseQuery, }; @@ -78,13 +78,11 @@ use lance_index::{ progress::{IndexBuildProgress, NoopIndexBuildProgress}, scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams}, vector::{ - DEFAULT_QUERY_PARALLELISM, Query as VectorQuery, hnsw::builder::HnswBuildParams, - ivf::IvfBuildParams, pq::PQBuildParams, sq::builder::SQBuildParams, + ApproxMode, DEFAULT_QUERY_PARALLELISM, Query as VectorQuery, + hnsw::builder::HnswBuildParams, ivf::IvfBuildParams, pq::PQBuildParams, + sq::builder::SQBuildParams, }, }; -use lance_index::{ - infer_system_index_type, metrics::NoOpMetricsCollector, scalar::inverted::query::Occur, -}; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStoreParams, StorageOptionsAccessor, }; @@ -820,8 +818,14 @@ impl Dataset { // Set up commit handler only if namespace manages versioning if namespace_client_managed_versioning { - let external_store = - LanceNamespaceExternalManifestStore::new(ns_client, tid.clone()); + // The store derives the branch a request targets from the base + // path it is handed, resolved against the table root. + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + ns_client, + tid.clone(), + &uri, + ) + .infer_error()?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), @@ -952,79 +956,6 @@ impl Dataset { Ok(dict.into()) } - /// Load index metadata. - /// - /// This call will open the index and return its concrete index type. - fn load_indices(self_: PyRef<'_, Self>) -> PyResult>> { - let index_metadata = rt() - .block_on(Some(self_.py()), self_.ds.load_indices())? - .map_err(|err| PyValueError::new_err(err.to_string()))?; - let py = self_.py(); - index_metadata - .iter() - .map(|idx| { - let dict = PyDict::new(py); - let schema = self_.ds.schema(); - let field_paths = idx - .fields - .iter() - .map(|field_id| schema.field_path(*field_id).unwrap()) - .collect::>(); - - let ds = self_.ds.clone(); - let idx_type = match rt().block_on(Some(self_.py()), async { - if let Some(system_index_type) = infer_system_index_type(idx) { - Ok::<_, lance::Error>(system_index_type.to_string()) - } else { - let idx = ds - .open_generic_index( - &field_paths[0], - &idx.uuid.to_string(), - &NoOpMetricsCollector, - ) - .await?; - Ok::<_, lance::Error>(idx.index_type().to_string()) - } - })? { - Ok(r) => r, - Err(error) => { - log::warn!( - "Cannot derive index type for index {} (uuid={}, type_url={:?}, version={}) on dataset {}: {}", - idx.name, - idx.uuid, - idx.index_details.as_ref().map(|d| d.type_url.as_str()), - idx.index_version, - self_.ds.uri(), - error, - ); - // mark the type as unknown for any new index type - "Unknown".to_owned() - } - }; - - let fragment_set = PySet::empty(py).unwrap(); - if let Some(bitmap) = &idx.fragment_bitmap { - for fragment_id in bitmap.iter() { - fragment_set.add(fragment_id).unwrap(); - } - } - - dict.set_item("name", idx.name.clone()).unwrap(); - // TODO: once we add more than vector indices, we need to: - // 1. Change protos and write path to persist index type - // 2. Use the new field from idx instead of hard coding it to Vector - dict.set_item("type", idx_type).unwrap(); - dict.set_item("uuid", idx.uuid.to_string()).unwrap(); - dict.set_item("fields", field_paths).unwrap(); - dict.set_item("version", idx.dataset_version).unwrap(); - dict.set_item("fragment_ids", fragment_set).unwrap(); - dict.set_item("base_id", idx.base_id.map(|id| id as i64)) - .unwrap(); - dict.into_py_any(py) - }) - .collect::>>() - } - #[allow(clippy::too_many_arguments)] #[pyo3(signature=(columns=None, columns_with_transform=None, filter=None, search_filter=None, prefilter=None, limit=None, offset=None, nearest=None, batch_size=None, batch_size_bytes=None, io_buffer_size=None, batch_readahead=None, fragment_readahead=None, scan_in_order=None, fragments=None, index_segments=None, with_row_id=None, with_row_address=None, use_stats=None, substrait_filter=None, fast_search=None, full_text_query=None, late_materialization=None, blob_handling=None, use_scalar_index=None, include_deleted_rows=None, scan_stats_callback=None, strict_batch_size=None, order_by=None, disable_scoring_autoprojection=None, substrait_aggregate=None))] fn scanner( @@ -1303,6 +1234,7 @@ impl Dataset { use_index, ef, query_parallelism, + approx_mode, ) = vector_query_params_from_dict(nearest, default_k)?; let (_, element_type) = get_vector_type(self_.ds.schema(), &column) @@ -1369,6 +1301,7 @@ impl Dataset { s = s.ef(ef); } s = s.query_parallelism(query_parallelism); + s = s.approx_mode(approx_mode); s.use_index(use_index); if let Some((lower, upper)) = distance_range { s.distance_range(lower, upper); @@ -2235,6 +2168,7 @@ impl Dataset { "LABEL_LIST" => IndexType::LabelList, "RTREE" => IndexType::RTree, "INVERTED" | "FTS" => IndexType::Inverted, + "FM" => IndexType::Fm, "IVF_FLAT" | "IVF_PQ" | "IVF_SQ" | "IVF_RQ" | "IVF_HNSW_FLAT" | "IVF_HNSW_PQ" | "IVF_HNSW_SQ" => IndexType::Vector, _ => { @@ -2274,6 +2208,27 @@ impl Dataset { index_type: "rtree".to_string(), params: None, }), + "FM" => { + let mut params_json = serde_json::Map::new(); + if let Some(kwargs) = kwargs + && let Some(num_segments) = kwargs.get_item("num_segments")? + { + let n: u32 = num_segments.extract()?; + params_json.insert( + "num_segments".to_string(), + serde_json::Value::Number(n.into()), + ); + } + let params = if params_json.is_empty() { + None + } else { + Some(serde_json::Value::Object(params_json).to_string()) + }; + Box::new(ScalarIndexParams { + index_type: "fm".to_string(), + params, + }) + } "SCALAR" => { let Some(kwargs) = kwargs else { return Err(PyValueError::new_err( @@ -2380,10 +2335,22 @@ impl Dataset { None }; - let index_uuid: Option = if let Some(kwargs) = kwargs { + let index_uuid: Option = if let Some(kwargs) = kwargs { kwargs .get_item("index_uuid")? - .and_then(|v| if v.is_none() { None } else { Some(v.extract()) }) + .and_then(|v| { + if v.is_none() { + None + } else { + Some(v.extract::()) + } + }) + .transpose()? + .map(|s| { + Uuid::parse_str(&s).map_err(|e| { + PyValueError::new_err(format!("Invalid UUID string for index_uuid: {e}")) + }) + }) .transpose()? } else { None @@ -2486,6 +2453,9 @@ impl Dataset { batch_readhead: Option, progress_callback: Option<&Bound<'_, PyAny>>, ) -> PyResult<()> { + let parsed_uuid = Uuid::parse_str(index_uuid).map_err(|e| { + PyValueError::new_err(format!("Invalid UUID string for index_uuid: {e}")) + })?; let mut progress_handler = Self::make_index_progress_handler_from_callback(progress_callback)?; let progress: Arc = progress_handler @@ -2497,7 +2467,7 @@ impl Dataset { async { self.ds .merge_index_metadata( - index_uuid, + &parsed_uuid, IndexType::try_from(index_type)?, batch_readhead, progress, @@ -2691,9 +2661,16 @@ impl Dataset { && let (Some(ns_client), Some(tid)) = (namespace_client, table_id) { // Create ExternalManifestCommitHandler from namespace client and table_id - // only when namespace manages versioning + // only when namespace manages versioning. The store derives the + // branch a request targets from the base path it is handed, + // resolved against the table root. let ns_client = extract_namespace_arc(ns_client.py(), ns_client)?; - let external_store = LanceNamespaceExternalManifestStore::new(ns_client, tid); + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + ns_client, + tid, + &dest.table_root_uri()?, + ) + .infer_error()?; Some(Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }) as Arc) @@ -2909,6 +2886,54 @@ impl Dataset { Ok(PyArrowType(reader)) } + #[pyo3(signature = (*, min_version=None, progress=None))] + fn tracked_files( + &self, + min_version: Option, + progress: Option<&Bound<'_, PyAny>>, + ) -> PyResult>> { + use lance::dataset::files::{TrackedFilesOptions, TrackedFilesProgress}; + + let progress_cb: Option> = + if let Some(cb) = progress { + if !cb.is_callable() { + return Err(PyValueError::new_err("progress must be callable")); + } + let cb = cb.clone().unbind(); + Some(Box::new(move |p: TrackedFilesProgress| { + Python::attach(|py| { + let total: Option = p.manifests_total; + match cb.call1(py, (p.manifests_processed, total)) { + Ok(_) => (), + Err(e) => { + log::error!("Error in tracked_files progress callback: {}", e); + } + } + }); + })) + } else { + None + }; + + let options = TrackedFilesOptions { + min_version, + progress: progress_cb, + }; + let stream = rt().block_on(None, self.ds.tracked_files_with_options(options))?; + let reader = Box::new(LanceReader::from_stream(DatasetRecordBatchStream::new( + stream, + ))); + Ok(PyArrowType(reader)) + } + + fn all_files(&self) -> PyResult>> { + let stream = rt().block_on(None, self.ds.all_files())?; + let reader = Box::new(LanceReader::from_stream(DatasetRecordBatchStream::new( + stream, + ))); + Ok(PyArrowType(reader)) + } + #[pyo3(signature = (keys))] fn delete_config_keys(&mut self, keys: Vec) -> PyResult<()> { let mut new_self = self.ds.as_ref().clone(); @@ -3098,11 +3123,7 @@ impl Dataset { let vindex = self .ds - .open_vector_index( - column_name, - &idx_meta.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index(column_name, &idx_meta.uuid, &NoOpMetricsCollector) .await .infer_error()?; @@ -3407,6 +3428,188 @@ impl Dataset { self.ds.clone(), )) } + + /// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index. + /// + /// This function loads a specific partition from an IVF_FLAT index on a hash column, + /// computes pairwise hamming distances between all hashes in the partition, + /// filters by threshold, and clusters the results using union-find. + /// + /// Parameters + /// ---------- + /// index_name : str + /// Name of the IVF_FLAT index on the hash column + /// partition_id : int + /// The partition ID within the IVF_FLAT index + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (index_name, partition_id, hamming_threshold))] + fn hamming_clustering_for_ivf_partition( + &self, + py: Python<'_>, + index_name: &str, + partition_id: usize, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_ivf_partition; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_ivf_partition( + ds, + index_name, + partition_id, + hamming_threshold, + ), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } + + /// Get partition information for an IVF_FLAT index. + /// + /// Parameters + /// ---------- + /// index_name : str + /// Name of the IVF_FLAT index + /// + /// Returns + /// ------- + /// List[dict] + /// List of partition info dicts with 'partition_id' and 'size' + #[pyo3(signature = (index_name))] + fn get_ivf_partition_info( + &self, + py: Python<'_>, + index_name: &str, + ) -> PyResult>> { + use lance::index::vector::hamming::get_ivf_partition_info; + + let ds = self.ds.as_ref(); + let result = rt() + .block_on(Some(py), get_ivf_partition_info(ds, index_name))? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + let partitions: PyResult> = result + .iter() + .map(|p| { + let dict = PyDict::new(py); + dict.set_item("partition_id", p.partition_id)?; + dict.set_item("size", p.size)?; + Ok(dict.into()) + }) + .collect(); + + partitions + } + + /// Perform pairwise hamming distance clustering on sampled rows from a dataset. + /// + /// This function samples N rows randomly from the dataset, extracts hashes, + /// computes pairwise hamming distances, and clusters the results. + /// It's useful for benchmarking and testing without requiring an IVF index. + /// + /// Parameters + /// ---------- + /// column : str + /// Name of the hash column (must be FixedSizeList) + /// sample_size : int, optional + /// Number of rows to sample (if None or >= total rows, uses all rows) + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (column, sample_size, hamming_threshold))] + fn hamming_clustering_for_sample( + &self, + py: Python<'_>, + column: &str, + sample_size: Option, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_sample; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_sample(ds, column, sample_size, hamming_threshold), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } + + /// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment. + /// + /// This function reads a contiguous range of rows from a specific fragment, + /// extracts hashes, computes pairwise hamming distances, and clusters the results. + /// Unlike sampling, this reads sequential rows which is useful for distributed + /// processing where each worker handles a specific range of a fragment. + /// + /// Parameters + /// ---------- + /// column : str + /// Name of the hash column (must be FixedSizeList) + /// fragment_id : int + /// The fragment ID to read from + /// start_row : int + /// The starting row offset within the fragment + /// num_rows : int + /// Number of rows to read from the start position + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (column, fragment_id, start_row, num_rows, hamming_threshold))] + fn hamming_clustering_for_range( + &self, + py: Python<'_>, + column: &str, + fragment_id: usize, + start_row: usize, + num_rows: usize, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_range; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_range( + ds, + column, + fragment_id, + start_row, + num_rows, + hamming_threshold, + ), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } } #[pyclass(name = "SqlQuery", module = "_lib", subclass, skip_from_py_object)] @@ -3606,9 +3809,18 @@ impl PyWriteDest { Self::Uri(uri) => WriteDestination::Uri(uri), } } + + /// The table root uri of this destination (a branch dataset resolves to + /// its main location). Used to root the namespace manifest store. + pub fn table_root_uri(&self) -> PyResult { + match self { + Self::Dataset(ds) => Ok(ds.ds.branch_location().find_main().infer_error()?.uri), + Self::Uri(uri) => Ok(uri.to_string()), + } + } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] enum IndexProgressEventType { Start, Progress, @@ -3764,7 +3976,20 @@ impl IndexProgressDispatcher { fn drain(&mut self) -> PyResult<()> { while let Ok(event) = self.receiver.try_recv() { - self.dispatch(event)?; + let is_complete = event.event == IndexProgressEventType::Complete; + if let Err(err) = self.dispatch(event) { + if is_complete { + // Complete events are purely informational — the stage's work + // is already done. Propagating a callback error here would + // abort the operation after the real work has succeeded. + log::warn!( + "Ignoring progress callback error on stage-complete event: {}", + err + ); + } else { + return Err(err); + } + } } Ok(()) } @@ -3948,7 +4173,7 @@ pub fn write_dataset( dest: PyWriteDest, options: &Bound<'_, PyDict>, ) -> PyResult { - let params = get_write_params(options)?; + let params = get_write_params(options, &dest.table_root_uri()?)?; let py = options.py(); let ds = if reader.is_instance_of::() { let scanner: Scanner = reader.extract()?; @@ -4017,8 +4242,13 @@ fn get_dict_opt<'py, D: FromPyObjectOwned<'py>>( .transpose() } +/// `table_uri` is the destination table's root uri; it roots the namespace +/// manifest store when `namespace_client_managed_versioning` is requested. #[allow(deprecated)] -pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult> { +pub fn get_write_params( + options: &Bound<'_, PyDict>, + table_uri: &str, +) -> PyResult> { let params = if options.is_none() { None } else { @@ -4188,9 +4418,15 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); @@ -4361,6 +4597,13 @@ fn prepare_vector_index_params( pq_params.codebook = Some(codebook.values().clone()) }; + if let Some(r) = kwargs.get_item("rabitq_model")? { + let json: String = r.extract()?; + let meta: RabitQuantizationMetadata = serde_json::from_str(&json) + .map_err(|e| PyValueError::new_err(format!("Invalid rabitq_model JSON: {e}")))?; + rq_params.rotation = Some(meta); + }; + if let Some(version) = kwargs.get_item("index_file_version")? { let version: String = version.extract()?; index_file_version = IndexFileVersion::try_from(&version) @@ -4681,6 +4924,7 @@ type VectorQueryParams = ( bool, Option, i32, + ApproxMode, ); fn extract_query_parallelism(value: &Bound<'_, PyAny>) -> PyResult { @@ -4702,6 +4946,23 @@ fn vector_query_query_parallelism_from_dict(dict: &Bound<'_, PyDict>) -> PyResul } } +fn vector_query_approx_mode_from_dict(dict: &Bound<'_, PyDict>) -> PyResult { + if let Some(approx_mode) = dict.get_item("approx_mode")? + && !approx_mode.is_none() + { + match approx_mode.to_string().to_lowercase().as_str() { + "fast" => Ok(ApproxMode::Fast), + "normal" => Ok(ApproxMode::Normal), + "accurate" => Ok(ApproxMode::Accurate), + value => Err(PyValueError::new_err(format!( + "approx_mode must be one of 'fast', 'normal', or 'accurate', got '{value}'" + ))), + } + } else { + Ok(ApproxMode::Normal) + } +} + fn vector_query_params_from_dict( dict: &Bound<'_, PyDict>, default_k: usize, @@ -4808,6 +5069,7 @@ fn vector_query_params_from_dict( }; let query_parallelism = vector_query_query_parallelism_from_dict(dict)?; + let approx_mode = vector_query_approx_mode_from_dict(dict)?; Ok(( column, @@ -4820,6 +5082,7 @@ fn vector_query_params_from_dict( use_index, ef, query_parallelism, + approx_mode, )) } @@ -4856,6 +5119,7 @@ impl PySearchFilter { use_index, ef, query_parallelism, + approx_mode, ) = vector_query_params_from_dict(query, default_k)?; let metric_type = Some(metric_type_opt.unwrap_or(MetricType::L2)); @@ -4874,6 +5138,7 @@ impl PySearchFilter { use_index, query_parallelism, dist_q_c: 0.0, + approx_mode, }; Ok(Self { diff --git a/python/src/dataset/optimize.rs b/python/src/dataset/optimize.rs index 321d7157b86..4bb29246f45 100644 --- a/python/src/dataset/optimize.rs +++ b/python/src/dataset/optimize.rs @@ -58,6 +58,9 @@ fn parse_compaction_options( "batch_size" => { opts.batch_size = value.extract()?; } + "io_buffer_size" => { + opts.io_buffer_size = value.extract()?; + } "compaction_mode" => { let mode_str: Option = value.extract()?; if let Some(mode_str) = mode_str { @@ -551,26 +554,34 @@ impl PyCompaction { /// new version once committed. /// rewrites : List[RewriteResult] /// The results of the compaction tasks to include in the commit. + /// options : dict, optional + /// Compaction options to apply at commit time. + /// When absent or ``None``, defaults to ``CompactionOptions::default()``. /// /// Returns /// ------- /// CompactionMetrics #[staticmethod] + #[pyo3(signature = (dataset, rewrites, options = None))] pub fn commit( dataset: Bound, rewrites: Vec, + options: Option>, ) -> PyResult { let dataset_ref = unwrap_dataset(dataset)?; let dataset = dataset_ref.borrow().clone(); + let config = dataset.ds.manifest.config.clone(); + let opts = match options { + Some(ref dict) => parse_compaction_options(dict, &config)?, + None => CompactionOptions::default(), + }; let rewrites: Vec = rewrites.into_iter().map(|r| r.0).collect(); let mut new_ds = dataset.ds.as_ref().clone(); - // TODO: pass compaction option from plan and execute time - let options: CompactionOptions = CompactionOptions::default(); let fut = commit_compaction( &mut new_ds, rewrites, Arc::new(DatasetIndexRemapperOptions::default()), - &options, + &opts, ); let metrics = rt() .block_on(None, fut)? diff --git a/python/src/file.rs b/python/src/file.rs index ab5bda77fb0..b0bc20f9d0a 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -347,8 +347,10 @@ impl LanceFileWriter { } pub fn finish(&self) -> PyResult { - rt().block_on(None, async { self.inner.lock().await.finish().await })? - .infer_error() + rt().block_on(None, async { + self.inner.lock().await.finish().await.map(|s| s.num_rows) + })? + .infer_error() } pub fn add_global_buffer(&self, bytes: Vec) -> PyResult { diff --git a/python/src/fragment.rs b/python/src/fragment.rs index 1da99492fac..e6060b1ac4e 100644 --- a/python/src/fragment.rs +++ b/python/src/fragment.rs @@ -23,7 +23,7 @@ use lance::Error; use lance::dataset::fragment::FileFragment as LanceFragment; use lance::dataset::scanner::ColumnOrdering; use lance::dataset::transaction::{Operation, Transaction}; -use lance::dataset::{InsertBuilder, NewColumnTransform}; +use lance::dataset::{InsertBuilder, NewColumnTransform, WriteParams}; use lance_core::datatypes::BlobHandling; use lance_io::utils::CachedFileSize; use lance_table::format::{ @@ -119,7 +119,7 @@ impl FileFragment { kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult> { let params = if let Some(kw_params) = kwargs { - get_write_params(kw_params)? + get_write_params(kw_params, dataset_uri)? } else { None }; @@ -435,10 +435,10 @@ fn do_write_fragments( ) -> PyResult { let batches = convert_reader(reader)?; - let params = kwargs - .and_then(|params| get_write_params(params).transpose()) - .transpose()? - .unwrap_or_default(); + let params = match kwargs { + Some(params) => get_write_params(params, &dest.table_root_uri()?)?.unwrap_or_default(), + None => WriteParams::default(), + }; rt().block_on( Some(reader.py()), diff --git a/python/src/indices.rs b/python/src/indices.rs index fe988206117..7ce7a297924 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -145,11 +145,7 @@ async fn do_get_ivf_model(dataset: &Dataset, index_name: &str) -> PyResult( codebook.to_pyarrow(py) } +/// Mint one RaBitQ rotation and return it as a JSON string. +/// +/// Distributed IVF_RQ builds must pin a single rotation across all workers so that +/// independently built per-fragment segments rotate vectors identically and their +/// binary codes remain comparable when merged. A driver calls this once and broadcasts +/// the resulting string to every `create_index_uncommitted(..., rabitq_model=...)` call. +/// +/// The rotation is always the "fast" rotation since its sign vector is JSON-serializable, +/// whereas the "matrix" rotation stores a dense matrix in a binary buffer that is dropped by +/// the JSON wire format. `dtype` is accepted for API symmetry but does not affect the fast +/// rotation. +/// +/// # Example (Python) +/// +/// ```python +/// from lance.lance import indices +/// +/// # Mint one model and broadcast `model` to every worker. +/// model = indices.build_rq_model(dimension=128, num_bits=1) +/// seg = ds.create_index_uncommitted( +/// column="vector", +/// index_type="IVF_RQ", +/// num_partitions=256, +/// ivf_centroids=centroids, +/// rabitq_model=model, +/// fragment_ids=my_fragments, +/// ) +/// ``` +#[pyfunction] +#[pyo3(signature = (dimension, num_bits=1, dtype="float32"))] +pub fn build_rq_model(dimension: usize, num_bits: u8, dtype: &str) -> PyResult { + use arrow::datatypes::{Float16Type, Float32Type, Float64Type}; + use lance_index::vector::bq::RQRotationType; + use lance_index::vector::bq::builder::RabitQuantizer; + use lance_index::vector::quantizer::Quantization; + + if !dimension.is_multiple_of(u8::BITS as usize) { + return Err(PyValueError::new_err( + "dimension must be divisible by 8 for IVF_RQ", + )); + } + let dim = dimension as i32; + let rotation = RQRotationType::Fast; + let quantizer = match dtype.to_lowercase().as_str() { + "float16" => RabitQuantizer::new_with_rotation::(num_bits, dim, rotation), + "float32" => RabitQuantizer::new_with_rotation::(num_bits, dim, rotation), + "float64" => RabitQuantizer::new_with_rotation::(num_bits, dim, rotation), + other => { + return Err(PyValueError::new_err(format!("unsupported dtype: {other}"))); + } + }; + serde_json::to_string(&quantizer.metadata(None)) + .map_err(|e| PyValueError::new_err(format!("failed to serialize RQ model: {e}"))) +} + #[allow(clippy::too_many_arguments)] async fn do_transform_vectors( dataset: &Dataset, @@ -579,6 +630,9 @@ pub struct PyIndexSegmentDescription { /// The total size in bytes of all files in this segment /// (None for backward compatibility with indices created before file tracking) pub size_bytes: Option, + /// The id of the dataset base path that stores this segment + /// (None when the segment is stored in the dataset's default base path) + pub base_id: Option, } impl PyIndexSegmentDescription { @@ -597,18 +651,20 @@ impl PyIndexSegmentDescription { index_version: segment.index_version, created_at: segment.created_at, size_bytes, + base_id: segment.base_id.map(|id| id as i64), } } pub fn __repr__(&self) -> String { format!( - "IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?})", + "IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?}, base_id={:?})", self.uuid, self.dataset_version_at_last_update, self.fragment_ids, self.index_version, self.created_at, - self.size_bytes + self.size_bytes, + self.base_id ) } } @@ -623,7 +679,8 @@ pub struct PyIndexDescription { pub index_type: String, /// The ids of the fields that the index is built on pub fields: Vec, - /// The names of the fields that the index is built on + /// The full paths of the fields that the index is built on + /// (dotted, with backtick-quoted segments for non-identifier names) pub field_names: Vec, /// The number of rows indexed by the index pub num_rows_indexed: u64, @@ -644,9 +701,8 @@ impl PyIndexDescription { .map(|field| { dataset .schema() - .field_by_id(*field as i32) - .map(|f| f.name.clone()) - .unwrap_or("".to_string()) + .field_path(*field as i32) + .unwrap_or_else(|_| "".to_string()) }) .collect(); @@ -696,6 +752,7 @@ pub fn register_indices(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { let indices = PyModule::new(py, "indices")?; indices.add_wrapped(wrap_pyfunction!(train_ivf_model))?; indices.add_wrapped(wrap_pyfunction!(train_pq_model))?; + indices.add_wrapped(wrap_pyfunction!(build_rq_model))?; indices.add_wrapped(wrap_pyfunction!(transform_vectors))?; indices.add_wrapped(wrap_pyfunction!(shuffle_transformed_vectors))?; indices.add_wrapped(wrap_pyfunction!(load_shuffled_vectors))?; diff --git a/python/src/lib.rs b/python/src/lib.rs index cf29b26c46a..3bf4eab221e 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -293,6 +293,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_wrapped(wrap_pyfunction!(mem_wal::py_evaluate_sharding_spec))?; + m.add_wrapped(wrap_pyfunction!(mem_wal::py_write_pk_sidecar))?; m.add_wrapped(wrap_pyfunction!(bfloat16_array))?; m.add_wrapped(wrap_pyfunction!(write_dataset))?; m.add_wrapped(wrap_pyfunction!(write_fragments))?; diff --git a/python/src/mem_wal.rs b/python/src/mem_wal.rs index 25127c95ea4..dc9718c0dce 100644 --- a/python/src/mem_wal.rs +++ b/python/src/mem_wal.rs @@ -51,6 +51,31 @@ pub fn py_evaluate_sharding_spec<'py>( result.to_pyarrow(py) } +/// Write a primary-key dedup sidecar (`_pk_index/`) for a flushed-generation +/// dataset already written at `gen_path`, mirroring what production flush emits. +/// +/// Test-support only: lets Python tests stage a *faithful* flushed generation +/// (dataset + sidecar). Production always writes the sidecar during flush, so a +/// dataset-without-sidecar is not a state the system otherwise produces. +#[pyfunction(name = "_write_pk_sidecar", signature = (gen_path, data, pk_columns))] +pub fn py_write_pk_sidecar( + py: Python<'_>, + gen_path: String, + data: &Bound<'_, PyAny>, + pk_columns: Vec, +) -> PyResult<()> { + let reader = ArrowArrayStreamReader::from_pyarrow_bound(data) + .map_err(|e| PyValueError::new_err(format!("Cannot read data as Arrow: {}", e)))?; + let batches: Vec = reader + .collect::>() + .map_err(|e| PyIOError::new_err(format!("Failed to read batches: {}", e)))?; + rt().block_on(Some(py), async move { + let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); + lance::dataset::mem_wal::scanner::write_pk_sidecar(&gen_path, &batches, &pk_refs).await + })? + .map_err(|e: lance::Error| PyIOError::new_err(e.to_string())) +} + fn sharding_spec_from_py(spec: &Bound<'_, PyAny>) -> PyResult { let spec_id = get_py_value(spec, "spec_id")?.extract::()?; let fields_obj = get_py_value(spec, "fields")?; diff --git a/python/src/namespace.rs b/python/src/namespace.rs index cf5f7c41b0f..e88ff40de2c 100644 --- a/python/src/namespace.rs +++ b/python/src/namespace.rs @@ -392,6 +392,44 @@ impl PyDirectoryNamespace { pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) } + // Table branch operations + + fn create_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.create_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn list_table_branches<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.list_table_branches(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn delete_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.delete_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + // Data manipulation operations fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { @@ -1054,6 +1092,44 @@ impl PyRestNamespace { pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) } + // Table branch operations + + fn create_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.create_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn list_table_branches<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.list_table_branches(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn delete_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.delete_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + // Data manipulation operations fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { @@ -1472,6 +1548,30 @@ fn get_dict_with_model_dump_class(py: Python<'_>) -> PyResult> Ok(class) } +/// Convert a Python namespace exception into a lance error, preserving the +/// namespace error identity when the exception is a `lance_namespace` +/// `LanceNamespaceError` carrying an error `code`, so callers can react to +/// e.g. TableNotFound the same way they do for native clients. Foreign +/// exceptions that happen to carry an integer `code` (e.g. SystemExit) must +/// not be reinterpreted, so the extraction is gated on the exception type. +fn namespace_error_from_py(method_name: &'static str, e: PyErr) -> lance_core::Error { + Python::attach(|py| { + let value = e.value(py); + let is_namespace_error = py + .import("lance_namespace.errors") + .and_then(|module| module.getattr("LanceNamespaceError")) + .and_then(|class| value.is_instance(&class)) + .unwrap_or(false); + if is_namespace_error + && let Ok(code) = value.getattr("code").and_then(|code| code.extract::()) + { + return lance_namespace::error::NamespaceError::from_code(code, value.to_string()) + .into(); + } + lance_core::Error::io(format!("Python error in {}: {}", method_name, e)) + }) +} + /// Helper to call a Python namespace method with JSON serialization. /// For methods that take a request and return a response. /// Uses DictWithModelDump to pass a dict that also has model_dump() method, @@ -1519,7 +1619,7 @@ where }) .await .map_err(|e| lance_core::Error::io(format!("Task join error for {}: {}", method_name, e)))? - .map_err(|e: PyErr| lance_core::Error::io(format!("Python error in {}: {}", method_name, e)))?; + .map_err(|e: PyErr| namespace_error_from_py(method_name, e))?; serde_json::from_str(&response_json).map_err(|e| { lance_core::Error::io(format!( diff --git a/python/src/scanner.rs b/python/src/scanner.rs index 691f7f53294..bbf1b3f35a3 100644 --- a/python/src/scanner.rs +++ b/python/src/scanner.rs @@ -125,14 +125,17 @@ impl Scanner { Ok(res) } - #[pyo3(signature = (*))] - fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult { + #[pyo3(signature = (*, count_rows = false))] + fn analyze_plan(self_: PyRef<'_, Self>, count_rows: bool) -> PyResult { let scanner = self_.scanner.clone(); let res = rt() - .spawn( - Some(self_.py()), - async move { scanner.analyze_plan().await }, - )? + .spawn(Some(self_.py()), async move { + if count_rows { + scanner.analyze_count_plan().await + } else { + scanner.analyze_plan().await + } + })? .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(res) diff --git a/python/uv.lock b/python/uv.lock index 69e061b8075..5f1fa45d755 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1,13 +1,12 @@ version = 1 revision = 3 -requires-python = ">=3.9" +requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.14'", "python_full_version == '3.13.*'", "python_full_version == '3.12.*'", "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version < '3.11'", ] [[package]] @@ -30,139 +29,139 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.13.4" +version = "3.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "aiohappyeyeballs", marker = "python_full_version >= '3.10'" }, - { name = "aiosignal", marker = "python_full_version >= '3.10'" }, - { name = "async-timeout", marker = "python_full_version == '3.10.*'" }, - { name = "attrs", marker = "python_full_version >= '3.10'" }, - { name = "frozenlist", marker = "python_full_version >= '3.10'" }, - { name = "multidict", marker = "python_full_version >= '3.10'" }, - { name = "propcache", marker = "python_full_version >= '3.10'" }, - { name = "yarl", marker = "python_full_version >= '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/45/4a/064321452809dae953c1ed6e017504e72551a26b6f5708a5a80e4bf556ff/aiohttp-3.13.4.tar.gz", hash = "sha256:d97a6d09c66087890c2ab5d49069e1e570583f7ac0314ecf98294c1b6aaebd38", size = 7859748, upload-time = "2026-03-28T17:19:40.6Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/05/6817e0390eb47b0867cf8efdb535298191662192281bc3ca62a0cb7973eb/aiohttp-3.13.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6290fe12fe8cefa6ea3c1c5b969d32c010dfe191d4392ff9b599a3f473cbe722", size = 753094, upload-time = "2026-03-28T17:14:59.928Z" }, - { url = "https://files.pythonhosted.org/packages/b4/c1/e5b7f25f6dd1ab57da92aa9d226b2c8b56f223dd20475d3ddfddaba86ab8/aiohttp-3.13.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7520d92c0e8fbbe63f36f20a5762db349ff574ad38ad7bc7732558a650439845", size = 505213, upload-time = "2026-03-28T17:15:01.989Z" }, - { url = "https://files.pythonhosted.org/packages/b4/e5/8f42033c7ce98b54dfd3791f03e60231cfe4a2db4471b5fc188df2b8a6ad/aiohttp-3.13.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d2710ae1e1b81d0f187883b6e9d66cecf8794b50e91aa1e73fc78bfb5503b5d9", size = 498580, upload-time = "2026-03-28T17:15:03.879Z" }, - { url = "https://files.pythonhosted.org/packages/8c/a4/bbc989f5362066b81930da1a66084a859a971d03faab799dc59a3ce3a220/aiohttp-3.13.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:717d17347567ded1e273aa09918650dfd6fd06f461549204570c7973537d4123", size = 1692718, upload-time = "2026-03-28T17:15:05.541Z" }, - { url = "https://files.pythonhosted.org/packages/1c/72/3775116969931f151be116689d2ae6ddafff2ec2887d8f9b4e7043f32e74/aiohttp-3.13.4-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:383880f7b8de5ac208fa829c7038d08e66377283b2de9e791b71e06e803153c2", size = 1660714, upload-time = "2026-03-28T17:15:08.23Z" }, - { url = "https://files.pythonhosted.org/packages/a1/e8/d2f1a2da2743e32fe348ebf8a4c59caad14a92f5f18af616fd33381275e1/aiohttp-3.13.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1867087e2c1963db1216aedf001efe3b129835ed2b05d97d058176a6d08b5726", size = 1744152, upload-time = "2026-03-28T17:15:10.828Z" }, - { url = "https://files.pythonhosted.org/packages/4c/a6/575886f417ac3c08e462f2ca237cc49f436bd992ca3f7ff95b7dd9c44205/aiohttp-3.13.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6234bf416a38d687c3ab7f79934d7fb2a42117a5b9813aca07de0a5398489023", size = 1836278, upload-time = "2026-03-28T17:15:12.537Z" }, - { url = "https://files.pythonhosted.org/packages/4a/4c/0051d4550fb9e8b5ca4e0fe1ccd58652340915180c5164999e6741bf2083/aiohttp-3.13.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3cdd3393130bf6588962441ffd5bde1d3ea2d63a64afa7119b3f3ba349cebbe7", size = 1687953, upload-time = "2026-03-28T17:15:14.248Z" }, - { url = "https://files.pythonhosted.org/packages/c9/54/841e87b8c51c2adc01a3ceb9919dc45c7899fe4c21deb70aada734ea5a38/aiohttp-3.13.4-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0d0dbc6c76befa76865373d6aa303e480bb8c3486e7763530f7f6e527b471118", size = 1572484, upload-time = "2026-03-28T17:15:15.911Z" }, - { url = "https://files.pythonhosted.org/packages/da/f1/21cbf5f7fa1e267af6301f886cab9b314f085e4d0097668d189d165cd7da/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10fb7b53262cf4144a083c9db0d2b4d22823d6708270a9970c4627b248c6064c", size = 1662851, upload-time = "2026-03-28T17:15:17.822Z" }, - { url = "https://files.pythonhosted.org/packages/40/15/bcad6b68d7bef27ae7443288215767263c7753ede164267cf6cf63c94a87/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:eb10ce8c03850e77f4d9518961c227be569e12f71525a7e90d17bca04299921d", size = 1671984, upload-time = "2026-03-28T17:15:19.561Z" }, - { url = "https://files.pythonhosted.org/packages/ff/fa/ab316931afc7a73c7f493bb1b30fbd61e28ec2d3ea50353336e76293e8ec/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:7c65738ac5ae32b8feef699a4ed0dc91a0c8618b347781b7461458bbcaaac7eb", size = 1713880, upload-time = "2026-03-28T17:15:21.589Z" }, - { url = "https://files.pythonhosted.org/packages/1c/45/314e8e64c7f328174964b6db511dd5e9e60c9121ab5457bc2c908b7d03a4/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:6b335919ffbaf98df8ff3c74f7a6decb8775882632952fd1810a017e38f15aee", size = 1560315, upload-time = "2026-03-28T17:15:23.66Z" }, - { url = "https://files.pythonhosted.org/packages/18/e7/93d5fa06fe00219a81466577dacae9e3732f3b4f767b12b2e2cc8c35c970/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ec75fc18cb9f4aca51c2cbace20cf6716e36850f44189644d2d69a875d5e0532", size = 1735115, upload-time = "2026-03-28T17:15:25.77Z" }, - { url = "https://files.pythonhosted.org/packages/19/9f/f64b95392ddd4e204fd9ab7cd33dd18d14ac9e4b86866f1f6a69b7cda83d/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:463fa18a95c5a635d2b8c09babe240f9d7dbf2a2010a6c0b35d8c4dff2a0e819", size = 1673916, upload-time = "2026-03-28T17:15:27.526Z" }, - { url = "https://files.pythonhosted.org/packages/52/c1/bb33be79fd285c69f32e5b074b299cae8847f748950149c3965c1b3b3adf/aiohttp-3.13.4-cp310-cp310-win32.whl", hash = "sha256:13168f5645d9045522c6cef818f54295376257ed8d02513a37c2ef3046fc7a97", size = 440277, upload-time = "2026-03-28T17:15:29.173Z" }, - { url = "https://files.pythonhosted.org/packages/23/f9/7cf1688da4dd0885f914ee40bc8e1dce776df98fe6518766de975a570538/aiohttp-3.13.4-cp310-cp310-win_amd64.whl", hash = "sha256:a7058af1f53209fdf07745579ced525d38d481650a989b7aa4a3b484b901cdab", size = 463015, upload-time = "2026-03-28T17:15:30.802Z" }, - { url = "https://files.pythonhosted.org/packages/d4/7e/cb94129302d78c46662b47f9897d642fd0b33bdfef4b73b20c6ced35aa4c/aiohttp-3.13.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8ea0c64d1bcbf201b285c2246c51a0c035ba3bbd306640007bc5844a3b4658c1", size = 760027, upload-time = "2026-03-28T17:15:33.022Z" }, - { url = "https://files.pythonhosted.org/packages/5e/cd/2db3c9397c3bd24216b203dd739945b04f8b87bb036c640da7ddb63c75ef/aiohttp-3.13.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6f742e1fa45c0ed522b00ede565e18f97e4cf8d1883a712ac42d0339dfb0cce7", size = 508325, upload-time = "2026-03-28T17:15:34.714Z" }, - { url = "https://files.pythonhosted.org/packages/36/a3/d28b2722ec13107f2e37a86b8a169897308bab6a3b9e071ecead9d67bd9b/aiohttp-3.13.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dcfb50ee25b3b7a1222a9123be1f9f89e56e67636b561441f0b304e25aaef8f", size = 502402, upload-time = "2026-03-28T17:15:36.409Z" }, - { url = "https://files.pythonhosted.org/packages/fa/d6/acd47b5f17c4430e555590990a4746efbcb2079909bb865516892bf85f37/aiohttp-3.13.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3262386c4ff370849863ea93b9ea60fd59c6cf56bf8f93beac625cf4d677c04d", size = 1771224, upload-time = "2026-03-28T17:15:38.223Z" }, - { url = "https://files.pythonhosted.org/packages/98/af/af6e20113ba6a48fd1cd9e5832c4851e7613ef50c7619acdaee6ec5f1aff/aiohttp-3.13.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:473bb5aa4218dd254e9ae4834f20e31f5a0083064ac0136a01a62ddbae2eaa42", size = 1731530, upload-time = "2026-03-28T17:15:39.988Z" }, - { url = "https://files.pythonhosted.org/packages/81/16/78a2f5d9c124ad05d5ce59a9af94214b6466c3491a25fb70760e98e9f762/aiohttp-3.13.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e56423766399b4c77b965f6aaab6c9546617b8994a956821cc507d00b91d978c", size = 1827925, upload-time = "2026-03-28T17:15:41.944Z" }, - { url = "https://files.pythonhosted.org/packages/2a/1f/79acf0974ced805e0e70027389fccbb7d728e6f30fcac725fb1071e63075/aiohttp-3.13.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8af249343fafd5ad90366a16d230fc265cf1149f26075dc9fe93cfd7c7173942", size = 1923579, upload-time = "2026-03-28T17:15:44.071Z" }, - { url = "https://files.pythonhosted.org/packages/af/53/29f9e2054ea6900413f3b4c3eb9d8331f60678ec855f13ba8714c47fd48d/aiohttp-3.13.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bc0a5cf4f10ef5a2c94fdde488734b582a3a7a000b131263e27c9295bd682d9", size = 1767655, upload-time = "2026-03-28T17:15:45.911Z" }, - { url = "https://files.pythonhosted.org/packages/f3/57/462fe1d3da08109ba4aa8590e7aed57c059af2a7e80ec21f4bac5cfe1094/aiohttp-3.13.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5c7ff1028e3c9fc5123a865ce17df1cb6424d180c503b8517afbe89aa566e6be", size = 1630439, upload-time = "2026-03-28T17:15:48.11Z" }, - { url = "https://files.pythonhosted.org/packages/d7/4b/4813344aacdb8127263e3eec343d24e973421143826364fa9fc847f6283f/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ba5cf98b5dcb9bddd857da6713a503fa6d341043258ca823f0f5ab7ab4a94ee8", size = 1745557, upload-time = "2026-03-28T17:15:50.13Z" }, - { url = "https://files.pythonhosted.org/packages/d4/01/1ef1adae1454341ec50a789f03cfafe4c4ac9c003f6a64515ecd32fe4210/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:d85965d3ba21ee4999e83e992fecb86c4614d6920e40705501c0a1f80a583c12", size = 1741796, upload-time = "2026-03-28T17:15:52.351Z" }, - { url = "https://files.pythonhosted.org/packages/22/04/8cdd99af988d2aa6922714d957d21383c559835cbd43fbf5a47ddf2e0f05/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:49f0b18a9b05d79f6f37ddd567695943fcefb834ef480f17a4211987302b2dc7", size = 1805312, upload-time = "2026-03-28T17:15:54.407Z" }, - { url = "https://files.pythonhosted.org/packages/fb/7f/b48d5577338d4b25bbdbae35c75dbfd0493cb8886dc586fbfb2e90862239/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7f78cb080c86fbf765920e5f1ef35af3f24ec4314d6675d0a21eaf41f6f2679c", size = 1621751, upload-time = "2026-03-28T17:15:56.564Z" }, - { url = "https://files.pythonhosted.org/packages/bc/89/4eecad8c1858e6d0893c05929e22343e0ebe3aec29a8a399c65c3cc38311/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:67a3ec705534a614b68bbf1c70efa777a21c3da3895d1c44510a41f5a7ae0453", size = 1826073, upload-time = "2026-03-28T17:15:58.489Z" }, - { url = "https://files.pythonhosted.org/packages/f5/5c/9dc8293ed31b46c39c9c513ac7ca152b3c3d38e0ea111a530ad12001b827/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d6630ec917e85c5356b2295744c8a97d40f007f96a1c76bf1928dc2e27465393", size = 1760083, upload-time = "2026-03-28T17:16:00.677Z" }, - { url = "https://files.pythonhosted.org/packages/1e/19/8bbf6a4994205d96831f97b7d21a0feed120136e6267b5b22d229c6dc4dc/aiohttp-3.13.4-cp311-cp311-win32.whl", hash = "sha256:54049021bc626f53a5394c29e8c444f726ee5a14b6e89e0ad118315b1f90f5e3", size = 439690, upload-time = "2026-03-28T17:16:02.902Z" }, - { url = "https://files.pythonhosted.org/packages/0c/f5/ac409ecd1007528d15c3e8c3a57d34f334c70d76cfb7128a28cffdebd4c1/aiohttp-3.13.4-cp311-cp311-win_amd64.whl", hash = "sha256:c033f2bc964156030772d31cbf7e5defea181238ce1f87b9455b786de7d30145", size = 463824, upload-time = "2026-03-28T17:16:05.058Z" }, - { url = "https://files.pythonhosted.org/packages/1e/bd/ede278648914cabbabfdf95e436679b5d4156e417896a9b9f4587169e376/aiohttp-3.13.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ee62d4471ce86b108b19c3364db4b91180d13fe3510144872d6bad5401957360", size = 752158, upload-time = "2026-03-28T17:16:06.901Z" }, - { url = "https://files.pythonhosted.org/packages/90/de/581c053253c07b480b03785196ca5335e3c606a37dc73e95f6527f1591fe/aiohttp-3.13.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c0fd8f41b54b58636402eb493afd512c23580456f022c1ba2db0f810c959ed0d", size = 501037, upload-time = "2026-03-28T17:16:08.82Z" }, - { url = "https://files.pythonhosted.org/packages/fa/f9/a5ede193c08f13cc42c0a5b50d1e246ecee9115e4cf6e900d8dbd8fd6acb/aiohttp-3.13.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4baa48ce49efd82d6b1a0be12d6a36b35e5594d1dd42f8bfba96ea9f8678b88c", size = 501556, upload-time = "2026-03-28T17:16:10.63Z" }, - { url = "https://files.pythonhosted.org/packages/d6/10/88ff67cd48a6ec36335b63a640abe86135791544863e0cfe1f065d6cef7a/aiohttp-3.13.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d738ebab9f71ee652d9dbd0211057690022201b11197f9a7324fd4dba128aa97", size = 1757314, upload-time = "2026-03-28T17:16:12.498Z" }, - { url = "https://files.pythonhosted.org/packages/8b/15/fdb90a5cf5a1f52845c276e76298c75fbbcc0ac2b4a86551906d54529965/aiohttp-3.13.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0ce692c3468fa831af7dceed52edf51ac348cebfc8d3feb935927b63bd3e8576", size = 1731819, upload-time = "2026-03-28T17:16:14.558Z" }, - { url = "https://files.pythonhosted.org/packages/ec/df/28146785a007f7820416be05d4f28cc207493efd1e8c6c1068e9bdc29198/aiohttp-3.13.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8e08abcfe752a454d2cb89ff0c08f2d1ecd057ae3e8cc6d84638de853530ebab", size = 1793279, upload-time = "2026-03-28T17:16:16.594Z" }, - { url = "https://files.pythonhosted.org/packages/10/47/689c743abf62ea7a77774d5722f220e2c912a77d65d368b884d9779ef41b/aiohttp-3.13.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5977f701b3fff36367a11087f30ea73c212e686d41cd363c50c022d48b011d8d", size = 1891082, upload-time = "2026-03-28T17:16:18.71Z" }, - { url = "https://files.pythonhosted.org/packages/b0/b6/f7f4f318c7e58c23b761c9b13b9a3c9b394e0f9d5d76fbc6622fa98509f6/aiohttp-3.13.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:54203e10405c06f8b6020bd1e076ae0fe6c194adcee12a5a78af3ffa3c57025e", size = 1773938, upload-time = "2026-03-28T17:16:21.125Z" }, - { url = "https://files.pythonhosted.org/packages/aa/06/f207cb3121852c989586a6fc16ff854c4fcc8651b86c5d3bd1fc83057650/aiohttp-3.13.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:358a6af0145bc4dda037f13167bef3cce54b132087acc4c295c739d05d16b1c3", size = 1579548, upload-time = "2026-03-28T17:16:23.588Z" }, - { url = "https://files.pythonhosted.org/packages/6c/58/e1289661a32161e24c1fe479711d783067210d266842523752869cc1d9c2/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:898ea1850656d7d61832ef06aa9846ab3ddb1621b74f46de78fbc5e1a586ba83", size = 1714669, upload-time = "2026-03-28T17:16:25.713Z" }, - { url = "https://files.pythonhosted.org/packages/96/0a/3e86d039438a74a86e6a948a9119b22540bae037d6ba317a042ae3c22711/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7bc30cceb710cf6a44e9617e43eebb6e3e43ad855a34da7b4b6a73537d8a6763", size = 1754175, upload-time = "2026-03-28T17:16:28.18Z" }, - { url = "https://files.pythonhosted.org/packages/f4/30/e717fc5df83133ba467a560b6d8ef20197037b4bb5d7075b90037de1018e/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4a31c0c587a8a038f19a4c7e60654a6c899c9de9174593a13e7cc6e15ff271f9", size = 1762049, upload-time = "2026-03-28T17:16:30.941Z" }, - { url = "https://files.pythonhosted.org/packages/e4/28/8f7a2d4492e336e40005151bdd94baf344880a4707573378579f833a64c1/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2062f675f3fe6e06d6113eb74a157fb9df58953ffed0cdb4182554b116545758", size = 1570861, upload-time = "2026-03-28T17:16:32.953Z" }, - { url = "https://files.pythonhosted.org/packages/78/45/12e1a3d0645968b1c38de4b23fdf270b8637735ea057d4f84482ff918ad9/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d1ba8afb847ff80626d5e408c1fdc99f942acc877d0702fe137015903a220a9", size = 1790003, upload-time = "2026-03-28T17:16:35.468Z" }, - { url = "https://files.pythonhosted.org/packages/eb/0f/60374e18d590de16dcb39d6ff62f39c096c1b958e6f37727b5870026ea30/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b08149419994cdd4d5eecf7fd4bc5986b5a9380285bcd01ab4c0d6bfca47b79d", size = 1737289, upload-time = "2026-03-28T17:16:38.187Z" }, - { url = "https://files.pythonhosted.org/packages/02/bf/535e58d886cfbc40a8b0013c974afad24ef7632d645bca0b678b70033a60/aiohttp-3.13.4-cp312-cp312-win32.whl", hash = "sha256:fc432f6a2c4f720180959bc19aa37259651c1a4ed8af8afc84dd41c60f15f791", size = 434185, upload-time = "2026-03-28T17:16:40.735Z" }, - { url = "https://files.pythonhosted.org/packages/1e/1a/d92e3325134ebfff6f4069f270d3aac770d63320bd1fcd0eca023e74d9a8/aiohttp-3.13.4-cp312-cp312-win_amd64.whl", hash = "sha256:6148c9ae97a3e8bff9a1fc9c757fa164116f86c100468339730e717590a3fb77", size = 461285, upload-time = "2026-03-28T17:16:42.713Z" }, - { url = "https://files.pythonhosted.org/packages/e3/ac/892f4162df9b115b4758d615f32ec63d00f3084c705ff5526630887b9b42/aiohttp-3.13.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:63dd5e5b1e43b8fb1e91b79b7ceba1feba588b317d1edff385084fcc7a0a4538", size = 745744, upload-time = "2026-03-28T17:16:44.67Z" }, - { url = "https://files.pythonhosted.org/packages/97/a9/c5b87e4443a2f0ea88cb3000c93a8fdad1ee63bffc9ded8d8c8e0d66efc6/aiohttp-3.13.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:746ac3cc00b5baea424dacddea3ec2c2702f9590de27d837aa67004db1eebc6e", size = 498178, upload-time = "2026-03-28T17:16:46.766Z" }, - { url = "https://files.pythonhosted.org/packages/94/42/07e1b543a61250783650df13da8ddcdc0d0a5538b2bd15cef6e042aefc61/aiohttp-3.13.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bda8f16ea99d6a6705e5946732e48487a448be874e54a4f73d514660ff7c05d3", size = 498331, upload-time = "2026-03-28T17:16:48.9Z" }, - { url = "https://files.pythonhosted.org/packages/20/d6/492f46bf0328534124772d0cf58570acae5b286ea25006900650f69dae0e/aiohttp-3.13.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4b061e7b5f840391e3f64d0ddf672973e45c4cfff7a0feea425ea24e51530fc2", size = 1744414, upload-time = "2026-03-28T17:16:50.968Z" }, - { url = "https://files.pythonhosted.org/packages/e2/4d/e02627b2683f68051246215d2d62b2d2f249ff7a285e7a858dc47d6b6a14/aiohttp-3.13.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b252e8d5cd66184b570d0d010de742736e8a4fab22c58299772b0c5a466d4b21", size = 1719226, upload-time = "2026-03-28T17:16:53.173Z" }, - { url = "https://files.pythonhosted.org/packages/7b/6c/5d0a3394dd2b9f9aeba6e1b6065d0439e4b75d41f1fb09a3ec010b43552b/aiohttp-3.13.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:20af8aad61d1803ff11152a26146d8d81c266aa8c5aa9b4504432abb965c36a0", size = 1782110, upload-time = "2026-03-28T17:16:55.362Z" }, - { url = "https://files.pythonhosted.org/packages/0d/2d/c20791e3437700a7441a7edfb59731150322424f5aadf635602d1d326101/aiohttp-3.13.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:13a5cc924b59859ad2adb1478e31f410a7ed46e92a2a619d6d1dd1a63c1a855e", size = 1884809, upload-time = "2026-03-28T17:16:57.734Z" }, - { url = "https://files.pythonhosted.org/packages/c8/94/d99dbfbd1924a87ef643833932eb2a3d9e5eee87656efea7d78058539eff/aiohttp-3.13.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:534913dfb0a644d537aebb4123e7d466d94e3be5549205e6a31f72368980a81a", size = 1764938, upload-time = "2026-03-28T17:17:00.221Z" }, - { url = "https://files.pythonhosted.org/packages/49/61/3ce326a1538781deb89f6cf5e094e2029cd308ed1e21b2ba2278b08426f6/aiohttp-3.13.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:320e40192a2dcc1cf4b5576936e9652981ab596bf81eb309535db7e2f5b5672f", size = 1570697, upload-time = "2026-03-28T17:17:02.985Z" }, - { url = "https://files.pythonhosted.org/packages/b6/77/4ab5a546857bb3028fbaf34d6eea180267bdab022ee8b1168b1fcde4bfdd/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9e587fcfce2bcf06526a43cb705bdee21ac089096f2e271d75de9c339db3100c", size = 1702258, upload-time = "2026-03-28T17:17:05.28Z" }, - { url = "https://files.pythonhosted.org/packages/79/63/d8f29021e39bc5af8e5d5e9da1b07976fb9846487a784e11e4f4eeda4666/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9eb9c2eea7278206b5c6c1441fdd9dc420c278ead3f3b2cc87f9b693698cc500", size = 1740287, upload-time = "2026-03-28T17:17:07.712Z" }, - { url = "https://files.pythonhosted.org/packages/55/3a/cbc6b3b124859a11bc8055d3682c26999b393531ef926754a3445b99dfef/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:29be00c51972b04bf9d5c8f2d7f7314f48f96070ca40a873a53056e652e805f7", size = 1753011, upload-time = "2026-03-28T17:17:10.053Z" }, - { url = "https://files.pythonhosted.org/packages/e0/30/836278675205d58c1368b21520eab9572457cf19afd23759216c04483048/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:90c06228a6c3a7c9f776fe4fc0b7ff647fffd3bed93779a6913c804ae00c1073", size = 1566359, upload-time = "2026-03-28T17:17:12.433Z" }, - { url = "https://files.pythonhosted.org/packages/50/b4/8032cc9b82d17e4277704ba30509eaccb39329dc18d6a35f05e424439e32/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:a533ec132f05fd9a1d959e7f34184cd7d5e8511584848dab85faefbaac573069", size = 1785537, upload-time = "2026-03-28T17:17:14.721Z" }, - { url = "https://files.pythonhosted.org/packages/17/7d/5873e98230bde59f493bf1f7c3e327486a4b5653fa401144704df5d00211/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1c946f10f413836f82ea4cfb90200d2a59578c549f00857e03111cf45ad01ca5", size = 1740752, upload-time = "2026-03-28T17:17:17.387Z" }, - { url = "https://files.pythonhosted.org/packages/7b/f2/13e46e0df051494d7d3c68b7f72d071f48c384c12716fc294f75d5b1a064/aiohttp-3.13.4-cp313-cp313-win32.whl", hash = "sha256:48708e2706106da6967eff5908c78ca3943f005ed6bcb75da2a7e4da94ef8c70", size = 433187, upload-time = "2026-03-28T17:17:19.523Z" }, - { url = "https://files.pythonhosted.org/packages/ea/c0/649856ee655a843c8f8664592cfccb73ac80ede6a8c8db33a25d810c12db/aiohttp-3.13.4-cp313-cp313-win_amd64.whl", hash = "sha256:74a2eb058da44fa3a877a49e2095b591d4913308bb424c418b77beb160c55ce3", size = 459778, upload-time = "2026-03-28T17:17:21.964Z" }, - { url = "https://files.pythonhosted.org/packages/6d/29/6657cc37ae04cacc2dbf53fb730a06b6091cc4cbe745028e047c53e6d840/aiohttp-3.13.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:e0a2c961fc92abeff61d6444f2ce6ad35bb982db9fc8ff8a47455beacf454a57", size = 749363, upload-time = "2026-03-28T17:17:24.044Z" }, - { url = "https://files.pythonhosted.org/packages/90/7f/30ccdf67ca3d24b610067dc63d64dcb91e5d88e27667811640644aa4a85d/aiohttp-3.13.4-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:153274535985a0ff2bff1fb6c104ed547cec898a09213d21b0f791a44b14d933", size = 499317, upload-time = "2026-03-28T17:17:26.199Z" }, - { url = "https://files.pythonhosted.org/packages/93/13/e372dd4e68ad04ee25dafb050c7f98b0d91ea643f7352757e87231102555/aiohttp-3.13.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:351f3171e2458da3d731ce83f9e6b9619e325c45cbd534c7759750cabf453ad7", size = 500477, upload-time = "2026-03-28T17:17:28.279Z" }, - { url = "https://files.pythonhosted.org/packages/e5/fe/ee6298e8e586096fb6f5eddd31393d8544f33ae0792c71ecbb4c2bef98ac/aiohttp-3.13.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f989ac8bc5595ff761a5ccd32bdb0768a117f36dd1504b1c2c074ed5d3f4df9c", size = 1737227, upload-time = "2026-03-28T17:17:30.587Z" }, - { url = "https://files.pythonhosted.org/packages/b0/b9/a7a0463a09e1a3fe35100f74324f23644bfc3383ac5fd5effe0722a5f0b7/aiohttp-3.13.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d36fc1709110ec1e87a229b201dd3ddc32aa01e98e7868083a794609b081c349", size = 1694036, upload-time = "2026-03-28T17:17:33.29Z" }, - { url = "https://files.pythonhosted.org/packages/57/7c/8972ae3fb7be00a91aee6b644b2a6a909aedb2c425269a3bfd90115e6f8f/aiohttp-3.13.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:42adaeea83cbdf069ab94f5103ce0787c21fb1a0153270da76b59d5578302329", size = 1786814, upload-time = "2026-03-28T17:17:36.035Z" }, - { url = "https://files.pythonhosted.org/packages/93/01/c81e97e85c774decbaf0d577de7d848934e8166a3a14ad9f8aa5be329d28/aiohttp-3.13.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:92deb95469928cc41fd4b42a95d8012fa6df93f6b1c0a83af0ffbc4a5e218cde", size = 1866676, upload-time = "2026-03-28T17:17:38.441Z" }, - { url = "https://files.pythonhosted.org/packages/5a/5f/5b46fe8694a639ddea2cd035bf5729e4677ea882cb251396637e2ef1590d/aiohttp-3.13.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c0c7c07c4257ef3a1df355f840bc62d133bcdef5c1c5ba75add3c08553e2eed", size = 1740842, upload-time = "2026-03-28T17:17:40.783Z" }, - { url = "https://files.pythonhosted.org/packages/20/a2/0d4b03d011cca6b6b0acba8433193c1e484efa8d705ea58295590fe24203/aiohttp-3.13.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f062c45de8a1098cb137a1898819796a2491aec4e637a06b03f149315dff4d8f", size = 1566508, upload-time = "2026-03-28T17:17:43.235Z" }, - { url = "https://files.pythonhosted.org/packages/98/17/e689fd500da52488ec5f889effd6404dece6a59de301e380f3c64f167beb/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:76093107c531517001114f0ebdb4f46858ce818590363e3e99a4a2280334454a", size = 1700569, upload-time = "2026-03-28T17:17:46.165Z" }, - { url = "https://files.pythonhosted.org/packages/d8/0d/66402894dbcf470ef7db99449e436105ea862c24f7ea4c95c683e635af35/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:6f6ec32162d293b82f8b63a16edc80769662fbd5ae6fbd4936d3206a2c2cc63b", size = 1707407, upload-time = "2026-03-28T17:17:48.825Z" }, - { url = "https://files.pythonhosted.org/packages/2f/eb/af0ab1a3650092cbd8e14ef29e4ab0209e1460e1c299996c3f8288b3f1ff/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5903e2db3d202a00ad9f0ec35a122c005e85d90c9836ab4cda628f01edf425e2", size = 1752214, upload-time = "2026-03-28T17:17:51.206Z" }, - { url = "https://files.pythonhosted.org/packages/5a/bf/72326f8a98e4c666f292f03c385545963cc65e358835d2a7375037a97b57/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2d5bea57be7aca98dbbac8da046d99b5557c5cf4e28538c4c786313078aca09e", size = 1562162, upload-time = "2026-03-28T17:17:53.634Z" }, - { url = "https://files.pythonhosted.org/packages/67/9f/13b72435f99151dd9a5469c96b3b5f86aa29b7e785ca7f35cf5e538f74c0/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:bcf0c9902085976edc0232b75006ef38f89686901249ce14226b6877f88464fb", size = 1768904, upload-time = "2026-03-28T17:17:55.991Z" }, - { url = "https://files.pythonhosted.org/packages/18/bc/28d4970e7d5452ac7776cdb5431a1164a0d9cf8bd2fffd67b4fb463aa56d/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3295f98bfeed2e867cab588f2a146a9db37a85e3ae9062abf46ba062bd29165", size = 1723378, upload-time = "2026-03-28T17:17:58.348Z" }, - { url = "https://files.pythonhosted.org/packages/53/74/b32458ca1a7f34d65bdee7aef2036adbe0438123d3d53e2b083c453c24dd/aiohttp-3.13.4-cp314-cp314-win32.whl", hash = "sha256:a598a5c5767e1369d8f5b08695cab1d8160040f796c4416af76fd773d229b3c9", size = 438711, upload-time = "2026-03-28T17:18:00.728Z" }, - { url = "https://files.pythonhosted.org/packages/40/b2/54b487316c2df3e03a8f3435e9636f8a81a42a69d942164830d193beb56a/aiohttp-3.13.4-cp314-cp314-win_amd64.whl", hash = "sha256:c555db4bc7a264bead5a7d63d92d41a1122fcd39cc62a4db815f45ad46f9c2c8", size = 464977, upload-time = "2026-03-28T17:18:03.367Z" }, - { url = "https://files.pythonhosted.org/packages/47/fb/e41b63c6ce71b07a59243bb8f3b457ee0c3402a619acb9d2c0d21ef0e647/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:45abbbf09a129825d13c18c7d3182fecd46d9da3cfc383756145394013604ac1", size = 781549, upload-time = "2026-03-28T17:18:05.779Z" }, - { url = "https://files.pythonhosted.org/packages/97/53/532b8d28df1e17e44c4d9a9368b78dcb6bf0b51037522136eced13afa9e8/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:74c80b2bc2c2adb7b3d1941b2b60701ee2af8296fc8aad8b8bc48bc25767266c", size = 514383, upload-time = "2026-03-28T17:18:08.096Z" }, - { url = "https://files.pythonhosted.org/packages/1b/1f/62e5d400603e8468cd635812d99cb81cfdc08127a3dc474c647615f31339/aiohttp-3.13.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c97989ae40a9746650fa196894f317dafc12227c808c774929dda0ff873a5954", size = 518304, upload-time = "2026-03-28T17:18:10.642Z" }, - { url = "https://files.pythonhosted.org/packages/90/57/2326b37b10896447e3c6e0cbef4fe2486d30913639a5cfd1332b5d870f82/aiohttp-3.13.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dae86be9811493f9990ef44fff1685f5c1a3192e9061a71a109d527944eed551", size = 1893433, upload-time = "2026-03-28T17:18:13.121Z" }, - { url = "https://files.pythonhosted.org/packages/d2/b4/a24d82112c304afdb650167ef2fe190957d81cbddac7460bedd245f765aa/aiohttp-3.13.4-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1db491abe852ca2fa6cc48a3341985b0174b3741838e1341b82ac82c8bd9e871", size = 1755901, upload-time = "2026-03-28T17:18:16.21Z" }, - { url = "https://files.pythonhosted.org/packages/9e/2d/0883ef9d878d7846287f036c162a951968f22aabeef3ac97b0bea6f76d5d/aiohttp-3.13.4-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0e5d701c0aad02a7dce72eef6b93226cf3734330f1a31d69ebbf69f33b86666e", size = 1876093, upload-time = "2026-03-28T17:18:18.703Z" }, - { url = "https://files.pythonhosted.org/packages/ad/52/9204bb59c014869b71971addad6778f005daa72a96eed652c496789d7468/aiohttp-3.13.4-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8ac32a189081ae0a10ba18993f10f338ec94341f0d5df8fff348043962f3c6f8", size = 1970815, upload-time = "2026-03-28T17:18:21.858Z" }, - { url = "https://files.pythonhosted.org/packages/d6/b5/e4eb20275a866dde0f570f411b36c6b48f7b53edfe4f4071aa1b0728098a/aiohttp-3.13.4-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e968cdaba43e45c73c3f306fca418c8009a957733bac85937c9f9cf3f4de27", size = 1816223, upload-time = "2026-03-28T17:18:24.729Z" }, - { url = "https://files.pythonhosted.org/packages/d8/23/e98075c5bb146aa61a1239ee1ac7714c85e814838d6cebbe37d3fe19214a/aiohttp-3.13.4-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca114790c9144c335d538852612d3e43ea0f075288f4849cf4b05d6cd2238ce7", size = 1649145, upload-time = "2026-03-28T17:18:27.269Z" }, - { url = "https://files.pythonhosted.org/packages/d6/c1/7bad8be33bb06c2bb224b6468874346026092762cbec388c3bdb65a368ee/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ea2e071661ba9cfe11eabbc81ac5376eaeb3061f6e72ec4cc86d7cdd1ffbdbbb", size = 1816562, upload-time = "2026-03-28T17:18:29.847Z" }, - { url = "https://files.pythonhosted.org/packages/5c/10/c00323348695e9a5e316825969c88463dcc24c7e9d443244b8a2c9cf2eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:34e89912b6c20e0fd80e07fa401fd218a410aa1ce9f1c2f1dad6db1bd0ce0927", size = 1800333, upload-time = "2026-03-28T17:18:32.269Z" }, - { url = "https://files.pythonhosted.org/packages/84/43/9b2147a1df3559f49bd723e22905b46a46c068a53adb54abdca32c4de180/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0e217cf9f6a42908c52b46e42c568bd57adc39c9286ced31aaace614b6087965", size = 1820617, upload-time = "2026-03-28T17:18:35.238Z" }, - { url = "https://files.pythonhosted.org/packages/a9/7f/b3481a81e7a586d02e99387b18c6dafff41285f6efd3daa2124c01f87eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:0c296f1221e21ba979f5ac1964c3b78cfde15c5c5f855ffd2caab337e9cd9182", size = 1643417, upload-time = "2026-03-28T17:18:37.949Z" }, - { url = "https://files.pythonhosted.org/packages/8f/72/07181226bc99ce1124e0f89280f5221a82d3ae6a6d9d1973ce429d48e52b/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:d99a9d168ebaffb74f36d011750e490085ac418f4db926cce3989c8fe6cb6b1b", size = 1849286, upload-time = "2026-03-28T17:18:40.534Z" }, - { url = "https://files.pythonhosted.org/packages/1a/e6/1b3566e103eca6da5be4ae6713e112a053725c584e96574caf117568ffef/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cb19177205d93b881f3f89e6081593676043a6828f59c78c17a0fd6c1fbed2ba", size = 1782635, upload-time = "2026-03-28T17:18:43.073Z" }, - { url = "https://files.pythonhosted.org/packages/37/58/1b11c71904b8d079eb0c39fe664180dd1e14bebe5608e235d8bfbadc8929/aiohttp-3.13.4-cp314-cp314t-win32.whl", hash = "sha256:c606aa5656dab6552e52ca368e43869c916338346bfaf6304e15c58fb113ea30", size = 472537, upload-time = "2026-03-28T17:18:46.286Z" }, - { url = "https://files.pythonhosted.org/packages/bc/8f/87c56a1a1977d7dddea5b31e12189665a140fdb48a71e9038ff90bb564ec/aiohttp-3.13.4-cp314-cp314t-win_amd64.whl", hash = "sha256:014dcc10ec8ab8db681f0d68e939d1e9286a5aa2b993cbbdb0db130853e02144", size = 506381, upload-time = "2026-03-28T17:18:48.74Z" }, - { url = "https://files.pythonhosted.org/packages/0a/f9/17e8a70abe874ec694395119338fde2f13ee1903bd14f3fd5b310b77a1ea/aiohttp-3.13.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b3f00bb9403728b08eb3951e982ca0a409c7a871d709684623daeab79465b181", size = 755716, upload-time = "2026-03-28T17:18:51.918Z" }, - { url = "https://files.pythonhosted.org/packages/27/b3/fdb36e59b9fb37297b1651248d3d84e61faa49af2faabc1e243d3f75585f/aiohttp-3.13.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cb15595eb52870f84248d7cc97013a76f52ab02ff74d394be093b1d9b8b82bc0", size = 506500, upload-time = "2026-03-28T17:18:54.755Z" }, - { url = "https://files.pythonhosted.org/packages/cf/fb/dacf759c43cfb5fa32568bd369f054eeb23906ab23f4e3663e01e04c7988/aiohttp-3.13.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:907ad36b6a65cff7d88d7aca0f77c650546ba850a4f92c92ecb83590d4613249", size = 499881, upload-time = "2026-03-28T17:18:57.302Z" }, - { url = "https://files.pythonhosted.org/packages/52/cd/7824ee57dde8ca7f62e7fbc247ebe1aa3b5495d3598f0c516f06de1ef7ab/aiohttp-3.13.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5539ec0d6a3a5c6799b661b7e79166ad1b7ae71ccb59a92fcb6b4ef89295bc94", size = 1681734, upload-time = "2026-03-28T17:19:00.057Z" }, - { url = "https://files.pythonhosted.org/packages/7a/40/6f4ca61736a16deed2d2762a8dbeaaa48ad292974489be2a2f32f62a4e0b/aiohttp-3.13.4-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3b4e07d8803a70dd886b5f38588e5b49f894995ca8e132b06c31a2583ae2ef6e", size = 1653787, upload-time = "2026-03-28T17:19:03.026Z" }, - { url = "https://files.pythonhosted.org/packages/89/80/3793f0a1148a42190f6824ce9a0af79910cd3df8dfc58fa784234a7d9e41/aiohttp-3.13.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ce7320a945aac4bf0bb8901600e4f9409eb602f25ce3ef4d275b48f6d704a862", size = 1737964, upload-time = "2026-03-28T17:19:05.77Z" }, - { url = "https://files.pythonhosted.org/packages/15/fd/e41981d0f9e0dccfb8f2580d4e64e6c59d293b9b0815849950cc499fe53a/aiohttp-3.13.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:26ed03f7d3d6453634729e2c7600d7255d65e879559c5a48fe1bb78355cde74b", size = 1832226, upload-time = "2026-03-28T17:19:08.809Z" }, - { url = "https://files.pythonhosted.org/packages/fa/69/e6b566c638b37bfa14b98c2c429fcdba3b097a990acc9845fcc779ce39cc/aiohttp-3.13.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c3f733916e85506b8000dddc071c6b82f8c68f56c99adb328d6550017db062d", size = 1681476, upload-time = "2026-03-28T17:19:11.502Z" }, - { url = "https://files.pythonhosted.org/packages/7d/8c/f1b7f03e745fa6281dd949673297c7ac54d7cc54d2e58beb5135ac5c6204/aiohttp-3.13.4-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b3d525648fe7c8b4977e460c18098f9f81d7991d72edfdc2f13cf96068f279bc", size = 1573061, upload-time = "2026-03-28T17:19:14.437Z" }, - { url = "https://files.pythonhosted.org/packages/bc/56/e7e972f1bed922297d72cc1d27bae6b2e28fdc2d6a895320e396a93c0f8a/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e2e68085730a03704beb2cff035fa8648f62c9f93758d7e6d70add7f7bb5b3b", size = 1653248, upload-time = "2026-03-28T17:19:17.432Z" }, - { url = "https://files.pythonhosted.org/packages/cf/98/3d63d2f2e06808911e103d6d47c400548cf26a23dd3275de594339ff8e96/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:797613182ffaaca0b9ad5f3b3d3ce5d21242c768f75e66c750b8292bd97c9de3", size = 1666599, upload-time = "2026-03-28T17:19:20.17Z" }, - { url = "https://files.pythonhosted.org/packages/da/c8/31e487fb16d37c89cc6ee190a424b218471750ac48a227e042e200a17687/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2d15e7e4f1099d9e4d863eaf77a8eee5dcb002b7d7188061b0fbee37f845899e", size = 1709919, upload-time = "2026-03-28T17:19:22.872Z" }, - { url = "https://files.pythonhosted.org/packages/c1/86/3b742bd9204b7deb4f61e6723b1f42a8211ccc60dfddb3e52a6cd4329d46/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:19f60011ad60e40a01d242238bb335399e3a4d8df958c63cbb835add8d5c3b5a", size = 1560523, upload-time = "2026-03-28T17:19:25.879Z" }, - { url = "https://files.pythonhosted.org/packages/72/63/6b80cef343a0527690588808d02aad7604cc4e23eaab207179e77dd607be/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c344c47e85678e410b064fc2ace14db86bb69db7ed5520c234bf13aed603ec30", size = 1731336, upload-time = "2026-03-28T17:19:29.02Z" }, - { url = "https://files.pythonhosted.org/packages/d4/3c/9b39bc9609cac87e19b3394b7ed4bbab3787b434b14e012b9e16be64e9d5/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d904084985ca66459e93797e5e05985c048a9c0633655331144c089943e53d12", size = 1667646, upload-time = "2026-03-28T17:19:31.797Z" }, - { url = "https://files.pythonhosted.org/packages/21/72/3fb0ea857c891de89f6914f737f7423b7fa4dd1f46d8ce621eb07595ff4c/aiohttp-3.13.4-cp39-cp39-win32.whl", hash = "sha256:1746338dc2a33cf706cd7446575d13d451f28f9860bebc908c7632b22e71ae3f", size = 441019, upload-time = "2026-03-28T17:19:34.79Z" }, - { url = "https://files.pythonhosted.org/packages/b1/61/8a7191782a31ae3c7f7cee2cd2e37b3ee5849666767db116d449cfe20b88/aiohttp-3.13.4-cp39-cp39-win_amd64.whl", hash = "sha256:a5444dce2e6fba0a1dc2d58d026e674f25f21de178c6f844342629bcef019f2f", size = 464025, upload-time = "2026-03-28T17:19:37.362Z" }, + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "async-timeout", marker = "python_full_version < '3.11'" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/f0/f81190ba488cd106c2fc6d92680e56bb223bbbbf1e6908c2617011290112/aiohttp-3.14.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:692e409052e7436029bbb32977cd7c5bf806ac5fa4085b973996785ffadad33c", size = 760606, upload-time = "2026-06-01T19:36:39.054Z" }, + { url = "https://files.pythonhosted.org/packages/f6/54/444d37eebf0f15db661ca44ec7caf93962f3c5ca92eb4c9a5d888b70aaa2/aiohttp-3.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:40af7ebe53c7990e110dc4ad03566b12c3ac996254298a3d39046dd69cfcb2c2", size = 514677, upload-time = "2026-06-01T19:36:42.408Z" }, + { url = "https://files.pythonhosted.org/packages/d0/d1/da280e23321c132c0a3fa7c8cc2830621d79174edc64c829443346489a36/aiohttp-3.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02cb2ffbb7da32f82e21ad9952669c45bd88a80e0878264c2f59fe1c6fb2badd", size = 510155, upload-time = "2026-06-01T19:36:44.072Z" }, + { url = "https://files.pythonhosted.org/packages/09/b8/2e36d54d0991ec5bba451444004591ee0af58cb1662a3a81c562878b9c1f/aiohttp-3.14.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2514cb7195f6d7c219339635bea71ae47d1569b051300d32df9dcfabcdb869", size = 1699947, upload-time = "2026-06-01T19:36:45.762Z" }, + { url = "https://files.pythonhosted.org/packages/57/95/a31d8ea1a0b9ecc084f5a7dd0b431ce64ef585918bb7bdc82afe11843877/aiohttp-3.14.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:30e8b7eeb42d02c120ca90d6c6e076a221a16b70a6dac9ae44c7ab5104cc7fe4", size = 1664364, upload-time = "2026-06-01T19:36:47.653Z" }, + { url = "https://files.pythonhosted.org/packages/01/f6/5de3ddffc87a9e8d09b3be38fbd6dd1a736b2ad477a7e787dcb85f57f338/aiohttp-3.14.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:63e38be0d75a654deaa06be32fb4cab883a4222940be1d05861b6717679cbadb", size = 1761186, upload-time = "2026-06-01T19:36:49.355Z" }, + { url = "https://files.pythonhosted.org/packages/33/8c/03c5438ec35d7e3a4f33fe895d6c3ec7540a7cec46065f21851211e1ee4d/aiohttp-3.14.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1210d4c87cc00128160c7384ab41877a701295b97cffa6362f908a49b6e8a7ca", size = 1849727, upload-time = "2026-06-01T19:36:51.478Z" }, + { url = "https://files.pythonhosted.org/packages/22/32/5a05303b0874458920b73f48b8779cc3a93d503f121b38dcc0456dbd698c/aiohttp-3.14.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a78a77366ed158a0a54b076990e575d7b7cdb728cbfd02711eadab150f2269f", size = 1708197, upload-time = "2026-06-01T19:36:53.241Z" }, + { url = "https://files.pythonhosted.org/packages/7d/62/478f169488d61414c0a05e7fe423b59ae3d9dcc933d1f0e4acc2c5d5bc3e/aiohttp-3.14.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f4d2038c64f36df96cfd3fa0937910e231eafbf897e70a06c155a817bb632fa6", size = 1578147, upload-time = "2026-06-01T19:36:55.154Z" }, + { url = "https://files.pythonhosted.org/packages/1d/af/b20af85765658972d3337834bd5eebba91b962794f2b4fc3e0ee8c85c0e1/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4714c70067a08b604d0bf3bc4dfdf82e52944afab41d0428d460862763d2f79b", size = 1665836, upload-time = "2026-06-01T19:36:56.94Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a3/771879cfd59948f4544b172189048905feff802f20f1c6c5411e998a3e06/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:f79bfd2847513a7ac801bbafd1de02348a37926ac439eeb4bfe96fcff4eada15", size = 1680335, upload-time = "2026-06-01T19:36:58.642Z" }, + { url = "https://files.pythonhosted.org/packages/f4/16/582e36ad1d32133cd40659f3bc98e71c22179665a1cfbbb4713bce339c06/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:25e9f1d2465a210d60edb64d7b204a147e85d4c194eecef3d1604fb5ace678ce", size = 1731180, upload-time = "2026-06-01T19:37:00.583Z" }, + { url = "https://files.pythonhosted.org/packages/11/bc/80708fe3f64a07a2c306a42fc7b009118a952709761d215f6d1b4c57195b/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:b5314743ebe926c2fda35d0a298c565c885505f6635c2a30936363404cf274a7", size = 1565805, upload-time = "2026-06-01T19:37:02.446Z" }, + { url = "https://files.pythonhosted.org/packages/57/8f/8d25897f8273a32fe4ad40a8885eec4f397377ed46e8e383078169f60316/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:28eee8de1d69711c53116df8202f1c2aa0e3f80ef912a88fc18d159d53e7110b", size = 1742496, upload-time = "2026-06-01T19:37:04.222Z" }, + { url = "https://files.pythonhosted.org/packages/9f/7d/c341d32ab2dec56c8478740695743dc6c21b383cace9376a3eab16311a07/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:89ed35666c95d3efe1955056afcde09e62a57a34e2a4398b17f9f6c1564f0b25", size = 1691240, upload-time = "2026-06-01T19:37:06.277Z" }, + { url = "https://files.pythonhosted.org/packages/37/0f/a81207dd7a2d4a4f645b3a3f8b5a1da1159dc63117ffb137b698fd6df50f/aiohttp-3.14.0-cp310-cp310-win32.whl", hash = "sha256:5e4646e9a6af29af354204011bf5769cb0276ec5b64653e42f90b3e13845169f", size = 454686, upload-time = "2026-06-01T19:37:07.96Z" }, + { url = "https://files.pythonhosted.org/packages/7f/ae/842357f2afb9c915715c6f5775239d987f5d0f845abf7675fa794e0a9d40/aiohttp-3.14.0-cp310-cp310-win_amd64.whl", hash = "sha256:22a8d06f204e0518a586d770032db3c7043c9ba3693081b3e3ad425e1458d594", size = 478677, upload-time = "2026-06-01T19:37:09.652Z" }, + { url = "https://files.pythonhosted.org/packages/6b/d1/330fb22c9535ec177b52396905131c6e39447244b6ca876262939af668ef/aiohttp-3.14.0-cp310-cp310-win_arm64.whl", hash = "sha256:4acfc34bd4d3c58754fc9f22ff1b5e92aabce68f3d4bf7b71a0b732d9bceb78a", size = 450364, upload-time = "2026-06-01T19:37:11.279Z" }, + { url = "https://files.pythonhosted.org/packages/67/47/7727bfe8db93f8835a001bd4359d8480cc68d1259b8bce334668f8be97bd/aiohttp-3.14.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:54bf3522d6f7351e55f89a62d5c2bf138ad557b031670266c5df604ae88e0b5a", size = 759147, upload-time = "2026-06-01T19:37:12.918Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f2/cd3fedff6fade73d71df9ec908c210cec518ef90fd00289250684b90aecf/aiohttp-3.14.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0746d9fb0ac4fdef643a84494efe3f06d50335dd8c7a530228b86448aae0a803", size = 513705, upload-time = "2026-06-01T19:37:14.633Z" }, + { url = "https://files.pythonhosted.org/packages/5a/fe/49746b6b610144a06323bebd8e1211a390310d8c69b98dd6d52df341bc3e/aiohttp-3.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9f3a96b6d39a4872222beee72e1df41d2ff886ae96152cf3e757ef8c5673ef0e", size = 509627, upload-time = "2026-06-01T19:37:16.385Z" }, + { url = "https://files.pythonhosted.org/packages/4c/3f/28f2f6cf3d5c0e7b01b27140d0e7873fd11fb341169ad3ce78ad04aba628/aiohttp-3.14.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d336820adbb914debbc90a1d8c1bfc4bea55996aecf64866a989d35d1f9fd903", size = 1769293, upload-time = "2026-06-01T19:37:18.067Z" }, + { url = "https://files.pythonhosted.org/packages/97/6f/2e5f1b525d5474b12b3c60abf733a755845f3bceff21542081ada515f837/aiohttp-3.14.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:71b2604c9bfc1b115547d63a094d5244b3f02799833513a99a68aaa7b167c4cb", size = 1732363, upload-time = "2026-06-01T19:37:20.138Z" }, + { url = "https://files.pythonhosted.org/packages/a8/ce/596120faa85ca7b19cd061e3f2f3be23aa8f11a0aedf9191db9e0da1bd76/aiohttp-3.14.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:610d68800435903e303ca0542b9d3e4eb72a12ff33a6d471a070c1d81eebd3c2", size = 1840375, upload-time = "2026-06-01T19:37:22.104Z" }, + { url = "https://files.pythonhosted.org/packages/72/3c/a7ffe05a757a4a7867643da69357ec41f506879fbd1b231d2ed90af246b2/aiohttp-3.14.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:514db9a79337068981ee2137310283a07b4b885c584991097a91a4da419bcb81", size = 1921484, upload-time = "2026-06-01T19:37:24.068Z" }, + { url = "https://files.pythonhosted.org/packages/93/fa/2c861170bbd4a491de93a69e081db1d971092569e0d593a98ef62c384dc1/aiohttp-3.14.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c452d17eeb95d563fc8b936f3050301dbd1d268126c4632d8b70ede9696202ee", size = 1774153, upload-time = "2026-06-01T19:37:26.256Z" }, + { url = "https://files.pythonhosted.org/packages/9d/da/1d2f5a165f47ec9b1f69d37b8b977fdc4d501aa72ffb7930db27bb9e49ea/aiohttp-3.14.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ed94a81506e3d1bdbad5108f497a58f2a2354aedb4ca314d5326f07d1fd1ac2d", size = 1632569, upload-time = "2026-06-01T19:37:28.192Z" }, + { url = "https://files.pythonhosted.org/packages/46/1d/7a6e295c4257252f70f69e90864fdad74b6a1293054fb3f9e65a15de6d63/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1394dce36e0f0d260ac0b555a654de19cb989f3c1b8bdd24f505314dfea18a00", size = 1740325, upload-time = "2026-06-01T19:37:30.08Z" }, + { url = "https://files.pythonhosted.org/packages/f1/7e/e1899b1ca3ec62f1eab2a5cbde14039b97493f7f53eb88d9b668562ffa8d/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:d1467d1e7b48a73ca7237e0ee4335f3d02b923dbc27b82fd254bc301c97d4026", size = 1748691, upload-time = "2026-06-01T19:37:32.211Z" }, + { url = "https://files.pythonhosted.org/packages/ec/54/4e6b61c1fe7d3433f82bcc6bd7e4d7c683a742a10c9b12a025fd3695c047/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6a5f3532125233c261cf61f32df4059cfcf482eb793c7d3db8452e3142028b86", size = 1814477, upload-time = "2026-06-01T19:37:34.173Z" }, + { url = "https://files.pythonhosted.org/packages/9c/38/86fd51be2e08d8e45c83d879d255f10391903cd9fe2a16512f7591a15873/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3ea81eb518a2ecb319d8ec6d1424a37c773f6634bd87d6985eb606b2faac419f", size = 1623393, upload-time = "2026-06-01T19:37:36.281Z" }, + { url = "https://files.pythonhosted.org/packages/78/49/466e947a42a88ee23c486d036e7e5d1b097f1bafd8084ad9c9a0a92f0f43/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:32e735c3182de7b64f6941a4ede48b38c7f47d9437bd615dd30b5bda8fa1bc93", size = 1824097, upload-time = "2026-06-01T19:37:38.421Z" }, + { url = "https://files.pythonhosted.org/packages/f3/89/35f3410bc284682338a1be6b6ea0c5abfa05f063942cfaa9256608440434/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c21ca9a1c63d4509158f478aeb9d02914dcc52adc68d1bc9dee2452284ee5996", size = 1764790, upload-time = "2026-06-01T19:37:40.755Z" }, + { url = "https://files.pythonhosted.org/packages/42/80/2d4291bd5724d3d17e5951aff5a3e02281483fb47295f0788276ee66cd73/aiohttp-3.14.0-cp311-cp311-win32.whl", hash = "sha256:19ca5fc84130675ba11c6ca5c7da5cb65f7bf8a32cdd2b616bf49cd334688aae", size = 454176, upload-time = "2026-06-01T19:37:42.837Z" }, + { url = "https://files.pythonhosted.org/packages/59/ed/41d0ad4f6ececffc32bdf1f7b494e5498f7ca5c849ea2e3cc9bbd1668251/aiohttp-3.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:d488e6e9d3bb8ba5ae7066d5be885ae9670eba021b8c6ccb9a3a568e6b19d6e5", size = 479334, upload-time = "2026-06-01T19:37:44.776Z" }, + { url = "https://files.pythonhosted.org/packages/d1/86/c0b5e305c770053f8c3d069bb52b8196917ba91949d1962d52eb307fb0d2/aiohttp-3.14.0-cp311-cp311-win_arm64.whl", hash = "sha256:8b93618102caf12801638a01a2b478a55410ddd71bd41cfaf6f707953a49ac43", size = 450262, upload-time = "2026-06-01T19:37:46.461Z" }, + { url = "https://files.pythonhosted.org/packages/89/97/2b6889bfb6b6847520d50d95eb8c4307a45e28aaca39faf4a9454b3d1b2f/aiohttp-3.14.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b29518c9c2ec7e373e68259206a137c7f4f5439c58baaec4b5ab3ab799850a4e", size = 750194, upload-time = "2026-06-01T19:37:48.164Z" }, + { url = "https://files.pythonhosted.org/packages/21/e2/62634b7fff918ed98c3c6b2f0e70d520f7f28846cb412d451b04354c6459/aiohttp-3.14.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dbec68ce61b64cb73cab4d33df9433427b1713c8bcccb181dce695c1b6f8e87c", size = 506966, upload-time = "2026-06-01T19:37:50.014Z" }, + { url = "https://files.pythonhosted.org/packages/dd/fb/5ce075150828c797a5106f1c2fb26034e709d4289b9d2bf8b07f1e59fac6/aiohttp-3.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3cdf534aa455593e589302990c5097aa5c92c06c4262a20da22934f9186a5fff", size = 507527, upload-time = "2026-06-01T19:37:51.96Z" }, + { url = "https://files.pythonhosted.org/packages/01/d5/405a0ae4e6b081754a3609c1c97c63a950e000a2def16046f1e736933a0e/aiohttp-3.14.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cb6c657104393b5fbff01a5f59b2023db74058a8077d94475d6c25d03882a108", size = 1762420, upload-time = "2026-06-01T19:37:53.839Z" }, + { url = "https://files.pythonhosted.org/packages/ae/1d/e05a7c896b15a6bc6fb8fc5319eb437861c2c49c34559ef928add6590315/aiohttp-3.14.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:46fbbec4e4fab7428d4396a3823f9320e4560aa3113b89eeebce712c27c9ed5a", size = 1733672, upload-time = "2026-06-01T19:37:55.791Z" }, + { url = "https://files.pythonhosted.org/packages/cc/22/a72f7c459e195fa41bf4f7abd1f925b91fe91f8097e51c654229ba144a33/aiohttp-3.14.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2c2c7e05dd5335b298085abf45ddf98673934c3ee1c083d0b9ea13d4186ad500", size = 1805064, upload-time = "2026-06-01T19:37:57.931Z" }, + { url = "https://files.pythonhosted.org/packages/80/50/e85bdaba0be59ca4838005ebfef4048fcdd5f35a02b07057a9a123394440/aiohttp-3.14.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3c7139100fbaae76515b73051d8f0aa3a3ff02e415eec8a8eee8e2223d9ba955", size = 1902125, upload-time = "2026-06-01T19:38:00.225Z" }, + { url = "https://files.pythonhosted.org/packages/19/d8/51de5c6b971c27bb1ef620293b8d1ca611ec78736b34b3f6ccf68e4c8785/aiohttp-3.14.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78d6f9286a629ce52728430afe18f8ed2b6c39a1fddb3802d7244b9983910ad2", size = 1783112, upload-time = "2026-06-01T19:38:02.641Z" }, + { url = "https://files.pythonhosted.org/packages/73/ae/b4402bfde77e43dfb1b6ccff83c7b7ab63ed06b50c4754f0c5423fb374fe/aiohttp-3.14.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cc3c3e12cdaeb92d7dcf13db00e9f6b1956b910e47256e696df1cfa946d02159", size = 1586356, upload-time = "2026-06-01T19:38:04.637Z" }, + { url = "https://files.pythonhosted.org/packages/bc/05/750a3265ca4dc54a460bd0cb1121a8f2ce9171fce4a135fb47ea7fd594d2/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4d6a998191f5ebe3b8c28463ff72bc030250008b3193c402464efadd08b5ca02", size = 1723119, upload-time = "2026-06-01T19:38:06.713Z" }, + { url = "https://files.pythonhosted.org/packages/37/01/8c0812c50b3b1b1c37b323bf170d6be8847a8f234060485b7d1e71953f60/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0fc2b75ae8d169d853be2862d960be8550da6c5c65711d5476407eb3fdb006bd", size = 1757216, upload-time = "2026-06-01T19:38:08.736Z" }, + { url = "https://files.pythonhosted.org/packages/47/2a/50fb98028a26887cbe48dcc1df92a90825615bc73b5584301304090cded8/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:16eee56bcc72d04600bc56c1759982c2385ec0b41d3fd3521f836bf64a0957ef", size = 1770500, upload-time = "2026-06-01T19:38:11.111Z" }, + { url = "https://files.pythonhosted.org/packages/bd/32/0ffd598a2fa2b9a423daf242e700cfdabda35d6e602394ad9ae58972c1c7/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:5a2e7ca615c3ddc15b82687e05a624e5f5cba3f1d6c20cb81172d70ea498451e", size = 1576224, upload-time = "2026-06-01T19:38:13.391Z" }, + { url = "https://files.pythonhosted.org/packages/0b/f9/b9fc381dd9b66afb33f2634c40e229d106467be0afcabe79648631ab6712/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f0b7b8bbbec3ce9467ee0ebe334622fd90624f593edd3136c567811453fc4fae", size = 1794252, upload-time = "2026-06-01T19:38:15.498Z" }, + { url = "https://files.pythonhosted.org/packages/a8/fb/05d9214c975f23225a8cd5c439325e338c7c377b315480ef3871db51f54e/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5ba10966d4f03dd96a14365be4b8e37c327c76f11c3ca867116966cdd9f98066", size = 1760193, upload-time = "2026-06-01T19:38:17.624Z" }, + { url = "https://files.pythonhosted.org/packages/d9/4b/02992fc4fb9e1b6673ee3f888a8e587a6447afda1f6f4aca776c148c2876/aiohttp-3.14.0-cp312-cp312-win32.whl", hash = "sha256:101df7779c80c0636014a6b2c6642acd3efb5b355d48347c9d7dfb720aee9430", size = 448650, upload-time = "2026-06-01T19:38:19.545Z" }, + { url = "https://files.pythonhosted.org/packages/39/e9/246532214c3abda518477cbaaf16d420295ad8effa5233844cbb38f299ab/aiohttp-3.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:b0a5747586d4467efd1f932710b269131c9717a872dce082cd92a00c1c13123a", size = 476145, upload-time = "2026-06-01T19:38:21.505Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c3/63f8c20090048915711598b0adf475b149216d736157961de06480a45b15/aiohttp-3.14.0-cp312-cp312-win_arm64.whl", hash = "sha256:5f1c5be60add78fabb4aacd13c5a348ae79d2fcbfc7fa78da8f1eb192273b370", size = 444250, upload-time = "2026-06-01T19:38:24.027Z" }, + { url = "https://files.pythonhosted.org/packages/21/61/d11f7d9a3144bffe825247d6367cd93053666da50b94707c9129c78868d5/aiohttp-3.14.0-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:25400d710641a8040bf022a8a99f579e581ffa1c5bd42c33255d7d6f3957c127", size = 502399, upload-time = "2026-06-01T19:38:25.955Z" }, + { url = "https://files.pythonhosted.org/packages/4f/9b/a7e317625d36356844f8bb022cabd305b541f968856cc3c2e0b58e53ee6e/aiohttp-3.14.0-cp313-cp313-android_21_x86_64.whl", hash = "sha256:c5492b9929826e07cc3fcb9739ae87aab05dff6b5e67a9b73fd1700c6d008981", size = 510068, upload-time = "2026-06-01T19:38:27.828Z" }, + { url = "https://files.pythonhosted.org/packages/11/41/cc2d2cfbfbdc3126ba258f3cd27d1ac8a33492ae3c35a4583ee21f0ba7f1/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3366751d68d237c621264233a32f3078bbc21b7904ab90a77e03d21390c742c6", size = 481670, upload-time = "2026-06-01T19:38:29.836Z" }, + { url = "https://files.pythonhosted.org/packages/3c/07/381f4023c3b08cb616e520f566d8c58957abad54e56441d41fe67cfb0195/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:57ea07d28695a7a40304d42251892a8df765e5588c10ee32afeddcd5df33c0a2", size = 487591, upload-time = "2026-06-01T19:38:31.704Z" }, + { url = "https://files.pythonhosted.org/packages/fb/4d/4506fdb7a022bdf70011a3bbb4ca00c5c570026ef6a3c5bd7bc70c39089c/aiohttp-3.14.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:076cb014191ae2e65d949e1ad01f1dcfe33e32789b5172510f3e79c79fc04d50", size = 496503, upload-time = "2026-06-01T19:38:33.6Z" }, + { url = "https://files.pythonhosted.org/packages/ef/7d/c814111e04894a45d9e2defc94443879a6f118d9633d5fedfe6e2e8af5f0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2f3fc37054564dee64a855b5b092d87ec35dcddfaabf7dacb1c8a2b1f83dc0a9", size = 745870, upload-time = "2026-06-01T19:38:36.013Z" }, + { url = "https://files.pythonhosted.org/packages/c6/ee/80eee0efddfe187e7cd05027086b7ce1c0e492e82a4eda58f5c5543a44a0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8fcaef74d2ab0f607d7ff85a0d15e21bb5a258c4a58df1908396eb50d7f4ed3c", size = 505588, upload-time = "2026-06-01T19:38:38.282Z" }, + { url = "https://files.pythonhosted.org/packages/d6/f8/0f28f04eef75d52fc9c715dde7ce9c0abb810fd20cfeb0fea7afd2ab1e98/aiohttp-3.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4c01b0bfc6209590960e68eac083cd22d5d87c21f974dd6208cafa5d3542bc8", size = 504492, upload-time = "2026-06-01T19:38:40.611Z" }, + { url = "https://files.pythonhosted.org/packages/ff/db/44c755232085545065c94378dfce38641b1aee647f4939fcd32f5b32e719/aiohttp-3.14.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f12eb7896e81caf403a2b18c9406426f1207361e7239c057ab29c076d4257e83", size = 1752111, upload-time = "2026-06-01T19:38:42.682Z" }, + { url = "https://files.pythonhosted.org/packages/5e/6a/42e030a46743841414402a3b00cd3d78419055e86c66fb5822c14b5abfc6/aiohttp-3.14.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6c79a044cacf360ec46738d863d2f41c9300d2a06ef4a7402ea0df306a350e61", size = 1729674, upload-time = "2026-06-01T19:38:44.79Z" }, + { url = "https://files.pythonhosted.org/packages/34/26/3199beb415202e3108e7b83ecebe10914d806d33fb9860c3e4aa60a19be3/aiohttp-3.14.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:85e0675f47be4eff0636bf88c02140ea89168ae0df3ff1f3f464e9de9610d277", size = 1798808, upload-time = "2026-06-01T19:38:47.01Z" }, + { url = "https://files.pythonhosted.org/packages/bd/94/b9b6fcf0ee17c21d0d19fb8c22bf83ad18f82e702a9c3bd901a868f5e446/aiohttp-3.14.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7b33e751cab03fdc960095b1e326cb5a03f5ee577d6ded59f3d1c100f8668882", size = 1891921, upload-time = "2026-06-01T19:38:49.233Z" }, + { url = "https://files.pythonhosted.org/packages/c5/a3/3800dbd095cb2bb165a7ea5d94d790914677e27f45638c7d80e3f34c8945/aiohttp-3.14.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:26d9224c6dd7f5c749aba4f61315a894601448b28d94d12f4dea0903e26d2096", size = 1777241, upload-time = "2026-06-01T19:38:52.04Z" }, + { url = "https://files.pythonhosted.org/packages/21/2a/45be91ad1b860508557448d4cc2e165a2ee68dd865657b73bf66cc5a00fb/aiohttp-3.14.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6281aecdf2732940f4fe06bd6adec5ae4d59b78b080b8e3a6b81467301010988", size = 1579554, upload-time = "2026-06-01T19:38:54.508Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3d/dc94df99ed1511fdf28314f722643ed334112643cab00223577085e788c4/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:23e8314e7aed8576fbe33314d218bd81447a3adbc91dc36f1163bf583cd3084c", size = 1714864, upload-time = "2026-06-01T19:38:56.788Z" }, + { url = "https://files.pythonhosted.org/packages/ae/e4/1f1c8acbb3acd5c8f795473b92c9c3d44eb60a5692c6104256c8a1c83a0c/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3b54fbff46127aeafdd764cecd0d99fa2f24a0e37ea5c18a7c3a4ac450df1db3", size = 1749803, upload-time = "2026-06-01T19:38:59.367Z" }, + { url = "https://files.pythonhosted.org/packages/0b/c8/c45ea6e7ed84cebba939b9c334498a045ba19d79c61b0110df5f21580de3/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b27d89af91a555f58e08e4902dbcbc48862fd40095720ca705990476bd93b7ac", size = 1765023, upload-time = "2026-06-01T19:39:01.651Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a1/a932941784432962fe390e1066823aaef64b4e5ac9fa595df57b5fe472a9/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:25d2326a4967bf705a9f9913a13005e93b6020ad8a9f6bd6bd78850d5171332e", size = 1571671, upload-time = "2026-06-01T19:39:04.044Z" }, + { url = "https://files.pythonhosted.org/packages/b0/01/e1280feac522597a4d46eb67a0cdfa053cfae263033030b761ab146f29fb/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:a1d209375c503472b3c0a340cdf3c55fcd82e84b46dda7caeaced59faba373ec", size = 1789904, upload-time = "2026-06-01T19:39:06.294Z" }, + { url = "https://files.pythonhosted.org/packages/fa/10/ab28818262f4d26bdb47ed5f1fc7999b69e2fc6e0370b02d0f49011f45ea/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:666c7c5036df57b693026398b69b41874a1931ac5b3485fd910e57bfac253869", size = 1754516, upload-time = "2026-06-01T19:39:08.788Z" }, + { url = "https://files.pythonhosted.org/packages/af/cc/c122eabd7a1b7e0c9bbdd6be60e4715905b858399145d9df872bb94f1427/aiohttp-3.14.0-cp313-cp313-win32.whl", hash = "sha256:23f094a1ef64823fd35854ddf5c7a80a078162f37f9d2f7c6142b51a6affa456", size = 448656, upload-time = "2026-06-01T19:39:11.171Z" }, + { url = "https://files.pythonhosted.org/packages/41/a5/bab07d79848a00eedd8ed979ccb302aaea3ac6eb9fa16bd0ed87135869b4/aiohttp-3.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:e03abdaa17d553f17e1d1d06bb266b3970106c78051d06795723e748d8e49d11", size = 475803, upload-time = "2026-06-01T19:39:13.439Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a0/f03ade8566c153666a3871afccbedf6d99911da006325e1fc6cf72a2de99/aiohttp-3.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:acdb400538cf4769543548bb5d1eb23d39bed4f96554a6078cb728c7cb2c268b", size = 443889, upload-time = "2026-06-01T19:39:15.945Z" }, + { url = "https://files.pythonhosted.org/packages/28/03/5f36ab196a88ba5e9648ae5643e6531e67a3a8c0e96f9c6510ff41540fec/aiohttp-3.14.0-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:363ef9e91014e7891679bfb2ac0a7c6ea93435dbbfd10ecf41b9f06fcf506c5f", size = 503330, upload-time = "2026-06-01T19:39:18.195Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ce/8b49ec2f30f68e02f314f4832186cd45e583360a5a386058be36855d23b6/aiohttp-3.14.0-cp314-cp314-android_24_x86_64.whl", hash = "sha256:884a4edbdad77be9d0ef36142c8b504351b170df0bf62b51e784fadabf311c42", size = 509822, upload-time = "2026-06-01T19:39:20.396Z" }, + { url = "https://files.pythonhosted.org/packages/1a/fe/6edbf5d39bf29322b6816365b17ed8ede4dace164a3aea1abcd30110eb78/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:70ea956f6cc4a37620966b56c2e205d88ca3e6d85ec063277e414b1035cddad3", size = 483329, upload-time = "2026-06-01T19:39:22.607Z" }, + { url = "https://files.pythonhosted.org/packages/1b/5a/fae531bdbc6456fb6241f46b7b81e4d8a0dd3fc09118a0055dc7141ac1ec/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:ea3b9806c89f61da22fddf1f12dd524fb368e5e28f1261fbdafe5c3cd8ce893b", size = 489502, upload-time = "2026-06-01T19:39:24.881Z" }, + { url = "https://files.pythonhosted.org/packages/36/f4/48a7b0414db7fed77a03d5dde34508c026afd83510ab6bca08c313855776/aiohttp-3.14.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a071be341c2bd9b0188e62d173509f024e0a35b1c342c53c50f8daaeda8c3bd8", size = 497357, upload-time = "2026-06-01T19:39:27.197Z" }, + { url = "https://files.pythonhosted.org/packages/75/75/e85a13a370acc007fca5feb1fd1b88ac2d8426e6dadd625479b7cadd55a3/aiohttp-3.14.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:198cfe61bf253b19da1fb3e0fa122249dc4f14c12709493fed8054aa0411cc76", size = 750898, upload-time = "2026-06-01T19:39:29.563Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e4/3d637f800c724eff0e2bed64df72557444482366fd0a35b0cec0e6968f6c/aiohttp-3.14.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9dc203d6ce6b9106d54e2a93f41dfdfebfbca2d99962ba503bfd3e5921a6549e", size = 506986, upload-time = "2026-06-01T19:39:31.872Z" }, + { url = "https://files.pythonhosted.org/packages/1d/df/35161f3598bf7501d2b2a805b41ab4f45a2e34150c421bcb4ef8c0d281a7/aiohttp-3.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9e19d17ab02bf16832a2c8c0d55a486792c5b1645665652ee9531aebcc30cb72", size = 508033, upload-time = "2026-06-01T19:39:34.137Z" }, + { url = "https://files.pythonhosted.org/packages/e5/39/b36e5d3d31e850fb4691dd3e941684ac490a2559249f6fa634b6b0fdf020/aiohttp-3.14.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d925fba0c14d5b498a8028b0107beebdfd16c5d48d702ff54f879cb017aaaca3", size = 1746213, upload-time = "2026-06-01T19:39:36.654Z" }, + { url = "https://files.pythonhosted.org/packages/b1/28/24e1409e605a9aa5d84abe0e2acb365354b70ae56d40948101cabe3341ab/aiohttp-3.14.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d33e61021222ce7f9792bcac870d6f58d8adfceda33ab857b01264f4560f2c5f", size = 1705862, upload-time = "2026-06-01T19:39:38.968Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d0/e5eb3ff1daeaf644c7e36a957517672494122628e067c38b263fa04eda77/aiohttp-3.14.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:44eca38755d0105bb32f47d085f5dd449846a449e1245fc105889e3279dcf8e3", size = 1798909, upload-time = "2026-06-01T19:39:41.334Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ba/8943f906f0570342886ababb9a722a44e360f786a028c5e0b0e29e3f735b/aiohttp-3.14.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f13087e06f68fea4941c21a0c541c00553aa16e4f8fd7bbe2b198df761e964d6", size = 1868892, upload-time = "2026-06-01T19:39:43.807Z" }, + { url = "https://files.pythonhosted.org/packages/3a/05/27df32c844b2156e1675a8d8ec22d963e3c8ba469ed7ceb1863320c7b521/aiohttp-3.14.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff82be7f1ef73634cb77890a770743239bc3d487b848669be1c599889336dc0a", size = 1751659, upload-time = "2026-06-01T19:39:46.398Z" }, + { url = "https://files.pythonhosted.org/packages/7f/62/da182e5910ab912b2e88aa919b61a16046a37a95714a5795b02eb57b2d18/aiohttp-3.14.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a150c0875ac8fd87f1c398650841308a30d65facf7416b12dbdb9cfdcbe5a48c", size = 1578775, upload-time = "2026-06-01T19:39:48.902Z" }, + { url = "https://files.pythonhosted.org/packages/66/e3/53c67097e8a5ce98625e91e3fa7f43c9c6940de680345d03b3509a72a078/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:edc01ea4e1ec5a1649a28866262bf24195889ff7b27bdd947029a6086741de9b", size = 1710090, upload-time = "2026-06-01T19:39:51.392Z" }, + { url = "https://files.pythonhosted.org/packages/dd/55/0e2732ca598c7a4dfe8a775662376d0ca2977cb1030e48386d4da5d9a456/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:540632bf882ff8fc88f2e1697be0761578e89e0d79fb4a8a6d65dc5da7e729d4", size = 1715016, upload-time = "2026-06-01T19:39:53.807Z" }, + { url = "https://files.pythonhosted.org/packages/5a/96/f0b73730798c9ca525afc30b39f1f81bbe24e245d9654c54d3b39d63212d/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:860a86bc2c80237f5dff52edcf427e10a8d8352271fd84845429a3e60199e02c", size = 1763810, upload-time = "2026-06-01T19:39:56.31Z" }, + { url = "https://files.pythonhosted.org/packages/71/cc/11acb6c4518f448323405a7312b6f255d0f974a34373ad1db7633c4aadc8/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:5cbd50e6a50d6b99283a826b18cbdebf65b0797689a7535cb0e9dd37be0f63c3", size = 1573064, upload-time = "2026-06-01T19:39:58.718Z" }, + { url = "https://files.pythonhosted.org/packages/de/2d/28c31dde0a7dc98c0ee7d0da2ddcec3f7688c4fc131e5989e278d0c03c0a/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:20144819e99db593e22bbd2f3f2691a5e149f879142d6b8670254708853ff4fb", size = 1775765, upload-time = "2026-06-01T19:40:01.195Z" }, + { url = "https://files.pythonhosted.org/packages/b8/69/155c4ef3aec96417d47024800472b33b16c5d8a665371dcd044c2afdf25d/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:26b6d79aa54cb4ed50cc7d41ed14e99e0f1fc8e7c2d42f2e05b37aea897b2b52", size = 1733716, upload-time = "2026-06-01T19:40:03.631Z" }, + { url = "https://files.pythonhosted.org/packages/5f/44/6126116fd8a316b712bb615660b855c78466bb67ba1bb1742427eafcf7ac/aiohttp-3.14.0-cp314-cp314-win32.whl", hash = "sha256:106ed074a856f3e21d186b8579e2c8afb6da598e267cdaab01059e13db2fc44d", size = 453684, upload-time = "2026-06-01T19:40:06.277Z" }, + { url = "https://files.pythonhosted.org/packages/a2/d7/eff4c58a88c5cac5e38b55f44fb8a6d3929c3cbd77356e383e094d3220bd/aiohttp-3.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f770846edae8f00ecc57af825bce811f787f87a7dcf0e90d191790efe5b31f7", size = 481758, upload-time = "2026-06-01T19:40:08.653Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ed/17b5bd9fbcb46e688f02e572f517754a9a75831e7b54702f027761dc4fa5/aiohttp-3.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:acf1581c4f21ed4b80a2dded504d87b055a071a84d5737ea966435f768275ac6", size = 450557, upload-time = "2026-06-01T19:40:11.03Z" }, + { url = "https://files.pythonhosted.org/packages/12/34/6180103ce9aabc8ebff3f7bb55a1228ffe60f61042823031d9692cb7b101/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:6aa1a40f9cbb3da9f80714c5966b8946c21e6a2530d809b9498b33161e3c8733", size = 787878, upload-time = "2026-06-01T19:40:13.401Z" }, + { url = "https://files.pythonhosted.org/packages/92/e9/08954a40e8b7baa3d8beadd2b074b186e9b1e9c8ddabc288678a6265de50/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b62af5a8cc96a194eaa01a9ed7b34a3ffa58d3d8daaa1a0d7a749353ad12d228", size = 524400, upload-time = "2026-06-01T19:40:15.972Z" }, + { url = "https://files.pythonhosted.org/packages/08/6a/b5965a634ac4d5ba99a463314cf4ab214ca073fcdc38a15e0294273701fc/aiohttp-3.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6eb63b1417efaf7d1002a6ad034a40d44376afcc16508a57f8e74b49ad26a095", size = 527904, upload-time = "2026-06-01T19:40:18.28Z" }, + { url = "https://files.pythonhosted.org/packages/06/b4/932bcdd850c354d9bcca30f360e475d7852e30413fbbd44b182782ed5432/aiohttp-3.14.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c20b9ad156a79eb97be5cf9e069eec01d2f0dc8472ffbd75299a8b2d4c2cbbde", size = 1912162, upload-time = "2026-06-01T19:40:20.825Z" }, + { url = "https://files.pythonhosted.org/packages/c6/85/ce79bab0310d2e3fd2d7bc7e44412abeff7c8338f8a21dd0f2f1714989e5/aiohttp-3.14.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:40ae7b0642c25632c7eabc4a04754012691864d2a1b93becf7cddb76027b838a", size = 1778813, upload-time = "2026-06-01T19:40:23.726Z" }, + { url = "https://files.pythonhosted.org/packages/05/54/ba62ac2d1bc87e010aad23751e383b8794e45d931df67677313a2da78823/aiohttp-3.14.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:95f5217e76a046b9f228a101717ef8d42b1eb3d9d196d15202db5bf41df88936", size = 1899969, upload-time = "2026-06-01T19:40:26.406Z" }, + { url = "https://files.pythonhosted.org/packages/dc/82/7cc7907725d83a19f31551334061e1ab8e108b1d7ac52632a2a844a4acb5/aiohttp-3.14.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1a4a9f17e85b80878c176695c1998c790e83731d8271881e5d356488652a1f9e", size = 1991771, upload-time = "2026-06-01T19:40:29.061Z" }, + { url = "https://files.pythonhosted.org/packages/d0/1c/a57de71a4508c93a830b77c28af3d08cd97f606dedfc6b94275347744508/aiohttp-3.14.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:145262119b07d7f95abc1839add35ba2bfc84551d4b4660ca11542c0b215455b", size = 1868606, upload-time = "2026-06-01T19:40:31.843Z" }, + { url = "https://files.pythonhosted.org/packages/9c/ae/3839726cd49150a53ed340cc24ce5ba09d4c2117020ef9d45542bec5eb2f/aiohttp-3.14.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:49a33ded29b0b2fa7a367a02cf0fb89af602bb87542a16177ec8ce1c9c51d12a", size = 1665437, upload-time = "2026-06-01T19:40:35.01Z" }, + { url = "https://files.pythonhosted.org/packages/35/1e/c237923232c7da7f0392ea25d89fc5e60c0e93f685f4ebca8e7bcdd5271c/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cc736a9c9fc2bc4dd71fd404815741b6573df27c3f985948ec4076989ac57de", size = 1834090, upload-time = "2026-06-01T19:40:37.733Z" }, + { url = "https://files.pythonhosted.org/packages/98/02/a5a7a2524f92d3911761b405a7c067c751891942144adc13e2ad79611e39/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:b4141a3e5342ee3053a9cab54d25b64ed28289c1041e4c54b3d99839314d90ce", size = 1816907, upload-time = "2026-06-01T19:40:40.46Z" }, + { url = "https://files.pythonhosted.org/packages/fa/76/a8b9f0d09234d516af9f2d7dd715557f33b5da3b0b56ead41d1170e86e3c/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:e30871b2d58996cb81aac52d2b1d15ac05257131ef0f90f18c2115a380fbfe7c", size = 1840382, upload-time = "2026-06-01T19:40:43.48Z" }, + { url = "https://files.pythonhosted.org/packages/c9/8e/140e715a0a4bbc211979ea30ec8396ad2ed5bf90ab87d8058fc4668b1923/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:667b881d083ccae3900ea5a241e17e5007ca78844c53ed389bb63d48f729d9c7", size = 1659497, upload-time = "2026-06-01T19:40:46.265Z" }, + { url = "https://files.pythonhosted.org/packages/10/c7/7ba5de8af9650b9767b063c675427b8685f43fa7ce563673a7bc3af60f08/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:b584dfe615d151e9b8f0a8ecb3aee6147f2927ec5b95ba25fe621f5377510928", size = 1870829, upload-time = "2026-06-01T19:40:49.583Z" }, + { url = "https://files.pythonhosted.org/packages/cc/bc/2aaab2f85cadb26ea59c091fa2b8e370d625154b5c14b478f1b489d07551/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6199707cc40e0e9cd39c36fbc97bec416c704e1d0ddce03412bb3b3e6a90ccd0", size = 1832281, upload-time = "2026-06-01T19:40:52.303Z" }, + { url = "https://files.pythonhosted.org/packages/39/98/31b9ad9fbc01f0075ee7221002df5fd2d10b647f451ca5f30edc802d9dd6/aiohttp-3.14.0-cp314-cp314t-win32.whl", hash = "sha256:a8d93334d4961c9d566b1f046c81dee475b7c21eb730728d38237bfa70d1c8e6", size = 490597, upload-time = "2026-06-01T19:40:54.937Z" }, + { url = "https://files.pythonhosted.org/packages/59/1f/299b21441c8de42ff70fddc7cfe65e92f810abcf740739a09b56f7835364/aiohttp-3.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2d2ffe9b614f50f069068b3b52e73414e4107fc10b7efc939a76acff9251fdd2", size = 525789, upload-time = "2026-06-01T19:40:57.306Z" }, + { url = "https://files.pythonhosted.org/packages/70/11/7f83fcba9ee05d4c54d61b3f8104da0d43a59adac44dd28effc0c9a10422/aiohttp-3.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:7a3fc4358e65826c515350f199c210de747cf669998211b1ee6c2e46de364b24", size = 467399, upload-time = "2026-06-01T19:40:59.993Z" }, ] [[package]] @@ -170,8 +169,8 @@ name = "aiosignal" version = "1.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "frozenlist", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, + { name = "frozenlist" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } wheels = [ @@ -235,19 +234,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/28/8a/79c76ad88b16f2fac25684f7313593738f353355eb1af2307e43efd7b1ca/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:de74a2512e2e2366d4b064c498c38672bf6ddea38acec8b1999b4e66182dd001", size = 3104663, upload-time = "2025-10-13T23:11:00.582Z" }, { url = "https://files.pythonhosted.org/packages/20/66/9152feaa87f851a37c1a2bd74fb89d7e82e4c76447ee590bf8e6fff5e9d8/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:806ca8e20507675b2de68b3d009f76e898cc3c3e441c834ea5220866f68aac50", size = 2956440, upload-time = "2025-10-13T23:11:03.769Z" }, { url = "https://files.pythonhosted.org/packages/ad/66/f4179ef64d5c18fe76ec93cfbff42c0f401438ef771c6766b880044d7e13/arro3_core-0.6.5-cp313-cp313t-win_amd64.whl", hash = "sha256:8f6f0cc78877ade7ad6e678a4671b191406547e7b407bc9637436869c017ed47", size = 2845345, upload-time = "2025-10-13T23:11:07.447Z" }, - { url = "https://files.pythonhosted.org/packages/07/c2/407d6bc19813fb74cc2b087ad3e959e102b29ff81e35dcc0ad0dfb5b946c/arro3_core-0.6.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:dfac7fac3c6a302399d94644d48682a19488a5b67bd1ccbdf6c560a7ffabde6d", size = 2680237, upload-time = "2025-10-13T23:11:10.876Z" }, - { url = "https://files.pythonhosted.org/packages/d3/73/c67156794d7e9734f4cc03d2eca7e44a1cc014686e6b7663f5110f58581d/arro3_core-0.6.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fc70042e558d1cd5fbe917b58e8ef52701441e38ff30b1912858050f796a62c", size = 2386228, upload-time = "2025-10-13T23:11:14.02Z" }, - { url = "https://files.pythonhosted.org/packages/79/e8/817ee1abb0cfa7e266ef00749b144553d2bb9c4679ca932ecbca9dc7dea9/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1583b29b2ba83927a33e5435e5d9d134114c45a6360a8bb4db4beda13dab4fd8", size = 2886476, upload-time = "2025-10-13T23:11:17.579Z" }, - { url = "https://files.pythonhosted.org/packages/8e/d6/1b9beceab797c4510abfc25ef6e657e4c940d06a9615927ce506463691dd/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a170fe53f18dda4a4647fd3b8b4a9373fc11ac42c41a4b65f55d79ad531a33e", size = 2911941, upload-time = "2025-10-13T23:11:21.131Z" }, - { url = "https://files.pythonhosted.org/packages/dc/ed/4fe1fb9a24698fe6189111836d22c9582cbc92fa159b24b8664e924738dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83047b4e6e18835c91c8d12c5494e6ababc7c185c5a772d3429e8f9b0c185894", size = 3150419, upload-time = "2025-10-13T23:11:24.503Z" }, - { url = "https://files.pythonhosted.org/packages/a1/91/d6215b782fa91493f504ae13623db889beeaf0519037c28fc6744464439a/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3d4393d281d1ef18927915a11187da27287d279f99d5325bc9afb417f76084f", size = 2777891, upload-time = "2025-10-13T23:11:28.11Z" }, - { url = "https://files.pythonhosted.org/packages/d4/de/0aa3504e6cbf406086de49b59cb0dcb3ab11f64acbb38602143e479831dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:f0c88d8babcf51affdd69390882e2f0ecb1890a1b8a5abfc087d003e7181eb6e", size = 2519673, upload-time = "2025-10-13T23:11:31.426Z" }, - { url = "https://files.pythonhosted.org/packages/05/69/47bf9c9ab66bafc7056a41f6db9d2149639eea6417299e3fe6c01ef99b6c/arro3_core-0.6.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36424e1d62212466a5cacdc27d414e99bf0fdab1544cc2b7e5b81e41437e5970", size = 3026254, upload-time = "2025-10-13T23:11:36.199Z" }, - { url = "https://files.pythonhosted.org/packages/b1/e8/638582437ab41ba52d3c7f2a1b0a98e4a05a51e3f660985e594b4f6c18d5/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4eb4d96f7db618f100758a8b7ec1b221c8737d543073701b7ffee74bc5019d46", size = 2704582, upload-time = "2025-10-13T23:11:39.408Z" }, - { url = "https://files.pythonhosted.org/packages/aa/0a/7bc46ee799459cce72a2e15b0eb184170f26cac37eace0b813e855fbc4d8/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2cfe9b4b1dd663d256754f1aa7aae783a1cddd3eb5698892b9caf381431f0af7", size = 3155815, upload-time = "2025-10-13T23:11:43.304Z" }, - { url = "https://files.pythonhosted.org/packages/99/8a/f20eff8f4ff5bd7db9b37b70ea058b37375a930a10e03d584a7597b6b740/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a3b2621505f97eb5ce80f1c6fa8c77d18d757ab48d1f11d33a805e9ccbcd6fb6", size = 3107791, upload-time = "2025-10-13T23:11:46.735Z" }, - { url = "https://files.pythonhosted.org/packages/79/da/60c66f0cc4a6af7f54e57973190540f77b84da1218fad2a9917e17bd897b/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6c1becbb96ceba0b20f3d4318dd35f3417ee9a49065813d99f52b0fa285fc569", size = 2957730, upload-time = "2025-10-13T23:11:49.875Z" }, - { url = "https://files.pythonhosted.org/packages/dd/8d/6e3235894196e1fd2be34e01ac2d4280dd24e6c9019e3b12603858651e91/arro3_core-0.6.5-cp39-cp39-win_amd64.whl", hash = "sha256:5459e7bd39bb9dd8c57aa06856d2bebc5c1ca782cbccab0e186c6c89530e4ca9", size = 2839298, upload-time = "2025-10-13T23:11:53.566Z" }, { url = "https://files.pythonhosted.org/packages/10/ca/b2139dbb25f9fefb9b1cdce8a73785615de6763af6a16bf6ff96a3b630f2/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:26d5b50139f1a96727fa1760b4d70393acf5ee0fba45346ad2d4f69824d3bdc2", size = 2676788, upload-time = "2025-10-13T23:11:56.965Z" }, { url = "https://files.pythonhosted.org/packages/34/a1/c68dde2944f493c8ccfcb91bf6da6d27a27c3674316dd09c9560f9e6ab1a/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b65b3d8d7f65f2f3c36002dc467380d7a31ea771132986dddc6341c5a9dc726f", size = 2382809, upload-time = "2025-10-13T23:12:00.175Z" }, { url = "https://files.pythonhosted.org/packages/c6/fc/2fb81d42a3cecd632deace97dc23ac74083d60d158106440c783bae4ff01/arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c3442a79a757ed3fbd7793de180019ae3201f04237537c2e2e3f1e3dd99b31c", size = 2882818, upload-time = "2025-10-13T23:12:03.721Z" }, @@ -314,8 +300,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/d0/3888673417202262ddd7e6361cab8e01ee2705e39643af8445e2eb276eab/botocore-1.40.43.tar.gz", hash = "sha256:d87412dc1ea785df156f412627d3417c9f9eb45601fd0846d8fe96fe3c78b630", size = 14389164, upload-time = "2025-10-01T19:38:16.06Z" } wheels = [ @@ -392,17 +377,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/2a/aff5dd112b2f14bcc3462c312dce5445806bfc8ab3a7328555da95330e4b/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d716a916938e03231e86e43782ca7878fb602a125a91e7acb8b5112e2e96ac16", size = 152224, upload-time = "2025-08-09T07:56:51.369Z" }, { url = "https://files.pythonhosted.org/packages/b7/8c/9839225320046ed279c6e839d51f028342eb77c91c89b8ef2549f951f3ec/charset_normalizer-3.4.3-cp314-cp314-win32.whl", hash = "sha256:c6dbd0ccdda3a2ba7c2ecd9d77b37f3b5831687d8dc1b6ca5f56a4880cc7b7ce", size = 100086, upload-time = "2025-08-09T07:56:52.722Z" }, { url = "https://files.pythonhosted.org/packages/ee/7a/36fbcf646e41f710ce0a563c1c9a343c6edf9be80786edeb15b6f62e17db/charset_normalizer-3.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:73dc19b562516fc9bcf6e5d6e596df0b4eb98d87e4f79f3ae71840e6ed21361c", size = 107400, upload-time = "2025-08-09T07:56:55.172Z" }, - { url = "https://files.pythonhosted.org/packages/c2/ca/9a0983dd5c8e9733565cf3db4df2b0a2e9a82659fd8aa2a868ac6e4a991f/charset_normalizer-3.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:70bfc5f2c318afece2f5838ea5e4c3febada0be750fcf4775641052bbba14d05", size = 207520, upload-time = "2025-08-09T07:57:11.026Z" }, - { url = "https://files.pythonhosted.org/packages/39/c6/99271dc37243a4f925b09090493fb96c9333d7992c6187f5cfe5312008d2/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23b6b24d74478dc833444cbd927c338349d6ae852ba53a0d02a2de1fce45b96e", size = 147307, upload-time = "2025-08-09T07:57:12.4Z" }, - { url = "https://files.pythonhosted.org/packages/e4/69/132eab043356bba06eb333cc2cc60c6340857d0a2e4ca6dc2b51312886b3/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:34a7f768e3f985abdb42841e20e17b330ad3aaf4bb7e7aeeb73db2e70f077b99", size = 160448, upload-time = "2025-08-09T07:57:13.712Z" }, - { url = "https://files.pythonhosted.org/packages/04/9a/914d294daa4809c57667b77470533e65def9c0be1ef8b4c1183a99170e9d/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fb731e5deb0c7ef82d698b0f4c5bb724633ee2a489401594c5c88b02e6cb15f7", size = 157758, upload-time = "2025-08-09T07:57:14.979Z" }, - { url = "https://files.pythonhosted.org/packages/b0/a8/6f5bcf1bcf63cb45625f7c5cadca026121ff8a6c8a3256d8d8cd59302663/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:257f26fed7d7ff59921b78244f3cd93ed2af1800ff048c33f624c87475819dd7", size = 152487, upload-time = "2025-08-09T07:57:16.332Z" }, - { url = "https://files.pythonhosted.org/packages/c4/72/d3d0e9592f4e504f9dea08b8db270821c909558c353dc3b457ed2509f2fb/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1ef99f0456d3d46a50945c98de1774da86f8e992ab5c77865ea8b8195341fc19", size = 150054, upload-time = "2025-08-09T07:57:17.576Z" }, - { url = "https://files.pythonhosted.org/packages/20/30/5f64fe3981677fe63fa987b80e6c01042eb5ff653ff7cec1b7bd9268e54e/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2c322db9c8c89009a990ef07c3bcc9f011a3269bc06782f916cd3d9eed7c9312", size = 161703, upload-time = "2025-08-09T07:57:20.012Z" }, - { url = "https://files.pythonhosted.org/packages/e1/ef/dd08b2cac9284fd59e70f7d97382c33a3d0a926e45b15fc21b3308324ffd/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:511729f456829ef86ac41ca78c63a5cb55240ed23b4b737faca0eb1abb1c41bc", size = 159096, upload-time = "2025-08-09T07:57:21.329Z" }, - { url = "https://files.pythonhosted.org/packages/45/8c/dcef87cfc2b3f002a6478f38906f9040302c68aebe21468090e39cde1445/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:88ab34806dea0671532d3f82d82b85e8fc23d7b2dd12fa837978dad9bb392a34", size = 153852, upload-time = "2025-08-09T07:57:22.608Z" }, - { url = "https://files.pythonhosted.org/packages/63/86/9cbd533bd37883d467fcd1bd491b3547a3532d0fbb46de2b99feeebf185e/charset_normalizer-3.4.3-cp39-cp39-win32.whl", hash = "sha256:16a8770207946ac75703458e2c743631c79c59c5890c80011d536248f8eaa432", size = 99840, upload-time = "2025-08-09T07:57:23.883Z" }, - { url = "https://files.pythonhosted.org/packages/ce/d6/7e805c8e5c46ff9729c49950acc4ee0aeb55efb8b3a56687658ad10c3216/charset_normalizer-3.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:d22dbedd33326a4a5190dd4fe9e9e693ef12160c77382d9e87919bce54f3d4ca", size = 107438, upload-time = "2025-08-09T07:57:25.287Z" }, { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" }, ] @@ -420,9 +394,9 @@ name = "datafusion" version = "53.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/58/2b/0f96f12b70839c93930c4e17d767fc32b6c77d548c78784128049e944701/datafusion-53.0.0.tar.gz", hash = "sha256:ba9a5ec06b5453fbd8710d6aeeb515a8bcac4b6c140e254409bb53a5f322ef22", size = 224267, upload-time = "2026-04-13T00:45:02.686Z" } wheels = [ @@ -433,42 +407,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/1a/ea4831fc6aeefedbcf186c9f6a273d507b1787c03cbb905bded7e1149a6a/datafusion-53.0.0-cp310-abi3-win_amd64.whl", hash = "sha256:4c8410f5f659b926677be6c7d443bbc05d825c078c970b7d8cf977ebcf948314", size = 38120687, upload-time = "2026-04-13T00:45:00.633Z" }, ] -[[package]] -name = "datasets" -version = "0.0.9" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/cd/fe/4d2874473a753d59c83335691bd9532704f2605418a0d288a1d70fa003fc/datasets-0.0.9.zip", hash = "sha256:86d54441bab87aebb2aa3bf0853aa7fb7abed8c708f9bb08a88e86a498972010", size = 4013, upload-time = "2015-08-18T00:07:40.556Z" } - [[package]] name = "datasets" version = "4.1.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "dill", marker = "python_full_version >= '3.10'" }, - { name = "filelock", marker = "python_full_version >= '3.10'" }, - { name = "fsspec", extra = ["http"], marker = "python_full_version >= '3.10'" }, - { name = "huggingface-hub", marker = "python_full_version >= '3.10'" }, - { name = "multiprocess", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "pandas", marker = "python_full_version >= '3.10'" }, - { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "pyyaml", marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "tqdm", marker = "python_full_version >= '3.10'" }, - { name = "xxhash", marker = "python_full_version >= '3.10'" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, ] sdist = { url = "https://files.pythonhosted.org/packages/91/a4/73f8e6ef52c535e1d20d5b2ca83bfe6de399d8b8b8a61ccc8d63d60735aa/datasets-4.1.1.tar.gz", hash = "sha256:7d8d5ba8b12861d2c44bfff9c83484ebfafff1ff553371e5901a8d3aab5450e2", size = 579324, upload-time = "2025-09-18T13:14:27.108Z" } wheels = [ @@ -514,12 +472,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/23/32/57866cf8881288b3dfb9212720221fb890daaa534dbdc6fe3fff3979ecd1/duckdb-1.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2de258a93435c977a0ec3a74ec8f60c2f215ddc73d427ee49adc4119558facd3", size = 18421289, upload-time = "2025-09-16T10:22:21.564Z" }, { url = "https://files.pythonhosted.org/packages/a0/83/7438fb43be451a7d4a04650aaaf662b2ff2d95895bbffe3e0e28cbe030c9/duckdb-1.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6d3659641d517dd9ed1ab66f110cdbdaa6900106f116effaf2dbedd83c38de3", size = 20426547, upload-time = "2025-09-16T10:22:23.759Z" }, { url = "https://files.pythonhosted.org/packages/21/b2/98fb89ae81611855f35984e96f648d871f3967bb3f524b51d1372d052f0c/duckdb-1.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:07fcc612ea5f0fe6032b92bcc93693034eb00e7a23eb9146576911d5326af4f7", size = 12290467, upload-time = "2025-09-16T10:22:25.923Z" }, - { url = "https://files.pythonhosted.org/packages/8d/42/0f355319b3e8ee1703d0e17378dd829db391434306621f85c110134f2763/duckdb-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1c97ee61c582002b654331f7fd967d6b1e83bf7fdb0772f409dfd4b6af3a70f4", size = 31292373, upload-time = "2025-09-16T10:22:28.118Z" }, - { url = "https://files.pythonhosted.org/packages/fd/52/091dbef5eb2ac4e60a9c6d38fcc7c7530a75fafa0f37658450e8731a265b/duckdb-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:74e3d6295355160df5d3588b880e8bcae23fdd6f573f538793a8a1abf4c2c29d", size = 17288145, upload-time = "2025-09-16T10:22:30.346Z" }, - { url = "https://files.pythonhosted.org/packages/c9/6c/879317d9c3ac7a2a1f0618ca536a48ebfa4b9fe202f9783e07070e168192/duckdb-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0c76425e4ffe98069dd4fc4752ab919a4125dc0d176bb676b3065fdea152c42", size = 14816258, upload-time = "2025-09-16T10:22:32.442Z" }, - { url = "https://files.pythonhosted.org/packages/95/87/83ac8e67c0530b69fe39f91bbb7f3bd0a49b0c24216cffa9c5561fb2845c/duckdb-1.4.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c122bd7d80ab5057f53024ee3922d7612a5cdc99583fae730990964aebc3fd4", size = 18391043, upload-time = "2025-09-16T10:22:34.616Z" }, - { url = "https://files.pythonhosted.org/packages/d6/01/1d70bd6c594ef915c004edc0f1119d1602173dc5ce91c1eed7368f6aab34/duckdb-1.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:30689c1436bca723526be6102fe1f4f82ea6d4780fb9ca196bda7ed5ec227950", size = 20385348, upload-time = "2025-09-16T10:22:36.982Z" }, - { url = "https://files.pythonhosted.org/packages/b6/04/0650128cdcdc5208c4f51341a0a3f8db436ecaba51032c6065e20ea0baae/duckdb-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:4c55a367c1296617cff89c5e1c7153f1dc3c3b556ef70711a45b0236515f80c2", size = 12283322, upload-time = "2025-09-16T10:22:39.388Z" }, ] [[package]] @@ -543,29 +495,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, ] -[[package]] -name = "flatbuffers" -version = "2.0.7" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/d1/90/0532e737a11e1dc50e9e352c3ccc97338cb75991f83279c2edbc9234e022/flatbuffers-2.0.7.tar.gz", hash = "sha256:0ae7d69c5b82bf41962ca5fde9cc43033bc9501311d975fd5a25e8a7d29c1245", size = 22686, upload-time = "2022-08-23T22:50:07.903Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/0d/b5bfb553a6ac66d6ec2b6d7f1e814a908fba7188356ac94bb36ae3d905c3/flatbuffers-2.0.7-py2.py3-none-any.whl", hash = "sha256:71e135d533be527192819aaab757c5e3d109cb10fbb01e687f6bdb7a61ad39d1", size = 26562, upload-time = "2022-08-23T22:50:56.342Z" }, -] - [[package]] name = "flatbuffers" version = "25.9.23" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" }, @@ -662,23 +595,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/37/5f9f3c3fd7f7746082ec67bcdc204db72dad081f4f83a503d33220a92973/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a85e345b4c43db8b842cab1feb41be5cc0b10a1830e6295b69d7310f99becaf", size = 282620, upload-time = "2025-06-09T23:02:00.493Z" }, { url = "https://files.pythonhosted.org/packages/0b/31/8fbc5af2d183bff20f21aa743b4088eac4445d2bb1cdece449ae80e4e2d1/frozenlist-1.7.0-cp313-cp313t-win32.whl", hash = "sha256:3a14027124ddb70dfcee5148979998066897e79f89f64b13328595c4bdf77c81", size = 43059, upload-time = "2025-06-09T23:02:02.072Z" }, { url = "https://files.pythonhosted.org/packages/bb/ed/41956f52105b8dbc26e457c5705340c67c8cc2b79f394b79bffc09d0e938/frozenlist-1.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3bf8010d71d4507775f658e9823210b7427be36625b387221642725b515dcf3e", size = 47516, upload-time = "2025-06-09T23:02:03.779Z" }, - { url = "https://files.pythonhosted.org/packages/dd/b1/ee59496f51cd244039330015d60f13ce5a54a0f2bd8d79e4a4a375ab7469/frozenlist-1.7.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cea3dbd15aea1341ea2de490574a4a37ca080b2ae24e4b4f4b51b9057b4c3630", size = 82434, upload-time = "2025-06-09T23:02:05.195Z" }, - { url = "https://files.pythonhosted.org/packages/75/e1/d518391ce36a6279b3fa5bc14327dde80bcb646bb50d059c6ca0756b8d05/frozenlist-1.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d536ee086b23fecc36c2073c371572374ff50ef4db515e4e503925361c24f71", size = 48232, upload-time = "2025-06-09T23:02:07.728Z" }, - { url = "https://files.pythonhosted.org/packages/b7/8d/a0d04f28b6e821a9685c22e67b5fb798a5a7b68752f104bfbc2dccf080c4/frozenlist-1.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dfcebf56f703cb2e346315431699f00db126d158455e513bd14089d992101e44", size = 47186, upload-time = "2025-06-09T23:02:09.243Z" }, - { url = "https://files.pythonhosted.org/packages/93/3a/a5334c0535c8b7c78eeabda1579179e44fe3d644e07118e59a2276dedaf1/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974c5336e61d6e7eb1ea5b929cb645e882aadab0095c5a6974a111e6479f8878", size = 226617, upload-time = "2025-06-09T23:02:10.949Z" }, - { url = "https://files.pythonhosted.org/packages/0a/67/8258d971f519dc3f278c55069a775096cda6610a267b53f6248152b72b2f/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c70db4a0ab5ab20878432c40563573229a7ed9241506181bba12f6b7d0dc41cb", size = 224179, upload-time = "2025-06-09T23:02:12.603Z" }, - { url = "https://files.pythonhosted.org/packages/fc/89/8225905bf889b97c6d935dd3aeb45668461e59d415cb019619383a8a7c3b/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1137b78384eebaf70560a36b7b229f752fb64d463d38d1304939984d5cb887b6", size = 235783, upload-time = "2025-06-09T23:02:14.678Z" }, - { url = "https://files.pythonhosted.org/packages/54/6e/ef52375aa93d4bc510d061df06205fa6dcfd94cd631dd22956b09128f0d4/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e793a9f01b3e8b5c0bc646fb59140ce0efcc580d22a3468d70766091beb81b35", size = 229210, upload-time = "2025-06-09T23:02:16.313Z" }, - { url = "https://files.pythonhosted.org/packages/ee/55/62c87d1a6547bfbcd645df10432c129100c5bd0fd92a384de6e3378b07c1/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74739ba8e4e38221d2c5c03d90a7e542cb8ad681915f4ca8f68d04f810ee0a87", size = 215994, upload-time = "2025-06-09T23:02:17.9Z" }, - { url = "https://files.pythonhosted.org/packages/45/d2/263fea1f658b8ad648c7d94d18a87bca7e8c67bd6a1bbf5445b1bd5b158c/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e63344c4e929b1a01e29bc184bbb5fd82954869033765bfe8d65d09e336a677", size = 225122, upload-time = "2025-06-09T23:02:19.479Z" }, - { url = "https://files.pythonhosted.org/packages/7b/22/7145e35d12fb368d92124f679bea87309495e2e9ddf14c6533990cb69218/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ea2a7369eb76de2217a842f22087913cdf75f63cf1307b9024ab82dfb525938", size = 224019, upload-time = "2025-06-09T23:02:20.969Z" }, - { url = "https://files.pythonhosted.org/packages/44/1e/7dae8c54301beb87bcafc6144b9a103bfd2c8f38078c7902984c9a0c4e5b/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:836b42f472a0e006e02499cef9352ce8097f33df43baaba3e0a28a964c26c7d2", size = 239925, upload-time = "2025-06-09T23:02:22.466Z" }, - { url = "https://files.pythonhosted.org/packages/4b/1e/99c93e54aa382e949a98976a73b9b20c3aae6d9d893f31bbe4991f64e3a8/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e22b9a99741294b2571667c07d9f8cceec07cb92aae5ccda39ea1b6052ed4319", size = 220881, upload-time = "2025-06-09T23:02:24.521Z" }, - { url = "https://files.pythonhosted.org/packages/5e/9c/ca5105fa7fb5abdfa8837581be790447ae051da75d32f25c8f81082ffc45/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:9a19e85cc503d958abe5218953df722748d87172f71b73cf3c9257a91b999890", size = 234046, upload-time = "2025-06-09T23:02:26.206Z" }, - { url = "https://files.pythonhosted.org/packages/8d/4d/e99014756093b4ddbb67fb8f0df11fe7a415760d69ace98e2ac6d5d43402/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f22dac33bb3ee8fe3e013aa7b91dc12f60d61d05b7fe32191ffa84c3aafe77bd", size = 235756, upload-time = "2025-06-09T23:02:27.79Z" }, - { url = "https://files.pythonhosted.org/packages/8b/72/a19a40bcdaa28a51add2aaa3a1a294ec357f36f27bd836a012e070c5e8a5/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ccec739a99e4ccf664ea0775149f2749b8a6418eb5b8384b4dc0a7d15d304cb", size = 222894, upload-time = "2025-06-09T23:02:29.848Z" }, - { url = "https://files.pythonhosted.org/packages/08/49/0042469993e023a758af81db68c76907cd29e847d772334d4d201cbe9a42/frozenlist-1.7.0-cp39-cp39-win32.whl", hash = "sha256:b3950f11058310008a87757f3eee16a8e1ca97979833239439586857bc25482e", size = 39848, upload-time = "2025-06-09T23:02:31.413Z" }, - { url = "https://files.pythonhosted.org/packages/5a/45/827d86ee475c877f5f766fbc23fb6acb6fada9e52f1c9720e2ba3eae32da/frozenlist-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:43a82fce6769c70f2f5a06248b614a7d268080a9d20f7457ef10ecee5af82b63", size = 44102, upload-time = "2025-06-09T23:02:32.808Z" }, { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, ] @@ -693,32 +609,13 @@ wheels = [ [package.optional-dependencies] http = [ - { name = "aiohttp", marker = "python_full_version >= '3.10'" }, -] - -[[package]] -name = "gast" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/83/4a/07c7e59cef23fb147454663c3271c21da68ba2ab141427c20548ae5a8a4d/gast-0.4.0.tar.gz", hash = "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1", size = 13804, upload-time = "2020-08-07T21:45:23.526Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/48/583c032b79ae5b3daa02225a675aeb673e58d2cb698e78510feceb11958c/gast-0.4.0-py3-none-any.whl", hash = "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4", size = 9824, upload-time = "2020-08-07T21:45:21.32Z" }, + { name = "aiohttp" }, ] [[package]] name = "gast" version = "0.6.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3c/14/c566f5ca00c115db7725263408ff952b8ae6d6a4e792ef9c84e77d9af7a1/gast-0.6.0.tar.gz", hash = "sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb", size = 27708, upload-time = "2024-06-27T20:31:49.527Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a3/61/8001b38461d751cd1a0c3a6ae84346796a5758123f3ed97a1b121dfbf4f3/gast-0.6.0-py3-none-any.whl", hash = "sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54", size = 21173, upload-time = "2024-07-09T13:15:15.615Z" }, @@ -726,69 +623,61 @@ wheels = [ [[package]] name = "geoarrow-rust-core" -version = "0.6.1" +version = "0.6.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "arro3-core" }, - { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/77/2d/3e994dd76223fac0eb597a6f55647cca51bd5a4f446d09b668697f901724/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:84d972cc3dd45a797fd99588d7ee68f257e4083ebdcecad9ec773260067f71a6", size = 3570129, upload-time = "2025-12-03T18:51:07.148Z" }, - { url = "https://files.pythonhosted.org/packages/5f/2a/e19df203b4ffb225f39627e1bd1b89ce7b2220e39f1d6972692174820c57/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bc0f382d4ed41e85d2d89fc2c7c8c3d046681c9a5e19350ce79e0e930cf69821", size = 3333881, upload-time = "2025-11-21T01:49:28.959Z" }, - { url = "https://files.pythonhosted.org/packages/52/98/b749a2165dfc5d9c54a1c19eb3e6a75b6d005ecde42289b25c1c355346b7/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80e719edcaf6698ed2b1aa9525bd97cf79e23a500a39b1e83566cd9a16a294d3", size = 3806366, upload-time = "2025-11-21T01:48:03.525Z" }, - { url = "https://files.pythonhosted.org/packages/84/93/7c0e42ba7d46208fb0f851e06c05de071962170f3a3b2a2260d8a3f66e7a/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d0f3546a15503329880063aca31266b301b0b781f618f832585bcd1c9efcc876", size = 3981800, upload-time = "2025-11-21T01:48:17.789Z" }, - { url = "https://files.pythonhosted.org/packages/de/43/9c5736569dead60b33e46b7c485e24804d950693df70dee306e153547789/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6937f3cabebf673f8b726d60d8ca160b46401de8b08c8e257be22772c12c2001", size = 5068955, upload-time = "2025-11-21T01:48:32.569Z" }, - { url = "https://files.pythonhosted.org/packages/71/5e/f26f9bea2af96b0d070e980dcc2196d369a678e06141ed260de5ca72bcc2/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f29ba92053e8ad4bd60d72188518f033ca4abc1f34eecebeb41ee7b790612e00", size = 4104946, upload-time = "2025-11-21T01:48:45.801Z" }, - { url = "https://files.pythonhosted.org/packages/fa/08/473796b3e0c03b35292220de88c8efa3e74d6174e807b26a371f2523a4b0/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a5d05a312fbb76821566b1d144c64d0923fcbd790b2c7376ee11f62472b2fe", size = 3917533, upload-time = "2025-11-21T01:49:14.631Z" }, - { url = "https://files.pythonhosted.org/packages/b9/7a/7b62b839c3a9878a7d91b8395e0b7b04483e4bec687e073df0fbd4056583/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:88fe8fd33b16a06e9b3b7638b51d24047f1d01af12cc2e3e2653140877bddef6", size = 4318837, upload-time = "2025-11-21T01:48:58.953Z" }, - { url = "https://files.pythonhosted.org/packages/ea/86/309c55a9c63f316e3a04949ade8847b8e5acbdd21645696911175f0e1814/geoarrow_rust_core-0.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:dbecc2487cc95526ac77797cd70c199e196811b0a9e877c1b61fcaca508575fa", size = 3320081, upload-time = "2025-11-21T01:49:58.861Z" }, - { url = "https://files.pythonhosted.org/packages/1a/ed/514cff089185d71242a62e774e2c59dda147baab65929851b66d72198d5d/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e26ca240d7a6a0fa1b4f56a9ebe07b2e14fc7c1c9507aa862bd31ef14e0521f0", size = 3572326, upload-time = "2025-12-03T18:51:08.477Z" }, - { url = "https://files.pythonhosted.org/packages/77/21/22f8233235bd020db22b4f2bf888f9aeed08813eda7b8b421a6963bdc7e4/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46876e3528685673e08b4cbc696dca7f22fb073a83318708b0eaf640107b923b", size = 3335166, upload-time = "2025-11-21T01:49:30.632Z" }, - { url = "https://files.pythonhosted.org/packages/bb/eb/0c2e40a6a1bd450347a8a9fc7648ca840710bc177ff6eed3fc5da6ef981a/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5502bd12ede712d9b4725753df4db231a0aa6d3e131079bc4b6452c436e37b7", size = 3800540, upload-time = "2025-11-21T01:48:05.583Z" }, - { url = "https://files.pythonhosted.org/packages/4c/42/22d3b8441bb7041a6fcdb4cf0a1108e150513a52f8a407715188412bc71f/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f04dd7dd03449dba6d15f7d35c6c708637ac05f125638f56206e876756cd4c5", size = 3984840, upload-time = "2025-11-21T01:48:19.719Z" }, - { url = "https://files.pythonhosted.org/packages/12/44/477b6b2389398dc983026a4ab7dbb7ec121284ad5fb864a1b7a4658c3881/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2afce33d0c3fa87d5d4d24d6617732e4297da3372b1746569b759f9b62aede1", size = 5067358, upload-time = "2025-11-21T01:48:34.373Z" }, - { url = "https://files.pythonhosted.org/packages/62/50/6995e9d11462635972b2fc09c8e1e510928563ca4fb0fd2c9145cf6ef771/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e63cdb661652a9836dc86cb5995ad269817d88b80f4cce6ed236a7f80f0aba", size = 4105773, upload-time = "2025-11-21T01:48:47.461Z" }, - { url = "https://files.pythonhosted.org/packages/a3/21/b369208495f213db0a0e7d563358307a706cc6af0cb9c897dacf28ae06a1/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adbaf97cb770aef69df8a16437c9faa67adb2b04856faf45bcb61d5b986101dc", size = 3914659, upload-time = "2025-11-21T01:49:16.35Z" }, - { url = "https://files.pythonhosted.org/packages/1d/49/fccb14c6ee9bb715451e4d5bbe3d571eb59a8a1abe21b2abe0d9d48a7fac/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:202f35b301caa5154d95fd74424a1ef6449306e4f6fbfb5140270e48e94188a5", size = 4315153, upload-time = "2025-11-21T01:49:01.075Z" }, - { url = "https://files.pythonhosted.org/packages/c0/1c/88b16510e24a4a3332284669085673701b9fe4d6a511b4466c90655a9daf/geoarrow_rust_core-0.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:491405dfcc821a2c599e381cc9923e04a758deb1cc84fdb5794b519446c2f8a8", size = 3320510, upload-time = "2025-11-21T01:50:15.545Z" }, - { url = "https://files.pythonhosted.org/packages/cb/5f/1dbdbc1dde2140937cff20188cb25034b6f39e1734c14ca6510cf464bf77/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a8145a562e94419402dd0882bb62429853804c53d47dbea944f2a24abc57abd2", size = 3568115, upload-time = "2025-12-03T18:51:09.743Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e1/b62676f89ef3b866676967989ee8dbbd3d16c77f69aa4287825703268c42/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:51040a5afcfa0cd3ab372d981375c7fe8eb652d155e3964d52ed51d14faa04e8", size = 3325336, upload-time = "2025-11-21T01:49:32.67Z" }, - { url = "https://files.pythonhosted.org/packages/1f/89/94e20f255712ff0eaccf9bfeac4bf51953ebcef0599cfc92f67037f8ab1a/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fbf8506848b0254b3c89b27c045be38bbef6372b21714cad45d76b0c8cb92ce", size = 3808535, upload-time = "2025-11-21T01:48:07.618Z" }, - { url = "https://files.pythonhosted.org/packages/e7/e4/37c7e2c9e251148be17292d0656d7d1ab35019678f6bd11090a41c270d18/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c1a0d9c14bf2f36676016c753517d9470381969c2a67859716cceae33735f3ee", size = 3978997, upload-time = "2025-11-21T01:48:21.551Z" }, - { url = "https://files.pythonhosted.org/packages/71/27/c4ba353d9b77889136bdfd1c0cd1a04d6eade9da6e0748b06719c458afb5/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6df97301782ecbaf5f2f0252011a9ff309471cde25435bdf1e17b29c263ebc16", size = 5066492, upload-time = "2025-11-21T01:48:36.142Z" }, - { url = "https://files.pythonhosted.org/packages/a6/81/34107fc9aacc489e41afed420202645675b41d85b46dc70d5ba222312791/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1948cfdd0e1c7d03a0c2067821dd536ab34d1e726515202e51fbd6b0d9f775f", size = 4106130, upload-time = "2025-11-21T01:48:49.144Z" }, - { url = "https://files.pythonhosted.org/packages/92/5f/2e348b884738fb213fb3b4745955baeeaf047aecb37639e39a4dd8f12d99/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95b1611b66c386cc6c74e990df4f114bcf24956a35e18e51bf6331c079a36688", size = 3913166, upload-time = "2025-11-21T01:49:18.228Z" }, - { url = "https://files.pythonhosted.org/packages/bf/81/fdda8bb5f84df82bc9e000435a88be46d46dda41eb5149f624ed96b7031c/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1751357a1aaa26aeb5feb6f66873b6a2d369655039f7278dedcb692b512111cc", size = 4313573, upload-time = "2025-11-21T01:49:03.184Z" }, - { url = "https://files.pythonhosted.org/packages/a0/14/ca0bc7d3b158094e769ba2bbc43d203330e7e457ed67b50af97d3eac45df/geoarrow_rust_core-0.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:16fe159043a444579948864808ebec8c49ec167ec0df3cb772dfb88de268bc91", size = 3318746, upload-time = "2025-11-21T01:50:17.319Z" }, - { url = "https://files.pythonhosted.org/packages/85/b8/94e4f8fb32ef705cf65031a24c58cdc441042a68a794b74757a6561cbc60/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6c1b692f76b613757438bf23cfe3be4a8715f0268afd8ad3ca0063c257a3be4b", size = 3568328, upload-time = "2025-12-03T18:51:11.291Z" }, - { url = "https://files.pythonhosted.org/packages/7c/45/a96e64f9febc3436766c5055508c4e823cce56577529d7b76c4e4f584bc4/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a2b4f9a8cfe852a0ba9a667258307db9e354b470b7e0a03edffd0b7daf9b6f5", size = 3325879, upload-time = "2025-11-21T01:49:34.941Z" }, - { url = "https://files.pythonhosted.org/packages/58/c0/c719ce3fb4e982e28c71f65a80cf697d07d733336e6b74d7d1b8a7daf9d0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8248330f5c3e7ec5852d0a23c23b31a08395300ef9544109e2991317beddfee3", size = 3809144, upload-time = "2025-11-21T01:48:09.562Z" }, - { url = "https://files.pythonhosted.org/packages/e2/8e/2ab3563b2ffd13f2dd69c050a901de0a4bb325879531a66f56d30bc7337e/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:775e9fe45c06d02be59b1497c60aa4f7a7c1d460387bf5f63142faf39b8ad4ff", size = 3978886, upload-time = "2025-11-21T01:48:23.335Z" }, - { url = "https://files.pythonhosted.org/packages/db/0a/31625caa0a32e8e9e7aaf2514a840dda0dadf8e2452710ebc10e5f469494/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94de8fb01da3f22332eab28b03570c43cc36492ce482c254fe87e851ae21285b", size = 5065429, upload-time = "2025-11-21T01:48:37.896Z" }, - { url = "https://files.pythonhosted.org/packages/11/8d/ee247bd4ccf3b0791b8669357d440e3960d4dbd5cca940a2e226e8910c31/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c70a63d1d36687a53dc6c2933446b1435c187e4c616cd84844d89b6ba13bc4f6", size = 4105436, upload-time = "2025-11-21T01:48:50.874Z" }, - { url = "https://files.pythonhosted.org/packages/a9/fb/c1e92716ee5aa00d48b650f0cb43220a1bf4088c8d572dfc21d400b16723/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e505312f2761393fe5158242f3f2d77e9daa5cca63badd8d66e6d1d69fc17bf", size = 3913672, upload-time = "2025-11-21T01:49:19.873Z" }, - { url = "https://files.pythonhosted.org/packages/f8/6f/ef47f6070c5d5cf0d061d5f5ba95aed7e895e4720a784b84c911c0209fc0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a732e58549108df8267ab72fa6cc7c54e5a9e30b818d8d869e301a9de9d3029e", size = 4313496, upload-time = "2025-11-21T01:49:04.953Z" }, - { url = "https://files.pythonhosted.org/packages/3c/ac/2696b979623ea02129e342f8820c89d03fa5a253a913ad00b588d6dd2948/geoarrow_rust_core-0.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:9e1d6492b1388b9d5ae898728838ada78dbf2340d2e9dd25ad3df6ccdd058813", size = 3318780, upload-time = "2025-11-21T01:50:18.928Z" }, - { url = "https://files.pythonhosted.org/packages/4e/42/0cb3af24b01d3897a9eee6af5cc0676bf6b80364e0d4638e45a5fc873d35/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3748cc8e8cb2bcedaede27cefed6749d4eea93e358b49a2f0b061d8974dd1b91", size = 3560313, upload-time = "2025-12-03T18:51:12.897Z" }, - { url = "https://files.pythonhosted.org/packages/51/bc/33f8c918e46188707ab358752b993bee9184fa62e580998c1ec4c37885c1/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1b0e232fe4e239ca435d0bab638934eee87d758024c1727ee24a2b8bc4d8bc7b", size = 3321855, upload-time = "2025-12-03T18:51:00.056Z" }, - { url = "https://files.pythonhosted.org/packages/f4/d7/aeb2a3922670ad57f62cb591bd0309a8300ceeec6efc7f925a563c9da672/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:843444ada2c7f7670fd9df3bdebd93e5247b376d1dd20c4fb3828632847ab78e", size = 3799057, upload-time = "2025-12-03T18:50:28.982Z" }, - { url = "https://files.pythonhosted.org/packages/76/08/606e55fc2a0e85b02e0fde7dec2014eb8f1463e8a823496d72a3095de73d/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:880641183a09ebfbca3a6357071f137d1a4b0f1ba606fb9127a01cf58faaef56", size = 3968892, upload-time = "2025-12-03T18:50:34.661Z" }, - { url = "https://files.pythonhosted.org/packages/10/1f/e75fd5b59e9e582190c11ec73c91728d96e90608a22e0aed7365439d9534/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6bb69024257d2fd20da691d1e15bcced874d278884218b64690256982fa30cb1", size = 5049247, upload-time = "2025-12-03T18:50:40.542Z" }, - { url = "https://files.pythonhosted.org/packages/7e/95/2257b9b148c8c6557387e67828a5096ebc519b997a158ffb67a0987589e5/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:85464a1bab81068789de5fb19684e43709d2ba6d64d5655aace7c50b35893d6d", size = 4099850, upload-time = "2025-12-03T18:50:45.341Z" }, - { url = "https://files.pythonhosted.org/packages/b9/07/8c8aaf8755ee7c137f0898823bd005ffb16edaa6accc0cc1a9a747d56ddc/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7eb773a101f1d9716d750bb326991885a7c4576e85d9a016a567a3b07380bf07", size = 3908308, upload-time = "2025-12-03T18:50:55.587Z" }, - { url = "https://files.pythonhosted.org/packages/dc/7e/b8f1933be03d9a3a6416edf29fc23d520e45f00fbde6bd8f0614ad6f8a69/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:920e6fed857acd2145a8fca7c6fad17094873f586ac5efed7049ce43a7af4ff6", size = 4307178, upload-time = "2025-12-03T18:50:50.429Z" }, - { url = "https://files.pythonhosted.org/packages/df/95/a8ba3d7e51ec02ec954d0247c6021b36de5935a9a3845c1cf6c1348cd6e3/geoarrow_rust_core-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:9887119cc31a763c34ed8676d06434b47971517e86f8e35c640b494d05e7d5ac", size = 3316511, upload-time = "2025-12-03T18:51:18.831Z" }, - { url = "https://files.pythonhosted.org/packages/ea/6d/4b2f51d0e4ac683217852d79c3acef719ca116f418d9ce8f4dcc6d717716/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:420a720217b5a7ec6f7977cfe7e7a729c73381ed5e63112fdef33bd805b9cf8a", size = 3572216, upload-time = "2025-12-03T18:51:14.544Z" }, - { url = "https://files.pythonhosted.org/packages/f0/55/85a2948b10ad9ea347597f90355d8992745f00fedae54916205c8c9b80fb/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fb9c8c6bba4e712edf475ce3c78bf13f7b10f750256f57deb29c3222eaef033", size = 3335928, upload-time = "2025-11-21T01:49:51.601Z" }, - { url = "https://files.pythonhosted.org/packages/4e/98/fdd6c34ff8acd878c31e9f5fe4792f49d437e0465e0b60c24d6cdc287ed7/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9547ead76eac906b7a583ee65fa137e6b8ed34c0f128c1745a290c451726f27", size = 3808249, upload-time = "2025-11-21T01:48:11.192Z" }, - { url = "https://files.pythonhosted.org/packages/8a/a1/fd6741b5c1d7d48b5f6ab58a994a91c86e29d19ee7bca2636590b8ac9a54/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eaa8e8f40ca8fcd367735cb4226c5aa5171a713d75bc2caab9a03bd9f59d7bf2", size = 3984081, upload-time = "2025-11-21T01:48:25.595Z" }, - { url = "https://files.pythonhosted.org/packages/91/1e/2b5a9b65bf19a79d212ea0fe60fa5632ec4c89bb64ee446272b47e5cd6ac/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08992719a2accbf993837a6aad615e3f2bf1954d2d9152e507dd79621c87e9d3", size = 5071749, upload-time = "2025-11-21T01:48:39.673Z" }, - { url = "https://files.pythonhosted.org/packages/08/7a/6b37f5e52300b60854b74f4cdc9fbe613c692a15c3ae42f1952f3849bc86/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:071c0e72c4c2047326ebec8d76ce2debcdd59e187207433c3a29ac2da861ca92", size = 4107621, upload-time = "2025-11-21T01:48:52.632Z" }, - { url = "https://files.pythonhosted.org/packages/e8/3e/f849642ef4e1f54bcc651903f19a219c3d2be68d27f4ceb282a07ebba7cd/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c49d5a9e7b73c30dd1790a3e0faf30b7a4ee393c127c5a799d543653d1d80f0c", size = 3919352, upload-time = "2025-11-21T01:49:21.495Z" }, - { url = "https://files.pythonhosted.org/packages/84/c8/57318cb04d061788d5ba523984915c98523e9eb9b7ba4937ff3438e045ef/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:909152922ee42197b8ae846a8b6c5383c6f3ab39fe627ec8539765e3a634de68", size = 4320006, upload-time = "2025-11-21T01:49:06.588Z" }, - { url = "https://files.pythonhosted.org/packages/13/9f/be16e191fdedbac4d9c01096327917a948625619423c666ec3db2191b4ab/geoarrow_rust_core-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:796c84184fe5e65e30df9f9f45aa8c1680f07689ea71ed1960faa7324fb67e52", size = 3321071, upload-time = "2025-11-21T01:50:20.844Z" }, + { url = "https://files.pythonhosted.org/packages/70/a7/9de5cdcb86089ef4d9a24940838a72ef0655d5be11b46dc4ee807b0d7772/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e1dbbca927858c05ef4eaa5e13a3977a62183cfa3f17fe7b19dd2d88ecf24e91", size = 3855749, upload-time = "2026-06-11T19:24:32.965Z" }, + { url = "https://files.pythonhosted.org/packages/54/48/da86c2bd1db71849f003f5a8eb78ce54f7a33341d5b33ddcdb480b5aafb4/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce7e126d340f335bcc108327cbf7264539e856cb6a299f59757a6ee8329f6643", size = 3710538, upload-time = "2026-06-11T19:24:34.925Z" }, + { url = "https://files.pythonhosted.org/packages/f6/65/7f8ecc05447a85f14643170de8a29715e7c3e732fbb7132617772d39eac7/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88eb7982c1345fc4c4b18d9895602f0148c9495fe7ac00df03a92c20c8058149", size = 4198382, upload-time = "2026-06-11T19:24:37.02Z" }, + { url = "https://files.pythonhosted.org/packages/41/57/b11fbb277fab166d8a8940bc1151bbd1aeef537e70c55f495ff85178f827/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c8368b91d4cab5cb5ad1b0f7369da4cec196d82bf73aa3823618a99c1bd4cf04", size = 4270350, upload-time = "2026-06-11T19:24:38.726Z" }, + { url = "https://files.pythonhosted.org/packages/6d/16/0c35e5aff4aca77d818b28d79f9ce20fe1c282ef26d6a2fcc764f3a55f26/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2955d82d0204197c8e96adbfb70f252fa5987821dd8f202e712a84bfb5b876d3", size = 5602389, upload-time = "2026-06-11T19:24:40.198Z" }, + { url = "https://files.pythonhosted.org/packages/e0/06/58e4d0c94f7d8897ca5e2469fe5db0dd937bfc3cd676dea43c6ce488effe/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacbc2231b03c674975d5a25ff549c367dd8c07147c41edb5461c8ebda693739", size = 4414385, upload-time = "2026-06-11T19:24:41.779Z" }, + { url = "https://files.pythonhosted.org/packages/09/65/902e986d01d4978e752c1d0d5b15873de712321ce3f61c285f491e4149b9/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f5726fd638563d11dfefd7d17dd769e679ac1efb868178791573de19d16b41f", size = 4251263, upload-time = "2026-06-11T19:24:43.556Z" }, + { url = "https://files.pythonhosted.org/packages/2c/f1/b1e0f93ea5288706f08ac7c01f332eb0feaa128251f3c2c9896e5f42cba5/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df7a0319cdec5d0e4ffc3f17a171e16787e7719f85f82c8cf0035d873ec31e62", size = 4747229, upload-time = "2026-06-11T19:24:45.281Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f3/77ebd20cb5cf5eb18c5bb0e32e07f76ec915a728ea123e075365f0b6c53c/geoarrow_rust_core-0.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:19ce5fb18025480461253d0a03f20cbb635163214b5f193b0700bc1a407dfe4d", size = 3601298, upload-time = "2026-06-11T19:24:46.721Z" }, + { url = "https://files.pythonhosted.org/packages/02/a8/d50e482a56d9543119be40000bc405b725242b6056809bbee3a75eff2411/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d91b5249d5e1da53a79268759601c107beb69a8944dd3b5b225e9515ab63d519", size = 3856056, upload-time = "2026-06-11T19:24:48.331Z" }, + { url = "https://files.pythonhosted.org/packages/04/e3/f4de7795959d95d88b32b85740d5d2d6b0a2e17233258f0331aee6cb7b13/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:14412f02c1e60c92d2f88bc9f92835cf6d80f1da37fe8ba462eafdb7bd570f3c", size = 3710092, upload-time = "2026-06-11T19:24:49.802Z" }, + { url = "https://files.pythonhosted.org/packages/b4/48/04888477c2a12fbe6a6f8898bd026facdc3a929b4e747d7b569e6d20dd58/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc5d6db2341568b1e44678ccc0ade1ca1e7660a2c186ebf8bf847acdb160f2cf", size = 4197891, upload-time = "2026-06-11T19:24:51.245Z" }, + { url = "https://files.pythonhosted.org/packages/fb/2d/c16b6eb6f9f2ab213dcd0cd2ac0dec2eae1e2ce5922b3fbeb7bb1ac2a865/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:45f4193b9d6f6caae969d8448f3687a19f0998d757519a091df609c06ffa68a0", size = 4269771, upload-time = "2026-06-11T19:24:52.781Z" }, + { url = "https://files.pythonhosted.org/packages/47/fd/2ee73341c37d554ce8d0b67a95525700ec32194fa785261c17262afadfc8/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf9ca054562fb4610c8e5ea140fa1bf746ccc16de505d3a5684abd2fa11f9538", size = 5601846, upload-time = "2026-06-11T19:24:54.63Z" }, + { url = "https://files.pythonhosted.org/packages/67/05/229234ae7bf1d39306e41896f3055a2ae847707ce58f21bd0872b9a5764e/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ec9530fefb653f9a2e605cc26fc1c0d1ffa5c4923ec1037323ba9a16744f8ccc", size = 4413741, upload-time = "2026-06-11T19:24:56.015Z" }, + { url = "https://files.pythonhosted.org/packages/eb/5a/7875548a48231b02f909d3d8c7d74ba47867b2af3396e7aed59cd3b2b40d/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2813aceabb29567d96f29fd2d3099d6f8decd0f5f968ff81ed1a664751dc84a3", size = 4251434, upload-time = "2026-06-11T19:24:57.527Z" }, + { url = "https://files.pythonhosted.org/packages/bf/46/ed0370def1a950f185edda603a02276bb412a9c95ad5a052c9e919b2df78/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:49686767d1379ff3b165f9d35a73e96fc25daba786ce27cf3359c5feac880fd0", size = 4746598, upload-time = "2026-06-11T19:24:58.979Z" }, + { url = "https://files.pythonhosted.org/packages/44/bc/3a1720be855d7d0011416b7f0a7b7e33546b0fc7320faf59b05e401adff7/geoarrow_rust_core-0.6.3-cp311-cp311-win_amd64.whl", hash = "sha256:fd9cc8c47af736dd087575306088e73b28a720f52e5c3342968851ddd2fb5778", size = 3601329, upload-time = "2026-06-11T19:25:00.459Z" }, + { url = "https://files.pythonhosted.org/packages/24/b2/65db3af5fcc7d64ac7ac86d7debc6a90803bb076c8f7d4599c167be79fd6/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:86aaa60e5b6d99be08f9adc9e58bd088135e1dcfebd290085228ed8a0e93e90f", size = 3848323, upload-time = "2026-06-11T19:25:02.079Z" }, + { url = "https://files.pythonhosted.org/packages/27/9a/37bdd36d7feb9d591b9ccdc1952c6171b04dc777b999e2082b810eb1dd45/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fec148cd63e616d9a7aa00c4ab08693eeec55aca7c9d700aa6451cd8001d0e08", size = 3707679, upload-time = "2026-06-11T19:25:03.594Z" }, + { url = "https://files.pythonhosted.org/packages/45/b7/8d2998284de21d0feb2a0935c41636f8ebf2b65723d8139026e7f9f3d5e8/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1944f3d548b6296e9fbd668602accae0ad68e49ee0f5b8df9e7ea4f474e4ae", size = 4190279, upload-time = "2026-06-11T19:25:05.21Z" }, + { url = "https://files.pythonhosted.org/packages/25/f3/140209f53a70f261ef1459b08eea25c4edef3ad9f6ec0924033b5285ee7e/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f5c04195cbedf5d1684a50203e862d979cda0d6218aac32f607d6e3f7cd65c8", size = 4264876, upload-time = "2026-06-11T19:25:06.654Z" }, + { url = "https://files.pythonhosted.org/packages/14/32/0097bfb92816ef91b38f7e757f65fe8456e56152ca51cd7a05b1be8a2e40/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:671c6be9cbc68295a68598fc8c6ddd875de063a795d64b2cfd10d36abd1ee324", size = 5586563, upload-time = "2026-06-11T19:25:08.376Z" }, + { url = "https://files.pythonhosted.org/packages/fd/86/508fe299aa44afe95399d9fa73cdbc7a451841803b8f1431e8c3d0b26ec1/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5f4726fbe09d545a507993f2f76c2be7812fef3c20c994ff33c32aaa96aaa212", size = 4402886, upload-time = "2026-06-11T19:25:10.302Z" }, + { url = "https://files.pythonhosted.org/packages/46/81/fc34afcce2b0f17424610405481f69f3c6e4d670c5c94170d71ed6719794/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0fa37a90312e7ca06921be56cee183c12c442b345fadd982480cd1f8ed2eede", size = 4247331, upload-time = "2026-06-11T19:25:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0d/af42431f80282a2f7e1f3e496c39483dd2362e11f8008c65033be9d2ba4c/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3f41a8c0a9f3558d73537dcad83c88b29c2a169bcc7766dc677e8245a98a5e95", size = 4741954, upload-time = "2026-06-11T19:25:13.964Z" }, + { url = "https://files.pythonhosted.org/packages/cc/e5/be80aa4384f16be6a20828fd4cc67da18bd2266366f80c9bfefa481559f8/geoarrow_rust_core-0.6.3-cp312-cp312-win_amd64.whl", hash = "sha256:382f0914c75d84b87420aef7b6f11e8b5d4d58b5f5db7c8d199815e4dd282a42", size = 3599115, upload-time = "2026-06-11T19:25:15.357Z" }, + { url = "https://files.pythonhosted.org/packages/19/52/93bbf15979ce656d09821f02f82420957fdc99ee4cd37e5e2d8c99a324da/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:c11190008ed6a571b8ca4ef769198e95434dbe7c3caefa9acd5f0ceba1ed868f", size = 3848682, upload-time = "2026-06-11T19:25:16.914Z" }, + { url = "https://files.pythonhosted.org/packages/a8/1e/1665171a3756b1977b7240a8f518bbbdfa778dcc156e0f90d659723468fb/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1308ad09912fb67a43ff7dd7dbc685ca8a8fbd8028d3876eb187b6b082a98a7b", size = 3707868, upload-time = "2026-06-11T19:25:22.483Z" }, + { url = "https://files.pythonhosted.org/packages/ec/38/e344ccb72473b8756c8f2dae3a8a9339e1821884a2a50befbad45150d178/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1247b961c61656596631ca3380d405f8d0a2f60f045f8b8a3a335b1a849dc55", size = 4189835, upload-time = "2026-06-11T19:25:24.116Z" }, + { url = "https://files.pythonhosted.org/packages/22/10/bc92b9fcdc628fa1ff7e234219701cd575b0a78da5fdf3a6c8884e5ca445/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5c2cb90116255c3f74d5aee563405f3a440bd4eb75471adac13cd0c80a2564dc", size = 4265584, upload-time = "2026-06-11T19:25:25.628Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ed/67edd70967851bef3ef9e35d8ccef242923ed69104ecb885ad3adf4de9a2/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a993d3a0964b8cf55a51bd404225dc3037b51f34b01c6bb1312611ce61f9b2d", size = 5586300, upload-time = "2026-06-11T19:25:27.32Z" }, + { url = "https://files.pythonhosted.org/packages/76/a6/a20fba654caa314b4688ad9dceb5e99fa7956bbf92b3059baa36e06c59b3/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbd153a3348d166ecb57b2770b69b17c2df14cf303d41cd9168adba77532a31b", size = 4402375, upload-time = "2026-06-11T19:25:28.799Z" }, + { url = "https://files.pythonhosted.org/packages/ed/5d/c8949bb5916ff80186c854792b9ddadc9f3069db09d31311f24d82ba7096/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fb5aaf3a6f104145b4c5a3188b1be589849b2599626c0e40181a18fc2e79f68", size = 4246712, upload-time = "2026-06-11T19:25:31.015Z" }, + { url = "https://files.pythonhosted.org/packages/b5/36/c9b7afa2929b697a164ae18f35aba517bcab85efcf19cb48ffa5ac66642b/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3b33be8308a479f3a3a6d3a664861d6b5f8b1ad8822798f5a7e5d9af0b924eb", size = 4742863, upload-time = "2026-06-11T19:25:32.468Z" }, + { url = "https://files.pythonhosted.org/packages/57/5c/55a8d753bff924959837c39c9aa37c7813c5929570a2629ae4ece811505f/geoarrow_rust_core-0.6.3-cp313-cp313-pyemscripten_2025_0_wasm32.whl", hash = "sha256:a090191ae224e8490a95e68038db7a14df8f0326706f10c2e958621bf6c06ef5", size = 1979216, upload-time = "2026-06-11T19:25:33.905Z" }, + { url = "https://files.pythonhosted.org/packages/71/c7/a9f93af9306fd3743a96cc61bfdd7fc9194c38026f7904c067d4b4a99f0c/geoarrow_rust_core-0.6.3-cp313-cp313-win_amd64.whl", hash = "sha256:2606d6f5afacdb49145b39d3e024efadf33f847b596c19c9b6d3030d6beb2721", size = 3599237, upload-time = "2026-06-11T19:25:35.452Z" }, + { url = "https://files.pythonhosted.org/packages/8a/7a/6993bd89e12d0b227b611a53c657b38e63f906dfca773accae3a1f3815a4/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:370cd1ef46bf18fa598f3038fe6f417b016da211ffe060f2b60e47dd2f684a34", size = 3854961, upload-time = "2026-06-11T19:25:37.045Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c4/92cbcabd2a6add1b69a76a22a349fa219bdfed8026dfab4b8ec230bf9943/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4dbf733db0bc57859d1a34c4bc8c50805f19e60081496967588e43f1f606e885", size = 3708325, upload-time = "2026-06-11T19:25:38.638Z" }, + { url = "https://files.pythonhosted.org/packages/07/b3/8fc34c5efa95cd597328876b6295fbe280d4b71df615655aaa2cd1618881/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45ac6715e790b1ca9be38ceb8ee39cdfe29395d29c83541f7a1190812290d81d", size = 4196828, upload-time = "2026-06-11T19:25:40.329Z" }, + { url = "https://files.pythonhosted.org/packages/ca/f2/bd2026862995ff96eb6b94d2fc56f7bf737d13f6bac9662481eaae23d079/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d14917d471dce8ee5a0976ec50b5da800bab0117bfd72bc56e23518a1dbbdb3a", size = 4265577, upload-time = "2026-06-11T19:25:41.91Z" }, + { url = "https://files.pythonhosted.org/packages/3e/01/73d69c5205a34e043026a73048d210f448a986ebb577deee7ceb1923fb5a/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43a371299305388663131321f0d623fc70ca4a3840f973598946b5183e5ba4e4", size = 5592303, upload-time = "2026-06-11T19:25:43.503Z" }, + { url = "https://files.pythonhosted.org/packages/98/20/fe35466e526a5d363ebd9c9dd16985dbad7fd677b90e1f123a8180bceb44/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23eddb8dd65dfefb397762cc3c3f6bfaffb4271641bd9dc8043a9ab3aa4cd72a", size = 4409972, upload-time = "2026-06-11T19:25:45.114Z" }, + { url = "https://files.pythonhosted.org/packages/e5/c8/dc588827ad6e8dad75413bc1d35b5189c8a011a2be4827499a4ab9402253/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43ce7b3aaeb0e8c8ad7c37c84ceed49e10d0929a5a92042c3f6ec5ef33271de4", size = 4250885, upload-time = "2026-06-11T19:25:46.649Z" }, + { url = "https://files.pythonhosted.org/packages/e6/e2/a9923e4c5848ace6e3e6f09a40d3860955f7d836675affe35bc79bc27033/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c67201bd662e4732a822f91651111bc024329b3e71eba9f4eed19e58c9cf789b", size = 4742518, upload-time = "2026-06-11T19:25:48.098Z" }, + { url = "https://files.pythonhosted.org/packages/e6/c7/3112def9e93e88341210dd22b4d04c598fb4d0726adef2114b68157354d5/geoarrow_rust_core-0.6.3-cp314-cp314-pyemscripten_2026_0_wasm32.whl", hash = "sha256:8461e6d07a7b39ab099c9885a68d5e7983d4e83a82a42dd5b331c543683c9d6e", size = 1959191, upload-time = "2026-06-11T19:25:49.668Z" }, + { url = "https://files.pythonhosted.org/packages/ed/0f/de74ce2171c408e4b4a7660f69f6dfaa294797a18a209fa85b1ea79be141/geoarrow_rust_core-0.6.3-cp314-cp314-win_amd64.whl", hash = "sha256:5d2fd45d09bf700e0ca4d30b51ebcd59fb8d1a9eb4a4d7b4fc5f53a6cca59475", size = 3603948, upload-time = "2026-06-11T19:25:51.078Z" }, ] [[package]] @@ -797,8 +686,7 @@ version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "arro3-core" }, - { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ @@ -842,14 +730,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/14/1ec1ba4df851b477d802285e8b770f65e6774f0d6272e4e8548c8758892c/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a10e67d95a134dbb5f657fe3436ea645c6760a4ffef44df211f7d9b8fb687e6", size = 10499137, upload-time = "2025-12-03T19:02:24.514Z" }, { url = "https://files.pythonhosted.org/packages/a5/66/7ad618415790671664e76596c000e812e0bd39e8f347f4eb7b8e3f519a55/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:61ccbb528bbe4834849c501e5990a4a6f4b87976ca6a22df7859f16760c79590", size = 10394123, upload-time = "2025-12-03T19:02:01.248Z" }, { url = "https://files.pythonhosted.org/packages/43/4b/4520af8c694ca0932f995c91d604837741522bd02b66414fdff4521abc98/geoarrow_rust_io-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:aa46f6beda6c267f420ea390f071fadd0161094c1db8d71ad54002c006fe7f21", size = 8989484, upload-time = "2025-12-03T19:02:40.081Z" }, - { url = "https://files.pythonhosted.org/packages/69/87/efadbf1bb9d359f55791f7198cf9aa87f0272be6a2d373f5844f5e59cd1e/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:46e3e41b726b250b44a829ab41489e5008280acb8af8e68001230babf04bafd8", size = 9780411, upload-time = "2025-11-21T02:11:30.128Z" }, - { url = "https://files.pythonhosted.org/packages/95/73/5e108b286b219d3a46042cfa0830e0f075f4addd01f83f7c851a933919ae/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bb95364b726c34c23fb93ebc9c08b8fa1d52062a4a9c1ac614ff8761a339ba7a", size = 9316307, upload-time = "2025-11-21T02:11:21.195Z" }, - { url = "https://files.pythonhosted.org/packages/06/76/89c387d6d4d303feef328fc9c63df76cea52963e2046f2c092b434fb04a9/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:691a67ef3a5214fb704d1a19d33a9ddf173483c3943056fb965101c19b0edd28", size = 10309182, upload-time = "2025-11-21T02:10:34.063Z" }, - { url = "https://files.pythonhosted.org/packages/ff/08/34ed2d76ebfb34ed6bf3312defad16b2b5246e40d59e46443a6fe19e85dd/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91c82e9cbae6759798a8e4a87adb13ea617090a5498f384fc56c44775653d7f0", size = 11291230, upload-time = "2025-11-21T02:10:57.771Z" }, - { url = "https://files.pythonhosted.org/packages/e9/f5/9c25512c1f31101125555367e55ff28f72f449c8f56ff06c5be9e3feb9e5/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9c2b609addc7a810eab5cd573243710d95afe8486f829edd05b311d51bbb5af", size = 13300664, upload-time = "2025-11-21T02:10:46.082Z" }, - { url = "https://files.pythonhosted.org/packages/f5/aa/14be165b439d3a3ffc6ced96f971b02df255e86b82c7e1f9f340d35689c3/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6abdc80e130f472f55598543a4bb9ba522d6502a5d80017a952027a9e9c1d1ce", size = 10486589, upload-time = "2025-11-21T02:11:09.681Z" }, - { url = "https://files.pythonhosted.org/packages/5a/df/1c36bae723561785ce47e463f6366a3c52994795a168d7c4ed5e457e9a37/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c4638a89d61629110dde474b3d410ee2e71c89d2035ab2f2557857e7eee4ea30", size = 10395106, upload-time = "2025-11-21T02:10:20.832Z" }, - { url = "https://files.pythonhosted.org/packages/47/d4/4e9cffad7647c07a5cd1cce68c97102dd011652168e3e09a2dedc1253a5e/geoarrow_rust_io-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:4811e96b1777fcf12ac2416872407b1e4717f9a59fe5b80ce02b1e9a087d1b5e", size = 8988735, upload-time = "2025-11-21T02:11:39.164Z" }, { url = "https://files.pythonhosted.org/packages/e6/9f/32059400bb853eafe5d37d8c4ae9e48cd9c43820287e435cc1566f42208e/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ef94f84ba4efb42d63588241733e1b62bbdb4edeac5513baeb7bfb07db4f204a", size = 10303111, upload-time = "2025-11-21T02:10:36.067Z" }, { url = "https://files.pythonhosted.org/packages/6c/a2/7db0a685eafa41e9565a3c4e441f41d2630c084f616d2669c5fe8f5805ef/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:872dd92c52b2df342d34ac42d1b710c91c58e9dd93f5c88098816f9cd9dc8a84", size = 11299498, upload-time = "2025-11-21T02:11:00.19Z" }, { url = "https://files.pythonhosted.org/packages/13/b4/1bfbfbe828ca51b4f314d9f70514c2ff19923714aa7d51ef1b0ec8600aed/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:235a7ea94faa95a4699f6577765a5e5a88bee079828c3d9015d9d5c6c240459c", size = 13299230, upload-time = "2025-11-21T02:10:48.12Z" }, @@ -913,13 +793,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/9d/5e3e362815152aa1afd8b26ea613effa005962f9da0eec6e0e4527e7a7d1/grpcio-1.75.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:3e71a2105210366bfc398eef7f57a664df99194f3520edb88b9c3a7e46ee0d64", size = 7081061, upload-time = "2025-09-26T09:02:58.261Z" }, { url = "https://files.pythonhosted.org/packages/1e/1a/46615682a19e100f46e31ddba9ebc297c5a5ab9ddb47b35443ffadb8776c/grpcio-1.75.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8679aa8a5b67976776d3c6b0521e99d1c34db8a312a12bcfd78a7085cb9b604e", size = 8010849, upload-time = "2025-09-26T09:03:00.548Z" }, { url = "https://files.pythonhosted.org/packages/67/8e/3204b94ac30b0f675ab1c06540ab5578660dc8b690db71854d3116f20d00/grpcio-1.75.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:aad1c774f4ebf0696a7f148a56d39a3432550612597331792528895258966dc0", size = 7464478, upload-time = "2025-09-26T09:03:03.096Z" }, - { url = "https://files.pythonhosted.org/packages/8f/e2/33efd823a879dc7b60c10192df1900ee5c200f8e782663a41a3b2aecd143/grpcio-1.75.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:c09fba33327c3ac11b5c33dbdd8218eef8990d78f83b1656d628831812a8c0fb", size = 5706679, upload-time = "2025-09-26T09:03:10.218Z" }, - { url = "https://files.pythonhosted.org/packages/77/90/b80e75f8cce758425b2772742eed4e9db765a965d902ba4b7f239b2513de/grpcio-1.75.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c12121e509b9f8b0914d10054d24120237d19e870b1cd82acbb8a9b9ddd198a3", size = 6291926, upload-time = "2025-09-26T09:03:16.282Z" }, - { url = "https://files.pythonhosted.org/packages/40/5f/e6033d8f99063350e20873a46225468b73045b9ef2c8cba73d66a87c3fd5/grpcio-1.75.1-cp39-cp39-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:73577a93e692b3474b1bfe84285d098de36705dbd838bb4d6a056d326e4dc880", size = 6950040, upload-time = "2025-09-26T09:03:18.874Z" }, - { url = "https://files.pythonhosted.org/packages/01/12/34076c079b45af5aed40f037fffe388d7fbe90dd539ed01e4744c926d227/grpcio-1.75.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e19e7dfa0d7ca7dea22be464339e18ac608fd75d88c56770c646cdabe54bc724", size = 6465780, upload-time = "2025-09-26T09:03:21.219Z" }, - { url = "https://files.pythonhosted.org/packages/e4/c5/ee6fd69a9f6e7288d04da010ad7480a0566d2aac81097ff4dafbc5ffa9b6/grpcio-1.75.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e1c28f51c1cf67eccdfc1065e8e866c9ed622f09773ca60947089c117f848a1", size = 7098308, upload-time = "2025-09-26T09:03:23.875Z" }, - { url = "https://files.pythonhosted.org/packages/78/32/f2be13f13035361768923159fe20470a7d22db2c7c692b952e21284f56e5/grpcio-1.75.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:030a6164bc2ca726052778c0cf8e3249617a34e368354f9e6107c27ad4af8c28", size = 8042268, upload-time = "2025-09-26T09:03:26.268Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2d/1bb0572f0a2eaab100b4635c6c2cd0d37e3cda5554037e3f90b1bc428d56/grpcio-1.75.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:67697efef5a98d46d5db7b1720fa4043536f8b8e5072a5d61cfca762f287e939", size = 7491470, upload-time = "2025-09-26T09:03:28.906Z" }, ] [[package]] @@ -927,8 +800,7 @@ name = "h5py" version = "3.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5d/57/dfb3c5c3f1bf5f5ef2e59a22dec4ff1f3d7408b55bfcefcfb0ea69ef21c6/h5py-3.14.0.tar.gz", hash = "sha256:2372116b2e0d5d3e5e705b7f663f7c8d96fa79a4052d250484ef91d24d6a08f4", size = 424323, upload-time = "2025-06-06T14:06:15.01Z" } @@ -941,8 +813,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/f9/f00de11c82c88bfc1ef22633557bfba9e271e0cb3189ad704183fc4a2644/h5py-3.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cbd41f4e3761f150aa5b662df991868ca533872c95467216f2bec5fcad84882", size = 4929422, upload-time = "2025-06-06T14:05:18.399Z" }, { url = "https://files.pythonhosted.org/packages/0d/ce/3a21d87896bc7e3e9255e0ad5583ae31ae9e6b4b00e0bcb2a67e2b6acdbc/h5py-3.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8cbaf6910fa3983c46172666b0b8da7b7bd90d764399ca983236f2400436eeb", size = 4700675, upload-time = "2025-06-06T14:05:37.38Z" }, { url = "https://files.pythonhosted.org/packages/e7/ec/86f59025306dcc6deee5fda54d980d077075b8d9889aac80f158bd585f1b/h5py-3.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d90e6445ab7c146d7f7981b11895d70bc1dd91278a4f9f9028bc0c95e4a53f13", size = 4921632, upload-time = "2025-06-06T14:05:43.464Z" }, - { url = "https://files.pythonhosted.org/packages/66/40/b423b57696514e05aa7bb06150ef96667d0e0006cc6de7ab52c71734ab51/h5py-3.14.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:573c33ad056ac7c1ab6d567b6db9df3ffc401045e3f605736218f96c1e0490c6", size = 4326368, upload-time = "2025-06-06T14:06:00.782Z" }, - { url = "https://files.pythonhosted.org/packages/f7/07/e088f89f04fdbe57ddf9de377f857158d3daa38cf5d0fb20ef9bd489e313/h5py-3.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccbe17dc187c0c64178f1a10aa274ed3a57d055117588942b8a08793cc448216", size = 4559686, upload-time = "2025-06-06T14:06:07.416Z" }, ] [[package]] @@ -965,14 +835,14 @@ name = "huggingface-hub" version = "0.35.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "python_full_version >= '3.10'" }, - { name = "fsspec", marker = "python_full_version >= '3.10'" }, - { name = "hf-xet", marker = "(python_full_version >= '3.10' and platform_machine == 'aarch64') or (python_full_version >= '3.10' and platform_machine == 'amd64') or (python_full_version >= '3.10' and platform_machine == 'arm64') or (python_full_version >= '3.10' and platform_machine == 'x86_64')" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "pyyaml", marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "tqdm", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" } wheels = [ @@ -981,23 +851,11 @@ wheels = [ [[package]] name = "idna" -version = "3.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, -] - -[[package]] -name = "importlib-metadata" -version = "8.7.1" +version = "3.15" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245, upload-time = "2026-05-12T22:45:57.011Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, + { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" }, ] [[package]] @@ -1030,83 +888,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, ] -[[package]] -name = "keras" -version = "2.7.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/8b/065f94ba03282fa41b2d76942b87a180a9913312c4611ea7d6508fbbc114/keras-2.7.0-py2.py3-none-any.whl", hash = "sha256:0c33ae1f728064ca0d35dfba999e9c316f03623bf5688c82fb83cc74a80ea248", size = 1332171, upload-time = "2021-11-03T16:16:34.318Z" }, -] - [[package]] name = "keras" version = "3.11.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "absl-py", marker = "python_full_version >= '3.10'" }, - { name = "h5py", marker = "python_full_version >= '3.10'" }, - { name = "ml-dtypes", marker = "python_full_version >= '3.10'" }, - { name = "namex", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "absl-py" }, + { name = "h5py" }, + { name = "ml-dtypes" }, + { name = "namex" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "optree", marker = "python_full_version >= '3.10'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "rich", marker = "python_full_version >= '3.10'" }, + { name = "optree" }, + { name = "packaging" }, + { name = "rich" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6a/89/646425fe9a46f9053430e1271f817c36041c6f33469950a3caafc3d2591e/keras-3.11.3.tar.gz", hash = "sha256:efda616835c31b7d916d72303ef9adec1257320bc9fd4b2b0138840fc65fb5b7", size = 1065906, upload-time = "2025-08-21T22:08:57.643Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/94/5b/4c778cc921ce4b864b238f63f8e3ff6e954ab19b80c9fa680593ad8093d4/keras-3.11.3-py3-none-any.whl", hash = "sha256:f484f050e05ee400455b05ec8c36ed35edc34de94256b6073f56cfe68f65491f", size = 1408438, upload-time = "2025-08-21T22:08:55.858Z" }, ] -[[package]] -name = "keras-preprocessing" -version = "1.1.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "six", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5e/f1/b44337faca48874333769a29398fe4666686733c8880aa160b9fd5dfe600/Keras_Preprocessing-1.1.2.tar.gz", hash = "sha256:add82567c50c8bc648c14195bf544a5ce7c1f76761536956c3d2978970179ef3", size = 163598, upload-time = "2020-05-14T03:53:48.526Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/4c/7c3275a01e12ef9368a892926ab932b33bb13d55794881e3573482b378a7/Keras_Preprocessing-1.1.2-py2.py3-none-any.whl", hash = "sha256:7b82029b130ff61cc99b55f3bd27427df4838576838c5b2f65940e4fcec99a7b", size = 42581, upload-time = "2020-05-14T03:53:47.192Z" }, -] - [[package]] name = "lance-namespace" -version = "0.8.0" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/80/2b6eaa08c5e25915acaa6368a70211a25b5ba9d2d6006450e68a73936164/lance_namespace-0.8.0.tar.gz", hash = "sha256:c4a79ee221a3b2315c29863ad12d85fcf219a13158e26149d63e21dc4b4673a7", size = 10756, upload-time = "2026-06-01T08:47:10.183Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/12/f7ab93b29be3edbf5fc3610714bf2d06088e7f4524bfb38dfd6852458b08/lance_namespace-0.8.6.tar.gz", hash = "sha256:18232e721c8188145f4ec9389cc2dfbeeabf54a619d94885ea1b3375bee9f4af", size = 11529, upload-time = "2026-06-12T17:36:41.651Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/bd/7b40a08fb132fab39a6caebf832fdf6b9befc71be9413beb9be0a9d927d4/lance_namespace-0.8.0-py3-none-any.whl", hash = "sha256:782cf9e332f46bf06836722dd98b53ca8495ad98bb541501ff6876c89b67ec90", size = 12579, upload-time = "2026-06-01T08:47:10.91Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1b/5b1668ee2dc8910965f390640359112a31157092fcf8e000b89c79b58708/lance_namespace-0.8.6-py3-none-any.whl", hash = "sha256:571eae34f9aad70e5b05020416c2860889b9ec82993ccd0eb015e7b39c3ea309", size = 13383, upload-time = "2026-06-12T17:36:43.456Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.8.0" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, { name = "python-dateutil" }, { name = "typing-extensions" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8c/37/06fcd5a8969381e0ba953d51990af8d331bdccbc62458bf2eed30d064573/lance_namespace_urllib3_client-0.8.0.tar.gz", hash = "sha256:4f060f05ebf3c04aeaeb0d2022cbe77648a3df290f02cd2c305e5797d0fc1fdd", size = 203710, upload-time = "2026-06-01T08:47:13.404Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/80/fb224b4a89c1c1638cde949cb6cce6c3aca7759effbfea46a3d9c3960b21/lance_namespace_urllib3_client-0.8.6.tar.gz", hash = "sha256:b6fb1d306e74a7576e5309919020be744527de484a63dbf5eed10f8b368548df", size = 228772, upload-time = "2026-06-12T17:36:42.609Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/51/43/e280727feee958f303bc58d5fa912b07734a0831f756d841654d500c2c34/lance_namespace_urllib3_client-0.8.0-py3-none-any.whl", hash = "sha256:6734e341b726e5cc96a0cd257cef27eb9d03013f2d151526ee426cef8e63e228", size = 336669, upload-time = "2026-06-01T08:47:11.88Z" }, + { url = "https://files.pythonhosted.org/packages/c5/90/1e27de15cd1b16785a1c7312beb0a59e75c8344a815f600f58173a565bd1/lance_namespace_urllib3_client-0.8.6-py3-none-any.whl", hash = "sha256:9d78249c3fb15aa3d15d668f78f04a275af3d08d800a7027492f37996ac4968b", size = 369950, upload-time = "2026-06-12T17:36:40.438Z" }, ] [[package]] @@ -1125,9 +951,6 @@ wheels = [ name = "markdown" version = "3.9" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, -] sdist = { url = "https://files.pythonhosted.org/packages/8d/37/02347f6d6d8279247a5837082ebc26fc0d5aaeaf75aa013fcbb433c777ab/markdown-3.9.tar.gz", hash = "sha256:d2900fe1782bd33bdbbd56859defef70c2e78fc46668f8eb9df3128138f2cb6a", size = 364585, upload-time = "2025-09-04T20:25:22.885Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/70/ae/44c4a6a4cbb496d93c6257954260fe3a6e91b7bed2240e5dad2a717f5111/markdown-3.9-py3-none-any.whl", hash = "sha256:9f4d91ed810864ea88a6f32c07ba8bee1346c0cc1f6b1f9f6c822f2a9667d280", size = 107441, upload-time = "2025-09-04T20:25:21.784Z" }, @@ -1138,7 +961,7 @@ name = "markdown-it-py" version = "4.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mdurl", marker = "python_full_version >= '3.10'" }, + { name = "mdurl" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } wheels = [ @@ -1228,17 +1051,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, - { url = "https://files.pythonhosted.org/packages/56/23/0d8c13a44bde9154821586520840643467aee574d8ce79a17da539ee7fed/markupsafe-3.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:15d939a21d546304880945ca1ecb8a039db6b4dc49b2c5a400387cdae6a62e26", size = 11623, upload-time = "2025-09-27T18:37:29.296Z" }, - { url = "https://files.pythonhosted.org/packages/fd/23/07a2cb9a8045d5f3f0890a8c3bc0859d7a47bfd9a560b563899bec7b72ed/markupsafe-3.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f71a396b3bf33ecaa1626c255855702aca4d3d9fea5e051b41ac59a9c1c41edc", size = 12049, upload-time = "2025-09-27T18:37:30.234Z" }, - { url = "https://files.pythonhosted.org/packages/bc/e4/6be85eb81503f8e11b61c0b6369b6e077dcf0a74adbd9ebf6b349937b4e9/markupsafe-3.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f4b68347f8c5eab4a13419215bdfd7f8c9b19f2b25520968adfad23eb0ce60c", size = 21923, upload-time = "2025-09-27T18:37:31.177Z" }, - { url = "https://files.pythonhosted.org/packages/6f/bc/4dc914ead3fe6ddaef035341fee0fc956949bbd27335b611829292b89ee2/markupsafe-3.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8fc20152abba6b83724d7ff268c249fa196d8259ff481f3b1476383f8f24e42", size = 20543, upload-time = "2025-09-27T18:37:32.168Z" }, - { url = "https://files.pythonhosted.org/packages/89/6e/5fe81fbcfba4aef4093d5f856e5c774ec2057946052d18d168219b7bd9f9/markupsafe-3.0.3-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:949b8d66bc381ee8b007cd945914c721d9aba8e27f71959d750a46f7c282b20b", size = 20585, upload-time = "2025-09-27T18:37:33.166Z" }, - { url = "https://files.pythonhosted.org/packages/f6/f6/e0e5a3d3ae9c4020f696cd055f940ef86b64fe88de26f3a0308b9d3d048c/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:3537e01efc9d4dccdf77221fb1cb3b8e1a38d5428920e0657ce299b20324d758", size = 21387, upload-time = "2025-09-27T18:37:34.185Z" }, - { url = "https://files.pythonhosted.org/packages/c8/25/651753ef4dea08ea790f4fbb65146a9a44a014986996ca40102e237aa49a/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:591ae9f2a647529ca990bc681daebdd52c8791ff06c2bfa05b65163e28102ef2", size = 20133, upload-time = "2025-09-27T18:37:35.138Z" }, - { url = "https://files.pythonhosted.org/packages/dc/0a/c3cf2b4fef5f0426e8a6d7fce3cb966a17817c568ce59d76b92a233fdbec/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a320721ab5a1aba0a233739394eb907f8c8da5c98c9181d1161e77a0c8e36f2d", size = 20588, upload-time = "2025-09-27T18:37:36.096Z" }, - { url = "https://files.pythonhosted.org/packages/cd/1b/a7782984844bd519ad4ffdbebbba2671ec5d0ebbeac34736c15fb86399e8/markupsafe-3.0.3-cp39-cp39-win32.whl", hash = "sha256:df2449253ef108a379b8b5d6b43f4b1a8e81a061d6537becd5582fba5f9196d7", size = 14566, upload-time = "2025-09-27T18:37:37.09Z" }, - { url = "https://files.pythonhosted.org/packages/18/1f/8d9c20e1c9440e215a44be5ab64359e207fcb4f675543f1cf9a2a7f648d0/markupsafe-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:7c3fb7d25180895632e5d3148dbdc29ea38ccb7fd210aa27acbd1201a1902c6e", size = 15053, upload-time = "2025-09-27T18:37:38.054Z" }, - { url = "https://files.pythonhosted.org/packages/4e/d3/fe08482b5cd995033556d45041a4f4e76e7f0521112a9c9991d40d39825f/markupsafe-3.0.3-cp39-cp39-win_arm64.whl", hash = "sha256:38664109c14ffc9e7437e86b4dceb442b0096dfe3541d7864d9cbe1da4cf36c8", size = 13928, upload-time = "2025-09-27T18:37:39.037Z" }, ] [[package]] @@ -1279,8 +1091,7 @@ name = "ml-dtypes" version = "0.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/78/a7/aad060393123cfb383956dca68402aff3db1e1caffd5764887ed5153f41b/ml_dtypes-0.5.3.tar.gz", hash = "sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9", size = 692316, upload-time = "2025-07-29T18:39:19.454Z" } @@ -1315,10 +1126,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/21/783dfb51f40d2660afeb9bccf3612b99f6a803d980d2a09132b0f9d216ab/ml_dtypes-0.5.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3", size = 689324, upload-time = "2025-07-29T18:39:07.567Z" }, { url = "https://files.pythonhosted.org/packages/09/f7/a82d249c711abf411ac027b7163f285487f5e615c3e0716c61033ce996ab/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93", size = 5275917, upload-time = "2025-07-29T18:39:09.339Z" }, { url = "https://files.pythonhosted.org/packages/7f/3c/541c4b30815ab90ebfbb51df15d0b4254f2f9f1e2b4907ab229300d5e6f2/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39", size = 5285284, upload-time = "2025-07-29T18:39:11.532Z" }, - { url = "https://files.pythonhosted.org/packages/19/2d/c61af51173083bbf2a3b0f1a1a01d50ef1830436880027433d1b75271083/ml_dtypes-0.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5ee72568d46b9533ad54f78b1e1f3067c0534c5065120ea8ecc6f210d22748b3", size = 663552, upload-time = "2025-07-29T18:39:13.102Z" }, - { url = "https://files.pythonhosted.org/packages/61/0e/a628f2aefd719745e8a13492375a55cedea77c0cfc917b1ce11bde435c68/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01de48de4537dc3c46e684b969a40ec36594e7eeb7c69e9a093e7239f030a28a", size = 4952704, upload-time = "2025-07-29T18:39:14.829Z" }, - { url = "https://files.pythonhosted.org/packages/f8/2e/5ba92f1f99d1f5f62bffec614a5b8161e55c3961257c902fa26dbe909baa/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b1a6e231b0770f2894910f1dce6d2f31d65884dbf7668f9b08d73623cdca909", size = 4923538, upload-time = "2025-07-29T18:39:16.581Z" }, - { url = "https://files.pythonhosted.org/packages/70/3b/f801c69027866ea6e387224551185fedef62ad8e2e71181ec0d9dda905f7/ml_dtypes-0.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:a4f39b9bf6555fab9bfb536cf5fdd1c1c727e8d22312078702e9ff005354b37f", size = 206567, upload-time = "2025-07-29T18:39:18.047Z" }, ] [[package]] @@ -1335,7 +1142,7 @@ name = "multidict" version = "6.6.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version == '3.10.*'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" } wheels = [ @@ -1429,24 +1236,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/b0/a6fae46071b645ae98786ab738447de1ef53742eaad949f27e960864bb49/multidict-6.6.4-cp313-cp313t-win32.whl", hash = "sha256:f93b2b2279883d1d0a9e1bd01f312d6fc315c5e4c1f09e112e4736e2f650bc4e", size = 47775, upload-time = "2025-08-11T12:08:12.439Z" }, { url = "https://files.pythonhosted.org/packages/b2/0a/2436550b1520091af0600dff547913cb2d66fbac27a8c33bc1b1bccd8d98/multidict-6.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:6d46a180acdf6e87cc41dc15d8f5c2986e1e8739dc25dbb7dac826731ef381a4", size = 53100, upload-time = "2025-08-11T12:08:13.823Z" }, { url = "https://files.pythonhosted.org/packages/97/ea/43ac51faff934086db9c072a94d327d71b7d8b40cd5dcb47311330929ef0/multidict-6.6.4-cp313-cp313t-win_arm64.whl", hash = "sha256:756989334015e3335d087a27331659820d53ba432befdef6a718398b0a8493ad", size = 45501, upload-time = "2025-08-11T12:08:15.173Z" }, - { url = "https://files.pythonhosted.org/packages/d4/d3/f04c5db316caee9b5b2cbba66270b358c922a959855995bedde87134287c/multidict-6.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:af7618b591bae552b40dbb6f93f5518328a949dac626ee75927bba1ecdeea9f4", size = 76977, upload-time = "2025-08-11T12:08:16.667Z" }, - { url = "https://files.pythonhosted.org/packages/70/39/a6200417d883e510728ab3caec02d3b66ff09e1c85e0aab2ba311abfdf06/multidict-6.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b6819f83aef06f560cb15482d619d0e623ce9bf155115150a85ab11b8342a665", size = 44878, upload-time = "2025-08-11T12:08:18.157Z" }, - { url = "https://files.pythonhosted.org/packages/6f/7e/815be31ed35571b137d65232816f61513fcd97b2717d6a9d7800b5a0c6e0/multidict-6.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4d09384e75788861e046330308e7af54dd306aaf20eb760eb1d0de26b2bea2cb", size = 44546, upload-time = "2025-08-11T12:08:19.694Z" }, - { url = "https://files.pythonhosted.org/packages/e2/f1/21b5bff6a8c3e2aff56956c241941ace6b8820e1abe6b12d3c52868a773d/multidict-6.6.4-cp39-cp39-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:a59c63061f1a07b861c004e53869eb1211ffd1a4acbca330e3322efa6dd02978", size = 223020, upload-time = "2025-08-11T12:08:21.554Z" }, - { url = "https://files.pythonhosted.org/packages/15/59/37083f1dd3439979a0ffeb1906818d978d88b4cc7f4600a9f89b1cb6713c/multidict-6.6.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:350f6b0fe1ced61e778037fdc7613f4051c8baf64b1ee19371b42a3acdb016a0", size = 240528, upload-time = "2025-08-11T12:08:23.45Z" }, - { url = "https://files.pythonhosted.org/packages/d1/f0/f054d123c87784307a27324c829eb55bcfd2e261eb785fcabbd832c8dc4a/multidict-6.6.4-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0c5cbac6b55ad69cb6aa17ee9343dfbba903118fd530348c330211dc7aa756d1", size = 219540, upload-time = "2025-08-11T12:08:24.965Z" }, - { url = "https://files.pythonhosted.org/packages/e8/26/8f78ce17b7118149c17f238f28fba2a850b660b860f9b024a34d0191030f/multidict-6.6.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:630f70c32b8066ddfd920350bc236225814ad94dfa493fe1910ee17fe4365cbb", size = 251182, upload-time = "2025-08-11T12:08:26.511Z" }, - { url = "https://files.pythonhosted.org/packages/00/c3/a21466322d69f6594fe22d9379200f99194d21c12a5bbf8c2a39a46b83b6/multidict-6.6.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f8d4916a81697faec6cb724a273bd5457e4c6c43d82b29f9dc02c5542fd21fc9", size = 249371, upload-time = "2025-08-11T12:08:28.075Z" }, - { url = "https://files.pythonhosted.org/packages/c2/8e/2e673124eb05cf8dc82e9265eccde01a36bcbd3193e27799b8377123c976/multidict-6.6.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e42332cf8276bb7645d310cdecca93a16920256a5b01bebf747365f86a1675b", size = 239235, upload-time = "2025-08-11T12:08:29.937Z" }, - { url = "https://files.pythonhosted.org/packages/2b/2d/bdd9f05e7c89e30a4b0e4faf0681a30748f8d1310f68cfdc0e3571e75bd5/multidict-6.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f3be27440f7644ab9a13a6fc86f09cdd90b347c3c5e30c6d6d860de822d7cb53", size = 237410, upload-time = "2025-08-11T12:08:31.872Z" }, - { url = "https://files.pythonhosted.org/packages/46/4c/3237b83f8ca9a2673bb08fc340c15da005a80f5cc49748b587c8ae83823b/multidict-6.6.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:21f216669109e02ef3e2415ede07f4f8987f00de8cdfa0cc0b3440d42534f9f0", size = 232979, upload-time = "2025-08-11T12:08:33.399Z" }, - { url = "https://files.pythonhosted.org/packages/55/a6/a765decff625ae9bc581aed303cd1837955177dafc558859a69f56f56ba8/multidict-6.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:d9890d68c45d1aeac5178ded1d1cccf3bc8d7accf1f976f79bf63099fb16e4bd", size = 240979, upload-time = "2025-08-11T12:08:35.02Z" }, - { url = "https://files.pythonhosted.org/packages/6b/2d/9c75975cb0c66ea33cae1443bb265b2b3cd689bffcbc68872565f401da23/multidict-6.6.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:edfdcae97cdc5d1a89477c436b61f472c4d40971774ac4729c613b4b133163cb", size = 246849, upload-time = "2025-08-11T12:08:37.038Z" }, - { url = "https://files.pythonhosted.org/packages/3e/71/d21ac0843c1d8751fb5dcf8a1f436625d39d4577bc27829799d09b419af7/multidict-6.6.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0b2e886624be5773e69cf32bcb8534aecdeb38943520b240fed3d5596a430f2f", size = 241798, upload-time = "2025-08-11T12:08:38.669Z" }, - { url = "https://files.pythonhosted.org/packages/94/3d/1d8911e53092837bd11b1c99d71de3e2a9a26f8911f864554677663242aa/multidict-6.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:be5bf4b3224948032a845d12ab0f69f208293742df96dc14c4ff9b09e508fc17", size = 235315, upload-time = "2025-08-11T12:08:40.266Z" }, - { url = "https://files.pythonhosted.org/packages/86/c5/4b758df96376f73e936b1942c6c2dfc17e37ed9d5ff3b01a811496966ca0/multidict-6.6.4-cp39-cp39-win32.whl", hash = "sha256:10a68a9191f284fe9d501fef4efe93226e74df92ce7a24e301371293bd4918ae", size = 41434, upload-time = "2025-08-11T12:08:41.965Z" }, - { url = "https://files.pythonhosted.org/packages/58/16/f1dfa2a0f25f2717a5e9e5fe8fd30613f7fe95e3530cec8d11f5de0b709c/multidict-6.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:ee25f82f53262f9ac93bd7e58e47ea1bdcc3393cef815847e397cba17e284210", size = 46186, upload-time = "2025-08-11T12:08:43.367Z" }, - { url = "https://files.pythonhosted.org/packages/88/7d/a0568bac65438c494cb6950b29f394d875a796a237536ac724879cf710c9/multidict-6.6.4-cp39-cp39-win_arm64.whl", hash = "sha256:f9867e55590e0855bcec60d4f9a092b69476db64573c9fe17e92b0c50614c16a", size = 43115, upload-time = "2025-08-11T12:08:45.126Z" }, { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" }, ] @@ -1455,14 +1244,12 @@ name = "multiprocess" version = "0.70.16" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "dill", marker = "python_full_version >= '3.10'" }, + { name = "dill" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ef/76/6e712a2623d146d314f17598df5de7224c85c0060ef63fd95cc15a25b3fa/multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", size = 134980, upload-time = "2024-01-28T18:52:15.731Z" }, { url = "https://files.pythonhosted.org/packages/0f/ab/1e6e8009e380e22254ff539ebe117861e5bdb3bff1fc977920972237c6c7/multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", size = 134982, upload-time = "2024-01-28T18:52:17.783Z" }, - { url = "https://files.pythonhosted.org/packages/d8/94/8638a89f93c80df329116e6781a060506c7e91e1f4370dc831e9d17a041d/multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41", size = 133497, upload-time = "2024-01-28T18:52:22.644Z" }, - { url = "https://files.pythonhosted.org/packages/89/21/222066f6bb8d8af287923ae3bd26cf4699a9ce020228ac273caca1de8250/multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a", size = 133498, upload-time = "2024-01-28T18:52:24.576Z" }, { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" }, { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" }, { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload-time = "2024-01-28T18:52:29.395Z" }, @@ -1479,24 +1266,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/bc/465daf1de06409cdd4532082806770ee0d8d7df434da79c76564d0f69741/namex-0.1.0-py3-none-any.whl", hash = "sha256:e2012a474502f1e2251267062aae3114611f07df4224b6e06334c57b0f2ce87c", size = 5905, upload-time = "2025-05-26T23:17:37.695Z" }, ] -[[package]] -name = "networkx" -version = "3.2.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/c4/80/a84676339aaae2f1cfdf9f418701dd634aef9cc76f708ef55c36ff39c3ca/networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", size = 2073928, upload-time = "2023-10-28T08:41:39.364Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2", size = 1647772, upload-time = "2023-10-28T08:41:36.945Z" }, -] - [[package]] name = "networkx" version = "3.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } wheels = [ @@ -1527,67 +1302,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] -[[package]] -name = "numpy" -version = "2.0.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245, upload-time = "2024-08-26T20:04:14.625Z" }, - { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540, upload-time = "2024-08-26T20:04:36.784Z" }, - { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623, upload-time = "2024-08-26T20:04:46.491Z" }, - { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774, upload-time = "2024-08-26T20:04:58.173Z" }, - { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081, upload-time = "2024-08-26T20:05:19.098Z" }, - { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451, upload-time = "2024-08-26T20:05:47.479Z" }, - { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572, upload-time = "2024-08-26T20:06:17.137Z" }, - { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722, upload-time = "2024-08-26T20:06:39.16Z" }, - { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170, upload-time = "2024-08-26T20:06:50.361Z" }, - { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558, upload-time = "2024-08-26T20:07:13.881Z" }, - { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137, upload-time = "2024-08-26T20:07:45.345Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552, upload-time = "2024-08-26T20:08:06.666Z" }, - { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957, upload-time = "2024-08-26T20:08:15.83Z" }, - { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573, upload-time = "2024-08-26T20:08:27.185Z" }, - { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330, upload-time = "2024-08-26T20:08:48.058Z" }, - { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895, upload-time = "2024-08-26T20:09:16.536Z" }, - { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253, upload-time = "2024-08-26T20:09:46.263Z" }, - { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074, upload-time = "2024-08-26T20:10:08.483Z" }, - { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640, upload-time = "2024-08-26T20:10:19.732Z" }, - { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230, upload-time = "2024-08-26T20:10:43.413Z" }, - { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803, upload-time = "2024-08-26T20:11:13.916Z" }, - { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835, upload-time = "2024-08-26T20:11:34.779Z" }, - { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499, upload-time = "2024-08-26T20:11:43.902Z" }, - { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497, upload-time = "2024-08-26T20:11:55.09Z" }, - { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158, upload-time = "2024-08-26T20:12:14.95Z" }, - { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173, upload-time = "2024-08-26T20:12:44.049Z" }, - { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174, upload-time = "2024-08-26T20:13:13.634Z" }, - { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701, upload-time = "2024-08-26T20:13:34.851Z" }, - { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313, upload-time = "2024-08-26T20:13:45.653Z" }, - { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179, upload-time = "2024-08-26T20:14:08.786Z" }, - { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942, upload-time = "2024-08-26T20:14:40.108Z" }, - { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512, upload-time = "2024-08-26T20:15:00.985Z" }, - { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976, upload-time = "2024-08-26T20:15:10.876Z" }, - { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494, upload-time = "2024-08-26T20:15:22.055Z" }, - { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596, upload-time = "2024-08-26T20:15:42.452Z" }, - { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099, upload-time = "2024-08-26T20:16:11.048Z" }, - { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823, upload-time = "2024-08-26T20:16:40.171Z" }, - { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424, upload-time = "2024-08-26T20:17:02.604Z" }, - { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809, upload-time = "2024-08-26T20:17:13.553Z" }, - { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314, upload-time = "2024-08-26T20:17:36.72Z" }, - { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288, upload-time = "2024-08-26T20:18:07.732Z" }, - { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793, upload-time = "2024-08-26T20:18:19.125Z" }, - { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885, upload-time = "2024-08-26T20:18:47.237Z" }, - { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784, upload-time = "2024-08-26T20:19:11.19Z" }, -] - [[package]] name = "numpy" version = "2.2.6" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } wheels = [ @@ -1874,7 +1594,7 @@ name = "optree" version = "0.17.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/56/c7/0853e0c59b135dff770615d2713b547b6b3b5cde7c10995b4a5825244612/optree-0.17.0.tar.gz", hash = "sha256:5335a5ec44479920620d72324c66563bd705ab2a698605dd4b6ee67dbcad7ecd", size = 163111, upload-time = "2025-07-25T11:26:11.586Z" } wheels = [ @@ -1915,11 +1635,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/d3/8819a2d5105a240d6793d11a61d597db91756ce84da5cee08808c6b8f61f/optree-0.17.0-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:875c017890a4b5d566af5593cab67fe3c4845544942af57e6bb9dea17e060297", size = 439080, upload-time = "2025-07-25T11:25:42.605Z" }, { url = "https://files.pythonhosted.org/packages/c6/ef/9dbd34dfd1ad89feb239ca9925897a14ac94f190379a3bd991afdfd94186/optree-0.17.0-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ffa5686191139f763e13445a169765c83517164bc28e60dbedb19bed2b2655f1", size = 439422, upload-time = "2025-07-25T11:25:43.672Z" }, { url = "https://files.pythonhosted.org/packages/86/ca/a7a7549af2951925a692df508902ed2a6a94a51bc846806d2281b1029ef9/optree-0.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:575cf48cc2190acb565bd2b26b6f9b15c4e3b60183e86031215badc9d5441345", size = 426579, upload-time = "2025-07-25T11:25:44.765Z" }, - { url = "https://files.pythonhosted.org/packages/1d/29/3bb53de2de3b36a51e46b6d9ada7ee1a3a312ac461cd54292a023adc807c/optree-0.17.0-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:537498cf7bf7a4fe71f7ffd815e72b8672aea0fac82e1513f6b6e35e8569f5aa", size = 350302, upload-time = "2025-07-25T11:25:52.016Z" }, - { url = "https://files.pythonhosted.org/packages/2b/3b/d17a31447ed7ef6f10bd0caf40742b016fcdeaa3abb7568307b04a0f50cf/optree-0.17.0-cp39-cp39-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:3b3bb2326b550ddb048e3454fad40183b7fed74dda4351b016d20362809180af", size = 405358, upload-time = "2025-07-25T11:25:53.085Z" }, - { url = "https://files.pythonhosted.org/packages/db/f3/b9f0a8c98fd0c7f53fa9d9a46d75bb1182aeecd7ecde6f353d3e69ec9618/optree-0.17.0-cp39-cp39-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c0d3d702044e5acbec2cf8349789f6b096057bd00dc8e1e1c97b990347279fda", size = 402694, upload-time = "2025-07-25T11:25:54.537Z" }, - { url = "https://files.pythonhosted.org/packages/cb/dd/0d9d7426fd6b5d90ad40e4d93717a955d4257d06574dfe7a1da0d24cb06c/optree-0.17.0-cp39-cp39-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a9155e82717be1dda1f3c1244e9cb5b3733d5dd3ba47702730c7816be083a5cb", size = 398857, upload-time = "2025-07-25T11:25:55.921Z" }, - { url = "https://files.pythonhosted.org/packages/d8/57/dacec3f8c70f4685bb07fce19cf3361037fde2b596f6f7228e1a4b39677b/optree-0.17.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8e825501f55360e8381718623b094579dedc485e57010e01593d72a43b43e68", size = 387849, upload-time = "2025-07-25T11:25:57.046Z" }, { url = "https://files.pythonhosted.org/packages/ed/d7/3036d15c028c447b1bd65dcf8f66cfd775bfa4e52daa74b82fb1d3c88faf/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adde1427e0982cfc5f56939c26b4ebbd833091a176734c79fb95c78bdf833dff", size = 350952, upload-time = "2025-07-25T11:26:02.692Z" }, { url = "https://files.pythonhosted.org/packages/71/45/e710024ef77324e745de48efd64f6270d8c209f14107a48ffef4049ac57a/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a80b7e5de5dd09b9c8b62d501e29a3850b047565c336c9d004b07ee1c01f4ae1", size = 389568, upload-time = "2025-07-25T11:26:04.094Z" }, { url = "https://files.pythonhosted.org/packages/69/c4/94a187ed3ca71194b9da6a276790e1703c7544c8f695ac915214ae8ce934/optree-0.17.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f87f6f39015fc82d7adeee19900d246b89911319726e93cb2dbd4d1a809899bd", size = 363728, upload-time = "2025-07-25T11:26:07.959Z" }, @@ -1940,8 +1655,7 @@ name = "pandas" version = "2.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "python-dateutil" }, { name = "pytz" }, @@ -1996,13 +1710,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" }, { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" }, { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" }, - { url = "https://files.pythonhosted.org/packages/56/b4/52eeb530a99e2a4c55ffcd352772b599ed4473a0f892d127f4147cf0f88e/pandas-2.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2", size = 11567720, upload-time = "2025-09-29T23:33:06.209Z" }, - { url = "https://files.pythonhosted.org/packages/48/4a/2d8b67632a021bced649ba940455ed441ca854e57d6e7658a6024587b083/pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8", size = 10810302, upload-time = "2025-09-29T23:33:35.846Z" }, - { url = "https://files.pythonhosted.org/packages/13/e6/d2465010ee0569a245c975dc6967b801887068bc893e908239b1f4b6c1ac/pandas-2.3.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff", size = 12154874, upload-time = "2025-09-29T23:33:49.939Z" }, - { url = "https://files.pythonhosted.org/packages/1f/18/aae8c0aa69a386a3255940e9317f793808ea79d0a525a97a903366bb2569/pandas-2.3.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29", size = 12790141, upload-time = "2025-09-29T23:34:05.655Z" }, - { url = "https://files.pythonhosted.org/packages/f7/26/617f98de789de00c2a444fbe6301bb19e66556ac78cff933d2c98f62f2b4/pandas-2.3.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73", size = 13208697, upload-time = "2025-09-29T23:34:21.835Z" }, - { url = "https://files.pythonhosted.org/packages/b9/fb/25709afa4552042bd0e15717c75e9b4a2294c3dc4f7e6ea50f03c5136600/pandas-2.3.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9", size = 13879233, upload-time = "2025-09-29T23:34:35.079Z" }, - { url = "https://files.pythonhosted.org/packages/98/af/7be05277859a7bc399da8ba68b88c96b27b48740b6cf49688899c6eb4176/pandas-2.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa", size = 11359119, upload-time = "2025-09-29T23:34:46.339Z" }, ] [[package]] @@ -2091,17 +1798,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370, upload-time = "2025-07-01T09:15:46.673Z" }, { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500, upload-time = "2025-07-01T09:15:48.512Z" }, { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" }, - { url = "https://files.pythonhosted.org/packages/9e/8e/9c089f01677d1264ab8648352dcb7773f37da6ad002542760c80107da816/pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f", size = 5316478, upload-time = "2025-07-01T09:15:52.209Z" }, - { url = "https://files.pythonhosted.org/packages/b5/a9/5749930caf674695867eb56a581e78eb5f524b7583ff10b01b6e5048acb3/pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081", size = 4686522, upload-time = "2025-07-01T09:15:54.162Z" }, - { url = "https://files.pythonhosted.org/packages/43/46/0b85b763eb292b691030795f9f6bb6fcaf8948c39413c81696a01c3577f7/pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4", size = 5853376, upload-time = "2025-07-03T13:11:01.066Z" }, - { url = "https://files.pythonhosted.org/packages/5e/c6/1a230ec0067243cbd60bc2dad5dc3ab46a8a41e21c15f5c9b52b26873069/pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc", size = 7626020, upload-time = "2025-07-03T13:11:06.479Z" }, - { url = "https://files.pythonhosted.org/packages/63/dd/f296c27ffba447bfad76c6a0c44c1ea97a90cb9472b9304c94a732e8dbfb/pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06", size = 5956732, upload-time = "2025-07-01T09:15:56.111Z" }, - { url = "https://files.pythonhosted.org/packages/a5/a0/98a3630f0b57f77bae67716562513d3032ae70414fcaf02750279c389a9e/pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a", size = 6624404, upload-time = "2025-07-01T09:15:58.245Z" }, - { url = "https://files.pythonhosted.org/packages/de/e6/83dfba5646a290edd9a21964da07674409e410579c341fc5b8f7abd81620/pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978", size = 6067760, upload-time = "2025-07-01T09:16:00.003Z" }, - { url = "https://files.pythonhosted.org/packages/bc/41/15ab268fe6ee9a2bc7391e2bbb20a98d3974304ab1a406a992dcb297a370/pillow-11.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:97afb3a00b65cc0804d1c7abddbf090a81eaac02768af58cbdcaaa0a931e0b6d", size = 6700534, upload-time = "2025-07-01T09:16:02.29Z" }, - { url = "https://files.pythonhosted.org/packages/64/79/6d4f638b288300bed727ff29f2a3cb63db054b33518a95f27724915e3fbc/pillow-11.3.0-cp39-cp39-win32.whl", hash = "sha256:ea944117a7974ae78059fcc1800e5d3295172bb97035c0c1d9345fca1419da71", size = 6277091, upload-time = "2025-07-01T09:16:04.4Z" }, - { url = "https://files.pythonhosted.org/packages/46/05/4106422f45a05716fd34ed21763f8ec182e8ea00af6e9cb05b93a247361a/pillow-11.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:e5c5858ad8ec655450a7c7df532e9842cf8df7cc349df7225c60d5d348c8aada", size = 6986091, upload-time = "2025-07-01T09:16:06.342Z" }, - { url = "https://files.pythonhosted.org/packages/63/c6/287fd55c2c12761d0591549d48885187579b7c257bef0c6660755b0b59ae/pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb", size = 2422632, upload-time = "2025-07-01T09:16:08.142Z" }, { url = "https://files.pythonhosted.org/packages/6f/8b/209bd6b62ce8367f47e68a218bffac88888fdf2c9fcf1ecadc6c3ec1ebc7/pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967", size = 5270556, upload-time = "2025-07-01T09:16:09.961Z" }, { url = "https://files.pythonhosted.org/packages/2e/e6/231a0b76070c2cfd9e260a7a5b504fb72da0a95279410fa7afd99d9751d6/pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe", size = 4654625, upload-time = "2025-07-01T09:16:11.913Z" }, { url = "https://files.pythonhosted.org/packages/13/f4/10cf94fda33cb12765f2397fc285fa6d8eb9c29de7f3185165b702fc7386/pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c", size = 4874207, upload-time = "2025-07-03T13:11:10.201Z" }, @@ -2250,52 +1946,13 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/91/9cb56efbb428b006bb85db28591e40b7736847b8331d43fe335acf95f6c8/propcache-0.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4445542398bd0b5d32df908031cb1b30d43ac848e20470a878b770ec2dcc6330", size = 265778, upload-time = "2025-06-09T22:55:36.45Z" }, { url = "https://files.pythonhosted.org/packages/9a/4c/b0fe775a2bdd01e176b14b574be679d84fc83958335790f7c9a686c1f468/propcache-0.3.2-cp313-cp313t-win32.whl", hash = "sha256:f86e5d7cd03afb3a1db8e9f9f6eff15794e79e791350ac48a8c924e6f439f394", size = 41175, upload-time = "2025-06-09T22:55:38.436Z" }, { url = "https://files.pythonhosted.org/packages/a4/ff/47f08595e3d9b5e149c150f88d9714574f1a7cbd89fe2817158a952674bf/propcache-0.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9704bedf6e7cbe3c65eca4379a9b53ee6a83749f047808cbb5044d40d7d72198", size = 44857, upload-time = "2025-06-09T22:55:39.687Z" }, - { url = "https://files.pythonhosted.org/packages/6c/39/8ea9bcfaaff16fd0b0fc901ee522e24c9ec44b4ca0229cfffb8066a06959/propcache-0.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a7fad897f14d92086d6b03fdd2eb844777b0c4d7ec5e3bac0fbae2ab0602bbe5", size = 74678, upload-time = "2025-06-09T22:55:41.227Z" }, - { url = "https://files.pythonhosted.org/packages/d3/85/cab84c86966e1d354cf90cdc4ba52f32f99a5bca92a1529d666d957d7686/propcache-0.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1f43837d4ca000243fd7fd6301947d7cb93360d03cd08369969450cc6b2ce3b4", size = 43829, upload-time = "2025-06-09T22:55:42.417Z" }, - { url = "https://files.pythonhosted.org/packages/23/f7/9cb719749152d8b26d63801b3220ce2d3931312b2744d2b3a088b0ee9947/propcache-0.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:261df2e9474a5949c46e962065d88eb9b96ce0f2bd30e9d3136bcde84befd8f2", size = 43729, upload-time = "2025-06-09T22:55:43.651Z" }, - { url = "https://files.pythonhosted.org/packages/a2/a2/0b2b5a210ff311260002a315f6f9531b65a36064dfb804655432b2f7d3e3/propcache-0.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e514326b79e51f0a177daab1052bc164d9d9e54133797a3a58d24c9c87a3fe6d", size = 204483, upload-time = "2025-06-09T22:55:45.327Z" }, - { url = "https://files.pythonhosted.org/packages/3f/e0/7aff5de0c535f783b0c8be5bdb750c305c1961d69fbb136939926e155d98/propcache-0.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4a996adb6904f85894570301939afeee65f072b4fd265ed7e569e8d9058e4ec", size = 217425, upload-time = "2025-06-09T22:55:46.729Z" }, - { url = "https://files.pythonhosted.org/packages/92/1d/65fa889eb3b2a7d6e4ed3c2b568a9cb8817547a1450b572de7bf24872800/propcache-0.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76cace5d6b2a54e55b137669b30f31aa15977eeed390c7cbfb1dafa8dfe9a701", size = 214723, upload-time = "2025-06-09T22:55:48.342Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e2/eecf6989870988dfd731de408a6fa366e853d361a06c2133b5878ce821ad/propcache-0.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31248e44b81d59d6addbb182c4720f90b44e1efdc19f58112a3c3a1615fb47ef", size = 200166, upload-time = "2025-06-09T22:55:49.775Z" }, - { url = "https://files.pythonhosted.org/packages/12/06/c32be4950967f18f77489268488c7cdc78cbfc65a8ba8101b15e526b83dc/propcache-0.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abb7fa19dbf88d3857363e0493b999b8011eea856b846305d8c0512dfdf8fbb1", size = 194004, upload-time = "2025-06-09T22:55:51.335Z" }, - { url = "https://files.pythonhosted.org/packages/46/6c/17b521a6b3b7cbe277a4064ff0aa9129dd8c89f425a5a9b6b4dd51cc3ff4/propcache-0.3.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d81ac3ae39d38588ad0549e321e6f773a4e7cc68e7751524a22885d5bbadf886", size = 203075, upload-time = "2025-06-09T22:55:52.681Z" }, - { url = "https://files.pythonhosted.org/packages/62/cb/3bdba2b736b3e45bc0e40f4370f745b3e711d439ffbffe3ae416393eece9/propcache-0.3.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:cc2782eb0f7a16462285b6f8394bbbd0e1ee5f928034e941ffc444012224171b", size = 195407, upload-time = "2025-06-09T22:55:54.048Z" }, - { url = "https://files.pythonhosted.org/packages/29/bd/760c5c6a60a4a2c55a421bc34a25ba3919d49dee411ddb9d1493bb51d46e/propcache-0.3.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:db429c19a6c7e8a1c320e6a13c99799450f411b02251fb1b75e6217cf4a14fcb", size = 196045, upload-time = "2025-06-09T22:55:55.485Z" }, - { url = "https://files.pythonhosted.org/packages/76/58/ced2757a46f55b8c84358d6ab8de4faf57cba831c51e823654da7144b13a/propcache-0.3.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:21d8759141a9e00a681d35a1f160892a36fb6caa715ba0b832f7747da48fb6ea", size = 208432, upload-time = "2025-06-09T22:55:56.884Z" }, - { url = "https://files.pythonhosted.org/packages/bb/ec/d98ea8d5a4d8fe0e372033f5254eddf3254344c0c5dc6c49ab84349e4733/propcache-0.3.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2ca6d378f09adb13837614ad2754fa8afaee330254f404299611bce41a8438cb", size = 210100, upload-time = "2025-06-09T22:55:58.498Z" }, - { url = "https://files.pythonhosted.org/packages/56/84/b6d8a7ecf3f62d7dd09d9d10bbf89fad6837970ef868b35b5ffa0d24d9de/propcache-0.3.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:34a624af06c048946709f4278b4176470073deda88d91342665d95f7c6270fbe", size = 200712, upload-time = "2025-06-09T22:55:59.906Z" }, - { url = "https://files.pythonhosted.org/packages/bf/32/889f4903ddfe4a9dc61da71ee58b763758cf2d608fe1decede06e6467f8d/propcache-0.3.2-cp39-cp39-win32.whl", hash = "sha256:4ba3fef1c30f306b1c274ce0b8baaa2c3cdd91f645c48f06394068f37d3837a1", size = 38187, upload-time = "2025-06-09T22:56:01.212Z" }, - { url = "https://files.pythonhosted.org/packages/67/74/d666795fb9ba1dc139d30de64f3b6fd1ff9c9d3d96ccfdb992cd715ce5d2/propcache-0.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:7a2368eed65fc69a7a7a40b27f22e85e7627b74216f0846b04ba5c116e191ec9", size = 42025, upload-time = "2025-06-09T22:56:02.875Z" }, { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] -[[package]] -name = "protobuf" -version = "3.19.6" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/51/d1/79bfd1f481469b661a2eddab551255536401892722189433282bfb13cfb1/protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4", size = 218071, upload-time = "2022-09-29T22:07:23.03Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/3b/90f805b9e5ecacf8a216f2e5acabc2d3ad965b62803510be41804e6bfbfe/protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1", size = 913631, upload-time = "2022-09-29T21:17:39.095Z" }, - { url = "https://files.pythonhosted.org/packages/26/ef/bd6ba3b4ff9a35944bdd325e2c9ee56f71e855757f7d43938232499f0278/protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6", size = 1055327, upload-time = "2022-09-29T21:17:41.054Z" }, - { url = "https://files.pythonhosted.org/packages/bc/db/8b33c9558f1f27dd74e7f9ad730c6b32efab431419af556b1659e125b041/protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39", size = 913657, upload-time = "2022-09-29T21:18:18.359Z" }, - { url = "https://files.pythonhosted.org/packages/51/61/e80b7a04f4e1b4eecc86582335205fd876abca0abafee4a6c001f70a375e/protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0", size = 1055457, upload-time = "2022-09-29T21:18:20.212Z" }, - { url = "https://files.pythonhosted.org/packages/32/27/1141a8232723dcb10a595cc0ce4321dcbbd5215300bf4acfc142343205bf/protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4", size = 162648, upload-time = "2022-09-29T22:07:20.303Z" }, -] - [[package]] name = "protobuf" version = "6.32.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/fa/a4/cc17347aa2897568beece2e674674359f911d6fe21b0b8d6268cd42727ac/protobuf-6.32.1.tar.gz", hash = "sha256:ee2469e4a021474ab9baafea6cd070e5bf27c7d29433504ddea1a4ee5850f68d", size = 440635, upload-time = "2025-09-11T21:38:42.935Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3f/be/8dd0a927c559b37d7a6c8ab79034fd167dcc1f851595f2e641ad62be8643/protobuf-6.32.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:2f5b80a49e1eb7b86d85fcd23fe92df154b9730a725c3b38c4e43b9d77018bf4", size = 322874, upload-time = "2025-09-11T21:38:35.509Z" }, @@ -2333,8 +1990,7 @@ name = "pyarrow" version = "21.0.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" } wheels = [ @@ -2373,13 +2029,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" }, { url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" }, { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, - { url = "https://files.pythonhosted.org/packages/3e/cc/ce4939f4b316457a083dc5718b3982801e8c33f921b3c98e7a93b7c7491f/pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a7f6524e3747e35f80744537c78e7302cd41deee8baa668d56d55f77d9c464b3", size = 31211248, upload-time = "2025-07-18T00:56:59.7Z" }, - { url = "https://files.pythonhosted.org/packages/1f/c2/7a860931420d73985e2f340f06516b21740c15b28d24a0e99a900bb27d2b/pyarrow-21.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:203003786c9fd253ebcafa44b03c06983c9c8d06c3145e37f1b76a1f317aeae1", size = 32676896, upload-time = "2025-07-18T00:57:03.884Z" }, - { url = "https://files.pythonhosted.org/packages/68/a8/197f989b9a75e59b4ca0db6a13c56f19a0ad8a298c68da9cc28145e0bb97/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b4d97e297741796fead24867a8dabf86c87e4584ccc03167e4a811f50fdf74d", size = 41067862, upload-time = "2025-07-18T00:57:07.587Z" }, - { url = "https://files.pythonhosted.org/packages/fa/82/6ecfa89487b35aa21accb014b64e0a6b814cc860d5e3170287bf5135c7d8/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:898afce396b80fdda05e3086b4256f8677c671f7b1d27a6976fa011d3fd0a86e", size = 42747508, upload-time = "2025-07-18T00:57:13.917Z" }, - { url = "https://files.pythonhosted.org/packages/3b/b7/ba252f399bbf3addc731e8643c05532cf32e74cebb5e32f8f7409bc243cf/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:067c66ca29aaedae08218569a114e413b26e742171f526e828e1064fcdec13f4", size = 43345293, upload-time = "2025-07-18T00:57:19.828Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0a/a20819795bd702b9486f536a8eeb70a6aa64046fce32071c19ec8230dbaa/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0c4e75d13eb76295a49e0ea056eb18dbd87d81450bfeb8afa19a7e5a75ae2ad7", size = 45060670, upload-time = "2025-07-18T00:57:24.477Z" }, - { url = "https://files.pythonhosted.org/packages/10/15/6b30e77872012bbfe8265d42a01d5b3c17ef0ac0f2fae531ad91b6a6c02e/pyarrow-21.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdc4c17afda4dab2a9c0b79148a43a7f4e1094916b3e18d8975bfd6d6d52241f", size = 26227521, upload-time = "2025-07-18T00:57:29.119Z" }, ] [[package]] @@ -2552,19 +2201,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, - { url = "https://files.pythonhosted.org/packages/54/db/160dffb57ed9a3705c4cbcbff0ac03bdae45f1ca7d58ab74645550df3fbd/pydantic_core-2.41.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8bfeaf8735be79f225f3fefab7f941c712aaca36f1128c9d7e2352ee1aa87bdf", size = 2107999, upload-time = "2025-11-04T13:42:03.885Z" }, - { url = "https://files.pythonhosted.org/packages/a3/7d/88e7de946f60d9263cc84819f32513520b85c0f8322f9b8f6e4afc938383/pydantic_core-2.41.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:346285d28e4c8017da95144c7f3acd42740d637ff41946af5ce6e5e420502dd5", size = 1929745, upload-time = "2025-11-04T13:42:06.075Z" }, - { url = "https://files.pythonhosted.org/packages/d5/c2/aef51e5b283780e85e99ff19db0f05842d2d4a8a8cd15e63b0280029b08f/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a75dafbf87d6276ddc5b2bf6fae5254e3d0876b626eb24969a574fff9149ee5d", size = 1920220, upload-time = "2025-11-04T13:42:08.457Z" }, - { url = "https://files.pythonhosted.org/packages/c7/97/492ab10f9ac8695cd76b2fdb24e9e61f394051df71594e9bcc891c9f586e/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7b93a4d08587e2b7e7882de461e82b6ed76d9026ce91ca7915e740ecc7855f60", size = 2067296, upload-time = "2025-11-04T13:42:10.817Z" }, - { url = "https://files.pythonhosted.org/packages/ec/23/984149650e5269c59a2a4c41d234a9570adc68ab29981825cfaf4cfad8f4/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8465ab91a4bd96d36dde3263f06caa6a8a6019e4113f24dc753d79a8b3a3f82", size = 2231548, upload-time = "2025-11-04T13:42:13.843Z" }, - { url = "https://files.pythonhosted.org/packages/71/0c/85bcbb885b9732c28bec67a222dbed5ed2d77baee1f8bba2002e8cd00c5c/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:299e0a22e7ae2b85c1a57f104538b2656e8ab1873511fd718a1c1c6f149b77b5", size = 2362571, upload-time = "2025-11-04T13:42:16.208Z" }, - { url = "https://files.pythonhosted.org/packages/c0/4a/412d2048be12c334003e9b823a3fa3d038e46cc2d64dd8aab50b31b65499/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:707625ef0983fcfb461acfaf14de2067c5942c6bb0f3b4c99158bed6fedd3cf3", size = 2068175, upload-time = "2025-11-04T13:42:18.911Z" }, - { url = "https://files.pythonhosted.org/packages/73/f4/c58b6a776b502d0a5540ad02e232514285513572060f0d78f7832ca3c98b/pydantic_core-2.41.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f41eb9797986d6ebac5e8edff36d5cef9de40def462311b3eb3eeded1431e425", size = 2177203, upload-time = "2025-11-04T13:42:22.578Z" }, - { url = "https://files.pythonhosted.org/packages/ed/ae/f06ea4c7e7a9eead3d165e7623cd2ea0cb788e277e4f935af63fc98fa4e6/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0384e2e1021894b1ff5a786dbf94771e2986ebe2869533874d7e43bc79c6f504", size = 2148191, upload-time = "2025-11-04T13:42:24.89Z" }, - { url = "https://files.pythonhosted.org/packages/c1/57/25a11dcdc656bf5f8b05902c3c2934ac3ea296257cc4a3f79a6319e61856/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:f0cd744688278965817fd0839c4a4116add48d23890d468bc436f78beb28abf5", size = 2343907, upload-time = "2025-11-04T13:42:27.683Z" }, - { url = "https://files.pythonhosted.org/packages/96/82/e33d5f4933d7a03327c0c43c65d575e5919d4974ffc026bc917a5f7b9f61/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:753e230374206729bf0a807954bcc6c150d3743928a73faffee51ac6557a03c3", size = 2322174, upload-time = "2025-11-04T13:42:30.776Z" }, - { url = "https://files.pythonhosted.org/packages/81/45/4091be67ce9f469e81656f880f3506f6a5624121ec5eb3eab37d7581897d/pydantic_core-2.41.5-cp39-cp39-win32.whl", hash = "sha256:873e0d5b4fb9b89ef7c2d2a963ea7d02879d9da0da8d9d4933dee8ee86a8b460", size = 1990353, upload-time = "2025-11-04T13:42:33.111Z" }, - { url = "https://files.pythonhosted.org/packages/44/8a/a98aede18db6e9cd5d66bcacd8a409fcf8134204cdede2e7de35c5a2c5ef/pydantic_core-2.41.5-cp39-cp39-win_amd64.whl", hash = "sha256:e4f4a984405e91527a0d62649ee21138f8e3d0ef103be488c1dc11a80d7f184b", size = 2015698, upload-time = "2025-11-04T13:42:35.484Z" }, { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, @@ -2605,8 +2241,7 @@ name = "pylance" source = { editable = "." } dependencies = [ { name = "lance-namespace" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -2626,9 +2261,8 @@ geo = [ ] tests = [ { name = "boto3" }, - { name = "datafusion", marker = "python_full_version >= '3.10'" }, - { name = "datasets", version = "0.0.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "datafusion" }, + { name = "datasets" }, { name = "duckdb" }, { name = "ml-dtypes" }, { name = "pandas" }, @@ -2636,8 +2270,7 @@ tests = [ { name = "polars", extra = ["pandas", "pyarrow"] }, { name = "psutil" }, { name = "pytest" }, - { name = "tensorflow", version = "2.7.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform == 'linux'" }, - { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, + { name = "tensorflow", marker = "sys_platform == 'linux'" }, { name = "tqdm" }, ] torch = [ @@ -2655,8 +2288,8 @@ dev = [ ] tests = [ { name = "boto3" }, - { name = "datafusion", marker = "python_full_version >= '3.10'" }, - { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "datafusion" }, + { name = "datasets" }, { name = "duckdb" }, { name = "ml-dtypes" }, { name = "pandas" }, @@ -2664,19 +2297,19 @@ tests = [ { name = "polars", extra = ["pandas", "pyarrow"] }, { name = "psutil" }, { name = "pytest" }, - { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, + { name = "tensorflow", marker = "sys_platform == 'linux'" }, { name = "tqdm" }, ] [package.metadata] requires-dist = [ { name = "boto3", marker = "extra == 'tests'" }, - { name = "datafusion", marker = "python_full_version >= '3.10' and extra == 'tests'", specifier = ">=53,<54" }, + { name = "datafusion", marker = "extra == 'tests'", specifier = ">=53,<54" }, { name = "datasets", marker = "extra == 'tests'" }, { name = "duckdb", marker = "extra == 'tests'" }, { name = "geoarrow-rust-core", marker = "extra == 'geo'" }, { name = "geoarrow-rust-io", marker = "extra == 'geo'" }, - { name = "lance-namespace", specifier = ">=0.8.0,<0.9" }, + { name = "lance-namespace", specifier = ">=0.8.5,<0.9" }, { name = "ml-dtypes", marker = "extra == 'tests'" }, { name = "numpy", specifier = ">=1.22" }, { name = "pandas", marker = "extra == 'tests'" }, @@ -2703,8 +2336,8 @@ dev = [ ] tests = [ { name = "boto3", specifier = "==1.40.43" }, - { name = "datafusion", marker = "python_full_version >= '3.10'", specifier = "==53.0.0" }, - { name = "datasets", marker = "python_full_version >= '3.10'", specifier = "==4.1.1" }, + { name = "datafusion", specifier = "==53.0.0" }, + { name = "datasets", specifier = "==4.1.1" }, { name = "duckdb", specifier = "==1.4.0" }, { name = "ml-dtypes", specifier = "==0.5.3" }, { name = "pandas", specifier = "==2.3.3" }, @@ -2712,59 +2345,19 @@ tests = [ { name = "polars", extras = ["pyarrow", "pandas"], specifier = "==1.34.0" }, { name = "psutil", specifier = "==7.1.0" }, { name = "pytest", specifier = "==8.4.2" }, - { name = "tensorflow", marker = "python_full_version >= '3.10' and sys_platform == 'linux'", specifier = "==2.20.0" }, + { name = "tensorflow", marker = "sys_platform == 'linux'", specifier = "==2.20.0" }, { name = "tqdm", specifier = "==4.67.1" }, ] -[[package]] -name = "pyproj" -version = "3.6.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "certifi", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/84/2b39bbf888c753ea48b40d47511548c77aa03445465c35cc4c4e9649b643/pyproj-3.6.1.tar.gz", hash = "sha256:44aa7c704c2b7d8fb3d483bbf75af6cb2350d30a63b144279a09b75fead501bf", size = 225131, upload-time = "2023-09-21T02:07:51.593Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/32/63cf474f4a8d4804b3bdf7c16b8589f38142e8e2f8319dcea27e0bc21a87/pyproj-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab7aa4d9ff3c3acf60d4b285ccec134167a948df02347585fdd934ebad8811b4", size = 6142763, upload-time = "2023-09-21T02:07:12.844Z" }, - { url = "https://files.pythonhosted.org/packages/18/86/2e7cb9de40492f1bafbf11f4c9072edc394509a40b5e4c52f8139546f039/pyproj-3.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4bc0472302919e59114aa140fd7213c2370d848a7249d09704f10f5b062031fe", size = 4877123, upload-time = "2023-09-21T02:10:37.905Z" }, - { url = "https://files.pythonhosted.org/packages/5e/c5/928d5a26995dbefbebd7507d982141cd9153bc7e4392b334fff722c4af12/pyproj-3.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5279586013b8d6582e22b6f9e30c49796966770389a9d5b85e25a4223286cd3f", size = 6190576, upload-time = "2023-09-21T02:17:08.637Z" }, - { url = "https://files.pythonhosted.org/packages/f6/2b/b60cf73b0720abca313bfffef34e34f7f7dae23852b2853cf0368d49426b/pyproj-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fafd1f3eb421694857f254a9bdbacd1eb22fc6c24ca74b136679f376f97d35", size = 8328075, upload-time = "2023-09-21T02:07:15.353Z" }, - { url = "https://files.pythonhosted.org/packages/d9/a8/7193f46032636be917bc775506ae987aad72c931b1f691b775ca812a2917/pyproj-3.6.1-cp310-cp310-win32.whl", hash = "sha256:c41e80ddee130450dcb8829af7118f1ab69eaf8169c4bf0ee8d52b72f098dc2f", size = 5635713, upload-time = "2023-09-21T02:07:17.548Z" }, - { url = "https://files.pythonhosted.org/packages/89/8f/27350c8fba71a37cd0d316f100fbd96bf139cc2b5ff1ab0dcbc7ac64010a/pyproj-3.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:db3aedd458e7f7f21d8176f0a1d924f1ae06d725228302b872885a1c34f3119e", size = 6087932, upload-time = "2023-09-21T02:07:19.793Z" }, - { url = "https://files.pythonhosted.org/packages/84/a6/a300c1b14b2112e966e9f90b18f9c13b586bdcf417207cee913ae9005da3/pyproj-3.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ebfbdbd0936e178091309f6cd4fcb4decd9eab12aa513cdd9add89efa3ec2882", size = 6147442, upload-time = "2023-09-21T02:07:21.879Z" }, - { url = "https://files.pythonhosted.org/packages/30/bd/b9bd3761f08754e8dbb34c5a647db2099b348ab5da338e90980caf280e37/pyproj-3.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:447db19c7efad70ff161e5e46a54ab9cc2399acebb656b6ccf63e4bc4a04b97a", size = 4880331, upload-time = "2023-09-21T02:10:40.828Z" }, - { url = "https://files.pythonhosted.org/packages/f4/0a/d82aeeb605b5d6870bc72307c3b5e044e632eb7720df8885e144f51a8eac/pyproj-3.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7e13c40183884ec7f94eb8e0f622f08f1d5716150b8d7a134de48c6110fee85", size = 6192425, upload-time = "2023-09-21T02:17:09.049Z" }, - { url = "https://files.pythonhosted.org/packages/64/90/dfe5c00de1ca4dbb82606e79790659d4ed7f0ed8d372bccb3baca2a5abe0/pyproj-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65ad699e0c830e2b8565afe42bd58cc972b47d829b2e0e48ad9638386d994915", size = 8571478, upload-time = "2023-09-21T02:07:23.771Z" }, - { url = "https://files.pythonhosted.org/packages/14/6d/ae373629a1723f0db80d7b8c93598b00d9ecb930ed9ebf4f35826a33e97c/pyproj-3.6.1-cp311-cp311-win32.whl", hash = "sha256:8b8acc31fb8702c54625f4d5a2a6543557bec3c28a0ef638778b7ab1d1772132", size = 5634575, upload-time = "2023-09-21T02:07:26.535Z" }, - { url = "https://files.pythonhosted.org/packages/79/95/eb68113c5b5737c342bde1bab92705dabe69c16299c5a122616e50f1fbd6/pyproj-3.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:38a3361941eb72b82bd9a18f60c78b0df8408416f9340521df442cebfc4306e2", size = 6088494, upload-time = "2023-09-21T02:07:28.75Z" }, - { url = "https://files.pythonhosted.org/packages/0b/64/93232511a7906a492b1b7dfdfc17f4e95982d76a24ef4f86d18cfe7ae2c9/pyproj-3.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1e9fbaf920f0f9b4ee62aab832be3ae3968f33f24e2e3f7fbb8c6728ef1d9746", size = 6135280, upload-time = "2023-09-21T02:07:30.911Z" }, - { url = "https://files.pythonhosted.org/packages/10/f2/b550b1f65cc7e51c9116b220b50aade60c439103432a3fd5b12efbc77e15/pyproj-3.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d227a865356f225591b6732430b1d1781e946893789a609bb34f59d09b8b0f8", size = 4880030, upload-time = "2023-09-21T02:10:43.067Z" }, - { url = "https://files.pythonhosted.org/packages/fe/4b/2f8f6f94643b9fe2083338eff294feda84d916409b5840b7a402d2be93f8/pyproj-3.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83039e5ae04e5afc974f7d25ee0870a80a6bd6b7957c3aca5613ccbe0d3e72bf", size = 6184439, upload-time = "2023-09-21T02:17:43.499Z" }, - { url = "https://files.pythonhosted.org/packages/19/9b/c57569132174786aa3f72275ac306956859a639dad0ce8d95c8411ce8209/pyproj-3.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb059ba3bced6f6725961ba758649261d85ed6ce670d3e3b0a26e81cf1aa8d", size = 8660747, upload-time = "2023-09-21T02:07:32.586Z" }, - { url = "https://files.pythonhosted.org/packages/0e/ab/1c2159ec757677c5a6b8803f6be45c2b550dc42c84ec4a228dc219849bbb/pyproj-3.6.1-cp312-cp312-win32.whl", hash = "sha256:2d6ff73cc6dbbce3766b6c0bce70ce070193105d8de17aa2470009463682a8eb", size = 5626805, upload-time = "2023-09-21T02:07:35.28Z" }, - { url = "https://files.pythonhosted.org/packages/c7/f3/2f32fe143cd7ba1d4d68f1b6dce9ca402d909cbd5a5830e3a8fa3d1acbbf/pyproj-3.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:7a27151ddad8e1439ba70c9b4b2b617b290c39395fa9ddb7411ebb0eb86d6fb0", size = 6079779, upload-time = "2023-09-21T02:07:37.486Z" }, - { url = "https://files.pythonhosted.org/packages/d7/50/d369bbe62d7a0d1e2cb40bc211da86a3f6e0f3c99f872957a72c3d5492d6/pyproj-3.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ba1f9b03d04d8cab24d6375609070580a26ce76eaed54631f03bab00a9c737b", size = 6144755, upload-time = "2023-09-21T02:07:39.611Z" }, - { url = "https://files.pythonhosted.org/packages/2c/c2/8d4f61065dfed965e53badd41201ad86a05af0c1bbc75dffb12ef0f5a7dd/pyproj-3.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18faa54a3ca475bfe6255156f2f2874e9a1c8917b0004eee9f664b86ccc513d3", size = 4879187, upload-time = "2023-09-21T02:10:45.519Z" }, - { url = "https://files.pythonhosted.org/packages/31/38/2cf8777cb2d5622a78195e690281b7029098795fde4751aec8128238b8bb/pyproj-3.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd43bd9a9b9239805f406fd82ba6b106bf4838d9ef37c167d3ed70383943ade1", size = 6192339, upload-time = "2023-09-21T02:17:09.942Z" }, - { url = "https://files.pythonhosted.org/packages/97/0a/b1525be9680369cc06dd288e12c59d24d5798b4afcdcf1b0915836e1caa6/pyproj-3.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50100b2726a3ca946906cbaa789dd0749f213abf0cbb877e6de72ca7aa50e1ae", size = 8332638, upload-time = "2023-09-21T02:07:41.777Z" }, - { url = "https://files.pythonhosted.org/packages/8d/e8/e826e0a962f36bd925a933829cf6ef218efe2055db5ea292be40974a929d/pyproj-3.6.1-cp39-cp39-win32.whl", hash = "sha256:9274880263256f6292ff644ca92c46d96aa7e57a75c6df3f11d636ce845a1877", size = 5638159, upload-time = "2023-09-21T02:07:43.49Z" }, - { url = "https://files.pythonhosted.org/packages/43/d0/cbe29a4dcf38ee7e72bf695d0d3f2bee21b4f22ee6cf579ad974de9edfc8/pyproj-3.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:36b64c2cb6ea1cc091f329c5bd34f9c01bb5da8c8e4492c709bda6a09f96808f", size = 6090565, upload-time = "2023-09-21T02:07:45.735Z" }, - { url = "https://files.pythonhosted.org/packages/43/28/e8d2ca71dd56c27cbe668e4226963d61956cded222a2e839e6fec1ab6d82/pyproj-3.6.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fd93c1a0c6c4aedc77c0fe275a9f2aba4d59b8acf88cebfc19fe3c430cfabf4f", size = 6034252, upload-time = "2023-09-21T02:07:47.906Z" }, - { url = "https://files.pythonhosted.org/packages/cb/39/1ce27cb86f51a1f5aed3a1617802a6131b59ea78492141d1fbe36722595e/pyproj-3.6.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6420ea8e7d2a88cb148b124429fba8cd2e0fae700a2d96eab7083c0928a85110", size = 6386263, upload-time = "2023-09-21T02:07:49.586Z" }, -] - [[package]] name = "pyproj" version = "3.7.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] dependencies = [ - { name = "certifi", marker = "python_full_version == '3.10.*'" }, + { name = "certifi", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/67/10/a8480ea27ea4bbe896c168808854d00f2a9b49f95c0319ddcbba693c8a90/pyproj-3.7.1.tar.gz", hash = "sha256:60d72facd7b6b79853f19744779abcd3f804c4e0d4fa8815469db20c9f640a47", size = 226339, upload-time = "2025-02-16T04:28:46.621Z" } wheels = [ @@ -3000,15 +2593,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, - { url = "https://files.pythonhosted.org/packages/9f/62/67fc8e68a75f738c9200422bf65693fb79a4cd0dc5b23310e5202e978090/pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da", size = 184450, upload-time = "2025-09-25T21:33:00.618Z" }, - { url = "https://files.pythonhosted.org/packages/ae/92/861f152ce87c452b11b9d0977952259aa7df792d71c1053365cc7b09cc08/pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917", size = 174319, upload-time = "2025-09-25T21:33:02.086Z" }, - { url = "https://files.pythonhosted.org/packages/d0/cd/f0cfc8c74f8a030017a2b9c771b7f47e5dd702c3e28e5b2071374bda2948/pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9", size = 737631, upload-time = "2025-09-25T21:33:03.25Z" }, - { url = "https://files.pythonhosted.org/packages/ef/b2/18f2bd28cd2055a79a46c9b0895c0b3d987ce40ee471cecf58a1a0199805/pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5", size = 836795, upload-time = "2025-09-25T21:33:05.014Z" }, - { url = "https://files.pythonhosted.org/packages/73/b9/793686b2d54b531203c160ef12bec60228a0109c79bae6c1277961026770/pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a", size = 750767, upload-time = "2025-09-25T21:33:06.398Z" }, - { url = "https://files.pythonhosted.org/packages/a9/86/a137b39a611def2ed78b0e66ce2fe13ee701a07c07aebe55c340ed2a050e/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926", size = 727982, upload-time = "2025-09-25T21:33:08.708Z" }, - { url = "https://files.pythonhosted.org/packages/dd/62/71c27c94f457cf4418ef8ccc71735324c549f7e3ea9d34aba50874563561/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7", size = 755677, upload-time = "2025-09-25T21:33:09.876Z" }, - { url = "https://files.pythonhosted.org/packages/29/3d/6f5e0d58bd924fb0d06c3a6bad00effbdae2de5adb5cda5648006ffbd8d3/pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0", size = 142592, upload-time = "2025-09-25T21:33:10.983Z" }, - { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" }, ] [[package]] @@ -3016,10 +2600,10 @@ name = "requests" version = "2.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "certifi", marker = "python_full_version >= '3.10'" }, - { name = "charset-normalizer", marker = "python_full_version >= '3.10'" }, - { name = "idna", marker = "python_full_version >= '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" } wheels = [ @@ -3031,8 +2615,8 @@ name = "rich" version = "14.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markdown-it-py", marker = "python_full_version >= '3.10'" }, - { name = "pygments", marker = "python_full_version >= '3.10'" }, + { name = "markdown-it-py" }, + { name = "pygments" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" } wheels = [ @@ -3114,13 +2698,11 @@ dependencies = [ { name = "absl-py" }, { name = "grpcio" }, { name = "markdown" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "pillow" }, - { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "protobuf" }, { name = "setuptools" }, { name = "tensorboard-data-server" }, { name = "werkzeug" }, @@ -3138,74 +2720,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" }, ] -[[package]] -name = "tensorflow" -version = "2.7.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "absl-py", marker = "python_full_version < '3.10'" }, - { name = "astunparse", marker = "python_full_version < '3.10'" }, - { name = "flatbuffers", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "gast", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "google-pasta", marker = "python_full_version < '3.10'" }, - { name = "grpcio", marker = "python_full_version < '3.10'" }, - { name = "h5py", marker = "python_full_version < '3.10'" }, - { name = "keras", version = "2.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "keras-preprocessing", marker = "python_full_version < '3.10'" }, - { name = "libclang", marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "opt-einsum", marker = "python_full_version < '3.10'" }, - { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "six", marker = "python_full_version < '3.10'" }, - { name = "tensorboard", marker = "python_full_version < '3.10'" }, - { name = "tensorflow-estimator", marker = "python_full_version < '3.10'" }, - { name = "tensorflow-io-gcs-filesystem", marker = "python_full_version < '3.10'" }, - { name = "termcolor", marker = "python_full_version < '3.10'" }, - { name = "typing-extensions", marker = "python_full_version < '3.10'" }, - { name = "wheel", marker = "python_full_version < '3.10'" }, - { name = "wrapt", marker = "python_full_version < '3.10'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/31/d49a3dff9c4ca6e6c09c2c5fea95f58cf59cc3cd4f0d557069c7dccd6f57/tensorflow-2.7.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c4597635dd71fc6809b7fffcb462524d73e2ade09da61844059e6a2fead71140", size = 496066688, upload-time = "2022-09-02T19:11:01.631Z" }, -] - [[package]] name = "tensorflow" version = "2.20.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "absl-py", marker = "python_full_version >= '3.10'" }, - { name = "astunparse", marker = "python_full_version >= '3.10'" }, - { name = "flatbuffers", version = "25.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "gast", version = "0.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "google-pasta", marker = "python_full_version >= '3.10'" }, - { name = "grpcio", marker = "python_full_version >= '3.10'" }, - { name = "h5py", marker = "python_full_version >= '3.10'" }, - { name = "keras", version = "3.11.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "libclang", marker = "python_full_version >= '3.10'" }, - { name = "ml-dtypes", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "absl-py" }, + { name = "astunparse" }, + { name = "flatbuffers" }, + { name = "gast" }, + { name = "google-pasta" }, + { name = "grpcio" }, + { name = "h5py" }, + { name = "keras" }, + { name = "libclang" }, + { name = "ml-dtypes" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "opt-einsum", marker = "python_full_version >= '3.10'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "setuptools", marker = "python_full_version >= '3.10'" }, - { name = "six", marker = "python_full_version >= '3.10'" }, - { name = "tensorboard", marker = "python_full_version >= '3.10'" }, - { name = "termcolor", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, - { name = "wrapt", marker = "python_full_version >= '3.10'" }, + { name = "opt-einsum" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "six" }, + { name = "tensorboard" }, + { name = "termcolor" }, + { name = "typing-extensions" }, + { name = "wrapt" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ff/07/ea91ac67a9fd36d3372099f5a3e69860ded544f877f5f2117802388f4212/tensorflow-2.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02a0293d94f5c8b7125b66abf622cc4854a33ae9d618a0d41309f95e091bbaea", size = 259307122, upload-time = "2025-08-13T16:50:47.909Z" }, @@ -3216,31 +2757,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/d1/6aa15085d672056d5f08b5f28b1c7ce01c4e12149a23b0c98e3c79d04441/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25265b0bc527e0d54b1e9cc60c44a24f44a809fe27666b905f0466471f9c52ec", size = 620682547, upload-time = "2025-08-13T16:52:46.396Z" }, { url = "https://files.pythonhosted.org/packages/ea/4c/c1aa90c5cc92e9f7f9c78421e121ef25bae7d378f8d1d4cbad46c6308836/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47c88e05a07f1ead4977b4894b3ecd4d8075c40191065afc4fd9355c9db3d926", size = 259663776, upload-time = "2025-08-13T16:53:24.507Z" }, { url = "https://files.pythonhosted.org/packages/43/fb/8be8547c128613d82a2b006004026d86ed0bd672e913029a98153af4ffab/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa3729b0126f75a99882b89fb7d536515721eda8014a63e259e780ba0a37372", size = 620815537, upload-time = "2025-08-13T16:53:42.577Z" }, - { url = "https://files.pythonhosted.org/packages/83/ff/a26d49895586207b2704403366ef976dcaa6ed07514699dae9a4fc3fa1a9/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bc33759249c98eabcee9debd24e74506bbe29ac139e050cf0c74aa9888ebdf", size = 259307564, upload-time = "2025-08-13T16:54:17.691Z" }, - { url = "https://files.pythonhosted.org/packages/5f/fe/f3d738dc7c93ed5f67f9ace8dd3ed66971dab7c5a47f2d1c504ef0d0cf1d/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0deb5c583dfc53b54fd158a194ce0087b406bb6518af400ca3809735e4548ec3", size = 620427169, upload-time = "2025-08-13T16:54:33.431Z" }, -] - -[[package]] -name = "tensorflow-estimator" -version = "2.7.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/db/de/3a71ad41b87f9dd424e3aec3b0794a60f169fa7e9a9a1e3dd44290b86dd6/tensorflow_estimator-2.7.0-py2.py3-none-any.whl", hash = "sha256:325b5a224864379242b7b76c6987ca544239be82579d33e68ec7c2bda57abc9d", size = 463110, upload-time = "2021-10-29T23:02:47.14Z" }, -] - -[[package]] -name = "tensorflow-io-gcs-filesystem" -version = "0.37.1" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/19/9095c69e22c879cb3896321e676c69273a549a3148c4f62aa4bc5ebdb20f/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8febbfcc67c61e542a5ac1a98c7c20a91a5e1afc2e14b1ef0cb7c28bc3b6aa70", size = 4842078, upload-time = "2024-07-01T23:44:18.977Z" }, - { url = "https://files.pythonhosted.org/packages/f3/48/47b7d25572961a48b1de3729b7a11e835b888e41e0203cca82df95d23b91/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9679b36e3a80921876f31685ab6f7270f3411a4cc51bc2847e80d0e4b5291e27", size = 5085736, upload-time = "2024-07-01T23:44:21.034Z" }, - { url = "https://files.pythonhosted.org/packages/de/bf/ba597d3884c77d05a78050f3c178933d69e3f80200a261df6eaa920656cd/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e1f2796b57e799a8ca1b75bf47c2aaa437c968408cc1a402a9862929e104cda", size = 4842079, upload-time = "2024-07-01T23:44:26.825Z" }, - { url = "https://files.pythonhosted.org/packages/66/7f/e36ae148c2f03d61ca1bff24bc13a0fef6d6825c966abef73fc6f880a23b/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee7c8ee5fe2fd8cb6392669ef16e71841133041fee8a330eff519ad9b36e4556", size = 5085736, upload-time = "2024-07-01T23:44:28.618Z" }, - { url = "https://files.pythonhosted.org/packages/d3/46/962f47af08bd39fc9feb280d3192825431a91a078c856d17a78ae4884eb1/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fbb33f1745f218464a59cecd9a18e32ca927b0f4d77abd8f8671b645cc1a182f", size = 4842077, upload-time = "2024-07-01T23:44:33.86Z" }, - { url = "https://files.pythonhosted.org/packages/f0/9b/790d290c232bce9b691391cf16e95a96e469669c56abfb1d9d0f35fa437c/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:286389a203a5aee1a4fa2e53718c661091aa5fea797ff4fa6715ab8436b02e6c", size = 5085733, upload-time = "2024-07-01T23:44:36.663Z" }, - { url = "https://files.pythonhosted.org/packages/66/5f/334a011caa1eb97689274d1141df8e6b7a25e389f0390bdcd90235de9783/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:426de1173cb81fbd62becec2012fc00322a295326d90eb6c737fab636f182aed", size = 4842075, upload-time = "2024-07-01T23:44:42.094Z" }, - { url = "https://files.pythonhosted.org/packages/3d/cb/7dcee55fc5a7d7d8a862e12519322851cd5fe5b086f946fd71e4ae1ef281/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df00891669390078a003cedbdd3b8e645c718b111917535fa1d7725e95cdb95", size = 5087496, upload-time = "2024-07-01T23:44:43.797Z" }, ] [[package]] @@ -3299,8 +2815,7 @@ dependencies = [ { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, - { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -3342,10 +2857,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/15/5e488ca0bc6162c86a33b58642bc577c84ded17c7b72d97e49b5833e2d73/torch-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8f0a9d617a66509ded240add3754e462430a6c1fc5589f86c17b433dd808f97a", size = 887990692, upload-time = "2025-08-06T14:56:18.286Z" }, { url = "https://files.pythonhosted.org/packages/b4/a8/6a04e4b54472fc5dba7ca2341ab219e529f3c07b6941059fbf18dccac31f/torch-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a7242b86f42be98ac674b88a4988643b9bc6145437ec8f048fea23f72feb5eca", size = 241603453, upload-time = "2025-08-06T14:55:22.945Z" }, { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" }, - { url = "https://files.pythonhosted.org/packages/5b/b0/a321f27270049baa12f5c3fb0d6ceea005634787e3af9a8d75dce8306b0a/torch-2.8.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:da6afa31c13b669d4ba49d8a2169f0db2c3ec6bec4af898aa714f401d4c38904", size = 102059214, upload-time = "2025-08-06T14:55:33.433Z" }, - { url = "https://files.pythonhosted.org/packages/fd/dd/1630cb51b10d3d2e97db95e5a84c32def81fc26b005bce6fc880b0e6db81/torch-2.8.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:06fcee8000e5c62a9f3e52a688b9c5abb7c6228d0e56e3452983416025c41381", size = 888024302, upload-time = "2025-08-06T14:57:28.23Z" }, - { url = "https://files.pythonhosted.org/packages/b9/dc/1f1f621afe15e3c496e1e8f94f8903f75f87e7d642d5a985e92210cc208d/torch-2.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:5128fe752a355d9308e56af1ad28b15266fe2da5948660fad44de9e3a9e36e8c", size = 241249338, upload-time = "2025-08-06T14:57:05.669Z" }, - { url = "https://files.pythonhosted.org/packages/ae/95/ae26263aceb3d57b821179f827d0e321373ed49423e603dd5906ab14a730/torch-2.8.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:e9f071f5b52a9f6970dc8a919694b27a91ae9dc08898b2b988abbef5eddfd1ae", size = 73610795, upload-time = "2025-08-06T14:57:11.513Z" }, ] [[package]] @@ -3365,7 +2876,6 @@ name = "triton" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, { name = "setuptools" }, ] wheels = [ @@ -3374,7 +2884,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/66/b1eb52839f563623d185f0927eb3530ee4d5ffe9d377cdaf5346b306689e/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c1d84a5c0ec2c0f8e8a072d7fd150cab84a9c239eaddc6706c081bfae4eb04", size = 155560068, upload-time = "2025-07-30T19:58:37.081Z" }, { url = "https://files.pythonhosted.org/packages/30/7b/0a685684ed5322d2af0bddefed7906674f67974aa88b0fae6e82e3b766f6/triton-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00be2964616f4c619193cb0d1b29a99bd4b001d7dc333816073f92cf2a8ccdeb", size = 155569223, upload-time = "2025-07-30T19:58:44.017Z" }, { url = "https://files.pythonhosted.org/packages/20/63/8cb444ad5cdb25d999b7d647abac25af0ee37d292afc009940c05b82dda0/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7936b18a3499ed62059414d7df563e6c163c5e16c3773678a3ee3d417865035d", size = 155659780, upload-time = "2025-07-30T19:58:51.171Z" }, - { url = "https://files.pythonhosted.org/packages/12/34/1251beb5a3cb93f3950ebe68732752014646003ef6eb11eb5f1a37ca78cd/triton-3.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e5c1442eaeabae2e2452ae765801bd53cd4ce873cab0d1bdd59a32ab2d9397", size = 155430799, upload-time = "2025-07-30T19:58:57.664Z" }, ] [[package]] @@ -3407,29 +2916,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] -[[package]] -name = "urllib3" -version = "1.26.20" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" }, -] - [[package]] name = "urllib3" version = "2.5.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, @@ -3486,10 +2976,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" }, { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" }, { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" }, - { url = "https://files.pythonhosted.org/packages/43/46/dd0791943613885f62619f18ee6107e6133237a6b6ed8a9ecfac339d0b4f/wrapt-1.17.3-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7e18f01b0c3e4a07fe6dfdb00e29049ba17eadbc5e7609a2a3a4af83ab7d710a", size = 81745, upload-time = "2025-08-12T05:52:49.62Z" }, - { url = "https://files.pythonhosted.org/packages/dd/ec/bb2d19bd1a614cc4f438abac13ae26c57186197920432d2a915183b15a8b/wrapt-1.17.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f5f51a6466667a5a356e6381d362d259125b57f059103dd9fdc8c0cf1d14139", size = 82833, upload-time = "2025-08-12T05:52:27.738Z" }, - { url = "https://files.pythonhosted.org/packages/8d/eb/66579aea6ad36f07617fedca8e282e49c7c9bab64c63b446cfe4f7f47a49/wrapt-1.17.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:59923aa12d0157f6b82d686c3fd8e1166fa8cdfb3e17b42ce3b6147ff81528df", size = 81889, upload-time = "2025-08-12T05:52:29.023Z" }, - { url = "https://files.pythonhosted.org/packages/04/9c/a56b5ac0e2473bdc3fb11b22dd69ff423154d63861cf77911cdde5e38fd2/wrapt-1.17.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:46acc57b331e0b3bcb3e1ca3b421d65637915cfcd65eb783cb2f78a511193f9b", size = 81344, upload-time = "2025-08-12T05:52:50.869Z" }, { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, ] @@ -3604,21 +3090,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" }, { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" }, { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" }, - { url = "https://files.pythonhosted.org/packages/03/ff/1b4bb3f397552116c1df6266c1b83a21aeeb26061ab1f462984b499a3870/xxhash-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cc604dc06027dbeb8281aeac5899c35fcfe7c77b25212833709f0bff4ce74d2a", size = 32844, upload-time = "2025-10-02T14:36:39.157Z" }, - { url = "https://files.pythonhosted.org/packages/c1/db/27146d0bee4346a9a31f7b498a81fc02747f6f1e6c52a2e7989504278051/xxhash-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:277175a73900ad43a8caeb8b99b9604f21fe8d7c842f2f9061a364a7e220ddb7", size = 30806, upload-time = "2025-10-02T14:36:40.621Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2b/4896188df564908817a75de19bf7f2384b99a75af2d528f9c49326f76458/xxhash-3.6.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cfbc5b91397c8c2972fdac13fb3e4ed2f7f8ccac85cd2c644887557780a9b6e2", size = 193448, upload-time = "2025-10-02T14:36:41.797Z" }, - { url = "https://files.pythonhosted.org/packages/51/c5/be8953f62e772340319a826ce1e07489935600089756cf83b628cd36ebe3/xxhash-3.6.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2762bfff264c4e73c0e507274b40634ff465e025f0eaf050897e88ec8367575d", size = 212547, upload-time = "2025-10-02T14:36:43.581Z" }, - { url = "https://files.pythonhosted.org/packages/51/1a/1e9f0b911d1cf00dd537c074ae3fae15b535a7f0d9e7edd42a9d2c4f78ce/xxhash-3.6.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f171a900d59d51511209f7476933c34a0c2c711078d3c80e74e0fe4f38680ec", size = 211309, upload-time = "2025-10-02T14:36:45.307Z" }, - { url = "https://files.pythonhosted.org/packages/63/88/b284c6a128d88dc47f201957f926e707db79fb7415a87072e15c0e490de0/xxhash-3.6.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:780b90c313348f030b811efc37b0fa1431163cb8db8064cf88a7936b6ce5f222", size = 444480, upload-time = "2025-10-02T14:36:47.226Z" }, - { url = "https://files.pythonhosted.org/packages/87/e4/798293a2bf9e4fac5f6d53ce59cba4739930778dfc6c7c73f40044ab0e6e/xxhash-3.6.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b242455eccdfcd1fa4134c431a30737d2b4f045770f8fe84356b3469d4b919", size = 192957, upload-time = "2025-10-02T14:36:48.968Z" }, - { url = "https://files.pythonhosted.org/packages/78/55/bfd0d7db447a927897469048b953caececa3532e743b940dd1f5c1032d24/xxhash-3.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a75ffc1bd5def584129774c158e108e5d768e10b75813f2b32650bb041066ed6", size = 209850, upload-time = "2025-10-02T14:36:50.258Z" }, - { url = "https://files.pythonhosted.org/packages/31/06/d08ef9a792bfebfd2fb2bcbf04a541ad283bef74749ead6f089a0809d288/xxhash-3.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1fc1ed882d1e8df932a66e2999429ba6cc4d5172914c904ab193381fba825360", size = 197342, upload-time = "2025-10-02T14:36:51.651Z" }, - { url = "https://files.pythonhosted.org/packages/7b/1a/aebf90797c94e9ca407c28e23f54d71f7149d91a93406a08a09e44d06994/xxhash-3.6.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:44e342e8cc11b4e79dae5c57f2fb6360c3c20cc57d32049af8f567f5b4bcb5f4", size = 209757, upload-time = "2025-10-02T14:36:53.009Z" }, - { url = "https://files.pythonhosted.org/packages/3c/80/799eec3d0a144dc3edf8c19b4f139c27fb923c50b34352796089ca206429/xxhash-3.6.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c2f9ccd5c4be370939a2e17602fbc49995299203da72a3429db013d44d590e86", size = 412773, upload-time = "2025-10-02T14:36:54.691Z" }, - { url = "https://files.pythonhosted.org/packages/6a/f9/09df7545699de09219a205123b8463ce9ea83f48acc7aeeba0269507f9d3/xxhash-3.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:02ea4cb627c76f48cd9fb37cf7ab22bd51e57e1b519807234b473faebe526796", size = 190357, upload-time = "2025-10-02T14:36:56.363Z" }, - { url = "https://files.pythonhosted.org/packages/07/40/2f8327f94e64a3f34d6ce3347c55207c322abbc80ae486ea45df4c62e7b3/xxhash-3.6.0-cp39-cp39-win32.whl", hash = "sha256:6551880383f0e6971dc23e512c9ccc986147ce7bfa1cd2e4b520b876c53e9f3d", size = 30585, upload-time = "2025-10-02T14:36:57.664Z" }, - { url = "https://files.pythonhosted.org/packages/6a/c8/2ecbc6799be9c02e8bf7b5a66cd94832b6ac13d59808746f0d402481c6ad/xxhash-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:7c35c4cdc65f2a29f34425c446f2f5cdcd0e3c34158931e1cc927ece925ab802", size = 31512, upload-time = "2025-10-02T14:36:58.837Z" }, - { url = "https://files.pythonhosted.org/packages/19/94/1d5459a9c587c94d7b8bcc710bd08bbfa145cbd814ebde41b48494362a21/xxhash-3.6.0-cp39-cp39-win_arm64.whl", hash = "sha256:ffc578717a347baf25be8397cb10d2528802d24f94cfc005c0e44fef44b5cdd6", size = 27878, upload-time = "2025-10-02T14:37:00.201Z" }, { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" }, { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" }, { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" }, @@ -3631,9 +3102,9 @@ name = "yarl" version = "1.20.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "idna", marker = "python_full_version >= '3.10'" }, - { name = "multidict", marker = "python_full_version >= '3.10'" }, - { name = "propcache", marker = "python_full_version >= '3.10'" }, + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } wheels = [ @@ -3722,31 +3193,5 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/ed/c5fb04869b99b717985e244fd93029c7a8e8febdfcffa06093e32d7d44e7/yarl-1.20.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:88cab98aa4e13e1ade8c141daeedd300a4603b7132819c484841bb7af3edce9e", size = 341709, upload-time = "2025-06-10T00:45:23.221Z" }, { url = "https://files.pythonhosted.org/packages/24/fd/725b8e73ac2a50e78a4534ac43c6addf5c1c2d65380dd48a9169cc6739a9/yarl-1.20.1-cp313-cp313t-win32.whl", hash = "sha256:b121ff6a7cbd4abc28985b6028235491941b9fe8fe226e6fdc539c977ea1739d", size = 86591, upload-time = "2025-06-10T00:45:25.793Z" }, { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, - { url = "https://files.pythonhosted.org/packages/01/75/0d37402d208d025afa6b5b8eb80e466d267d3fd1927db8e317d29a94a4cb/yarl-1.20.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e42ba79e2efb6845ebab49c7bf20306c4edf74a0b20fc6b2ccdd1a219d12fad3", size = 134259, upload-time = "2025-06-10T00:45:29.882Z" }, - { url = "https://files.pythonhosted.org/packages/73/84/1fb6c85ae0cf9901046f07d0ac9eb162f7ce6d95db541130aa542ed377e6/yarl-1.20.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:41493b9b7c312ac448b7f0a42a089dffe1d6e6e981a2d76205801a023ed26a2b", size = 91269, upload-time = "2025-06-10T00:45:32.917Z" }, - { url = "https://files.pythonhosted.org/packages/f3/9c/eae746b24c4ea29a5accba9a06c197a70fa38a49c7df244e0d3951108861/yarl-1.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f5a5928ff5eb13408c62a968ac90d43f8322fd56d87008b8f9dabf3c0f6ee983", size = 89995, upload-time = "2025-06-10T00:45:35.066Z" }, - { url = "https://files.pythonhosted.org/packages/fb/30/693e71003ec4bc1daf2e4cf7c478c417d0985e0a8e8f00b2230d517876fc/yarl-1.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30c41ad5d717b3961b2dd785593b67d386b73feca30522048d37298fee981805", size = 325253, upload-time = "2025-06-10T00:45:37.052Z" }, - { url = "https://files.pythonhosted.org/packages/0f/a2/5264dbebf90763139aeb0b0b3154763239398400f754ae19a0518b654117/yarl-1.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:59febc3969b0781682b469d4aca1a5cab7505a4f7b85acf6db01fa500fa3f6ba", size = 320897, upload-time = "2025-06-10T00:45:39.962Z" }, - { url = "https://files.pythonhosted.org/packages/e7/17/77c7a89b3c05856489777e922f41db79ab4faf58621886df40d812c7facd/yarl-1.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2b6fb3622b7e5bf7a6e5b679a69326b4279e805ed1699d749739a61d242449e", size = 340696, upload-time = "2025-06-10T00:45:41.915Z" }, - { url = "https://files.pythonhosted.org/packages/6d/55/28409330b8ef5f2f681f5b478150496ec9cf3309b149dab7ec8ab5cfa3f0/yarl-1.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:749d73611db8d26a6281086f859ea7ec08f9c4c56cec864e52028c8b328db723", size = 335064, upload-time = "2025-06-10T00:45:43.893Z" }, - { url = "https://files.pythonhosted.org/packages/85/58/cb0257cbd4002828ff735f44d3c5b6966c4fd1fc8cc1cd3cd8a143fbc513/yarl-1.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9427925776096e664c39e131447aa20ec738bdd77c049c48ea5200db2237e000", size = 327256, upload-time = "2025-06-10T00:45:46.393Z" }, - { url = "https://files.pythonhosted.org/packages/53/f6/c77960370cfa46f6fb3d6a5a79a49d3abfdb9ef92556badc2dcd2748bc2a/yarl-1.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff70f32aa316393eaf8222d518ce9118148eddb8a53073c2403863b41033eed5", size = 316389, upload-time = "2025-06-10T00:45:48.358Z" }, - { url = "https://files.pythonhosted.org/packages/64/ab/be0b10b8e029553c10905b6b00c64ecad3ebc8ace44b02293a62579343f6/yarl-1.20.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c7ddf7a09f38667aea38801da8b8d6bfe81df767d9dfc8c88eb45827b195cd1c", size = 340481, upload-time = "2025-06-10T00:45:50.663Z" }, - { url = "https://files.pythonhosted.org/packages/c5/c3/3f327bd3905a4916029bf5feb7f86dcf864c7704f099715f62155fb386b2/yarl-1.20.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:57edc88517d7fc62b174fcfb2e939fbc486a68315d648d7e74d07fac42cec240", size = 336941, upload-time = "2025-06-10T00:45:52.554Z" }, - { url = "https://files.pythonhosted.org/packages/d1/42/040bdd5d3b3bb02b4a6ace4ed4075e02f85df964d6e6cb321795d2a6496a/yarl-1.20.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:dab096ce479d5894d62c26ff4f699ec9072269d514b4edd630a393223f45a0ee", size = 339936, upload-time = "2025-06-10T00:45:54.919Z" }, - { url = "https://files.pythonhosted.org/packages/0d/1c/911867b8e8c7463b84dfdc275e0d99b04b66ad5132b503f184fe76be8ea4/yarl-1.20.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:14a85f3bd2d7bb255be7183e5d7d6e70add151a98edf56a770d6140f5d5f4010", size = 360163, upload-time = "2025-06-10T00:45:56.87Z" }, - { url = "https://files.pythonhosted.org/packages/e2/31/8c389f6c6ca0379b57b2da87f1f126c834777b4931c5ee8427dd65d0ff6b/yarl-1.20.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2c89b5c792685dd9cd3fa9761c1b9f46fc240c2a3265483acc1565769996a3f8", size = 359108, upload-time = "2025-06-10T00:45:58.869Z" }, - { url = "https://files.pythonhosted.org/packages/7f/09/ae4a649fb3964324c70a3e2b61f45e566d9ffc0affd2b974cbf628957673/yarl-1.20.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:69e9b141de5511021942a6866990aea6d111c9042235de90e08f94cf972ca03d", size = 351875, upload-time = "2025-06-10T00:46:01.45Z" }, - { url = "https://files.pythonhosted.org/packages/8d/43/bbb4ed4c34d5bb62b48bf957f68cd43f736f79059d4f85225ab1ef80f4b9/yarl-1.20.1-cp39-cp39-win32.whl", hash = "sha256:b5f307337819cdfdbb40193cad84978a029f847b0a357fbe49f712063cfc4f06", size = 82293, upload-time = "2025-06-10T00:46:03.763Z" }, - { url = "https://files.pythonhosted.org/packages/d7/cd/ce185848a7dba68ea69e932674b5c1a42a1852123584bccc5443120f857c/yarl-1.20.1-cp39-cp39-win_amd64.whl", hash = "sha256:eae7bfe2069f9c1c5b05fc7fe5d612e5bbc089a39309904ee8b829e322dcad00", size = 87385, upload-time = "2025-06-10T00:46:05.655Z" }, { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, ] - -[[package]] -name = "zipp" -version = "3.23.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, -] diff --git a/rust/examples/Cargo.toml b/rust/examples/Cargo.toml index 3a1ce0ea03c..80eff457140 100644 --- a/rust/examples/Cargo.toml +++ b/rust/examples/Cargo.toml @@ -38,7 +38,7 @@ arrow-select = { workspace = true } clap = { workspace = true, features = ["derive"] } itertools = { workspace = true } futures = { workspace = true } -lance = { workspace = true, features = ["aws", "azure", "gcp", "oss", "huggingface", "tencent"] } +lance = { workspace = true, features = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "goosefs"] } lance-index = { workspace = true } lance-core = { workspace = true } lance-linalg = { workspace = true } @@ -49,6 +49,6 @@ tokio = { workspace = true } all_asserts = "2.3.1" env_logger = "0.11.7" hf-hub = "0.4.2" -parquet = "58.0.0" +parquet = { version = "58.0.0", default-features = false, features = ["arrow", "async"] } tokenizers = "0.15.2" rand.workspace = true diff --git a/rust/lance-arrow/src/ipc.rs b/rust/lance-arrow/src/ipc.rs index 1c6364c4525..8b6e5cf41fe 100644 --- a/rust/lance-arrow/src/ipc.rs +++ b/rust/lance-arrow/src/ipc.rs @@ -270,7 +270,7 @@ pub fn read_ipc_stream_single_at( /// Modern IPC streams have an 8-byte prefix `[continuation: 4][size: 4]`. /// Legacy streams have a 4-byte prefix `[size: 4]`. Returns `(prefix_len, meta_size)`. fn parse_ipc_message_prefix(buf: &Buffer) -> Result<(usize, usize), ArrowError> { - let has_continuation = buf.len() >= 4 && buf[..4] == [0xff; 4]; + let has_continuation = buf.len() >= 4 && buf[..4] == IPC_CONTINUATION; if has_continuation { if buf.len() < 8 { return Err(ArrowError::ParseError( @@ -358,6 +358,134 @@ pub fn read_ipc_stream_single(data: &Bytes) -> Result { } } +// --------------------------------------------------------------------------- +// Aligned IPC sections +// --------------------------------------------------------------------------- + +/// Byte alignment that each IPC section's stream start is padded to. +/// +/// When several IPC streams are concatenated into one larger blob (e.g. a +/// cache entry), a section that starts at an arbitrary offset would leave its +/// array data misaligned. [`FileDecoder`] with `require_alignment = false` +/// then silently copies each buffer into a freshly aligned allocation on +/// every read, defeating zero-copy. Padding each section start to a 64-byte +/// boundary keeps the decoded buffers borrowed directly from the input. +pub const IPC_SECTION_ALIGNMENT: usize = 64; + +/// Number of zero-padding bytes needed to advance `pos` to the next +/// [`IPC_SECTION_ALIGNMENT`] boundary. +fn section_padding(pos: usize) -> usize { + (IPC_SECTION_ALIGNMENT - (pos % IPC_SECTION_ALIGNMENT)) % IPC_SECTION_ALIGNMENT +} + +/// A [`Write`] adapter that counts the bytes written through it. +struct CountingWriter<'a> { + inner: &'a mut dyn Write, + count: usize, +} + +impl Write for CountingWriter<'_> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let n = self.inner.write(buf)?; + self.count += n; + Ok(n) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +/// Write zero padding so the next byte lands on an [`IPC_SECTION_ALIGNMENT`] +/// boundary, advancing `pos` past it. +fn write_section_padding(writer: &mut dyn Write, pos: &mut usize) -> Result<(), ArrowError> { + let pad = section_padding(*pos); + if pad > 0 { + const ZEROS: [u8; IPC_SECTION_ALIGNMENT] = [0u8; IPC_SECTION_ALIGNMENT]; + writer + .write_all(&ZEROS[..pad]) + .map_err(|e| ArrowError::IoError(e.to_string(), e))?; + *pos += pad; + } + Ok(()) +} + +/// Write `batch` as a 64-byte-aligned single-batch Arrow IPC section. +/// +/// `pos` is the absolute byte offset of `writer` within the enclosing blob. +/// Zero padding is written first so the IPC stream begins on an +/// [`IPC_SECTION_ALIGNMENT`] boundary, then the stream itself. `pos` is +/// advanced past both the padding and the stream so the caller can write +/// further aligned sections. +/// +/// Paired with [`read_ipc_section_at`]. For the decoded buffers to be borrowed +/// zero-copy, the blob must ultimately be read back from a buffer whose base +/// address is at least 64-byte aligned. +pub fn write_ipc_section( + writer: &mut dyn Write, + pos: &mut usize, + batch: &RecordBatch, +) -> Result<(), ArrowError> { + write_section_padding(writer, pos)?; + + let mut counting = CountingWriter { + inner: writer, + count: 0, + }; + write_ipc_stream(batch, &mut counting)?; + *pos += counting.count; + Ok(()) +} + +/// Read a single [`RecordBatch`] from an aligned IPC section at `offset`. +/// +/// Skips the alignment padding written by [`write_ipc_section`], then reads +/// the stream, advancing `offset` past the section (padding + stream + EOS). +/// +/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base +/// address is at least 64-byte aligned (see [`write_ipc_section`]). +pub fn read_ipc_section_at(data: &Bytes, offset: &mut usize) -> Result { + *offset += section_padding(*offset); + read_ipc_stream_single_at(data, offset) +} + +/// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC section. +/// +/// Like [`write_ipc_section`] but emits every batch from `iter` into one IPC +/// stream (schema + N batches + EOS). `iter` must yield at least one batch. +/// Paired with [`read_ipc_section_batches_at`]. +pub fn write_ipc_section_batches( + writer: &mut dyn Write, + pos: &mut usize, + iter: I, +) -> Result<(), ArrowError> +where + I: IntoIterator, +{ + write_section_padding(writer, pos)?; + + let mut counting = CountingWriter { + inner: writer, + count: 0, + }; + write_ipc_stream_batches(iter, &mut counting)?; + *pos += counting.count; + Ok(()) +} + +/// Read all [`RecordBatch`]es from an aligned multi-batch IPC section at +/// `offset`, advancing `offset` past the section (padding + stream + EOS). +/// +/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base +/// address is at least 64-byte aligned (see [`write_ipc_section_batches`]). +pub fn read_ipc_section_batches_at( + data: &Bytes, + offset: &mut usize, +) -> Result, ArrowError> { + *offset += section_padding(*offset); + read_ipc_stream_at(data, offset) +} + #[cfg(test)] mod tests { use arrow_array::{ArrayRef, record_batch}; @@ -403,4 +531,90 @@ mod tests { assert_col_zero_copy(batch.column(1)); } } + + /// Allocate a [`Bytes`] whose base address is 64-byte aligned, modelling a + /// backend that reads cache entries into an aligned buffer. A plain + /// `Bytes::from(vec)` only guarantees the allocator's alignment for `u8`. + fn aligned_bytes(payload: &[u8]) -> Bytes { + let mut v = vec![0u8; payload.len() + IPC_SECTION_ALIGNMENT]; + let pad = section_padding(v.as_ptr() as usize); + v[pad..pad + payload.len()].copy_from_slice(payload); + Bytes::from(v).slice(pad..pad + payload.len()) + } + + #[test] + fn test_aligned_ipc_sections_are_zero_copy() { + // A LargeBinary column exercises the i64-offset buffer whose 8-byte + // alignment requirement triggers a realigning memcpy when misaligned. + let blocks = arrow_array::LargeBinaryArray::from_vec(vec![&b"hello"[..], b"world"]); + let section_a = RecordBatch::try_from_iter([("a", Arc::new(blocks) as ArrayRef)]).unwrap(); + let section_b = record_batch!(("b", Int64, [10i64, 20, 30, 40, 50])).unwrap(); + + let mut buf = Vec::new(); + // Arbitrary, deliberately non-64-aligned preamble so the first section + // must be padded rather than landing at offset 0 by luck. + buf.extend_from_slice(&[0xABu8; 7]); + let mut pos = buf.len(); + // The first section's stream begins after padding the 7-byte preamble + // up to the next 64-byte boundary. + assert_eq!(7 + section_padding(7), IPC_SECTION_ALIGNMENT); + write_ipc_section(&mut buf, &mut pos, §ion_a).unwrap(); + write_ipc_section(&mut buf, &mut pos, §ion_b).unwrap(); + + let data = aligned_bytes(&buf); + assert_eq!( + section_padding(data.as_ptr() as usize), + 0, + "base not aligned" + ); + + let mut offset = 7; + let read_a = read_ipc_section_at(&data, &mut offset).unwrap(); + let read_b = read_ipc_section_at(&data, &mut offset).unwrap(); + assert_eq!(read_a, section_a); + assert_eq!(read_b, section_b); + + let data_base = data.as_ptr() as usize; + let data_end = data_base + data.len(); + for batch in [&read_a, &read_b] { + for buffer in batch.column(0).to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= data_base && ptr < data_end, + "section buffer at {ptr:#x} was realigned out of the input \ + [{data_base:#x}..{data_end:#x}) — misaligned section", + ); + } + } + } + + #[test] + fn test_aligned_multi_batch_section_roundtrip_zero_copy() { + // A multi-batch section (e.g. IVF SQ storage chunks) must round-trip + // every batch and decode the first batch's buffers zero-copy. + let b1 = record_batch!(("v", Int64, [1i64, 2, 3])).unwrap(); + let b2 = record_batch!(("v", Int64, [4i64, 5])).unwrap(); + let b3 = record_batch!(("v", Int64, [6i64])).unwrap(); + + let mut buf = vec![0xCDu8; 5]; + let mut pos = buf.len(); + write_ipc_section_batches(&mut buf, &mut pos, [b1.clone(), b2.clone(), b3.clone()]) + .unwrap(); + + let data = aligned_bytes(&buf); + let mut offset = 5; + let read = read_ipc_section_batches_at(&data, &mut offset).unwrap(); + assert_eq!(read, vec![b1, b2, b3]); + assert_eq!(offset, buf.len(), "offset should land at section end"); + + let data_base = data.as_ptr() as usize; + let data_end = data_base + data.len(); + for buffer in read[0].column(0).to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= data_base && ptr < data_end, + "first batch buffer at {ptr:#x} was realigned out of the input", + ); + } + } } diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index b993cf00745..34a67600543 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -52,6 +52,8 @@ pub const BLOB_V2_EXT_NAME: &str = "lance.blob.v2"; /// Metadata key for overriding the dedicated blob size threshold (in bytes) pub const BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY: &str = "lance-encoding:blob-dedicated-size-threshold"; +/// Metadata key for overriding the inline blob size threshold (in bytes) +pub const BLOB_INLINE_SIZE_THRESHOLD_META_KEY: &str = "lance-encoding:blob-inline-size-threshold"; type Result = std::result::Result; diff --git a/rust/lance-core/Cargo.toml b/rust/lance-core/Cargo.toml index 9dff4b001a4..7f956c70430 100644 --- a/rust/lance-core/Cargo.toml +++ b/rust/lance-core/Cargo.toml @@ -14,6 +14,7 @@ rust-version.workspace = true [dependencies] arrow-array.workspace = true arrow-buffer.workspace = true +arrow-data.workspace = true arrow-schema.workspace = true async-trait.workspace = true lance-arrow.workspace = true @@ -21,10 +22,11 @@ byteorder.workspace = true bytes.workspace = true datafusion-common = { workspace = true, optional = true } datafusion-sql = { workspace = true, optional = true } -deepsize.workspace = true +lance-derive.workspace = true futures.workspace = true itertools.workspace = true libc.workspace = true +libm.workspace = true moka.workspace = true num_cpus = "1.0" object_store = { workspace = true } @@ -39,6 +41,7 @@ tokio.workspace = true tokio-stream.workspace = true tokio-util.workspace = true tracing.workspace = true +twox-hash.workspace = true url.workspace = true log.workspace = true diff --git a/rust/lance-core/src/cache/backend.rs b/rust/lance-core/src/cache/backend.rs index 237254c464f..9307868f399 100644 --- a/rust/lance-core/src/cache/backend.rs +++ b/rust/lance-core/src/cache/backend.rs @@ -22,6 +22,9 @@ use super::CacheCodec; /// A type-erased cache entry. pub type CacheEntry = Arc; +/// Iterator over cache keys currently known to a backend. +pub type CacheKeyIterator<'a> = Box + Send + 'a>; + /// Structured cache key passed to [`CacheBackend`] methods. /// /// CacheBackend impls receive these ready-made from [`LanceCache`](super::LanceCache) @@ -116,6 +119,15 @@ pub trait CacheBackend: Send + Sync + std::fmt::Debug { /// Remove all entries. async fn clear(&self); + /// Return an iterator over cache keys currently known to this backend. + /// + /// Backends that cannot enumerate keys cheaply or accurately should return + /// `None`. An empty iterator means key inventory is supported and the + /// cache currently has no entries. + async fn keys(&self) -> Option> { + None + } + /// Number of entries currently stored (may flush pending operations). async fn num_entries(&self) -> usize; diff --git a/rust/lance-core/src/cache/codec.rs b/rust/lance-core/src/cache/codec.rs index 34e5264bb28..bba54840829 100644 --- a/rust/lance-core/src/cache/codec.rs +++ b/rust/lance-core/src/cache/codec.rs @@ -5,12 +5,184 @@ //! //! Implement [`CacheCodecImpl`] on concrete types, then use //! [`CacheCodec::from_impl`] to produce a type-erased codec for the cache. +//! +//! # Wire format +//! +//! Every serialized entry begins with a small hand-framed **envelope** so the +//! reader can validate it before trusting the body: +//! +//! ```text +//! [magic: 4B = b"LCE1"] +//! [envelope_version: u8] +//! [type_id_len: u16 LE][type_id: utf8] # stable, author-assigned +//! [type_version: u32 LE] # per-type body schema version +//! +//! ``` +//! +//! The envelope is deliberately *not* protobuf: it is the most +//! stability-critical part, must parse robustly against arbitrary bytes +//! (including data written by older, pre-stabilization builds), and never +//! changes shape. Bodies use protobuf headers, where field-number evolution +//! pays off. +//! +//! # Decode outcome +//! +//! Deserialization never propagates a parse failure as a hard error into the +//! cache path. Anything the reader cannot confidently interpret — absent or +//! wrong magic, an unknown `envelope_version`, a `type_id` mismatch, an +//! unsupported `type_version`, or a body decode error — becomes +//! [`CacheDecode::Miss`]. A backend turns `Miss` into a normal cache miss and +//! recomputes the value. This is what lets data written by an older format +//! self-heal: it simply fails the magic check and is regenerated. +use std::io::Write; use std::sync::Arc; use bytes::Bytes; -use crate::Result; +use crate::{Error, Result}; + +use super::{CacheEntryReader, CacheEntryWriter}; + +// --------------------------------------------------------------------------- +// Envelope +// --------------------------------------------------------------------------- + +/// Magic bytes that prefix every stabilized cache entry. +/// +/// An ASCII tag (`0x4C 0x43 0x45 0x31`) chosen so it cannot collide with any +/// pre-stabilization blob: those began with either a small little-endian +/// length (tens of bytes) or a small tag byte, never these values. +/// +/// Exported so backends can cheaply identify Lance cache entries (e.g. when +/// scanning a persistent store at startup) without hardcoding the bytes — +/// prefer [`has_cache_envelope`] over comparing against this directly. +pub const MAGIC: [u8; 4] = *b"LCE1"; + +/// Returns `true` if `data` begins with the cache-entry [`MAGIC`]. +/// +/// A cheap prefix check for backends that need to recognize Lance cache +/// entries without fully [`deserialize`](CacheCodec::deserialize)-ing them. A +/// `true` result only means the framing looks like ours; the entry can still +/// decode to a [`Miss`](CacheDecode::Miss) (e.g. wrong `type_id`). +pub fn has_cache_envelope(data: &[u8]) -> bool { + data.get(..MAGIC.len()) == Some(&MAGIC[..]) +} + +/// Version of the envelope framing itself. Bumped only if the outer frame +/// (magic/version/type_id/type_version layout) ever changes — expected never. +const ENVELOPE_VERSION: u8 = 1; + +/// Parsed envelope borrowed from the input bytes. +struct ParsedEnvelope<'a> { + type_id: &'a str, + type_version: u32, + /// Offset of the first body byte within the input. + body_offset: usize, +} + +/// Parse and validate the envelope at the start of `data`. +/// +/// Returns `None` for anything that is not a well-formed envelope this build +/// understands (wrong/absent magic, unknown `envelope_version`, truncation, +/// non-utf8 `type_id`). Callers translate `None` into [`CacheDecode::Miss`]. +fn parse_envelope(data: &Bytes) -> Option> { + let bytes = data.as_ref(); + let mut off = 0usize; + + let magic = bytes.get(off..off + 4)?; + if magic != MAGIC { + return None; + } + off += 4; + + if *bytes.get(off)? != ENVELOPE_VERSION { + return None; + } + off += 1; + + let type_id_len = u16::from_le_bytes(bytes.get(off..off + 2)?.try_into().ok()?) as usize; + off += 2; + + let type_id = std::str::from_utf8(bytes.get(off..off + type_id_len)?).ok()?; + off += type_id_len; + + let type_version = u32::from_le_bytes(bytes.get(off..off + 4)?.try_into().ok()?); + off += 4; + + Some(ParsedEnvelope { + type_id, + type_version, + body_offset: off, + }) +} + +/// Write the envelope for `type_id`/`type_version`, returning the number of +/// bytes written (the body's starting offset). +fn write_envelope(writer: &mut dyn Write, type_id: &str, type_version: u32) -> Result { + let type_id_len = u16::try_from(type_id.len()).map_err(|_| { + Error::io(format!( + "cache codec type_id too long ({} bytes, max {})", + type_id.len(), + u16::MAX + )) + })?; + + writer.write_all(&MAGIC)?; + writer.write_all(&[ENVELOPE_VERSION])?; + writer.write_all(&type_id_len.to_le_bytes())?; + writer.write_all(type_id.as_bytes())?; + writer.write_all(&type_version.to_le_bytes())?; + + Ok(4 + 1 + 2 + type_id.len() + 4) +} + +// --------------------------------------------------------------------------- +// CacheDecode — first-class cache-miss outcome +// --------------------------------------------------------------------------- + +/// Why a cache entry could not be decoded into the expected type. +/// +/// Carried by [`CacheDecode::Miss`] so backends can emit targeted metrics +/// (e.g. distinguish "evicting due to a stale format" from "type collision") +/// without re-parsing. Every reason maps to the same behavior — recompute via +/// the loader — so callers that don't care can ignore it. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CacheMissReason { + /// Absent or wrong magic, unknown `envelope_version`, truncated framing, or + /// a non-utf8 `type_id`. Typically an entry written by a pre-stabilization + /// or otherwise foreign build. + InvalidEnvelope, + /// Well-formed envelope, but its `type_id` names a different entry type than + /// the codec reading it. + TypeMismatch, + /// Written by a newer build whose `type_version` this build does not + /// understand and must not attempt to interpret. + VersionTooNew, + /// Envelope validated, but the body failed to decode (truncation, a + /// malformed protobuf header, an IPC error, etc.). + BodyError, +} + +/// Outcome of deserializing a cache entry. +/// +/// `Miss` means the bytes could not be confidently decoded into `T`; the +/// [`CacheMissReason`] says why. A backend treats any `Miss` exactly like a key +/// that was never present: recompute via the loader. +#[derive(Debug)] +pub enum CacheDecode { + Hit(T), + Miss(CacheMissReason), +} + +impl CacheDecode { + pub fn hit(self) -> Option { + match self { + Self::Hit(v) => Some(v), + Self::Miss(_) => None, + } + } +} // --------------------------------------------------------------------------- // CacheCodecImpl — trait for serializable cache entry types @@ -18,31 +190,40 @@ use crate::Result; /// Serialization trait for cache entries. /// -/// **Experimental**: the serialized format is not stable and may change -/// between releases without notice. +/// **Experimental**: the serialized format is not yet covered by a stability +/// guarantee and may change between releases. When it does stabilize, the +/// rules are: `TYPE_ID`, protobuf field numbers, and enum values are +/// append-only forever; format changes that protobuf cannot express +/// transparently bump [`CURRENT_VERSION`](Self::CURRENT_VERSION). /// -/// Implement this on concrete types that need to survive serialization -/// through a persistent cache backend. Then wire it into a [`CacheKey`](super::CacheKey) -/// via [`CacheCodec::from_impl`]: +/// Implement this on concrete types that need to survive serialization through +/// a persistent cache backend, then wire it into a +/// [`CacheKey`](super::CacheKey) via [`CacheCodec::from_impl`]. /// -/// ```ignore -/// impl CacheCodecImpl for MyData { -/// fn serialize(&self, w: &mut dyn Write) -> Result<()> { /* ... */ } -/// fn deserialize(data: &Bytes) -> Result { /* ... */ } -/// } -/// -/// impl CacheKey for MyDataKey { -/// type ValueType = MyData; -/// fn codec() -> Option { -/// Some(CacheCodec::from_impl::()) -/// } -/// // ... -/// } -/// ``` +/// The envelope (magic/version/type_id/type_version) is written and validated +/// by the [`CacheCodec`] wrapper. [`serialize`](Self::serialize) writes only +/// the body — a header followed by sections in a fixed, version-keyed order — +/// and [`deserialize`](Self::deserialize) reads them back in that same order. +/// The read sequence mirroring the write sequence for each `type_version` is +/// the invariant the implementor owns. pub trait CacheCodecImpl: Send + Sync { - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()>; + /// Stable identity for this entry type. **Must not change once shipped.** + /// This is a deliberate author-assigned string, not `std::any::type_name` + /// (which is not stable across compiler versions). + const TYPE_ID: &'static str; + + /// Body schema version this build writes. Bump when the body layout + /// changes in a way protobuf field additions cannot express transparently + /// (adding/removing/reordering sections, a raw-blob encoding change, etc.). + const CURRENT_VERSION: u32; + + /// Write the body: a header, then sections in a fixed order. + fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()>; - fn deserialize(data: &Bytes) -> Result + /// Reconstruct from the body. Branch on + /// [`reader.version()`](CacheEntryReader::version) for backward compat; + /// sections are read in write order. + fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result where Self: Sized; } @@ -55,25 +236,31 @@ pub(crate) type ArcAny = Arc; /// Type-erased codec for serializing and deserializing cache entries. /// -/// `CacheCodec` is two plain function pointers — it is `Copy` and has no -/// heap allocation. Construct one via [`CacheCodec::from_impl`] for types -/// that implement [`CacheCodecImpl`], or [`CacheCodec::new`] for custom -/// cases (e.g. when the orphan rule prevents a direct impl). +/// `CacheCodec` carries the entry's stable `type_id`/`version` plus two plain +/// function pointers — it is `Copy` and has no heap allocation. Construct one +/// via [`CacheCodec::from_impl`] for types that implement [`CacheCodecImpl`], +/// or [`CacheCodec::new`] for custom cases (e.g. when the orphan rule prevents +/// a direct impl). #[derive(Copy, Clone)] pub struct CacheCodec { - pub(crate) serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>, - pub(crate) deserialize: fn(&Bytes) -> Result, + type_id: &'static str, + version: u32, + serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>, + deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result, } impl std::fmt::Debug for CacheCodec { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CacheCodec").finish_non_exhaustive() + f.debug_struct("CacheCodec") + .field("type_id", &self.type_id) + .field("version", &self.version) + .finish_non_exhaustive() } } fn serialize_via_impl( any: &ArcAny, - writer: &mut dyn std::io::Write, + writer: &mut CacheEntryWriter<'_>, ) -> Result<()> { let val = any .downcast_ref::() @@ -81,44 +268,278 @@ fn serialize_via_impl( val.serialize(writer) } -fn deserialize_via_impl(data: &Bytes) -> Result { - let val = T::deserialize(data)?; +fn deserialize_via_impl( + reader: &mut CacheEntryReader<'_>, +) -> Result { + let val = T::deserialize(reader)?; Ok(Arc::new(val) as ArcAny) } impl CacheCodec { - /// Create a `CacheCodec` from plain function pointers. + /// Create a `CacheCodec` from explicit body function pointers. /// /// Prefer [`from_impl`](Self::from_impl) when the value type implements /// [`CacheCodecImpl`]. Use this for types where a direct impl isn't - /// possible (e.g. orphan rule prevents it). + /// possible (e.g. the orphan rule prevents it). `type_id` and `version` + /// play the same role as the corresponding [`CacheCodecImpl`] constants. pub fn new( - serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>, - deserialize: fn(&Bytes) -> Result, + type_id: &'static str, + version: u32, + serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>, + deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result, ) -> Self { Self { - serialize, - deserialize, + type_id, + version, + serialize_body, + deserialize_body, } } /// Create a `CacheCodec` from a [`CacheCodecImpl`] implementation. - /// - /// For **sized** types stored directly in the cache. The codec - /// downcasts `&dyn Any` to `&T` for serialization and returns `Arc` - /// from deserialization. pub fn from_impl() -> Self { Self { - serialize: serialize_via_impl::, - deserialize: deserialize_via_impl::, + type_id: T::TYPE_ID, + version: T::CURRENT_VERSION, + serialize_body: serialize_via_impl::, + deserialize_body: deserialize_via_impl::, } } - pub fn serialize(&self, value: &ArcAny, writer: &mut dyn std::io::Write) -> Result<()> { - (self.serialize)(value, writer) + /// Serialize `value` into `writer`: envelope first, then the body. + pub fn serialize(&self, value: &ArcAny, writer: &mut dyn Write) -> Result<()> { + let body_offset = write_envelope(writer, self.type_id, self.version)?; + let mut entry_writer = CacheEntryWriter::with_pos(writer, body_offset); + (self.serialize_body)(value, &mut entry_writer) + } + + /// Deserialize an entry from `data`. + /// + /// Never fails: any non-fatal failure to interpret the bytes becomes a + /// [`CacheDecode::Miss`] with the reason why (see [`CacheMissReason`]). + /// Reading from an in-memory [`Bytes`] cannot do I/O, so there is no fault + /// channel — a miss is the only non-`Hit` outcome. + pub fn deserialize(&self, data: &Bytes) -> CacheDecode { + let Some(envelope) = parse_envelope(data) else { + log::debug!("cache entry rejected: missing or invalid envelope"); + return CacheDecode::Miss(CacheMissReason::InvalidEnvelope); + }; + + if envelope.type_id != self.type_id { + log::debug!( + "cache entry type_id mismatch: got {:?}, expected {:?}", + envelope.type_id, + self.type_id + ); + return CacheDecode::Miss(CacheMissReason::TypeMismatch); + } + + // A version newer than this build writes was produced by a newer build + // whose body layout we cannot assume to understand. Older/equal versions + // are the impl's responsibility to handle (branching on reader.version()). + if envelope.type_version > self.version { + log::debug!( + "cache entry {:?} has unsupported type_version {} (this build writes {})", + self.type_id, + envelope.type_version, + self.version + ); + return CacheDecode::Miss(CacheMissReason::VersionTooNew); + } + + let mut reader = CacheEntryReader::new(data, envelope.body_offset, envelope.type_version); + match (self.deserialize_body)(&mut reader) { + Ok(value) => CacheDecode::Hit(value), + Err(e) => { + log::debug!( + "cache entry {:?} v{} failed to decode: {e}", + self.type_id, + envelope.type_version + ); + CacheDecode::Miss(CacheMissReason::BodyError) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A trivial codec used to exercise the envelope and miss semantics + /// without pulling in arrow-backed payloads. + #[derive(Debug, PartialEq)] + struct Widget { + n: u32, + } + + impl CacheCodecImpl for Widget { + const TYPE_ID: &'static str = "test.Widget"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()> { + writer.write_raw(&self.n.to_le_bytes()) + } + + fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result { + let bytes = reader.read_raw()?; + let n = u32::from_le_bytes( + bytes + .as_ref() + .try_into() + .map_err(|_| Error::io("bad widget".to_string()))?, + ); + Ok(Self { n }) + } + } + + fn serialize_widget(widget: &Widget) -> Bytes { + let codec = CacheCodec::from_impl::(); + let any: ArcAny = Arc::new(Widget { n: widget.n }); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + Bytes::from(buf) + } + + /// The miss reason, or `None` if the decode was a hit. + fn miss_reason(data: &Bytes) -> Option { + match deserialize_widget(data) { + CacheDecode::Hit(_) => None, + CacheDecode::Miss(reason) => Some(reason), + } } - pub fn deserialize(&self, data: &Bytes) -> Result { - (self.deserialize)(data) + fn deserialize_widget(data: &Bytes) -> CacheDecode { + let codec = CacheCodec::from_impl::(); + match codec.deserialize(data) { + CacheDecode::Hit(any) => { + CacheDecode::Hit(Arc::try_unwrap(any.downcast::().unwrap()).unwrap()) + } + CacheDecode::Miss(reason) => CacheDecode::Miss(reason), + } + } + + #[test] + fn envelope_roundtrip_hits() { + let bytes = serialize_widget(&Widget { n: 0xDEADBEEF }); + // Sanity: the entry starts with the magic. + assert_eq!(&bytes[..4], b"LCE1"); + let decoded = deserialize_widget(&bytes).hit().unwrap(); + assert_eq!(decoded, Widget { n: 0xDEADBEEF }); + } + + #[test] + fn has_cache_envelope_detects_magic() { + let bytes = serialize_widget(&Widget { n: 1 }); + assert!(has_cache_envelope(&bytes)); + assert!(has_cache_envelope(&MAGIC)); // exactly the magic, nothing after + assert!(!has_cache_envelope(b"LCE")); // too short + assert!(!has_cache_envelope(b"JUNK and more")); + assert!(!has_cache_envelope(&[])); + } + + #[test] + fn wrong_magic_is_miss() { + let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec(); + bytes[0] = b'X'; + assert_eq!( + miss_reason(&Bytes::from(bytes)), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn pre_stabilization_blob_is_miss() { + // An old unstable blob led with a small u64 LE length prefix (a JSON + // header of tens of bytes) — no magic. It must self-heal to a miss. + let mut blob = Vec::new(); + blob.extend_from_slice(&(42u64).to_le_bytes()); + blob.extend_from_slice(&[0u8; 42]); + assert_eq!( + miss_reason(&Bytes::from(blob)), + Some(CacheMissReason::InvalidEnvelope) + ); + + // A different unstable shape led with a small u8 tag (0/1/2). + assert_eq!( + miss_reason(&Bytes::from(vec![0u8, 1, 2, 3])), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn unknown_envelope_version_is_miss() { + let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec(); + bytes[4] = 0xFF; // envelope_version byte + assert_eq!( + miss_reason(&Bytes::from(bytes)), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn type_id_mismatch_is_miss() { + // Hand-build an envelope with a foreign type_id but valid framing. + let mut buf = Vec::new(); + write_envelope(&mut buf, "some.OtherType", 1).unwrap(); + buf.extend_from_slice(&(4u64).to_le_bytes()); + buf.extend_from_slice(&99u32.to_le_bytes()); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::TypeMismatch) + ); + } + + #[test] + fn unsupported_future_type_version_is_miss() { + // An entry written by a newer build (higher type_version) must miss + // rather than be misread by this build. + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION + 1).unwrap(); + lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &9u32.to_le_bytes()).unwrap(); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::VersionTooNew) + ); + } + + #[test] + fn truncated_envelope_is_miss() { + let bytes = serialize_widget(&Widget { n: 7 }); + for cut in [0, 1, 4, 5, 7, 9] { + assert_eq!( + miss_reason(&bytes.slice(..cut.min(bytes.len()))), + Some(CacheMissReason::InvalidEnvelope), + "truncating to {cut} bytes should miss as InvalidEnvelope" + ); + } + } + + #[test] + fn body_decode_error_is_miss() { + // Valid envelope, but the body is too short for the widget. + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION).unwrap(); + buf.extend_from_slice(&(1u64).to_le_bytes()); + buf.push(0u8); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::BodyError) + ); + } + + #[test] + fn reader_exposes_envelope_version() { + // type_version travels through the envelope to reader.version(). + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, 7).unwrap(); + let body_off = buf.len(); + // A widget body so the codec can decode it. + lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &5u32.to_le_bytes()).unwrap(); + let data = Bytes::from(buf); + + let mut r = CacheEntryReader::new(&data, body_off, 7); + assert_eq!(r.version(), 7); + assert_eq!(r.read_raw().unwrap().as_ref(), 5u32.to_le_bytes()); } } diff --git a/rust/lance-core/src/cache/entry_io.rs b/rust/lance-core/src/cache/entry_io.rs new file mode 100644 index 00000000000..fe91b11ca7d --- /dev/null +++ b/rust/lance-core/src/cache/entry_io.rs @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Streaming readers/writers for cache entry bodies. +//! +//! [`CacheCodecImpl`](super::CacheCodecImpl) bodies are written and read +//! through these wrappers. They keep serialization streaming (no buffering of +//! the whole entry) and reads zero-copy (sections borrow from the input +//! [`Bytes`]), while tracking the byte position needed to keep Arrow IPC +//! sections 64-byte aligned (see [`lance_arrow::ipc`]). +//! +//! Body layout primitives: +//! +//! ```text +//! HEADER : [header_len: u32 LE][header proto bytes] +//! ARROW_IPC : [pad to 64B][self-delimiting IPC stream] +//! RAW_BLOB : [len: u64 LE][bytes] +//! ``` + +use std::io::Write; + +use arrow_array::RecordBatch; +use bytes::Bytes; +use prost::Message; + +use crate::{Error, Result}; + +/// Writes a cache entry body: a header followed by sections, streaming +/// directly to the underlying writer. +/// +/// The envelope is written by the [`CacheCodec`](super::CacheCodec) wrapper +/// before this writer is handed to +/// [`CacheCodecImpl::serialize`](super::CacheCodecImpl::serialize). +pub struct CacheEntryWriter<'a> { + writer: &'a mut dyn Write, + /// Absolute byte offset within the entry, used to align IPC sections. + pos: usize, +} + +impl<'a> CacheEntryWriter<'a> { + /// Create a writer positioned at the start of an entry (offset 0). + /// + /// Use this for nested serialization into a standalone buffer. The + /// envelope-aware entry point is [`CacheCodec::serialize`](super::CacheCodec::serialize). + pub fn new(writer: &'a mut dyn Write) -> Self { + Self { writer, pos: 0 } + } + + /// Create a writer whose section alignment accounts for `pos` bytes + /// already written ahead of the body (i.e. the envelope). + pub(crate) fn with_pos(writer: &'a mut dyn Write, pos: usize) -> Self { + Self { writer, pos } + } + + /// Write a single discriminant byte (e.g. a variant tag). + pub fn write_u8(&mut self, value: u8) -> Result<()> { + self.writer.write_all(&[value])?; + self.pos += 1; + Ok(()) + } + + /// Write a protobuf header as `[len: u32 LE][bytes]`. + pub fn write_header(&mut self, header: &P) -> Result<()> { + let bytes = header.encode_to_vec(); + let len = u32::try_from(bytes.len()) + .map_err(|_| Error::io(format!("cache header too large: {} bytes", bytes.len())))?; + self.writer.write_all(&len.to_le_bytes())?; + self.writer.write_all(&bytes)?; + self.pos += 4 + bytes.len(); + Ok(()) + } + + /// Write `batch` as a 64-byte-aligned Arrow IPC section. + pub fn write_ipc(&mut self, batch: &RecordBatch) -> Result<()> { + lance_arrow::ipc::write_ipc_section(self.writer, &mut self.pos, batch) + .map_err(|e| Error::io(e.to_string())) + } + + /// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC + /// section. The iterator must yield at least one batch. + pub fn write_ipc_batches(&mut self, batches: I) -> Result<()> + where + I: IntoIterator, + { + lance_arrow::ipc::write_ipc_section_batches(self.writer, &mut self.pos, batches) + .map_err(|e| Error::io(e.to_string())) + } + + /// Write a raw blob as `[len: u64 LE][bytes]`. + /// + /// Only for byte payloads that already have their own stable, portable + /// encoding (e.g. a roaring bitmap, a varint-packed stream). + pub fn write_raw(&mut self, bytes: &[u8]) -> Result<()> { + lance_arrow::ipc::write_len_prefixed_bytes(self.writer, bytes) + .map_err(|e| Error::io(e.to_string()))?; + self.pos += 8 + bytes.len(); + Ok(()) + } + + /// The underlying writer, for a payload that carries its own framing. + /// + /// Use this only when the codec writes a self-delimiting or whole-body + /// payload — e.g. streaming a roaring bitmap as the entire body, where the + /// length prefix of [`write_raw`](Self::write_raw) would be redundant and + /// buffering to measure that length would force an extra copy. For + /// structured bodies prefer [`write_header`](Self::write_header) / + /// [`write_ipc`](Self::write_ipc) / [`write_raw`](Self::write_raw), which + /// give you versioning and 64-byte IPC alignment. + /// + /// Bytes written through this do **not** advance the section-alignment + /// position, so it must not be interleaved with [`write_ipc`](Self::write_ipc). + pub fn raw_writer(&mut self) -> &mut dyn Write { + self.writer + } +} + +/// Reads a cache entry body, tracking an offset into the input and exposing +/// the entry's `type_version` so implementors can branch for backward compat. +/// +/// All reads are zero-copy: returned [`Bytes`] and the buffers behind decoded +/// [`RecordBatch`]es borrow from the input allocation. +pub struct CacheEntryReader<'a> { + data: &'a Bytes, + offset: usize, + version: u32, +} + +impl<'a> CacheEntryReader<'a> { + /// Create a reader over `data`, starting at body byte `offset`, for an + /// entry written at `version`. + pub fn new(data: &'a Bytes, offset: usize, version: u32) -> Self { + Self { + data, + offset, + version, + } + } + + /// The `type_version` from the envelope. Branch on this for backward compat. + pub fn version(&self) -> u32 { + self.version + } + + /// Read a single discriminant byte written by [`CacheEntryWriter::write_u8`]. + pub fn read_u8(&mut self) -> Result { + let bytes = self.data.as_ref(); + let v = *bytes + .get(self.offset) + .ok_or_else(|| Error::io("cache entry: truncated, missing tag byte".to_string()))?; + self.offset += 1; + Ok(v) + } + + /// Read a protobuf header written by [`CacheEntryWriter::write_header`]. + pub fn read_header(&mut self) -> Result

{ + let bytes = self.data.as_ref(); + let len_end = self + .offset + .checked_add(4) + .filter(|&e| e <= bytes.len()) + .ok_or_else(|| Error::io("cache header: truncated length prefix".to_string()))?; + let len = u32::from_le_bytes(bytes[self.offset..len_end].try_into().unwrap()) as usize; + let data_end = len_end + .checked_add(len) + .filter(|&e| e <= bytes.len()) + .ok_or_else(|| Error::io("cache header: truncated body".to_string()))?; + let msg = P::decode(&bytes[len_end..data_end]) + .map_err(|e| Error::io(format!("cache header decode failed: {e}")))?; + self.offset = data_end; + Ok(msg) + } + + /// Read one [`RecordBatch`] from a 64-byte-aligned IPC section. + pub fn read_ipc(&mut self) -> Result { + lance_arrow::ipc::read_ipc_section_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// Read all [`RecordBatch`]es from a 64-byte-aligned multi-batch IPC + /// section written by [`CacheEntryWriter::write_ipc_batches`]. + pub fn read_ipc_batches(&mut self) -> Result> { + lance_arrow::ipc::read_ipc_section_batches_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// Read a raw blob written by [`CacheEntryWriter::write_raw`], zero-copy. + pub fn read_raw(&mut self) -> Result { + lance_arrow::ipc::read_len_prefixed_bytes_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// The not-yet-consumed body bytes as a zero-copy slice. + /// + /// For a payload that carries its own framing and is parsed with the + /// codec's own cursor — the read counterpart of + /// [`CacheEntryWriter::raw_writer`]. For structured bodies prefer + /// [`read_header`](Self::read_header) / [`read_ipc`](Self::read_ipc) / + /// [`read_raw`](Self::read_raw). + pub fn body(&self) -> Bytes { + self.data.slice(self.offset..) + } +} diff --git a/rust/lance-core/src/cache/mod.rs b/rust/lance-core/src/cache/mod.rs index ee6a728ef73..07038c6e9d5 100644 --- a/rust/lance-core/src/cache/mod.rs +++ b/rust/lance-core/src/cache/mod.rs @@ -47,10 +47,14 @@ pub mod backend; pub mod codec; +mod entry_io; mod moka; -pub use backend::{CacheBackend, CacheEntry, InternalCacheKey}; -pub use codec::{CacheCodec, CacheCodecImpl}; +pub use backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey}; +pub use codec::{ + CacheCodec, CacheCodecImpl, CacheDecode, CacheMissReason, MAGIC, has_cache_envelope, +}; +pub use entry_io::{CacheEntryReader, CacheEntryWriter}; pub use moka::MokaCacheBackend; use std::borrow::Cow; @@ -63,7 +67,7 @@ use futures::{Future, FutureExt}; use crate::Result; -pub use deepsize::{Context, DeepSizeOf}; +pub use crate::deepsize::{Context, DeepSizeOf}; // --------------------------------------------------------------------------- // CacheKey / UnsizedCacheKey — typed key traits for cache users @@ -245,6 +249,40 @@ impl LanceCache { self.cache.size_bytes().await } + /// Return an iterator over keys currently stored under this cache's prefix. + /// + /// Returns `None` when the backend does not support key inventory. The + /// iterator is intended for diagnostics and may be weakly consistent with + /// concurrent cache mutations. + /// + /// # Examples + /// + /// ``` + /// # use std::{borrow::Cow, sync::Arc}; + /// # use lance_core::cache::{CacheKey, LanceCache}; + /// # struct MyKey; + /// # impl CacheKey for MyKey { + /// # type ValueType = Vec; + /// # fn key(&self) -> Cow<'_, str> { Cow::Borrowed("my-key") } + /// # fn type_name() -> &'static str { "VecI32" } + /// # } + /// # async fn example() { + /// let cache = LanceCache::with_capacity(1024); + /// cache.insert_with_key(&MyKey, Arc::new(vec![1, 2, 3])).await; + /// + /// let mut keys = cache.keys().await.expect("Moka supports key inventory"); + /// assert_eq!(keys.next().unwrap().key(), "my-key"); + /// # } + /// ``` + pub async fn keys(&self) -> Option> { + Some(Box::new( + self.cache + .keys() + .await? + .filter(|key| key.starts_with(&self.prefix)), + )) + } + // -- Sized insert/get (internal, shared by sized and unsized paths) -------- async fn insert_with_id( @@ -557,7 +595,7 @@ impl CacheStats { #[cfg(test)] mod tests { use super::*; - use std::collections::HashMap; + use std::collections::{BTreeSet, HashMap}; use std::marker::PhantomData; struct TestKey { @@ -609,6 +647,18 @@ mod tests { } } + fn key_fields(keys: &[InternalCacheKey]) -> BTreeSet<(String, String, &'static str)> { + keys.iter() + .map(|key| { + ( + key.prefix().to_string(), + key.key().to_string(), + key.type_name(), + ) + }) + .collect() + } + #[tokio::test] async fn test_cache_bytes() { let item = Arc::new(vec![1, 2, 3]); @@ -718,6 +768,99 @@ mod tests { assert_eq!(base.stats().await.hits, 1); } + #[tokio::test] + async fn test_cache_keys_with_prefixes() { + let base = LanceCache::with_capacity(1000); + let prefixed = base.with_key_prefix("ns"); + let nested = prefixed.with_key_prefix("index"); + let other = base.with_key_prefix("ns-other"); + + base.insert_with_key(&TestKey::new("root"), Arc::new(vec![0])) + .await; + prefixed + .insert_with_key(&TestKey::new("child"), Arc::new(vec![1])) + .await; + nested + .insert_with_key(&TestKey::new("nested"), Arc::new(vec![2])) + .await; + other + .insert_with_key(&TestKey::new("other"), Arc::new(vec![3])) + .await; + + let base_keys = base.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&base_keys), + BTreeSet::from([ + ( + "".to_string(), + "root".to_string(), + TestKey::>::type_name() + ), + ( + "ns/".to_string(), + "child".to_string(), + TestKey::>::type_name() + ), + ( + "ns/index/".to_string(), + "nested".to_string(), + TestKey::>::type_name() + ), + ( + "ns-other/".to_string(), + "other".to_string(), + TestKey::>::type_name() + ), + ]) + ); + + let prefixed_keys = prefixed.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&prefixed_keys), + BTreeSet::from([ + ( + "ns/".to_string(), + "child".to_string(), + TestKey::>::type_name() + ), + ( + "ns/index/".to_string(), + "nested".to_string(), + TestKey::>::type_name() + ), + ]) + ); + } + + #[tokio::test] + async fn test_cache_keys_reflect_invalidation_and_clear() { + let base = LanceCache::with_capacity(1000); + let prefixed = base.with_key_prefix("ns"); + let other = base.with_key_prefix("other"); + + prefixed + .insert_with_key(&TestKey::new("child"), Arc::new(vec![1])) + .await; + other + .insert_with_key(&TestKey::new("other"), Arc::new(vec![2])) + .await; + assert_eq!(base.keys().await.unwrap().count(), 2); + + prefixed.invalidate_prefix("").await; + let keys = base.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&keys), + BTreeSet::from([( + "other/".to_string(), + "other".to_string(), + TestKey::>::type_name() + )]) + ); + + base.clear().await; + assert_eq!(base.keys().await.unwrap().count(), 0); + } + #[tokio::test] async fn test_cache_get_or_insert() { let cache = LanceCache::with_capacity(1000); @@ -833,6 +976,7 @@ mod tests { .await .is_none() ); + assert!(cache.keys().await.is_none()); } #[tokio::test] diff --git a/rust/lance-core/src/cache/moka.rs b/rust/lance-core/src/cache/moka.rs index 6be7760458a..a3956c1720c 100644 --- a/rust/lance-core/src/cache/moka.rs +++ b/rust/lance-core/src/cache/moka.rs @@ -11,7 +11,7 @@ use futures::Future; use crate::Result; use super::CacheCodec; -use super::backend::{CacheBackend, CacheEntry, InternalCacheKey}; +use super::backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey}; /// Internal record stored in the moka cache. #[derive(Clone, Debug)] @@ -123,6 +123,13 @@ impl CacheBackend for MokaCacheBackend { self.cache.run_pending_tasks().await; } + async fn keys(&self) -> Option> { + self.cache.run_pending_tasks().await; + Some(Box::new( + self.cache.iter().map(|(key, _)| key.as_ref().clone()), + )) + } + async fn num_entries(&self) -> usize { self.cache.run_pending_tasks().await; self.cache.entry_count() as usize diff --git a/rust/lance-core/src/container/list.rs b/rust/lance-core/src/container/list.rs index 4f1593f4de1..9d8205cb398 100644 --- a/rust/lance-core/src/container/list.rs +++ b/rust/lance-core/src/container/list.rs @@ -3,7 +3,7 @@ use std::collections::LinkedList; -use deepsize::DeepSizeOf; +use crate::deepsize::DeepSizeOf; /// A linked list that grows exponentially. It is used to store a large number of /// elements in a memory-efficient way. The list grows by doubling the capacity of @@ -134,7 +134,7 @@ impl ExpLinkedList { } impl DeepSizeOf for ExpLinkedList { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut crate::deepsize::Context) -> usize { self.inner .iter() .map(|v| v.deep_size_of_children(context)) diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 026e6b0bbe9..8837037c308 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -7,9 +7,9 @@ use std::collections::HashMap; use std::fmt::{self, Debug, Formatter}; use std::sync::{Arc, LazyLock}; +use crate::deepsize::DeepSizeOf; use arrow_array::ArrayRef; use arrow_schema::{DataType, Field as ArrowField, Fields, TimeUnit}; -use deepsize::DeepSizeOf; use lance_arrow::bfloat16::{BFLOAT16_EXT_NAME, is_bfloat16_field}; use lance_arrow::{ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY}; @@ -25,6 +25,7 @@ pub use field::{ pub use schema::{ BlobHandling, FieldRef, OnMissing, Projectable, Projection, Schema, escape_field_path_for_project, format_field_path, parse_field_path, + validate_fixed_size_list_dimensions, }; pub static BLOB_DESC_FIELDS: LazyLock = LazyLock::new(|| { @@ -408,10 +409,10 @@ pub struct Dictionary { } impl DeepSizeOf for Dictionary { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut crate::deepsize::Context) -> usize { self.values .as_ref() - .map(|v| v.get_array_memory_size()) + .map(|v| (v.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context)) .unwrap_or(0) } } diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index b122ce64ac4..9f06d421949 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -10,6 +10,7 @@ use std::{ sync::Arc, }; +use crate::deepsize::DeepSizeOf; use arrow_array::{ ArrayRef, cast::AsArray, @@ -18,7 +19,6 @@ use arrow_array::{ }, }; use arrow_schema::{DataType, Field as ArrowField}; -use deepsize::DeepSizeOf; use lance_arrow::{ ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, json::{is_arrow_json_field, is_json_field}, @@ -575,6 +575,18 @@ impl Field { } } + /// Convert blob v2 fields in this field tree to their descriptor view. + pub fn unload_blobs_recursive(&mut self) { + if self.is_blob_v2() { + self.unloaded_mut(); + return; + } + + for child in &mut self.children { + child.unload_blobs_recursive(); + } + } + pub fn project(&self, path_components: &[&str]) -> Result { let mut f = Self { name: self.name.clone(), @@ -1864,6 +1876,54 @@ mod tests { assert_eq!(field.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type); } + #[test] + fn unload_blobs_recursive_only_unloads_blob_v2() { + let legacy_metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]); + let blob_v2_metadata = + HashMap::from([(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())]); + + let mut field: Field = ArrowField::new( + "parent", + DataType::Struct(Fields::from(vec![ + ArrowField::new("legacy_blob", DataType::LargeBinary, true) + .with_metadata(legacy_metadata), + ArrowField::new( + "blob_v2", + DataType::Struct( + vec![ + ArrowField::new("data", DataType::LargeBinary, true), + ArrowField::new("uri", DataType::Utf8, true), + ] + .into(), + ), + true, + ) + .with_metadata(blob_v2_metadata), + ])), + true, + ) + .try_into() + .unwrap(); + + field.unload_blobs_recursive(); + + let legacy_blob = field + .children + .iter() + .find(|f| f.name == "legacy_blob") + .unwrap(); + assert_eq!( + legacy_blob.logical_type, + LogicalType::try_from(&DataType::LargeBinary).unwrap() + ); + assert_eq!(legacy_blob.children.len(), 0); + assert!(legacy_blob.metadata.contains_key(BLOB_META_KEY)); + + let blob_v2 = field.children.iter().find(|f| f.name == "blob_v2").unwrap(); + assert_eq!(blob_v2.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type); + assert_eq!(blob_v2.children.len(), 5); + } + #[test] fn project_by_field_accepts_blob_descriptor_projection() { let metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]); diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 9502f1e45a8..d13eb476359 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -9,9 +9,9 @@ use std::{ sync::Arc, }; +use crate::deepsize::DeepSizeOf; use arrow_array::RecordBatch; -use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; -use deepsize::DeepSizeOf; +use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use lance_arrow::*; use super::field::{Field, OnTypeMismatch, SchemaCompareOptions}; @@ -110,6 +110,29 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } } +/// Reject `FixedSizeList` types whose dimension is not a positive integer. +/// +/// The row count of a fixed-size list is derived by dividing the number of +/// child items by the dimension, so a zero dimension panics with a +/// divide-by-zero further down the write path (see issue #5102). A +/// `FixedSizeList` of a `FixedSizeList` over a primitive collapses into a +/// single leaf field, so the pre-order field walk never visits the inner list; +/// recurse through the nested list types here to catch an inner zero dimension. +/// +/// Shared by [`Schema::validate`] on the write path and the decoder's +/// field-scheduler builders on the read path. +pub fn validate_fixed_size_list_dimensions(field_name: &str, data_type: &DataType) -> Result<()> { + if let DataType::FixedSizeList(inner, dimension) = data_type { + if *dimension <= 0 { + return Err(Error::schema(format!( + "Field \"{field_name}\" contains a FixedSizeList with dimension {dimension}; dimension must be a positive integer" + ))); + } + validate_fixed_size_list_dimensions(field_name, inner.data_type())?; + } + Ok(()) +} + impl Schema { /// The unenforced primary key fields in the schema, ordered by position. /// @@ -346,6 +369,10 @@ impl Schema { field.id, self ))); } + // The row count of a fixed-size list is derived by dividing the + // number of items by the dimension, so a zero dimension would + // panic with a divide-by-zero further down the write path. + validate_fixed_size_list_dimensions(&field.name, &field.data_type())?; } Ok(()) @@ -2825,6 +2852,67 @@ mod tests { assert!(paths.contains(&"name".to_string())); } + #[test] + fn test_validate_rejects_zero_dimension_fixed_size_list() { + // A zero dimension divides-by-zero further down the write path (#5102) + let fsl = |dimension: i32| { + ArrowDataType::FixedSizeList( + Arc::new(ArrowField::new("item", ArrowDataType::Float32, true)), + dimension, + ) + }; + + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(0), true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // Nested inside a struct is rejected too + let arrow_schema = ArrowSchema::new(vec![ArrowField::new( + "outer", + ArrowDataType::Struct(ArrowFields::from(vec![ArrowField::new( + "vec", + fsl(0), + true, + )])), + true, + )]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A zero-dimension FixedSizeList nested inside a positive-dimension + // FixedSizeList collapses into a single leaf field, so the inner + // dimension is not visited by the pre-order field walk and must still + // be rejected: FixedSizeList(FixedSizeList(Float32, 0), 4). + let nested = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(0), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested, true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A positive dimension still validates, including nested lists + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(2), true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + let nested_ok = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(2), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested_ok, true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + } + #[test] fn test_schema_unenforced_clustering_key() { use crate::datatypes::field::LANCE_UNENFORCED_CLUSTERING_KEY_POSITION; diff --git a/rust/lance-core/src/deepsize.rs b/rust/lance-core/src/deepsize.rs new file mode 100644 index 00000000000..b6c145bb504 --- /dev/null +++ b/rust/lance-core/src/deepsize.rs @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +pub use lance_derive::DeepSizeOf; + +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::mem::{size_of, size_of_val}; +use std::sync::atomic::{AtomicU64, AtomicUsize}; +use std::sync::{Arc, Mutex, RwLock}; + +use arrow_array::{Array, RecordBatch}; +use arrow_buffer::ArrowNativeType; +use arrow_data::ArrayData; + +pub struct Context { + seen: HashSet, +} + +impl Default for Context { + fn default() -> Self { + Self::new() + } +} + +impl Context { + pub fn new() -> Self { + Self { + seen: HashSet::new(), + } + } + + /// Returns true if this pointer was NOT previously seen (i.e., it's new). + pub fn mark_seen(&mut self, ptr: usize) -> bool { + self.seen.insert(ptr) + } +} + +pub trait DeepSizeOf { + fn deep_size_of(&self) -> usize { + size_of_val(self) + self.deep_size_of_children(&mut Context::new()) + } + + fn deep_size_of_children(&self, context: &mut Context) -> usize; +} + +// Primitives — no heap children +macro_rules! impl_deep_size_primitive { + ($($t:ty),*) => { + $( + impl DeepSizeOf for $t { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } + } + )* + }; +} + +impl_deep_size_primitive!( + u8, + u16, + u32, + u64, + u128, + usize, + i8, + i16, + i32, + i64, + i128, + isize, + f32, + f64, + bool, + () +); + +impl DeepSizeOf for str { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } +} + +impl DeepSizeOf for String { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + self.capacity() + } +} + +impl DeepSizeOf for AtomicU64 { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } +} + +impl DeepSizeOf for AtomicUsize { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } +} + +impl DeepSizeOf for [T; N] { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.iter() + .map(|item| item.deep_size_of_children(context)) + .sum() + } +} + +impl DeepSizeOf for [T] { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // The slice's own element bytes are accounted for by the owner (e.g. the + // `size_of_val` in the `Arc`/`Box` impls); here we only sum the heap + // children of each element. + self.iter() + .map(|item| item.deep_size_of_children(context)) + .sum() + } +} + +impl DeepSizeOf for RwLock { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.read() + .map(|val| val.deep_size_of_children(context)) + .unwrap_or(0) + } +} + +impl DeepSizeOf for Mutex { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.lock() + .map(|val| val.deep_size_of_children(context)) + .unwrap_or(0) + } +} + +// Tuples +macro_rules! impl_deep_size_tuple { + ($($name:ident),+) => { + impl<$($name: DeepSizeOf),+> DeepSizeOf for ($($name,)+) { + #[allow(non_snake_case)] + fn deep_size_of_children(&self, context: &mut Context) -> usize { + let ($($name,)+) = self; + 0 $(+ $name.deep_size_of_children(context))+ + } + } + }; +} + +impl_deep_size_tuple!(A, B); +impl_deep_size_tuple!(A, B, C); +impl_deep_size_tuple!(A, B, C, D); +impl_deep_size_tuple!(A, B, C, D, E); +impl_deep_size_tuple!(A, B, C, D, E, F); + +impl DeepSizeOf for Vec { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.capacity() * size_of::() + + self + .iter() + .map(|item| item.deep_size_of_children(context)) + .sum::() + } +} + +impl DeepSizeOf for Box { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + size_of_val(&**self) + (**self).deep_size_of_children(context) + } +} + +impl DeepSizeOf for Arc { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + if context.mark_seen(Self::as_ptr(self) as *const () as usize) { + size_of_val(&**self) + (**self).deep_size_of_children(context) + } else { + 0 + } + } +} + +impl DeepSizeOf for Option { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + match self { + Some(val) => val.deep_size_of_children(context), + None => 0, + } + } +} + +impl DeepSizeOf for HashMap { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // Each bucket holds a key-value pair plus hash metadata (~1 byte control per bucket). + // Robin hood / Swiss table capacity is always a power of 2. + let capacity_bytes = self.capacity() * (size_of::() + size_of::() + 1); + let children: usize = self + .iter() + .map(|(k, v)| k.deep_size_of_children(context) + v.deep_size_of_children(context)) + .sum(); + capacity_bytes + children + } +} + +impl DeepSizeOf for HashSet { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + let capacity_bytes = self.capacity() * (size_of::() + 1); + let children: usize = self.iter().map(|k| k.deep_size_of_children(context)).sum(); + capacity_bytes + children + } +} + +impl DeepSizeOf for BTreeMap { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // BTreeMap nodes have ~11 entries each. Rough estimate: per-entry overhead ~3 pointers. + let per_entry = size_of::() + size_of::() + 3 * size_of::(); + let overhead = self.len() * per_entry; + let children: usize = self + .iter() + .map(|(k, v)| k.deep_size_of_children(context) + v.deep_size_of_children(context)) + .sum(); + overhead + children + } +} + +impl DeepSizeOf for BTreeSet { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + let per_entry = size_of::() + 3 * size_of::(); + let overhead = self.len() * per_entry; + let children: usize = self.iter().map(|k| k.deep_size_of_children(context)).sum(); + overhead + children + } +} + +// Arrow types + +fn record_array_data(context: &mut Context, data: &ArrayData) -> usize { + let mut total = 0; + for buffer in data.buffers() { + if context.mark_seen(buffer.as_ptr() as usize) { + total += buffer.capacity(); + } + } + if let Some(nulls) = data.nulls() { + let null_buf = nulls.inner().inner(); + if context.mark_seen(null_buf.as_ptr() as usize) { + total += null_buf.capacity(); + } + } + for child in data.child_data() { + total += record_array_data(context, child); + } + total +} + +impl DeepSizeOf for dyn Array { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // `to_data()` only clones Arc refs (no data copy) and allocates a small + // ArrayData metadata struct. This lets us walk buffer pointers for dedup. + // Cost is O(number_of_buffers), not O(data_size). + let data = self.to_data(); + record_array_data(context, &data) + } +} + +impl DeepSizeOf for RecordBatch { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.columns() + .iter() + .map(|col| col.deep_size_of_children(context)) + .sum() + } +} + +impl DeepSizeOf for arrow_buffer::ScalarBuffer +where + T: ArrowNativeType, +{ + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // Track the underlying buffer pointer to avoid double-counting shared allocations. + // Use capacity() rather than len() * size_of::() because sliced buffers retain + // their full original allocation. + let buf = self.inner(); + if context.mark_seen(buf.as_ptr() as usize) { + buf.capacity() + } else { + 0 + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int32Array, StringArray, StructArray}; + use arrow_schema::{DataType, Field, Fields, Schema}; + + #[test] + fn test_basic_record_batch() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let size = batch.deep_size_of(); + // Should at least include the buffer for 3 i32s + assert!(size >= 3 * size_of::()); + } + + #[test] + fn test_same_batch_dedup() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + ) + .unwrap(); + + let mut ctx = Context::new(); + let size_a = batch.deep_size_of_children(&mut ctx); + let size_b = batch.deep_size_of_children(&mut ctx); + + // First measurement should report buffer sizes + assert!(size_a > 0); + // Second measurement of the same batch should add nothing (buffers already seen) + assert_eq!(size_b, 0); + } + + #[test] + fn test_arc_dedup() { + let batch = Arc::new( + RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(), + ); + let clone = Arc::clone(&batch); + + let mut ctx = Context::new(); + let size_a = batch.deep_size_of_children(&mut ctx); + let size_b = clone.deep_size_of_children(&mut ctx); + + assert!(size_a > 0); + assert_eq!(size_b, 0); + } + + #[test] + fn test_multi_column_shared_array() { + // Two columns pointing to the same Arc + let array: Arc = Arc::new(Int32Array::from(vec![10, 20, 30])); + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + + // Single-column batch for reference + let one_col = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![array.clone()], + ) + .unwrap(); + + // Two-column batch with the same Arc shared + let two_col = RecordBatch::try_new(schema, vec![array.clone(), array]).unwrap(); + + let mut ctx1 = Context::new(); + let size_one = one_col.deep_size_of_children(&mut ctx1); + + let mut ctx2 = Context::new(); + let size_two = two_col.deep_size_of_children(&mut ctx2); + + // Both should report the same size since the second column's Arc is + // already seen and contributes nothing + assert_eq!(size_one, size_two); + } + + #[test] + fn test_nested_struct_array() { + let int_array = Int32Array::from(vec![1, 2, 3]); + let str_array = StringArray::from(vec!["a", "b", "c"]); + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("x", DataType::Int32, false)), + Arc::new(int_array) as Arc, + ), + ( + Arc::new(Field::new("y", DataType::Utf8, false)), + Arc::new(str_array) as Arc, + ), + ]); + + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "s", + DataType::Struct(Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ])), + false, + )])), + vec![Arc::new(struct_array)], + ) + .unwrap(); + + let size = batch.deep_size_of(); + // Should include buffers for both child arrays + assert!(size > 3 * size_of::()); + } + + #[test] + fn test_std_types() { + assert_eq!(42u32.deep_size_of(), size_of::()); + + let s = String::from("hello"); + assert!(s.deep_size_of() >= size_of::() + 5); + + let v = vec![1u32, 2, 3]; + assert!(v.deep_size_of() >= size_of::>() + 3 * size_of::()); + + let a = Arc::new(42u32); + let b = Arc::clone(&a); + let mut ctx = Context::new(); + let size_a = a.deep_size_of_children(&mut ctx); + let size_b = b.deep_size_of_children(&mut ctx); + assert_eq!(size_a, size_of::()); + assert_eq!(size_b, 0); + } + + #[test] + fn test_derive_macro() { + use lance_derive::DeepSizeOf; + + #[derive(DeepSizeOf)] + struct Outer { + count: u64, + label: String, + inner: Inner, + } + + #[derive(DeepSizeOf)] + struct Inner { + values: Vec, + } + + let val = Outer { + count: 7, + label: String::from("hello"), + inner: Inner { + values: vec![1, 2, 3], + }, + }; + + let size = val.deep_size_of(); + // Must be at least the stack size + heap allocations for label + values + assert!(size >= std::mem::size_of::() + 5 + 3 * std::mem::size_of::()); + } +} diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs index 173c7d0ceaa..8379fa74c4d 100644 --- a/rust/lance-core/src/lib.rs +++ b/rust/lance-core/src/lib.rs @@ -2,12 +2,16 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors #![cfg_attr(coverage, feature(coverage_attribute))] +// Allow the derive macro to reference `lance_core::deepsize` from within this crate. +extern crate self as lance_core; + use arrow_schema::{DataType, Field as ArrowField}; use std::sync::LazyLock; pub mod cache; pub mod container; pub mod datatypes; +pub mod deepsize; pub mod error; pub mod levenshtein; pub mod traits; diff --git a/rust/lance-core/src/utils.rs b/rust/lance-core/src/utils.rs index a7ac74a5b27..c202329838c 100644 --- a/rust/lance-core/src/utils.rs +++ b/rust/lance-core/src/utils.rs @@ -7,10 +7,12 @@ pub mod assume; pub mod backoff; pub mod bit; pub mod blob; +pub mod bloomfilter; pub mod cpu; pub mod deletion; pub mod futures; pub mod hash; +pub mod io_stats; pub mod parse; pub mod path; pub mod tempfile; diff --git a/rust/lance-core/src/utils/bloomfilter.rs b/rust/lance-core/src/utils/bloomfilter.rs new file mode 100644 index 00000000000..46cc272a694 --- /dev/null +++ b/rust/lance-core/src/utils/bloomfilter.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Generic bloom filter primitives. +//! +//! These are storage-agnostic data structures with no Lance semantics, used by +//! higher-level crates (e.g. the bloom filter scalar index in `lance-index`). + +pub mod as_bytes; +pub mod sbbf; diff --git a/rust/lance-index/src/scalar/bloomfilter/as_bytes.rs b/rust/lance-core/src/utils/bloomfilter/as_bytes.rs similarity index 98% rename from rust/lance-index/src/scalar/bloomfilter/as_bytes.rs rename to rust/lance-core/src/utils/bloomfilter/as_bytes.rs index 22df8d6af7c..86b9632ce39 100644 --- a/rust/lance-index/src/scalar/bloomfilter/as_bytes.rs +++ b/rust/lance-core/src/utils/bloomfilter/as_bytes.rs @@ -7,7 +7,7 @@ //! similar to parquet::data_type::AsBytes but without the external dependency. /// Trait to convert primitive types to byte slices -/// Reference: https://arrow.apache.org/rust/src/parquet/data_type.rs.html +/// Reference: pub trait AsBytes { /// Convert the value to a byte slice fn as_bytes(&self) -> impl AsRef<[u8]>; diff --git a/rust/lance-index/src/scalar/bloomfilter/sbbf.rs b/rust/lance-core/src/utils/bloomfilter/sbbf.rs similarity index 99% rename from rust/lance-index/src/scalar/bloomfilter/sbbf.rs rename to rust/lance-core/src/utils/bloomfilter/sbbf.rs index cbb4eb76b12..06df2641008 100644 --- a/rust/lance-index/src/scalar/bloomfilter/sbbf.rs +++ b/rust/lance-core/src/utils/bloomfilter/sbbf.rs @@ -28,7 +28,7 @@ //! removed from Lance. //! -use crate::scalar::bloomfilter::as_bytes::AsBytes; +use super::as_bytes::AsBytes; use libm::lgamma; use std::error::Error; use std::fmt; diff --git a/rust/lance-core/src/utils/deletion.rs b/rust/lance-core/src/utils/deletion.rs index 5ddfc3348e5..c7f8b142464 100644 --- a/rust/lance-core/src/utils/deletion.rs +++ b/rust/lance-core/src/utils/deletion.rs @@ -3,8 +3,8 @@ use std::{collections::HashSet, ops::Range, sync::Arc}; +use crate::deepsize::{Context, DeepSizeOf}; use arrow_array::BooleanArray; -use deepsize::{Context, DeepSizeOf}; use roaring::RoaringBitmap; /// Threshold for when a DeletionVector::Set should be promoted to a DeletionVector::Bitmap. @@ -296,7 +296,7 @@ impl From for DeletionVector { #[cfg_attr(coverage, coverage(off))] mod test { use super::*; - use deepsize::DeepSizeOf; + use crate::deepsize::DeepSizeOf; use rstest::rstest; fn set_dv(vals: impl IntoIterator) -> DeletionVector { diff --git a/rust/lance-core/src/utils/io_stats.rs b/rust/lance-core/src/utils/io_stats.rs new file mode 100644 index 00000000000..e2169d71ae3 --- /dev/null +++ b/rust/lance-core/src/utils/io_stats.rs @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::ops::Range; + +/// A sink that records I/O requests as they are submitted to storage. +/// +/// This lives in `lance-core` so that the encoding layer (`lance-encoding`) and +/// the I/O layer (`lance-io`) can both refer to it without depending on one +/// another. It lets a caller attach a lightweight counter to a file reader and +/// measure the exact bytes/IOPS performed for a bounded scope (e.g. a single +/// query); see `lance_io::scheduler::IoStats` for the concrete implementation. +/// +/// # When to use this +/// +/// Lance also exposes two *process-wide, cumulative* I/O accounting facilities: +/// the global scheduler counters (`lance_io::scheduler::iops_counter` / +/// `bytes_read_counter`) and the object-store `IOTracker` wrapper used in tests. +/// Both aggregate every read in the process and cannot attribute I/O to a single +/// bounded scope. Prefer an `IoStatsRecorder` when you need the *exact* I/O of +/// one operation (e.g. a single query): attach it to a reader with +/// `with_io_stats`, then read the snapshot when the scope ends. It re-uses the +/// reader's cached metadata, so measuring costs no extra file opens and does not +/// disturb the global counters. +pub trait IoStatsRecorder: std::fmt::Debug + Send + Sync { + /// Record one completed request, given the byte ranges as actually + /// submitted to storage (i.e. after any coalescing/splitting), so the + /// counts reflect physical I/O. + fn record_request(&self, ranges: &[Range]); +} diff --git a/rust/lance-datafusion/src/exec.rs b/rust/lance-datafusion/src/exec.rs index 5d7c5465132..8f346f45612 100644 --- a/rust/lance-datafusion/src/exec.rs +++ b/rust/lance-datafusion/src/exec.rs @@ -28,6 +28,7 @@ use datafusion::{ physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, analyze::AnalyzeExec, + coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, execution_plan::{Boundedness, CardinalityEffect, EmissionType}, metrics::MetricValue, @@ -606,9 +607,15 @@ pub fn execute_plan( let session_ctx = get_session_context(&options); - // NOTE: we are only executing the first partition here. Therefore, if - // the plan has more than one partition, we will be missing data. - assert_eq!(plan.properties().partitioning.partition_count(), 1); + // Coalesce to a single partition if the optimizer left more than one. + // EnforceDistribution may remove RepartitionExec(1) nodes when the parent + // declares UnspecifiedDistribution, leaving multi-partition plans here. + let plan: Arc = if plan.properties().partitioning.partition_count() == 1 { + plan + } else { + Arc::new(CoalescePartitionsExec::new(plan)) + }; + let stream = plan.execute(0, get_task_context(&session_ctx, &options))?; let schema = stream.schema(); diff --git a/rust/lance-datafusion/src/expr.rs b/rust/lance-datafusion/src/expr.rs index 79650f6775e..a0da34ba2bb 100644 --- a/rust/lance-datafusion/src/expr.rs +++ b/rust/lance-datafusion/src/expr.rs @@ -17,6 +17,18 @@ const MS_PER_DAY: i64 = 86400000; // will always yield "x = 7_u64" regardless of the type of the column "x". As a result, we // need to do that literal coercion ourselves. pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option { + // A dictionary target coerces the value to the dictionary's value type and + // re-wraps it as a dictionary literal. Only an untyped `ScalarValue::Null` + // keeps its untyped form, matching the behavior for all other targets; a + // *typed* null (e.g. `Utf8(None)`) is coerced and wrapped like any other + // value so it produces a `Dictionary(..)` literal that matches the column. + if let DataType::Dictionary(key_type, value_type) = ty { + if matches!(value, ScalarValue::Null) { + return Some(value.clone()); + } + let inner = safe_coerce_scalar(value, value_type)?; + return Some(ScalarValue::Dictionary(key_type.clone(), Box::new(inner))); + } match value { ScalarValue::Int8(val) => match ty { DataType::Int8 => Some(value.clone()), @@ -436,6 +448,9 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option Some(value.clone()), _ => None, }, + // A dictionary-encoded literal (e.g. produced by DataFusion's dictionary + // cast in the scalar-index path) coerces by unwrapping its underlying value. + ScalarValue::Dictionary(_, inner) => safe_coerce_scalar(inner, ty), _ => None, } } @@ -775,4 +790,97 @@ mod tests { Some(ScalarValue::BinaryView(Some(vec![1, 2, 3]))) ); } + + #[test] + fn test_dictionary_coerce() { + let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + + // A string literal coerces to a dictionary target by wrapping the + // coerced value in a dictionary scalar. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // The inner value is coerced through to the dictionary value type, so a + // LargeUtf8 literal lands as a Utf8 value inside the dictionary. + assert_eq!( + safe_coerce_scalar(&ScalarValue::LargeUtf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // A dictionary literal round-trips back to its value type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &DataType::Utf8, + ), + Some(ScalarValue::Utf8(Some("com".to_string()))) + ); + + // A dictionary literal coerces to a dictionary target, adopting the + // target's key type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &dict_ty, + ), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // An untyped null keeps its untyped form for a dictionary target, just + // like for every other target type. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Null, &dict_ty), + Some(ScalarValue::Null) + ); + + // A *typed* null (e.g. an API-built `Utf8(None)` literal, or an IN value + // already typed as Utf8) is still wrapped in the dictionary type so it + // matches the dictionary column. Returning a bare `Utf8(None)` here would + // leave `resolve_value` with a literal whose type does not line up with + // the column, breaking planning/evaluation the same way non-null strings + // used to break. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(None), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(None)), + )) + ); + + // The inner null is coerced through to the dictionary value type as well, + // so a LargeUtf8 typed null lands as a Utf8 null inside the dictionary. + assert_eq!( + safe_coerce_scalar(&ScalarValue::LargeUtf8(None), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(None)), + )) + ); + + // A value that cannot be coerced to the dictionary value type fails. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Utf8(Some("com".to_string())), + &DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Int32)), + ), + None + ); + } } diff --git a/rust/lance-datafusion/src/logical_expr.rs b/rust/lance-datafusion/src/logical_expr.rs index ab0936d31da..0eed438dae7 100644 --- a/rust/lance-datafusion/src/logical_expr.rs +++ b/rust/lance-datafusion/src/logical_expr.rs @@ -463,4 +463,58 @@ mod tests { _ => unreachable!("Expected BinaryExpr"), } } + + #[test] + fn test_resolve_typed_null_against_dictionary_column() { + // A dictionary-encoded string column, e.g. a categorical field. + let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + let arrow_schema = ArrowSchema::new(vec![Field::new("etld", dict_ty, true)]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + + // A typed null must be wrapped in the dictionary type, not left as a bare + // `Utf8(None)` literal sitting next to a `Dictionary(...)` column. + let expected_null = Expr::Literal( + ScalarValue::Dictionary(Box::new(DataType::Int16), Box::new(ScalarValue::Utf8(None))), + None, + ); + + // `etld = ` built directly via the API, as opposed to coming + // through SQL parsing. + let expr = Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column("etld".to_string().into())), + op: Operator::Eq, + right: Box::new(Expr::Literal(ScalarValue::Utf8(None), None)), + }); + match resolve_expr(&expr, &schema).unwrap() { + Expr::BinaryExpr(be) => assert_eq!(be.right.as_ref(), &expected_null), + other => unreachable!("Expected BinaryExpr, got {other:?}"), + } + + // `etld IN ('a', )` — a typed value mixed with a typed null, + // both already typed as Utf8. Every list element is wrapped in the + // dictionary type. + let expr = Expr::in_list( + Expr::Column("etld".to_string().into()), + vec![ + Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None), + Expr::Literal(ScalarValue::Utf8(None), None), + ], + false, + ); + let expected = Expr::in_list( + Expr::Column("etld".to_string().into()), + vec![ + Expr::Literal( + ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("a".to_string()))), + ), + None, + ), + expected_null, + ], + false, + ); + assert_eq!(resolve_expr(&expr, &schema).unwrap(), expected); + } } diff --git a/rust/lance-datagen/Cargo.toml b/rust/lance-datagen/Cargo.toml index eae1e3086b6..83b5aba3689 100644 --- a/rust/lance-datagen/Cargo.toml +++ b/rust/lance-datagen/Cargo.toml @@ -21,7 +21,6 @@ hex = "0.4.3" rand = { workspace = true } rand_distr = { workspace = true } rand_xoshiro = { workspace = true } -random_word = { version = "0.5", features = ["en"] } [dev-dependencies] criterion = { workspace = true } diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index 3756e354bea..39da4734619 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -21,7 +21,6 @@ use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, Sc use futures::{StreamExt, stream::BoxStream}; use rand::{Rng, RngCore, SeedableRng, distr::Uniform}; use rand_distr::Zipf; -use random_word; use self::array::rand_with_distribution; @@ -1172,24 +1171,223 @@ impl ArrayGenerator for BinaryPrefixPlusCounterGenerator { } } -// Common English stop words placed at the front to be sampled more frequently +// Common English stop words placed at the front to be sampled more frequently. const STOP_WORDS: &[&str] = &[ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", ]; +const ENGLISH_WORDS: &[&str] = &[ + "ability", + "able", + "about", + "above", + "accept", + "access", + "account", + "across", + "action", + "active", + "activity", + "actual", + "address", + "adjust", + "admin", + "advance", + "agent", + "align", + "allow", + "amount", + "analysis", + "answer", + "application", + "archive", + "array", + "asset", + "async", + "attribute", + "available", + "balance", + "batch", + "binary", + "bitmap", + "block", + "branch", + "buffer", + "build", + "cache", + "capacity", + "catalog", + "change", + "chunk", + "client", + "cluster", + "column", + "commit", + "common", + "compare", + "compile", + "compute", + "condition", + "config", + "connect", + "content", + "context", + "control", + "convert", + "copy", + "core", + "count", + "create", + "current", + "cursor", + "data", + "dataset", + "decode", + "default", + "delete", + "delta", + "depend", + "derive", + "design", + "detail", + "detect", + "device", + "direct", + "display", + "document", + "domain", + "drive", + "dynamic", + "encode", + "engine", + "error", + "event", + "example", + "execute", + "expand", + "expect", + "export", + "extend", + "feature", + "field", + "filter", + "final", + "finish", + "format", + "fragment", + "future", + "generate", + "global", + "group", + "handle", + "header", + "index", + "input", + "insert", + "inspect", + "instance", + "integer", + "internal", + "item", + "join", + "kernel", + "large", + "layer", + "layout", + "length", + "level", + "limit", + "linear", + "local", + "logical", + "lookup", + "manage", + "manifest", + "memory", + "merge", + "metric", + "model", + "module", + "namespace", + "native", + "node", + "normal", + "number", + "object", + "offset", + "option", + "output", + "package", + "page", + "parallel", + "parse", + "partition", + "pattern", + "physical", + "plan", + "policy", + "prefix", + "prepare", + "primary", + "process", + "profile", + "project", + "property", + "query", + "range", + "reader", + "record", + "region", + "registry", + "request", + "resolve", + "resource", + "result", + "return", + "row", + "runtime", + "scalar", + "scan", + "schema", + "search", + "segment", + "select", + "session", + "setting", + "source", + "stable", + "stage", + "state", + "static", + "storage", + "stream", + "string", + "struct", + "table", + "target", + "task", + "thread", + "token", + "trace", + "transform", + "type", + "update", + "upload", + "value", + "vector", + "version", + "view", + "write", + "writer", +]; + /// Word list with stop words at the front for Zipf sampling, computed once. static SENTENCE_WORDS: LazyLock> = LazyLock::new(|| { - let all_words = random_word::all(random_word::Lang::En); - let mut words = Vec::with_capacity(STOP_WORDS.len() + all_words.len()); + let mut words = Vec::with_capacity(STOP_WORDS.len() + ENGLISH_WORDS.len()); words.extend(STOP_WORDS.iter().copied()); - words.extend( - all_words - .iter() - .filter(|w| !STOP_WORDS.contains(w)) - .copied(), - ); + words.extend(ENGLISH_WORDS.iter().copied()); words }); @@ -1279,7 +1477,7 @@ struct RandomWordGenerator { impl RandomWordGenerator { pub fn new(is_large: bool) -> Self { - let words = random_word::all(random_word::Lang::En); + let words = ENGLISH_WORDS; Self { words, is_large } } } @@ -3190,9 +3388,9 @@ mod tests { assert_eq!( *genn.generate(RowCount::from(3), &mut rng).unwrap(), arrow_array::BinaryArray::from_iter_values([ - vec![174, 178], - vec![64, 122, 207, 248], - vec![124, 3, 58] + vec![111, 9, 80], + vec![86, 118, 13, 209], + vec![68, 33, 202] ]) ); } diff --git a/rust/lance-derive/Cargo.toml b/rust/lance-derive/Cargo.toml new file mode 100644 index 00000000000..4bb99d3ac93 --- /dev/null +++ b/rust/lance-derive/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "lance-derive" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true +readme.workspace = true +description = "Derive macros for Lance" +keywords.workspace = true +categories.workspace = true + +[lib] +proc-macro = true + +[dependencies] +proc-macro2 = "1.0.67" +quote = "1.0.33" +syn = { version = "2.0.37", features = ["full"] } + +[lints] +workspace = true diff --git a/rust/lance-derive/src/lib.rs b/rust/lance-derive/src/lib.rs new file mode 100644 index 00000000000..d0486133ddc --- /dev/null +++ b/rust/lance-derive/src/lib.rs @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use proc_macro::TokenStream; +use quote::quote; +use syn::{Data, DeriveInput, Fields, parse_macro_input}; + +/// Derive macro for the `DeepSizeOf` trait. +/// +/// Generates an implementation that sums the `deep_size_of_children` of all +/// fields (for structs) or the active variant's fields (for enums). +#[proc_macro_derive(DeepSizeOf)] +pub fn derive_deep_size_of(input: TokenStream) -> TokenStream { + let input = parse_macro_input!(input as DeriveInput); + let name = &input.ident; + let generics = &input.generics; + + // Add DeepSizeOf bounds to all type parameters + let mut bounded_generics = generics.clone(); + for param in &mut bounded_generics.params { + if let syn::GenericParam::Type(ref mut type_param) = *param { + type_param + .bounds + .push(syn::parse_quote!(lance_core::deepsize::DeepSizeOf)); + } + } + let (impl_generics, _, where_clause) = bounded_generics.split_for_impl(); + let (_, ty_generics, _) = generics.split_for_impl(); + + let body = match &input.data { + Data::Struct(data) => generate_struct_body(&data.fields), + Data::Enum(data) => { + let arms: Vec<_> = data + .variants + .iter() + .map(|variant| { + let variant_ident = &variant.ident; + match &variant.fields { + Fields::Unit => { + quote! { Self::#variant_ident => 0 } + } + Fields::Unnamed(fields) => { + let bindings: Vec<_> = (0..fields.unnamed.len()) + .map(|i| { + syn::Ident::new( + &format!("__field_{}", i), + proc_macro2::Span::call_site(), + ) + }) + .collect(); + let sum = bindings.iter().map(|b| { + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(#b, __context) } + }); + quote! { + Self::#variant_ident(#(#bindings),*) => { + 0 #(+ #sum)* + } + } + } + Fields::Named(fields) => { + let field_names: Vec<_> = + fields.named.iter().map(|f| &f.ident).collect(); + let sum = field_names.iter().map(|f| { + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(#f, __context) } + }); + quote! { + Self::#variant_ident { #(#field_names),* } => { + 0 #(+ #sum)* + } + } + } + } + }) + .collect(); + quote! { + match self { + #(#arms),* + } + } + } + Data::Union(_) => { + return syn::Error::new_spanned(&input, "DeepSizeOf cannot be derived for unions") + .to_compile_error() + .into(); + } + }; + + let expanded = quote! { + impl #impl_generics lance_core::deepsize::DeepSizeOf for #name #ty_generics #where_clause { + fn deep_size_of_children(&self, __context: &mut lance_core::deepsize::Context) -> usize { + #body + } + } + }; + + TokenStream::from(expanded) +} + +fn generate_struct_body(fields: &Fields) -> proc_macro2::TokenStream { + match fields { + Fields::Named(fields) => { + let field_sizes = fields.named.iter().map(|f| { + let name = &f.ident; + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(&self.#name, __context) } + }); + quote! { 0 #(+ #field_sizes)* } + } + Fields::Unnamed(fields) => { + let field_sizes = (0..fields.unnamed.len()).map(|i| { + let index = syn::Index::from(i); + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(&self.#index, __context) } + }); + quote! { 0 #(+ #field_sizes)* } + } + Fields::Unit => { + quote! { 0 } + } + } +} diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs index 59886d337d1..a30d5ed93a9 100644 --- a/rust/lance-encoding/src/decoder.rs +++ b/rust/lance-encoding/src/decoder.rs @@ -226,7 +226,9 @@ use futures::stream::{self, BoxStream}; use futures::{FutureExt, StreamExt}; use lance_arrow::DataTypeExt; use lance_core::cache::LanceCache; -use lance_core::datatypes::{BLOB_DESC_LANCE_FIELD, Field, Schema}; +use lance_core::datatypes::{ + BLOB_DESC_LANCE_FIELD, Field, Schema, validate_fixed_size_list_dimensions, +}; use lance_core::utils::futures::{FinallyStreamExt, StreamOnDropExt}; use lance_core::utils::parse::parse_env_as_bool; use log::{debug, trace, warn}; @@ -723,6 +725,7 @@ impl CoreFieldDecoderStrategy { column_infos: &mut ColumnInfoIter, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_structural_primitive(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = Box::new(StructuralPrimitiveFieldScheduler::try_new( @@ -832,6 +835,7 @@ impl CoreFieldDecoderStrategy { buffers: FileBuffers, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_primitive_legacy(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = self.create_primitive_scheduler(field, column_info, buffers)?; @@ -2887,6 +2891,52 @@ pub async fn decode_batch( mod tests { use super::*; + #[test] + fn test_read_zero_dimension_fsl_errors_instead_of_panicking() { + // Simulates reading a column whose stored schema declares a + // zero-dimension FixedSizeList, as old writers (before #5102) could + // persist. The read plan is built by the field-scheduler factories, + // which run the dimension guard before touching any column data, so + // an empty column iterator is sufficient to reach the guard. The read + // must surface a clean error rather than a divide-by-zero panic. + use arrow_schema::Field as ArrowField; + + let zero_dim = DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 0, + ); + let field = Field::try_from(&ArrowField::new("vec", zero_dim, true)).unwrap(); + let strategy = CoreFieldDecoderStrategy::default(); + + let mut structural_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_structural_field_scheduler(&field, &mut structural_columns) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + let mut legacy_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_legacy_field_scheduler( + &field, + &mut legacy_columns, + FileBuffers { + positions_and_sizes: &[], + }, + ) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + } + #[test] fn test_coalesce_indices_to_ranges_with_single_index() { let indices = vec![1]; diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index 78bc45d4d9a..9b506359e55 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -3701,12 +3701,7 @@ struct SerializedFullZip { // // If we directly record the size in bytes with 12 bits we would be limited to // 4KiB which is too small. Since we know each mini-block consists of 8 byte -// words we can store the # of words instead which gives us 32KiB. We want -// at least 24KiB so we can handle even the worst case of -// - 4Ki values compressed into an 8186 byte buffer -// - 4 bytes to describe rep & def lengths -// - 16KiB of rep & def buffer (this will almost never happen but life is easier if we -// plan for it) +// words we can store the # of words instead which gives us 32KiB. // // Second, each chunk in a mini-block is aligned to 8 bytes. This allows multi-byte // values like offsets to be stored in a mini-block and safely read back out. It also @@ -3906,9 +3901,9 @@ impl PrimitiveStructuralEncoder { // 0xA) All blocks except the last must have power-of-two number of values. // This not only makes metadata smaller but it makes decoding easier since // batch sizes are typically a power of 2. 4 bits would allow us to express - // up to 16Ki values but we restrict this further to 4Ki values. + // up to 32Ki values. // - // This means blocks can have 1 to 4Ki values and 8 - 32Ki bytes. + // This means blocks can have 1 to 32Ki values and 8 - 32Ki bytes. // // All metadata words are serialized (as little endian) into a single buffer // of metadata values. @@ -4007,7 +4002,13 @@ impl PrimitiveStructuralEncoder { } } else { for &buffer_size in &chunk.buffer_sizes { - data_buffer.extend_from_slice(&(buffer_size as u16).to_le_bytes()); + let buffer_size = u16::try_from(buffer_size).map_err(|_| { + Error::internal(format!( + "Mini-block buffer size ({} bytes) too large for 16-bit metadata", + buffer_size + )) + })?; + data_buffer.extend_from_slice(&buffer_size.to_le_bytes()); } } @@ -4041,15 +4042,28 @@ impl PrimitiveStructuralEncoder { let chunk_bytes = data_buffer.len() - start_pos; let max_chunk_size = if support_large_chunk { - 4 * 1024 * 1024 * 1024 // 4GB limit with u32 metadata + 1_u64 << 31 // 28 bits of 8-byte words in u32 metadata } else { 32 * 1024 // 32KiB limit with u16 metadata }; - assert!(chunk_bytes <= max_chunk_size); - assert!(chunk_bytes > 0); - assert_eq!(chunk_bytes % 8, 0); - // 4Ki values max - assert!(chunk.log_num_values <= 12); + if chunk_bytes == 0 || chunk_bytes as u64 > max_chunk_size { + return Err(Error::internal(format!( + "Mini-block chunk size {} bytes exceeds the {} byte metadata limit", + chunk_bytes, max_chunk_size + ))); + } + if chunk_bytes % MINIBLOCK_ALIGNMENT != 0 { + return Err(Error::internal(format!( + "Mini-block chunk size {} bytes is not aligned to {} bytes", + chunk_bytes, MINIBLOCK_ALIGNMENT + ))); + } + if chunk.log_num_values > 15 { + return Err(Error::internal(format!( + "Mini-block log_num_values {} exceeds the 4-bit metadata limit", + chunk.log_num_values + ))); + } // We subtract 1 here from chunk_bytes because we want to be able to express // a size of 32KiB and not (32Ki - 8)B which is what we'd get otherwise with // 0xFFF @@ -5081,13 +5095,19 @@ impl PrimitiveStructuralEncoder { let max_encoded_size = (data_size as f64 * threshold_ratio) as u64; let max_encoded_size = usize::try_from(max_encoded_size).ok()?; - // Avoid probing dictionary encoding on data that appears to be near-unique. - if Self::sample_is_near_unique( - data_block, - DEFAULT_SAMPLE_SIZE, - DEFAULT_SAMPLE_UNIQUE_RATIO, - )? { - return None; + // Avoid probing dictionary encoding on data that appears to be near-unique + // or likely to exceed the dictionary budget. + if let Some(sample_unique_ratio) = + Self::sample_unique_ratio(data_block, DEFAULT_SAMPLE_SIZE)? + { + if sample_unique_ratio >= DEFAULT_SAMPLE_UNIQUE_RATIO { + return None; + } + + let projected_cardinality = (sample_unique_ratio * num_values as f64).ceil() as u64; + if projected_cardinality > threshold_cardinality { + return None; + } } let max_dict_entries = u32::try_from(threshold_cardinality.min(i32::MAX as u64)).ok()?; @@ -5097,66 +5117,79 @@ impl PrimitiveStructuralEncoder { }) } - /// Probe whether a page looks near-unique before attempting dictionary encoding. + /// Samples whether a page looks near-unique before attempting dictionary encoding. /// - /// The probe uses deterministic stride sampling (not RNG sampling), which keeps + /// The probe uses deterministic block sampling (not RNG sampling), which keeps /// the check cheap and reproducible across runs. The result is only a gate for /// whether we try dictionary encoding, not a cardinality statistic. - fn sample_is_near_unique( - data_block: &DataBlock, - max_samples: usize, - unique_ratio_threshold: f64, - ) -> Option { + /// Returns `Some(None)` when there are too few reliable samples or the block type does not + /// support dictionary encoding. Returns `None` for malformed data. + fn sample_unique_ratio(data_block: &DataBlock, max_samples: usize) -> Option> { use std::collections::HashSet; - if unique_ratio_threshold <= 0.0 || unique_ratio_threshold > 1.0 { - return None; - } + const NUM_SAMPLE_BLOCKS: usize = 32; + const MIN_RELIABLE_SAMPLES: usize = 1024; let num_values = usize::try_from(data_block.num_values()).ok()?; if num_values == 0 { - return Some(false); + return Some(None); } let sample_count = num_values.min(max_samples).max(1); - // Uniform stride sampling across the page. - let step = (num_values / sample_count).max(1); + if sample_count < MIN_RELIABLE_SAMPLES { + return Some(None); + } - match data_block { + let block_count = NUM_SAMPLE_BLOCKS.min(sample_count).min(num_values).max(1); + let samples_per_block = (sample_count / block_count).max(1); + let mut indices = Vec::with_capacity(sample_count); + for block_idx in 0..block_count { + let block_start = block_idx * num_values / block_count; + let next_block_start = ((block_idx + 1) * num_values / block_count).min(num_values); + let block_len = next_block_start.saturating_sub(block_start); + let samples_in_block = samples_per_block.min(block_len); + indices.extend((0..samples_in_block).map(|offset| block_start + offset)); + } + + if indices.len() < MIN_RELIABLE_SAMPLES { + return Some(None); + } + + let ratio = match data_block { DataBlock::FixedWidth(fixed) => match fixed.bits_per_value { 64 => { let values = fixed.data.borrow_to_typed_slice::(); let values = values.as_ref(); - let mut unique: HashSet = HashSet::with_capacity(sample_count.min(1024)); - for idx in (0..num_values).step_by(step).take(sample_count) { + let mut unique: HashSet = + HashSet::with_capacity(indices.len().min(MIN_RELIABLE_SAMPLES)); + for idx in indices.iter().copied() { unique.insert(values.get(idx).copied()?); } - let ratio = unique.len() as f64 / sample_count as f64; - // Avoid overreacting to tiny pages with too few samples. - Some(sample_count >= 1024 && ratio >= unique_ratio_threshold) + unique.len() as f64 / indices.len() as f64 } 128 => { let values = fixed.data.borrow_to_typed_slice::(); let values = values.as_ref(); - let mut unique: HashSet = HashSet::with_capacity(sample_count.min(1024)); - for idx in (0..num_values).step_by(step).take(sample_count) { + let mut unique: HashSet = + HashSet::with_capacity(indices.len().min(MIN_RELIABLE_SAMPLES)); + for idx in indices.iter().copied() { unique.insert(values.get(idx).copied()?); } - let ratio = unique.len() as f64 / sample_count as f64; - Some(sample_count >= 1024 && ratio >= unique_ratio_threshold) + unique.len() as f64 / indices.len() as f64 } - _ => Some(false), + _ => return Some(None), }, DataBlock::VariableWidth(var) => { use xxhash_rust::xxh3::xxh3_64; // Hash variable-width slices instead of storing borrowed slice keys. - let mut unique: HashSet = HashSet::with_capacity(sample_count.min(1024)); + let mut unique: HashSet = + HashSet::with_capacity(indices.len().min(MIN_RELIABLE_SAMPLES)); match var.bits_per_offset { 32 => { let offsets_ref = var.offsets.borrow_to_typed_slice::(); let offsets: &[u32] = offsets_ref.as_ref(); - for i in (0..num_values).step_by(step).take(sample_count) { + for i in indices.iter().copied() { let start = usize::try_from(*offsets.get(i)?).ok()?; let end = usize::try_from(*offsets.get(i + 1)?).ok()?; if start > end || end > var.data.len() { @@ -5168,7 +5201,7 @@ impl PrimitiveStructuralEncoder { 64 => { let offsets_ref = var.offsets.borrow_to_typed_slice::(); let offsets: &[u64] = offsets_ref.as_ref(); - for i in (0..num_values).step_by(step).take(sample_count) { + for i in indices.iter().copied() { let start = usize::try_from(*offsets.get(i)?).ok()?; let end = usize::try_from(*offsets.get(i + 1)?).ok()?; if start > end || end > var.data.len() { @@ -5177,13 +5210,14 @@ impl PrimitiveStructuralEncoder { unique.insert(xxh3_64(&var.data[start..end])); } } - _ => return Some(false), + _ => return Some(None), } - let ratio = unique.len() as f64 / sample_count as f64; - Some(sample_count >= 1024 && ratio >= unique_ratio_threshold) + unique.len() as f64 / indices.len() as f64 } - _ => Some(false), - } + _ => return Some(None), + }; + + Some(Some(ratio)) } fn slice_repdef(repdef: &SerializedRepDefs, range: Range) -> SerializedRepDefs { @@ -5748,8 +5782,9 @@ mod tests { use super::{ ChunkInstructions, DataBlock, DecodeMiniBlockTask, FixedPerValueDecompressor, FixedWidthDataBlock, FullZipCacheableState, FullZipDecodeDetails, FullZipReadSource, - FullZipRepIndexDetails, FullZipScheduler, MiniBlockRepIndex, PerValueDecompressor, - PreambleAction, StructuralPageScheduler, VariableFullZipDecoder, + FullZipRepIndexDetails, FullZipScheduler, MiniBlockChunk, MiniBlockCompressed, + MiniBlockRepIndex, PerValueDecompressor, PreambleAction, StructuralPageScheduler, + VariableFullZipDecoder, }; use crate::buffer::LanceBuffer; use crate::compression::DefaultDecompressionStrategy; @@ -6947,7 +6982,7 @@ mod tests { #[tokio::test] async fn test_binary_large_minichunk_size_over_max_miniblock_values() { let mut string_data = Vec::new(); - // 128kb/chunk / 6 bytes (t_9999) = 21845 > max 4096 items per chunk + // 128kb/chunk / 6 bytes (t_9999) = 21845 items per chunk for i in 0..10000 { string_data.push(Some(format!("t_{}", i))); } @@ -7364,6 +7399,24 @@ mod tests { DataBlock::from_array(Arc::new(array) as ArrayRef) } + fn create_sorted_string_array(num_values: u64, cardinality: u64) -> ArrayRef { + use arrow_array::StringArray; + + assert!(cardinality <= num_values && cardinality > 0); + + let mut values = Vec::with_capacity(num_values as usize); + for i in 0..num_values { + let value_idx = i * cardinality / num_values; + values.push(format!("value_{:016}", value_idx)); + } + + Arc::new(StringArray::from(values)) as ArrayRef + } + + fn create_sorted_variable_width_block(num_values: u64, cardinality: u64) -> DataBlock { + DataBlock::from_array(create_sorted_string_array(num_values, cardinality)) + } + #[test] fn test_should_dictionary_encode() { use crate::constants::DICT_SIZE_RATIO_META_KEY; @@ -7390,6 +7443,93 @@ mod tests { ); } + #[test] + fn test_block_sampling_detects_low_cardinality_in_short_sorted_runs() { + let sample_count: usize = 4096; + let num_values: u64 = 200_000; + let cardinality: u64 = 8_000; + let run_length = num_values / cardinality; + let stride = num_values as usize / sample_count; + assert!( + stride > run_length as usize, + "test must construct the stride > run_length case" + ); + + let block = create_sorted_variable_width_block(num_values, cardinality); + let sample_unique_ratio = + PrimitiveStructuralEncoder::sample_unique_ratio(&block, sample_count).unwrap(); + + assert!( + sample_unique_ratio.is_some_and(|ratio| ratio < 0.98), + "sorted low-cardinality data must not be classified as near-unique" + ); + } + + #[test] + fn test_should_dictionary_encode_sorted_low_cardinality() { + use crate::constants::DICT_SIZE_RATIO_META_KEY; + use lance_core::datatypes::Field as LanceField; + + let block = create_sorted_variable_width_block(200_000, 8_000); + + let mut metadata = HashMap::new(); + metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.8".to_string()); + let arrow_field = + arrow_schema::Field::new("test", DataType::Utf8, false).with_metadata(metadata); + let field = LanceField::try_from(&arrow_field).unwrap(); + + let result = PrimitiveStructuralEncoder::should_dictionary_encode( + &block, + &field, + LanceFileVersion::V2_2, + ); + + assert!( + result.is_some(), + "sorted low-cardinality data should reach dictionary encoding" + ); + } + + #[test] + fn test_should_not_dictionary_encode_sorted_high_cardinality_short_runs() { + use crate::constants::DICT_SIZE_RATIO_META_KEY; + use lance_core::datatypes::Field as LanceField; + + let num_values = 200_002; + let cardinality = 100_001; + let block = create_sorted_variable_width_block(num_values, cardinality); + + let mut metadata = HashMap::new(); + metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.8".to_string()); + let arrow_field = + arrow_schema::Field::new("test", DataType::Utf8, false).with_metadata(metadata); + let field = LanceField::try_from(&arrow_field).unwrap(); + + let result = PrimitiveStructuralEncoder::should_dictionary_encode( + &block, + &field, + LanceFileVersion::V2_2, + ); + + assert!( + result.is_none(), + "sorted high-cardinality short runs should not trigger a full dictionary probe" + ); + } + + #[tokio::test] + async fn test_encode_sorted_low_cardinality_uses_dictionary_layout() { + use crate::constants::DICT_SIZE_RATIO_META_KEY; + + let mut metadata = HashMap::new(); + metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.8".to_string()); + let field = arrow_schema::Field::new("test", DataType::Utf8, false).with_metadata(metadata); + let array = create_sorted_string_array(200_000, 8_000); + + let page = encode_first_page(field, array, LanceFileVersion::V2_2).await; + let _ = dictionary_encoding_from_page(&page); + } + #[test] fn test_should_not_dictionary_encode_unsupported_bits() { use crate::constants::DICT_SIZE_RATIO_META_KEY; @@ -7441,6 +7581,36 @@ mod tests { ); } + #[test] + fn test_v2_1_miniblock_serializes_log_num_values_15() { + let miniblocks = MiniBlockCompressed { + data: vec![LanceBuffer::from(vec![1_u8; 16])], + chunks: vec![ + MiniBlockChunk { + buffer_sizes: vec![8], + log_num_values: 15, + }, + MiniBlockChunk { + buffer_sizes: vec![8], + log_num_values: 0, + }, + ], + num_values: 32_769, + }; + + let serialized = + PrimitiveStructuralEncoder::serialize_miniblocks(miniblocks, None, None, false) + .unwrap(); + + let chunk_metadata = serialized.metadata.borrow_to_typed_slice::(); + assert_eq!(chunk_metadata.len(), 2); + assert_eq!( + chunk_metadata[0] & 0x0F, + 15, + "V2.1 metadata should use all 4 bits for log_num_values" + ); + } + async fn encode_first_page( field: arrow_schema::Field, array: ArrayRef, diff --git a/rust/lance-encoding/src/encodings/logical/primitive/blob.rs b/rust/lance-encoding/src/encodings/logical/primitive/blob.rs index 614dcb81ac2..eed3e584b7e 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive/blob.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive/blob.rs @@ -205,9 +205,9 @@ struct BlobCacheableState { } impl DeepSizeOf for BlobCacheableState { - fn deep_size_of_children(&self, context: &mut lance_core::cache::Context) -> usize { - self.positions.get_array_memory_size() - + self.sizes.get_array_memory_size() + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + (self.positions.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + + (self.sizes.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + self.inner_state.deep_size_of_children(context) } } diff --git a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs index de3227b2a39..1cf3b9bf581 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs @@ -19,13 +19,14 @@ use lance_core::Result; pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6; const DEFAULT_MAX_MINIBLOCK_VALUES: u64 = 4096; +const MAX_CONFIGURABLE_MINIBLOCK_VALUES: u64 = 32768; fn parse_max_miniblock_values() -> u64 { let val = std::env::var("LANCE_MINIBLOCK_MAX_VALUES") .ok() .and_then(|v| v.parse().ok()) .unwrap_or(DEFAULT_MAX_MINIBLOCK_VALUES); - val.clamp(1, DEFAULT_MAX_MINIBLOCK_VALUES) + val.clamp(1, MAX_CONFIGURABLE_MINIBLOCK_VALUES) } pub static MAX_MINIBLOCK_VALUES: std::sync::LazyLock = @@ -58,9 +59,9 @@ pub struct MiniBlockCompressed { /// and contain a power-of-two number of values (except for the last chunk) /// /// By default we limit a chunk to 4Ki values and slightly less than -/// 8KiB of compressed data. This means that even in the extreme case -/// where we have 4 bytes of rep/def then we will have at most 24KiB of -/// data (values, repetition, and definition) per mini-block. +/// 8KiB of compressed value data. The byte budget remains the primary +/// constraint, so only encodings that compress many values into that +/// budget can use larger value counts when explicitly configured. /// /// The maximum number of values per chunk can be configured via the /// `LANCE_MINIBLOCK_MAX_VALUES` environment variable. This is only @@ -77,8 +78,8 @@ pub struct MiniBlockChunk { // then this should be 0 (the number of values will be calculated by subtracting the // size of all other chunks from the total size of the page) // - // For example, 1 would mean there are 2 values in the chunk and 12 would mean there - // are 4Ki values in the chunk. + // For example, 1 would mean there are 2 values in the chunk and 15 would mean there + // are 32Ki values in the chunk. // // This must be <= log2(MAX_MINIBLOCK_VALUES) (i.e. <= 12 at the default of 4096) pub log_num_values: u8, @@ -135,6 +136,14 @@ mod tests { unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; } + #[test] + #[serial] + fn test_parse_can_raise_to_32k() { + unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "32768") }; + assert_eq!(parse_max_miniblock_values(), 32768); + unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; + } + #[test] #[serial] fn test_parse_clamps_zero_to_one() { @@ -147,7 +156,10 @@ mod tests { #[serial] fn test_parse_clamps_above_max() { unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "99999") }; - assert_eq!(parse_max_miniblock_values(), DEFAULT_MAX_MINIBLOCK_VALUES); + assert_eq!( + parse_max_miniblock_values(), + MAX_CONFIGURABLE_MINIBLOCK_VALUES + ); unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; } diff --git a/rust/lance-encoding/src/lib.rs b/rust/lance-encoding/src/lib.rs index cb4062d3220..a58e0a14c59 100644 --- a/rust/lance-encoding/src/lib.rs +++ b/rust/lance-encoding/src/lib.rs @@ -86,6 +86,22 @@ pub trait EncodingsIo: std::fmt::Debug + Send + Sync { fn with_bypass_backpressure(&self) -> Option> { None } + + /// Returns a version of this I/O service that additionally records the I/O it + /// performs into `stats`, on top of any global accounting. This is the seam + /// used to measure exact per-scope (e.g. per-query) I/O without re-opening + /// files: wrap a reader's I/O service, perform the reads, then inspect the + /// recorder. + /// + /// Returns `None` if this implementation does not support per-scope I/O + /// statistics (e.g. in-memory or test schedulers), in which case the caller + /// should fall back to using self (and no statistics are recorded). + fn with_io_stats( + &self, + _stats: Arc, + ) -> Option> { + None + } } /// An implementation of EncodingsIo that serves data from an in-memory buffer diff --git a/rust/lance-file/Cargo.toml b/rust/lance-file/Cargo.toml index c79ffcdb57c..f08cd3457aa 100644 --- a/rust/lance-file/Cargo.toml +++ b/rust/lance-file/Cargo.toml @@ -27,7 +27,6 @@ async-trait.workspace = true byteorder.workspace = true bytes.workspace = true datafusion-common.workspace = true -deepsize.workspace = true futures.workspace = true log.workspace = true num-traits.workspace = true diff --git a/rust/lance-file/src/io.rs b/rust/lance-file/src/io.rs index c09e9d8d372..1a8edf92b08 100644 --- a/rust/lance-file/src/io.rs +++ b/rust/lance-file/src/io.rs @@ -38,6 +38,16 @@ impl EncodingsIo for LanceEncodingsIo { })) } + fn with_io_stats( + &self, + stats: Arc, + ) -> Option> { + Some(Arc::new(Self { + scheduler: self.scheduler.with_io_stats(stats), + read_chunk_size: self.read_chunk_size, + })) + } + fn submit_request( &self, ranges: Vec>, diff --git a/rust/lance-file/src/previous/format/metadata.rs b/rust/lance-file/src/previous/format/metadata.rs index 7e4046be893..11ba00c3243 100644 --- a/rust/lance-file/src/previous/format/metadata.rs +++ b/rust/lance-file/src/previous/format/metadata.rs @@ -6,8 +6,8 @@ use std::ops::Range; use crate::datatypes::{Fields, FieldsWithMeta}; use crate::format::pb; -use deepsize::DeepSizeOf; use lance_core::datatypes::Schema; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_io::traits::ProtoStruct; diff --git a/rust/lance-file/src/previous/page_table.rs b/rust/lance-file/src/previous/page_table.rs index 9a3c0d71437..cc246caa585 100644 --- a/rust/lance-file/src/previous/page_table.rs +++ b/rust/lance-file/src/previous/page_table.rs @@ -4,7 +4,7 @@ use arrow_array::builder::Int64Builder; use arrow_array::{Array, Int64Array}; use arrow_schema::DataType; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_io::encodings::Decoder; use lance_io::encodings::plain::PlainDecoder; use std::collections::BTreeMap; diff --git a/rust/lance-file/src/previous/reader.rs b/rust/lance-file/src/previous/reader.rs index cf30d30a547..1ab861985e1 100644 --- a/rust/lance-file/src/previous/reader.rs +++ b/rust/lance-file/src/previous/reader.rs @@ -19,11 +19,11 @@ use arrow_buffer::ArrowNativeType; use arrow_schema::{DataType, FieldRef, Schema as ArrowSchema}; use arrow_select::concat::{self, concat_batches}; use async_recursion::async_recursion; -use deepsize::DeepSizeOf; use futures::{Future, FutureExt, StreamExt, TryStreamExt, stream}; use lance_arrow::*; use lance_core::cache::{CacheKey, LanceCache}; use lance_core::datatypes::{Field, Schema}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_io::encodings::AsyncIndex; use lance_io::encodings::dictionary::DictionaryDecoder; diff --git a/rust/lance-file/src/reader.rs b/rust/lance-file/src/reader.rs index 3d1d8e7c361..c454f73819e 100644 --- a/rust/lance-file/src/reader.rs +++ b/rust/lance-file/src/reader.rs @@ -13,8 +13,8 @@ use arrow_array::RecordBatchReader; use arrow_schema::Schema as ArrowSchema; use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; use bytes::{Bytes, BytesMut}; -use deepsize::{Context, DeepSizeOf}; use futures::{Stream, StreamExt, stream::BoxStream}; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_encoding::{ EncodingsIo, decoder::{ @@ -470,6 +470,23 @@ impl FileReader { } } + /// Returns a clone of this reader whose I/O is additionally recorded into + /// `stats`, on top of the scheduler's global accounting. + /// + /// All cached metadata is shared with `self`, so no file is re-opened and + /// only a few `Arc` clones are performed. If the underlying I/O service + /// does not support per-scope statistics (e.g. an in-memory scheduler), the + /// returned reader is an ordinary, uninstrumented clone. + pub fn with_io_stats( + &self, + stats: Arc, + ) -> Self { + match self.scheduler.with_io_stats(stats) { + Some(scheduler) => self.with_scheduler(scheduler), + None => self.clone(), + } + } + pub fn num_rows(&self) -> u64 { self.num_rows } @@ -2511,7 +2528,7 @@ mod tests { // column_metadatas and column_infos, otherwise the moka cache weigher // dramatically underestimates entry sizes and never evicts, causing // unbounded memory growth on random-access workloads. - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; let fs = FsFixture::default(); let _written = create_some_file(&fs, version).await; diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index 902c462c820..12bd50df6fe 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -49,6 +49,15 @@ const PAD_BUFFER: [u8; PAGE_BUFFER_ALIGNMENT] = [72; PAGE_BUFFER_ALIGNMENT]; const MAX_PAGE_BYTES: usize = 32 * 1024 * 1024; const ENV_LANCE_FILE_WRITER_MAX_PAGE_BYTES: &str = "LANCE_FILE_WRITER_MAX_PAGE_BYTES"; +/// Summary of a completed Lance file write. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FileWriteSummary { + /// The number of rows written to the file. + pub num_rows: u64, + /// The final size of the file in bytes. + pub size_bytes: u64, +} + #[derive(Debug, Clone, Default)] pub struct FileWriterOptions { /// How many bytes to use for buffering column data @@ -303,7 +312,7 @@ impl FileWriter { for batch in batches { writer.write_batch(&batch).await?; } - Ok(writer.finish().await? as usize) + Ok(writer.finish().await?.num_rows as usize) } async fn do_write_buffer(writer: &mut (impl AsyncWrite + Unpin), buf: &[u8]) -> Result<()> { @@ -624,14 +633,11 @@ impl FileWriter { async fn write_global_buffers(&mut self) -> Result> { let schema = self.schema.as_mut().ok_or(Error::invalid_input("No schema provided on writer open and no data provided. Schema is unknown and file cannot be created"))?; schema.metadata = std::mem::take(&mut self.schema_metadata); - // Use descriptor layout for blob v2 in the footer to avoid exposing logical child fields. - // - // TODO(xuanwo): this doesn't work on nested struct, need better solution like fields_per_order_mut? - schema.fields.iter_mut().for_each(|f| { - if f.is_blob_v2() { - f.unloaded_mut(); - } - }); + // Use descriptor layout for blob v2 fields in the footer to avoid exposing logical child fields. + schema + .fields + .iter_mut() + .for_each(|f| f.unload_blobs_recursive()); let file_descriptor = Self::make_file_descriptor(schema, self.rows_written)?; let file_descriptor_bytes = file_descriptor.encode_to_vec(); @@ -755,8 +761,8 @@ impl FileWriter { /// will write the file metadata and the footer. It will not return until all /// data has been flushed and the file has been closed. /// - /// Returns the total number of rows written - pub async fn finish(&mut self) -> Result { + /// Returns a summary of the completed file write. + pub async fn finish(&mut self) -> Result { // 1. flush any remaining data and write out those pages let mut external_buffers = OutOfLineBuffers::new(self.tell().await?, PAGE_BUFFER_ALIGNMENT as u64); @@ -812,9 +818,12 @@ impl FileWriter { self.writer.write_all(MAGIC).await?; // 7. close the writer - Writer::shutdown(self.writer.as_mut()).await?; + let write_result = Writer::shutdown(self.writer.as_mut()).await?; - Ok(self.rows_written) + Ok(FileWriteSummary { + num_rows: self.rows_written, + size_bytes: write_result.size as u64, + }) } pub async fn abort(&mut self) { @@ -1581,8 +1590,12 @@ mod tests { .unwrap(); writer.write_batch(&batch).await.unwrap(); - let num_rows = writer.finish().await.unwrap(); - assert_eq!(num_rows, 2); + let write_summary = writer.finish().await.unwrap(); + assert_eq!(write_summary.num_rows, 2); + assert_eq!( + write_summary.size_bytes, + fs.object_store.size(&path).await.unwrap() + ); // Read back with split configuration let file_scheduler = fs diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 5ff94574a70..85de43c0f9b 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -26,7 +26,6 @@ datafusion-common.workspace = true datafusion-expr.workspace = true datafusion-physical-expr.workspace = true datafusion.workspace = true -deepsize.workspace = true dirs.workspace = true fst.workspace = true futures.workspace = true @@ -45,11 +44,11 @@ lance-encoding.workspace = true lance-file.workspace = true lance-geo = { workspace = true, optional = true } lance-io.workspace = true +libsais-rs = "0.2" lance-linalg.workspace = true lance-select.workspace = true lance-tokenizer.workspace = true lance-table.workspace = true -libm.workspace = true log.workspace = true ndarray.workspace = true num-traits.workspace = true @@ -57,6 +56,7 @@ object_store.workspace = true prost.workspace = true prost-types.workspace = true rand.workspace = true +regex-syntax.workspace = true roaring.workspace = true rayon.workspace = true serde_json.workspace = true @@ -69,7 +69,6 @@ crossbeam-queue.workspace = true bytes.workspace = true chrono.workspace = true uuid.workspace = true -twox-hash = "2.0" async-channel = "2.3.1" bitpacking = { version = "0.9.2", features = ["bitpacker4x"] } rand_distr.workspace = true diff --git a/rust/lance-index/benches/hnsw.rs b/rust/lance-index/benches/hnsw.rs index 1aafd30188c..0a9b10bf42c 100644 --- a/rust/lance-index/benches/hnsw.rs +++ b/rust/lance-index/benches/hnsw.rs @@ -95,6 +95,67 @@ fn bench_hnsw(c: &mut Criterion) { }); } +fn bench_hnsw_load(c: &mut Criterion) { + const DIMENSION: usize = 128; + const TOTAL: usize = 100_000; + const SEED: [u8; 32] = [42; 32]; + const K: usize = 100; + + let rt = tokio::runtime::Runtime::new().unwrap(); + + let data = generate_random_array_with_seed::(TOTAL * DIMENSION, SEED); + let fsl = FixedSizeListArray::try_new_from_values(data, DIMENSION as i32).unwrap(); + let vectors = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); + + let search_build_pool = ThreadPoolBuilder::new().num_threads(1).build().unwrap(); + let hnsw = search_build_pool + .install(|| HNSW::index_vectors(vectors.as_ref(), HnswBuildParams::default())) + .unwrap(); + let batch = hnsw.to_batch().unwrap(); + + // Load cost -- the path #6746 targets. `RecordBatch::clone` is an Arrow + // refcount bump (what production does anyway: each partition-cache IPC + // read yields a fresh batch), so it does not mask the load work measured. + c.bench_function(format!("load_hnsw({TOTAL}x{DIMENSION})").as_str(), |b| { + b.iter(|| { + let loaded = HNSW::load(batch.clone()).unwrap(); + assert_eq!(loaded.len(), TOTAL); + }) + }); + + // Search on the Arrow-backed loaded graph -- same TOTAL/DIMENSION/K/ef as + // the `search_hnsw` bench, so the two are directly comparable and confirm + // the new backend keeps search latency unchanged (issue #6746). + let loaded = HNSW::load(batch).unwrap(); + let query = fsl.value(0); + c.bench_function( + format!("search_hnsw_loaded{TOTAL}x{DIMENSION}").as_str(), + |b| { + b.to_async(&rt).iter(|| async { + let uids: HashSet = loaded + .search_basic( + query.clone(), + K, + &HnswQueryParams { + ef: 300, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }, + None, + vectors.as_ref(), + ) + .unwrap() + .iter() + .map(|node| node.id) + .collect(); + + assert_eq!(uids.len(), K); + }) + }, + ); +} + fn bench_hnsw_sq(c: &mut Criterion) { const DIMENSION: usize = 128; const TOTAL: usize = 100_000; @@ -291,7 +352,7 @@ criterion_group!( .measurement_time(Duration::from_secs(10)) .sample_size(10) .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); - targets = bench_hnsw, bench_hnsw_sq, bench_hnsw_pq); + targets = bench_hnsw, bench_hnsw_load, bench_hnsw_sq, bench_hnsw_pq); // Non-linux version does not support pprof. #[cfg(not(target_os = "linux"))] @@ -300,6 +361,6 @@ criterion_group!( config = Criterion::default() .measurement_time(Duration::from_secs(10)) .sample_size(10); - targets = bench_hnsw, bench_hnsw_sq, bench_hnsw_pq); + targets = bench_hnsw, bench_hnsw_load, bench_hnsw_sq, bench_hnsw_pq); criterion_main!(benches); diff --git a/rust/lance-index/benches/rq.rs b/rust/lance-index/benches/rq.rs index 4a7364d1313..72e0c49820d 100644 --- a/rust/lance-index/benches/rq.rs +++ b/rust/lance-index/benches/rq.rs @@ -17,11 +17,16 @@ use lance_datagen::array::rand_type; use lance_datagen::{BatchGeneratorBuilder, RowCount}; use lance_index::vector::bq::RQRotationType; use lance_index::vector::bq::builder::RabitQuantizer; +use lance_index::vector::bq::ex_dot::{ + blocked_ex_code_bytes, ex_dot_kernel, pack_blocked_row, packed_ex_code_value, +}; use lance_index::vector::bq::storage::*; use lance_index::vector::bq::transform::{ADD_FACTORS_COLUMN, SCALE_FACTORS_COLUMN}; use lance_index::vector::quantizer::{Quantization, QuantizerStorage}; use lance_index::vector::storage::{DistCalculator, VectorStore}; use lance_linalg::distance::DistanceType; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; const DIM: usize = 128; const TOTAL: usize = 16 * 1000; @@ -119,16 +124,526 @@ fn compute_distances(c: &mut Criterion) { } } -#[cfg(target_os = "linux")] -criterion_group!( - name=benches; - config = Criterion::default().measurement_time(Duration::from_secs(10)); - targets = construct_dist_table, compute_distances); +/// The table-gather ex distance used before the dedicated ex-dot kernels, +/// kept here as the baseline: per dim, extract the packed code and gather +/// `query[d] * code` from a `dim * 2^ex_bits` table. +fn gather_ex_distance(row_codes: &[u8], dim: usize, ex_bits: u8, ex_dist_table: &[f32]) -> f32 { + let entries_per_dim = 1usize << ex_bits; + (0..dim) + .map(|dim_idx| { + let code = packed_ex_code_value(row_codes, dim_idx, ex_bits) as usize; + ex_dist_table[dim_idx * entries_per_dim + code] + }) + .sum() +} + +fn ex_dot_kernels(c: &mut Criterion) { + for ex_dim in [1536usize, 2048] { + ex_dot_kernels_for_dim(c, ex_dim); + } +} + +fn ex_dot_kernels_for_dim(c: &mut Criterion, ex_dim: usize) { + const NUM_ROWS: usize = 1024; + + let mut rng = SmallRng::seed_from_u64(42); + let query = (0..ex_dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + + for ex_bits in 1..=8u8 { + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..NUM_ROWS * ex_dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + + // The gather baseline reads the legacy sequential layout it shipped + // with; the kernel reads the blocked layout. + let seq_code_len = (ex_dim * ex_bits as usize).div_ceil(8); + let mut seq_codes = vec![0u8; NUM_ROWS * seq_code_len]; + for (row, row_values) in seq_codes + .chunks_exact_mut(seq_code_len) + .zip(values.chunks_exact(ex_dim)) + { + for (dim, &value) in row_values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + row[bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + row[bit_offset / 8 + 1] |= (bits >> 8) as u8; + } + } + } + + let kernel_code_len = blocked_ex_code_bytes(ex_dim, ex_bits); + let mut kernel_codes = vec![0u8; NUM_ROWS * kernel_code_len]; + for (row, row_values) in kernel_codes + .chunks_exact_mut(kernel_code_len) + .zip(values.chunks_exact(ex_dim)) + { + pack_blocked_row(row_values, ex_bits, row); + } + + // ex_dim is block-aligned here, so the kernels read the query as-is. + let ex_query = &query; + let kernel = ex_dot_kernel(ex_bits); + c.bench_function( + format!("RQ ex_dot kernel: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}").as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in kernel_codes.chunks_exact(kernel_code_len) { + sum += kernel(ex_query, row); + } + black_box(sum) + }) + }, + ); + + let entries_per_dim = 1usize << ex_bits; + let mut ex_dist_table = vec![0.0f32; ex_dim * entries_per_dim]; + for (dim, table) in ex_dist_table.chunks_exact_mut(entries_per_dim).enumerate() { + for (code, value) in table.iter_mut().enumerate() { + *value = query[dim] * code as f32; + } + } + c.bench_function( + format!("RQ ex_dot table-gather: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}") + .as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in seq_codes.chunks_exact(seq_code_len) { + sum += gather_ex_distance(row, ex_dim, ex_bits, &ex_dist_table); + } + black_box(sum) + }) + }, + ); + } +} + +/// Storage load cost per format: blocked-format ex codes are aliased as-is, +/// legacy sequential ex codes are repacked row by row. +fn ex_code_storage_load(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::bq::ex_dot::repack_sequential_row; + use lance_index::vector::bq::rabit_ex_code_bytes; + use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; + use std::sync::Arc; + + const LOAD_DIM: usize = 1536; + const LOAD_ROWS: usize = 8192; + const NUM_BITS: u8 = 4; // ex_bits=3, a bit-plane width + + let ex_bits = NUM_BITS - 1; + let mut rng = SmallRng::seed_from_u64(7); + let metadata = RabitQuantizationMetadata { + rotate_mat: None, + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Fast, + code_dim: LOAD_DIM as u32, + num_bits: NUM_BITS, + packed: true, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let code_len = LOAD_DIM / 8; + let binary_codes = (0..LOAD_ROWS * code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let seq_code_len = rabit_ex_code_bytes(LOAD_DIM, ex_bits).unwrap(); + let seq_codes = (0..LOAD_ROWS * seq_code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let blocked_code_len = blocked_ex_code_bytes(LOAD_DIM, ex_bits); + let mut blocked_codes = vec![0u8; LOAD_ROWS * blocked_code_len]; + for (seq_row, blocked_row) in seq_codes + .chunks_exact(seq_code_len) + .zip(blocked_codes.chunks_exact_mut(blocked_code_len)) + { + repack_sequential_row(seq_row, LOAD_DIM, ex_bits, blocked_row); + } + + let make_batch = |ex_column: &str, ex_values: Vec, ex_code_len: usize| { + arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..LOAD_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes.clone()), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + ex_column, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_values), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ]) + .unwrap() + }; + + let blocked_batch = make_batch( + RABIT_BLOCKED_EX_CODE_COLUMN, + blocked_codes, + blocked_code_len, + ); + c.bench_function( + format!("RQ storage load (blocked ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}") + .as_str(), + |b| { + b.iter(|| { + black_box( + RabitQuantizationStorage::try_from_batch( + blocked_batch.clone(), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(), + ) + }) + }, + ); + + let legacy_batch = make_batch(RABIT_EX_CODE_COLUMN, seq_codes, seq_code_len); + c.bench_function( + format!("RQ storage load (legacy ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}") + .as_str(), + |b| { + b.iter(|| { + black_box( + RabitQuantizationStorage::try_from_batch( + legacy_batch.clone(), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(), + ) + }) + }, + ); +} + +/// Bulk-scoring cost of the ex stage: the quantized ex-FastScan LUT path +/// (inside `distance_all`) vs the exact per-row ex-dot kernel. The +/// binary-only run isolates the shared binary stage so the ex cost is the +/// difference from the full run. +fn ex_bulk_paths(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ApproxMode; + use lance_index::vector::bq::ex_dot::pad_query_into; + use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; + use lance_index::vector::storage::DistanceCalculatorOptions; + use std::sync::Arc; + + const BULK_DIM: usize = 1536; + const BULK_ROWS: usize = 16384; + + let mut rng = SmallRng::seed_from_u64(13); + for num_bits in [3u8, 5, 9] { + let ex_bits = num_bits - 1; + let max_code = ((1u16 << ex_bits) - 1) as u8; + + let rq = RabitQuantizer::new_with_rotation::( + num_bits, + BULK_DIM as i32, + RQRotationType::Fast, + ); + let metadata = rq.metadata(None); + + let code_len = BULK_DIM / 8; + let binary_codes = (0..BULK_ROWS * code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let ex_code_len = blocked_ex_code_bytes(BULK_DIM, ex_bits); + let mut ex_codes = vec![0u8; BULK_ROWS * ex_code_len]; + let values = (0..BULK_DIM) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + for row in ex_codes.chunks_exact_mut(ex_code_len) { + pack_blocked_row(&values, ex_bits, row); + } + + // No error factors: `distance_all` takes the FastScan ex bulk branch. + let batch = arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..BULK_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + RABIT_BLOCKED_EX_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_codes.clone()), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![1.0f32; BULK_ROWS])) as ArrayRef, + ), + ]) + .unwrap(); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) + .unwrap(); + + let query: ArrayRef = Arc::new(Float32Array::from( + (0..BULK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(), + )); + + for (label, approx_mode) in [ + ("full distance_all (binary + ex LUT)", ApproxMode::Normal), + ("binary-only distance_all (fast mode)", ApproxMode::Fast), + ] { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 0.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { approx_mode }, + ); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + c.bench_function( + format!("RQ bulk {label}: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}") + .as_str(), + |b| { + b.iter(|| { + calc.distance_all_with_scratch( + 0, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + black_box(dists.len()) + }) + }, + ); + } + + let kernel = ex_dot_kernel(ex_bits); + let mut ex_query = vec![0.0f32; BULK_DIM]; + pad_query_into( + query + .as_any() + .downcast_ref::() + .unwrap() + .values(), + &mut ex_query, + ); + c.bench_function( + format!( + "RQ bulk ex kernel loop: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}" + ) + .as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in ex_codes.chunks_exact(ex_code_len) { + sum += kernel(&ex_query, row); + } + black_box(sum) + }) + }, + ); + } +} + +/// Top-k accumulation through the gated raw-query multi-bit path: binary +/// FastScan, the per-row lower-bound pruning scan, and the exact rerank of +/// the surviving rows. Error factors are present so the gating is enabled. +fn heap_topk(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ApproxMode; + use lance_index::vector::bq::transform::{ + ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, + }; + use lance_index::vector::storage::DistanceCalculatorOptions; + use std::collections::BinaryHeap; + use std::sync::Arc; + + const TOPK_DIM: usize = 1536; + const TOPK_ROWS: usize = 4096; + const TOPK_K: usize = 10; + const NUM_BITS: u8 = 5; + let ex_bits = NUM_BITS - 1; + + let mut rng = SmallRng::seed_from_u64(99); + let rq = RabitQuantizer::new_with_rotation::( + NUM_BITS, + TOPK_DIM as i32, + RQRotationType::Fast, + ); + let metadata = rq.metadata(None); + + let code_len = TOPK_DIM / 8; + let binary_codes = (0..TOPK_ROWS * code_len) + .map(|_| rng.random()) + .collect::>(); + let ex_code_len = blocked_ex_code_bytes(TOPK_DIM, ex_bits); + let ex_codes = (0..TOPK_ROWS * ex_code_len) + .map(|_| rng.random()) + .collect::>(); + // Factor magnitudes chosen so the lower bounds spread mostly with the add + // factors; once the heap is full the threshold prunes the vast majority + // of rows, like a production multi-partition scan. + let mut rand_factors = |low: f32, high: f32| { + Arc::new(Float32Array::from( + (0..TOPK_ROWS) + .map(|_| rng.random_range(low..high)) + .collect::>(), + )) as ArrayRef + }; + let batch = arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..TOPK_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + (ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)), + (SCALE_FACTORS_COLUMN, rand_factors(0.0005, 0.0015)), + (ERROR_FACTORS_COLUMN, rand_factors(0.0, 0.01)), + ( + RABIT_BLOCKED_EX_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_codes), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + (EX_ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)), + (EX_SCALE_FACTORS_COLUMN, rand_factors(0.00003, 0.0001)), + ]) + .unwrap(); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None).unwrap(); + let query: ArrayRef = Arc::new(Float32Array::from( + (0..TOPK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(), + )); + + for (label, approx_mode) in [ + ("normal", ApproxMode::Normal), + ("accurate", ApproxMode::Accurate), + ] { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 1.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { approx_mode }, + ); + let mut heap = BinaryHeap::with_capacity(TOPK_K + 1); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + c.bench_function( + format!( + "RQ heap topk ({label}): num_bits={NUM_BITS}, DIM={TOPK_DIM}, rows={TOPK_ROWS}, k={TOPK_K}" + ) + .as_str(), + |b| { + b.iter(|| { + heap.clear(); + calc.accumulate_topk_with_scratch( + TOPK_K, + None, + None, + |id| id as u64, + &mut heap, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + black_box(heap.len()) + }) + }, + ); + } +} -#[cfg(not(target_os = "linux"))] criterion_group!( name=benches; config = Criterion::default().measurement_time(Duration::from_secs(10)); - targets = construct_dist_table, compute_distances); + targets = construct_dist_table, compute_distances, ex_dot_kernels, ex_code_storage_load, ex_bulk_paths, heap_topk); criterion_main!(benches); diff --git a/rust/lance-index/build.rs b/rust/lance-index/build.rs index 0617de8c806..b47744f7b5a 100644 --- a/rust/lance-index/build.rs +++ b/rust/lance-index/build.rs @@ -6,6 +6,9 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + // Cache-entry protos are library-internal serialization, not part of the + // on-disk format spec, so they live here rather than in the shared `protos/`. + println!("cargo:rerun-if-changed=protos-cache"); #[cfg(feature = "protoc")] // Use vendored protobuf compiler if requested. @@ -17,8 +20,12 @@ fn main() -> Result<()> { prost_build.protoc_arg("--experimental_allow_proto3_optional"); prost_build.enable_type_names(); prost_build.compile_protos( - &["./protos/index.proto", "./protos/index_old.proto"], - &["./protos"], + &[ + "./protos/index.proto", + "./protos/index_old.proto", + "./protos-cache/cache.proto", + ], + &["./protos", "./protos-cache"], )?; let rust_toolchain = env::var("RUSTUP_TOOLCHAIN") diff --git a/rust/lance-index/protos-cache/cache.proto b/rust/lance-index/protos-cache/cache.proto new file mode 100644 index 00000000000..b24a27055d7 --- /dev/null +++ b/rust/lance-index/protos-cache/cache.proto @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +// Protobuf headers for serialized index cache entries. +// +// These messages describe the *cache* serialization format, not the on-disk +// Lance format spec, so they live with the library (lance-index) rather than in +// the top-level `protos/` spec folder. +// +// Field numbers and enum values are append-only across all messages here: never +// renumber or reuse them. A change the proto cannot express transparently +// (adding/removing/reordering the IPC/raw sections that follow a header) must +// bump the relevant codec's `CURRENT_VERSION` instead. + +syntax = "proto3"; + +package lance.index.cache; + +// --------------------------------------------------------------------------- +// Full-text search (FTS) posting lists +// --------------------------------------------------------------------------- + +// Header for a serialized `CompressedPostingList` cache entry. +message CompressedPostingHeader { + float max_score = 1; + uint32 length = 2; + PostingTailCodec posting_tail_codec = 3; + PositionStorage position_storage = 4; + // Only meaningful when position_storage == POSITION_STORAGE_SHARED. + PositionStreamCodec position_stream_codec = 5; +} + +// Header for a serialized `PlainPostingList` cache entry. Followed by an Arrow +// IPC section of (row_ids: UInt64, frequencies: Float32), then — when +// position_storage == POSITION_STORAGE_LEGACY — an IPC section of the per-doc +// position list. Plain postings never carry a shared position stream. +message PlainPostingHeader { + // Absent when the posting has no precomputed block-max score (the in-memory + // `max_score` is `None`); present otherwise. + optional float max_score = 1; + // POSITION_STORAGE_NONE or POSITION_STORAGE_LEGACY only. + PositionStorage position_storage = 2; +} + +// Header for a serialized standalone `Positions` cache entry. Followed by the +// position sections framed per `position_storage`, which is never +// POSITION_STORAGE_NONE for a standalone entry. +message PositionsHeader { + PositionStorage position_storage = 1; + // Only meaningful when position_storage == POSITION_STORAGE_SHARED. + PositionStreamCodec position_stream_codec = 2; +} + +// Header for a serialized `PostingListGroup`: a member count followed by that +// many `PostingList` bodies written inline. Each member body is +// self-delimiting, so members need no length prefixes, and writing them inline +// keeps their Arrow IPC sections 64-byte aligned within the group entry. +message PostingListGroupHeader { + uint32 count = 1; +} + +// Tail-block encoding of a compressed posting list. +enum PostingTailCodec { + POSTING_TAIL_CODEC_FIXED32 = 0; + POSTING_TAIL_CODEC_VARINT_DELTA = 1; +} + +// Encoding of a shared position stream's byte buffer. +enum PositionStreamCodec { + POSITION_STREAM_CODEC_VARINT_DOC_DELTA = 0; + POSITION_STREAM_CODEC_PACKED_DELTA = 1; +} + +// Which (if any) positions accompany the posting list, and how they are framed +// in the sections after the header. +enum PositionStorage { + POSITION_STORAGE_NONE = 0; + // Legacy per-doc positions as a single Arrow IPC section. + POSITION_STORAGE_LEGACY = 1; + // Shared stream: an Arrow IPC section of block offsets, then a raw blob of + // the (codec-encoded) position bytes. + POSITION_STORAGE_SHARED = 2; +} + +// --------------------------------------------------------------------------- +// Scalar indices +// --------------------------------------------------------------------------- + +// Header for a serialized `BTreeIndexState` cache entry, followed by a single +// Arrow IPC section holding the page-lookup batch. +message BTreeIndexHeader { + uint64 batch_size = 1; + // Whether an explicit page-range -> file mapping is present. Distinguishes a + // non-range-partitioned index (false) from a range-partitioned one whose map + // happens to be empty (true with no entries). + bool has_ranges_to_files = 2; + repeated RangeToFile ranges_to_files = 3; +} + +// One entry of a `BTreeIndexState` page-range -> file mapping. The range is +// inclusive on both ends (a `RangeInclusive`). +message RangeToFile { + uint32 start = 1; + uint32 end = 2; + uint32 page_offset = 3; + string path = 4; +} + +// --------------------------------------------------------------------------- +// Vector indices (IVF partitions) +// --------------------------------------------------------------------------- + +// Headers for serialized IVF partition cache entries (`PartitionEntry`). +// +// Each header is followed by 64-byte-aligned Arrow IPC sections in a fixed, +// version-keyed order (sub-index, then any quantizer-specific arrays, then the +// quantizer storage batches). + +// Distance metric a quantizer's storage was built for. +enum DistanceType { + DISTANCE_TYPE_L2 = 0; + DISTANCE_TYPE_COSINE = 1; + DISTANCE_TYPE_DOT = 2; + DISTANCE_TYPE_HAMMING = 3; +} + +// Rotation applied by a RabitQ quantizer. +enum RotationType { + ROTATION_TYPE_MATRIX = 0; + ROTATION_TYPE_FAST = 1; +} + +// Estimator a RabitQ quantizer uses at query time. +enum RabitQueryEstimator { + RABIT_QUERY_ESTIMATOR_RESIDUAL_QUERY = 0; + RABIT_QUERY_ESTIMATOR_RAW_QUERY = 1; +} + +// Product quantizer. Sections: sub-index IPC, codebook IPC, storage IPC. +message PqPartitionHeader { + DistanceType distance_type = 1; + uint32 nbits = 2; + uint64 num_sub_vectors = 3; + uint64 dimension = 4; + bool transposed = 5; +} + +// Flat (float) and flat-binary quantizers. Sections: sub-index IPC, storage IPC. +message FlatPartitionHeader { + DistanceType distance_type = 1; + uint64 dim = 2; +} + +// Scalar quantizer. Sections: sub-index IPC, storage IPC (possibly multi-batch). +message SqPartitionHeader { + DistanceType distance_type = 1; + uint32 num_bits = 2; + uint64 dim = 3; + double bounds_start = 4; + double bounds_end = 5; +} + +// Header for a serialized IVF index state (`IvfIndexState`), followed by +// three raw blobs: the IVF model protobuf, the quantizer's extra-metadata +// buffer (may be empty), and the auxiliary IVF model protobuf. +message IvfStateHeader { + string index_file_path = 1; + string uuid = 2; + string distance_type = 3; + repeated string sub_index_metadata = 4; + string sub_index_type = 5; + string quantization_type = 6; + // Per-quantizer `Q::Metadata` as JSON. Kept as a string because the metadata + // type is generic over the quantizer; the proto envelope still provides + // additive evolution for the surrounding fields. + string quantizer_metadata_json = 7; + string cache_key_prefix = 8; + uint64 index_file_size = 9; + uint64 aux_file_size = 10; +} + +// RabitQ quantizer. Sections: sub-index IPC, rotate-matrix IPC (Matrix rotation +// only), storage IPC. +message RabitPartitionHeader { + DistanceType distance_type = 1; + uint32 num_bits = 2; + uint32 code_dim = 3; + RotationType rotation_type = 4; + // Fast-rotation sign vector; present only when rotation_type == + // ROTATION_TYPE_FAST (the Matrix case stores its rotation as an IPC section). + optional bytes fast_rotation_signs = 5; + // Estimator the RabitQ storage uses at query time (residual vs raw query). + RabitQueryEstimator query_estimator = 6; +} diff --git a/rust/lance-index/src/frag_reuse.rs b/rust/lance-index/src/frag_reuse.rs index d145108d3c0..d42b41ca9f0 100644 --- a/rust/lance-index/src/frag_reuse.rs +++ b/rust/lance-index/src/frag_reuse.rs @@ -1,357 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use crate::{Index, IndexType}; -use arrow_array::cast::AsArray; -use arrow_array::types::UInt64Type; -use arrow_array::{Array, ArrayRef, PrimitiveArray, RecordBatch, UInt64Array}; -use async_trait::async_trait; -use deepsize::{Context, DeepSizeOf}; -use itertools::Itertools; -use lance_core::{Error, Result}; -use lance_select::RowAddrTreeMap; -use lance_table::format::pb::fragment_reuse_index_details::InlineContent; -use lance_table::format::{ExternalFile, Fragment, pb}; -use roaring::{RoaringBitmap, RoaringTreemap}; -use serde::{Deserialize, Serialize}; -use std::{any::Any, collections::HashMap, sync::Arc}; -use uuid::Uuid; - -pub const FRAG_REUSE_INDEX_NAME: &str = "__lance_frag_reuse"; -pub const FRAG_REUSE_DETAILS_FILE_NAME: &str = "details.binpb"; - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragDigest { - pub id: u64, - pub physical_rows: usize, - pub num_deleted_rows: usize, -} - -impl From<&FragDigest> for pb::fragment_reuse_index_details::FragmentDigest { - fn from(digest: &FragDigest) -> Self { - Self { - id: digest.id, - physical_rows: digest.physical_rows as u64, - num_deleted_rows: digest.num_deleted_rows as u64, - } - } -} - -impl From<&Fragment> for FragDigest { - fn from(fragment: &Fragment) -> Self { - Self { - id: fragment.id, - physical_rows: fragment - .physical_rows - .expect("Fragment doesn't have physical rows recorded"), - num_deleted_rows: fragment - .deletion_file - .as_ref() - .and_then(|d| d.num_deleted_rows) - .unwrap_or(0), - } - } -} - -impl TryFrom for FragDigest { - type Error = Error; - - fn try_from(digest: pb::fragment_reuse_index_details::FragmentDigest) -> Result { - Ok(Self { - id: digest.id, - physical_rows: digest.physical_rows as usize, - num_deleted_rows: digest.num_deleted_rows as usize, - }) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragReuseGroup { - pub changed_row_addrs: Vec, - pub old_frags: Vec, - pub new_frags: Vec, -} - -impl From<&FragReuseGroup> for pb::fragment_reuse_index_details::Group { - fn from(group: &FragReuseGroup) -> Self { - Self { - changed_row_addrs: group.changed_row_addrs.clone(), - old_fragments: group.old_frags.iter().map(|f| f.into()).collect(), - new_fragments: group.new_frags.iter().map(|f| f.into()).collect(), - } - } -} - -impl TryFrom for FragReuseGroup { - type Error = Error; - - fn try_from(group: pb::fragment_reuse_index_details::Group) -> Result { - Ok(Self { - changed_row_addrs: group.changed_row_addrs, - old_frags: group - .old_fragments - .into_iter() - .map(FragDigest::try_from) - .collect::>()?, - new_frags: group - .new_fragments - .into_iter() - .map(FragDigest::try_from) - .collect::>()?, - }) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragReuseVersion { - pub dataset_version: u64, - pub groups: Vec, -} - -impl From<&FragReuseVersion> for pb::fragment_reuse_index_details::Version { - fn from(version: &FragReuseVersion) -> Self { - Self { - dataset_version: version.dataset_version, - groups: version.groups.iter().map(|g| g.into()).collect(), - } - } -} - -impl TryFrom for FragReuseVersion { - type Error = Error; - - fn try_from(version: pb::fragment_reuse_index_details::Version) -> Result { - Ok(Self { - dataset_version: version.dataset_version, - groups: version - .groups - .into_iter() - .map(FragReuseGroup::try_from) - .collect::>()?, - }) - } -} - -impl FragReuseVersion { - pub fn old_frag_ids(&self) -> Vec { - self.groups - .iter() - .flat_map(|g| g.old_frags.iter().map(|f| f.id)) - .collect::>() - } - - pub fn new_frag_ids(&self) -> Vec { - self.groups - .iter() - .flat_map(|g| g.new_frags.iter().map(|f| f.id)) - .collect::>() - } - - pub fn new_frag_bitmap(&self) -> RoaringBitmap { - RoaringBitmap::from_iter(self.new_frag_ids().iter().map(|&id| id as u32)) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub enum FragReuseIndexDetailsContentType { - Inline(FragReuseIndexDetails), - External(ExternalFile), -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragReuseIndexDetails { - pub versions: Vec, -} - -impl From<&FragReuseIndexDetails> for InlineContent { - fn from(details: &FragReuseIndexDetails) -> Self { - Self { - versions: details - .versions - .iter() - .map(|m| m.into()) - // sort from oldest to latest version - .sorted_by_key(|v: &pb::fragment_reuse_index_details::Version| v.dataset_version) - .collect(), - } - } -} - -impl TryFrom for FragReuseIndexDetails { - type Error = Error; - - fn try_from(content: InlineContent) -> Result { - Ok(Self { - versions: content - .versions - .into_iter() - .map(|m| m.try_into()) - .collect::>>()?, - }) - } -} - -impl FragReuseIndexDetails { - pub fn new_frag_bitmap(&self) -> RoaringBitmap { - RoaringBitmap::from_iter( - self.versions - .iter() - .flat_map(|v| v.new_frag_ids().into_iter().map(|id| id as u32)), - ) - } -} - -/// An index that stores row ID maps. -/// A row ID map describes the mapping from old row address to new address after compactions. -/// Each version contains the mapping for one round of compaction. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct FragReuseIndex { - pub uuid: Uuid, - pub row_id_maps: Vec>>, - pub details: FragReuseIndexDetails, -} - -impl DeepSizeOf for FragReuseIndex { - fn deep_size_of_children(&self, cx: &mut Context) -> usize { - self.row_id_maps.deep_size_of_children(cx) + self.details.deep_size_of_children(cx) - } -} - -impl FragReuseIndex { - pub fn new( - uuid: Uuid, - row_id_maps: Vec>>, - details: FragReuseIndexDetails, - ) -> Self { - Self { - uuid, - row_id_maps, - details, - } - } - - pub fn remap_row_id(&self, row_id: u64) -> Option { - let mut mapped_value = Some(row_id); - for row_id_map in self.row_id_maps.iter() { - if mapped_value.is_some() { - mapped_value = row_id_map - .get(&mapped_value.unwrap()) - .copied() - .unwrap_or(mapped_value); - } - } - - mapped_value - } - - pub fn remap_row_addrs_tree_map(&self, row_addrs: &RowAddrTreeMap) -> RowAddrTreeMap { - RowAddrTreeMap::from_iter(row_addrs.row_addrs().unwrap().filter_map(|addr| { - let addr_as_u64 = u64::from(addr); - self.remap_row_id(addr_as_u64) - })) - } - - pub fn remap_row_ids_roaring_tree_map(&self, row_ids: &RoaringTreemap) -> RoaringTreemap { - RoaringTreemap::from_iter(row_ids.iter().filter_map(|addr| self.remap_row_id(addr))) - } - - /// Remap a record batch that contains a row_id column at index `row_id_idx` - /// Currently this assumes there are only 2 columns in the schema, - /// which is the case for all indexes. - /// For example, for btree, the schema is (value, row_id). - /// For vector index storage, the schema is (row_id, vector). - pub fn remap_row_ids_record_batch( - &self, - batch: RecordBatch, - row_id_idx: usize, - ) -> Result { - assert_eq!(batch.schema().fields().len(), 2); - let other_column_idx = 1 - row_id_idx; - let row_ids = batch.column(row_id_idx).as_primitive::(); - let (val_indices, new_row_ids): (Vec, Vec) = row_ids - .values() - .iter() - .enumerate() - .filter_map(|(idx, old_id)| { - self.remap_row_id(*old_id) - .map(|new_id| (idx as u64, new_id)) - }) - .unzip(); - let new_val_indices = UInt64Array::from_iter_values(val_indices); - let new_vals = - arrow_select::take::take(batch.column(other_column_idx), &new_val_indices, None)?; - - let mut batch_data: Vec<(usize, ArrayRef)> = vec![ - ( - row_id_idx, - Arc::new(UInt64Array::from_iter_values(new_row_ids)) as ArrayRef, - ), - (other_column_idx, Arc::new(new_vals)), - ]; - batch_data.sort_by_key(|(i, _)| *i); - Ok(RecordBatch::try_new( - batch.schema(), - batch_data.into_iter().map(|(_, item)| item).collect(), - )?) - } +//! `Index`-trait adapter for the fragment-reuse system index. +//! +//! The data structures and table-format logic live in +//! [`lance_table::system_index::frag_reuse`]; this module re-exports them and +//! implements the local [`Index`] trait for [`FragReuseIndex`]. - pub fn remap_row_ids_array(&self, array: ArrayRef) -> PrimitiveArray { - let primitive_array = array - .as_any() - .downcast_ref::>() - .expect("expected row IDs to be uint64 array"); - (0..primitive_array.len()) - .map(|i| { - if primitive_array.is_null(i) { - None - } else { - self.remap_row_id(primitive_array.value(i)) - } - }) - .collect() - } +use std::any::Any; +use std::sync::Arc; - pub fn remap_fragment_bitmap(&self, fragment_bitmap: &mut RoaringBitmap) -> Result<()> { - for version in self.details.versions.iter() { - for group in version.groups.iter() { - let mut removed = 0; - for old_frag in group.old_frags.iter() { - if fragment_bitmap.remove(old_frag.id as u32) { - removed += 1; - } - } +use async_trait::async_trait; +use lance_core::{Error, Result}; +use roaring::RoaringBitmap; +use serde::Serialize; - if removed > 0 { - if removed != group.old_frags.len() { - // Straddle: the index covered only part of this rewrite - // group. Caused by the bug fixed in - // . - // We've already removed the indexed old_frags from the - // bitmap above; deliberately do NOT insert new_frags, - // since the merged fragment also contains rows that - // were never indexed. Affected rows fall through to - // flat scan until the next optimize_indices. The fix - // is persisted on the next write via build_manifest. - tracing::warn!( - "Healing straddling fragment-reuse rewrite group in index bitmap: \ - group {:?} was only partially indexed ({} of {} old fragments). \ - Affected rows will use flat scan until the next optimize_indices.", - group.old_frags, - removed, - group.old_frags.len(), - ); - continue; - } +pub use lance_table::system_index::frag_reuse::*; - for new_frag in group.new_frags.iter() { - fragment_bitmap.insert(new_frag.id as u32); - } - } - } - } - Ok(()) - } -} +use crate::{Index, IndexType}; #[derive(Serialize)] struct FragReuseStatistics { @@ -368,12 +34,6 @@ impl Index for FragReuseIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "FragReuseIndex is not a vector index".into(), - )) - } - fn statistics(&self) -> Result { let stats = FragReuseStatistics { num_versions: self.details.versions.len(), @@ -398,134 +58,3 @@ impl Index for FragReuseIndex { unimplemented!() } } - -#[cfg(test)] -mod tests { - - use super::*; - - #[tokio::test] - async fn test_serialize_deserialize_index_details() { - // Create sample FragReuseVersions with different dataset versions - let version1 = FragReuseVersion { - dataset_version: 2, - groups: vec![FragReuseGroup { - changed_row_addrs: vec![1, 2, 3], - old_frags: vec![FragDigest { - id: 1, - physical_rows: 1, - num_deleted_rows: 0, - }], - new_frags: vec![ - FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 3, - physical_rows: 1, - num_deleted_rows: 0, - }, - ], - }], - }; - - let version2 = FragReuseVersion { - dataset_version: 1, - groups: vec![FragReuseGroup { - changed_row_addrs: vec![4, 5, 6], - old_frags: vec![FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }], - new_frags: vec![ - FragDigest { - id: 4, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 5, - physical_rows: 1, - num_deleted_rows: 0, - }, - ], - }], - }; - - // Create FragReuseIndexDetails with versions in reverse order - let details = FragReuseIndexDetails { - versions: vec![version1, version2], - }; - - // Convert to protobuf format - let inline_content: InlineContent = (&details).into(); - - // Convert back to FragReuseIndexDetails - let roundtrip_details = FragReuseIndexDetails::try_from(inline_content).unwrap(); - - // Verify the roundtrip - assert_eq!(roundtrip_details.versions.len(), 2); - - // Verify versions are sorted by dataset_version (oldest to latest) - assert_eq!(roundtrip_details.versions[0].dataset_version, 1); - assert_eq!( - roundtrip_details.versions[0].groups[0].changed_row_addrs, - vec![4, 5, 6] - ); - assert_eq!( - roundtrip_details.versions[0].groups[0].new_frags, - vec![ - FragDigest { - id: 4, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 5, - physical_rows: 1, - num_deleted_rows: 0, - } - ] - ); - assert_eq!( - roundtrip_details.versions[0].groups[0].old_frags, - vec![FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }] - ); - - assert_eq!(roundtrip_details.versions[1].dataset_version, 2); - assert_eq!( - roundtrip_details.versions[1].groups[0].changed_row_addrs, - vec![1, 2, 3] - ); - assert_eq!( - roundtrip_details.versions[1].groups[0].new_frags, - vec![ - FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 3, - physical_rows: 1, - num_deleted_rows: 0, - } - ] - ); - assert_eq!( - roundtrip_details.versions[1].groups[0].old_frags, - vec![FragDigest { - id: 1, - physical_rows: 1, - num_deleted_rows: 0, - }] - ); - } -} diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index 0ed6ddd4e2d..61b45550367 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -14,7 +14,7 @@ use std::{any::Any, sync::Arc}; use crate::frag_reuse::FRAG_REUSE_INDEX_NAME; use crate::mem_wal::MEM_WAL_INDEX_NAME; use async_trait::async_trait; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -68,6 +68,13 @@ pub mod pbold { include!(concat!(env!("OUT_DIR"), "/lance.table.rs")); } +/// Protobuf headers for serialized index cache entries (FTS posting lists, +/// scalar indices, and IVF vector partitions). +pub mod cache_pb { + #![allow(clippy::use_self)] + include!(concat!(env!("OUT_DIR"), "/lance.index.cache.rs")); +} + /// Generic methods common across all types of secondary indices /// #[async_trait] @@ -78,9 +85,6 @@ pub trait Index: Send + Sync + DeepSizeOf { /// Cast to [Index] fn as_index(self: Arc) -> Arc; - /// Cast to [vector::VectorIndex] - fn as_vector_index(self: Arc) -> Result>; - /// Retrieve index statistics as a JSON Value fn statistics(&self) -> Result; @@ -125,6 +129,8 @@ pub enum IndexType { RTree = 10, // RTree + Fm = 11, // FM-Index + // 100+ and up for vector index. /// Flat vector index. Vector = 100, // Legacy vector index, alias to IvfPq @@ -150,6 +156,7 @@ impl std::fmt::Display for IndexType { Self::ZoneMap => write!(f, "ZoneMap"), Self::BloomFilter => write!(f, "BloomFilter"), Self::RTree => write!(f, "RTree"), + Self::Fm => write!(f, "Fm"), Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"), Self::IvfFlat => write!(f, "IVF_FLAT"), Self::IvfSq => write!(f, "IVF_SQ"), @@ -177,6 +184,7 @@ impl TryFrom for IndexType { v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap), v if v == Self::BloomFilter as i32 => Ok(Self::BloomFilter), v if v == Self::RTree as i32 => Ok(Self::RTree), + v if v == Self::Fm as i32 => Ok(Self::Fm), v if v == Self::Vector as i32 => Ok(Self::Vector), v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat), v if v == Self::IvfSq as i32 => Ok(Self::IvfSq), @@ -205,6 +213,7 @@ impl TryFrom<&str> for IndexType { "ZoneMap" | "ZONEMAP" => Ok(Self::ZoneMap), "BloomFilter" | "BLOOMFILTER" | "BLOOM_FILTER" => Ok(Self::BloomFilter), "RTree" | "RTREE" | "R_TREE" => Ok(Self::RTree), + "Fm" | "FM" => Ok(Self::Fm), "Vector" | "VECTOR" => Ok(Self::Vector), "IVF_FLAT" => Ok(Self::IvfFlat), "IVF_SQ" => Ok(Self::IvfSq), @@ -235,7 +244,8 @@ impl IndexType { | Self::NGram | Self::ZoneMap | Self::BloomFilter - | Self::RTree, + | Self::RTree + | Self::Fm, ) } @@ -275,6 +285,7 @@ impl IndexType { Self::ZoneMap => 0, Self::BloomFilter => 0, Self::RTree => 0, + Self::Fm => 0, // IMPORTANT: if any vector index subtype needs a format bump that is // not backward compatible, its new version must be set to @@ -305,6 +316,7 @@ impl IndexType { Self::IvfFlat => 4096, Self::IvfSq => 8192, Self::IvfPq => 8192, + Self::IvfRq => 4096, Self::IvfHnswFlat => 1 << 20, Self::IvfHnswSq => 1 << 20, Self::IvfHnswPq => 1 << 20, @@ -375,6 +387,11 @@ mod tests { assert_eq!(IndexType::max_vector_version(), IVF_RQ_INDEX_VERSION); } + #[test] + fn test_ivf_rq_target_partition_size() { + assert_eq!(IndexType::IvfRq.target_partition_size(), 4096); + } + #[test] fn test_index_type_try_from_i32_covers_all_variants() { let all = [ @@ -389,6 +406,7 @@ mod tests { IndexType::ZoneMap, IndexType::BloomFilter, IndexType::RTree, + IndexType::Fm, IndexType::Vector, IndexType::IvfFlat, IndexType::IvfSq, @@ -430,6 +448,8 @@ mod tests { ("RTree", IndexType::RTree), ("RTREE", IndexType::RTree), ("R_TREE", IndexType::RTree), + ("Fm", IndexType::Fm), + ("FM", IndexType::Fm), ("Vector", IndexType::Vector), ("VECTOR", IndexType::Vector), ("IVF_FLAT", IndexType::IvfFlat), diff --git a/rust/lance-index/src/mem_wal.rs b/rust/lance-index/src/mem_wal.rs index 4310db88908..9bd72ff7866 100644 --- a/rust/lance-index/src/mem_wal.rs +++ b/rust/lance-index/src/mem_wal.rs @@ -1,408 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +//! `Index`-trait adapter for the MemWAL system index. +//! +//! The data structures and table-format logic live in +//! [`lance_table::system_index::mem_wal`]; this module re-exports them and +//! implements the local [`Index`] trait for [`MemWalIndex`]. + use std::any::Any; -use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; -use deepsize::DeepSizeOf; use lance_core::Error; -use lance_table::format::pb; use roaring::RoaringBitmap; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -use crate::{Index, IndexType}; - -pub const MEM_WAL_INDEX_NAME: &str = "__lance_mem_wal"; - -/// Type alias for shard identifier (UUID v4). -pub type ShardId = Uuid; - -/// A flushed MemTable generation and its storage location. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FlushedGeneration { - pub generation: u64, - pub path: String, -} - -impl From<&FlushedGeneration> for pb::FlushedGeneration { - fn from(fg: &FlushedGeneration) -> Self { - Self { - generation: fg.generation, - path: fg.path.clone(), - } - } -} - -impl From for FlushedGeneration { - fn from(fg: pb::FlushedGeneration) -> Self { - Self { - generation: fg.generation, - path: fg.path, - } - } -} - -/// A shard's merged generation, used in MemWalIndexDetails. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash, Serialize, Deserialize)] -pub struct MergedGeneration { - pub shard_id: Uuid, - pub generation: u64, -} - -impl DeepSizeOf for MergedGeneration { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { - 0 // UUID is 16 bytes fixed size, no heap allocations - } -} - -impl MergedGeneration { - pub fn new(shard_id: Uuid, generation: u64) -> Self { - Self { - shard_id, - generation, - } - } -} - -impl From<&MergedGeneration> for pb::MergedGeneration { - fn from(mg: &MergedGeneration) -> Self { - Self { - shard_id: Some((&mg.shard_id).into()), - generation: mg.generation, - } - } -} - -impl TryFrom for MergedGeneration { - type Error = Error; - - fn try_from(mg: pb::MergedGeneration) -> lance_core::Result { - let shard_id = mg - .shard_id - .as_ref() - .map(Uuid::try_from) - .ok_or_else(|| Error::invalid_input("Missing shard_id in MergedGeneration"))??; - Ok(Self { - shard_id, - generation: mg.generation, - }) - } -} - -/// Tracks which merged generation a base table index has been rebuilt to cover. -/// Used to determine whether to read from flushed MemTable indexes or base table. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct IndexCatchupProgress { - pub index_name: String, - pub caught_up_generations: Vec, -} - -impl IndexCatchupProgress { - pub fn new(index_name: String, caught_up_generations: Vec) -> Self { - Self { - index_name, - caught_up_generations, - } - } - - /// Get the caught up generation for a specific shard. - /// Returns None if the shard is not present (assumed fully caught up). - pub fn caught_up_generation_for_shard(&self, shard_id: &Uuid) -> Option { - self.caught_up_generations - .iter() - .find(|mg| &mg.shard_id == shard_id) - .map(|mg| mg.generation) - } -} - -impl From<&IndexCatchupProgress> for pb::IndexCatchupProgress { - fn from(icp: &IndexCatchupProgress) -> Self { - Self { - index_name: icp.index_name.clone(), - caught_up_generations: icp - .caught_up_generations - .iter() - .map(|mg| mg.into()) - .collect(), - } - } -} - -impl TryFrom for IndexCatchupProgress { - type Error = Error; - - fn try_from(icp: pb::IndexCatchupProgress) -> lance_core::Result { - Ok(Self { - index_name: icp.index_name, - caught_up_generations: icp - .caught_up_generations - .into_iter() - .map(MergedGeneration::try_from) - .collect::>()?, - }) - } -} - -/// Shard manifest containing epoch-based fencing and WAL state. -/// Each shard has exactly one active writer at any time. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ShardManifest { - pub shard_id: Uuid, - pub version: u64, - pub shard_spec_id: u32, - /// Computed shard field values as raw Arrow scalar bytes, keyed by field id. - /// The byte encoding follows Arrow's little-endian convention: int32 is 4 LE - /// bytes, utf8 is raw UTF-8 bytes, etc. The result_type in the corresponding - /// ShardingField from the ShardingSpec determines how to interpret each value. - pub shard_field_values: HashMap>, - pub writer_epoch: u64, - /// The most recent WAL entry position flushed to a MemTable. - /// Recovery replays from `replay_after_wal_entry_position + 1`. The - /// default value 0 means "no flush has ever stamped this shard" — WAL - /// positions themselves are 1-based, so 0 is never a valid covered - /// position. - pub replay_after_wal_entry_position: u64, - /// The most recent WAL entry position observed at manifest write time. - /// Default 0 means "no entry has been written yet"; WAL positions are - /// 1-based. - pub wal_entry_position_last_seen: u64, - pub current_generation: u64, - pub flushed_generations: Vec, -} - -impl DeepSizeOf for ShardManifest { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - self.shard_field_values.deep_size_of_children(context) - + self.flushed_generations.deep_size_of_children(context) - } -} - -impl From<&ShardManifest> for pb::ShardManifest { - fn from(rm: &ShardManifest) -> Self { - Self { - shard_id: Some((&rm.shard_id).into()), - version: rm.version, - shard_spec_id: rm.shard_spec_id, - shard_field_entries: rm - .shard_field_values - .iter() - .map(|(k, v)| pb::ShardFieldEntry { - field_id: k.clone(), - value: v.clone(), - }) - .collect(), - writer_epoch: rm.writer_epoch, - replay_after_wal_entry_position: rm.replay_after_wal_entry_position, - wal_entry_position_last_seen: rm.wal_entry_position_last_seen, - current_generation: rm.current_generation, - flushed_generations: rm.flushed_generations.iter().map(|fg| fg.into()).collect(), - } - } -} - -impl TryFrom for ShardManifest { - type Error = Error; - - fn try_from(rm: pb::ShardManifest) -> lance_core::Result { - let shard_id = rm - .shard_id - .as_ref() - .map(Uuid::try_from) - .ok_or_else(|| Error::invalid_input("Missing shard_id in ShardManifest"))??; - let shard_field_values = rm - .shard_field_entries - .into_iter() - .map(|e| (e.field_id, e.value)) - .collect(); - Ok(Self { - shard_id, - version: rm.version, - shard_spec_id: rm.shard_spec_id, - shard_field_values, - writer_epoch: rm.writer_epoch, - replay_after_wal_entry_position: rm.replay_after_wal_entry_position, - wal_entry_position_last_seen: rm.wal_entry_position_last_seen, - current_generation: rm.current_generation, - flushed_generations: rm - .flushed_generations - .into_iter() - .map(FlushedGeneration::from) - .collect(), - }) - } -} - -/// Sharding field definition. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct ShardingField { - pub field_id: String, - pub source_ids: Vec, - pub transform: Option, - pub expression: Option, - pub result_type: String, - pub parameters: HashMap, -} - -impl From<&ShardingField> for pb::ShardingField { - fn from(rf: &ShardingField) -> Self { - Self { - field_id: rf.field_id.clone(), - source_ids: rf.source_ids.clone(), - transform: rf.transform.clone(), - expression: rf.expression.clone(), - result_type: rf.result_type.clone(), - parameters: rf.parameters.clone(), - } - } -} - -impl From for ShardingField { - fn from(rf: pb::ShardingField) -> Self { - Self { - field_id: rf.field_id, - source_ids: rf.source_ids, - transform: rf.transform, - expression: rf.expression, - result_type: rf.result_type, - parameters: rf.parameters, - } - } -} - -/// Sharding spec definition. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct ShardingSpec { - pub spec_id: u32, - pub fields: Vec, -} - -impl From<&ShardingSpec> for pb::ShardingSpec { - fn from(rs: &ShardingSpec) -> Self { - Self { - spec_id: rs.spec_id, - fields: rs.fields.iter().map(|f| f.into()).collect(), - } - } -} - -impl From for ShardingSpec { - fn from(rs: pb::ShardingSpec) -> Self { - Self { - spec_id: rs.spec_id, - fields: rs.fields.into_iter().map(ShardingField::from).collect(), - } - } -} - -/// Index details for MemWAL Index, stored in IndexMetadata.index_details. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct MemWalIndexDetails { - pub snapshot_ts_millis: i64, - pub num_shards: u32, - pub inline_snapshots: Option>, - pub sharding_specs: Vec, - pub maintained_indexes: Vec, - pub merged_generations: Vec, - pub index_catchup: Vec, - /// Default `ShardWriter` configuration values for this MemWAL index. - /// - /// Persisted so every writer — across processes and restarts — starts - /// from the same default writer configuration. These are defaults only; - /// an individual writer may still override any value at runtime in its - /// own (non-persisted) `ShardWriterConfig`. - pub writer_config_defaults: HashMap, -} - -impl From<&MemWalIndexDetails> for pb::MemWalIndexDetails { - fn from(details: &MemWalIndexDetails) -> Self { - Self { - snapshot_ts_millis: details.snapshot_ts_millis, - num_shards: details.num_shards, - inline_snapshots: details.inline_snapshots.clone(), - sharding_specs: details.sharding_specs.iter().map(|rs| rs.into()).collect(), - maintained_indexes: details.maintained_indexes.clone(), - merged_generations: details - .merged_generations - .iter() - .map(|mg| mg.into()) - .collect(), - index_catchup: details.index_catchup.iter().map(|icp| icp.into()).collect(), - writer_config_defaults: details.writer_config_defaults.clone(), - } - } -} - -impl TryFrom for MemWalIndexDetails { - type Error = Error; - - fn try_from(details: pb::MemWalIndexDetails) -> lance_core::Result { - Ok(Self { - snapshot_ts_millis: details.snapshot_ts_millis, - num_shards: details.num_shards, - inline_snapshots: details.inline_snapshots, - sharding_specs: details - .sharding_specs - .into_iter() - .map(ShardingSpec::from) - .collect(), - maintained_indexes: details.maintained_indexes, - merged_generations: details - .merged_generations - .into_iter() - .map(MergedGeneration::try_from) - .collect::>()?, - index_catchup: details - .index_catchup - .into_iter() - .map(IndexCatchupProgress::try_from) - .collect::>()?, - writer_config_defaults: details.writer_config_defaults, - }) - } -} - -/// MemWAL Index provides access to MemWAL configuration and state. -#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)] -pub struct MemWalIndex { - pub details: MemWalIndexDetails, -} - -impl MemWalIndex { - pub fn new(details: MemWalIndexDetails) -> Self { - Self { details } - } - - pub fn merged_generation_for_shard(&self, shard_id: &Uuid) -> Option { - self.details - .merged_generations - .iter() - .find(|mg| &mg.shard_id == shard_id) - .map(|mg| mg.generation) - } - - /// Get the caught up generation for a specific index and shard. - /// Returns None if the index is not tracked (assumed fully caught up). - pub fn index_caught_up_generation(&self, index_name: &str, shard_id: &Uuid) -> Option { - self.details - .index_catchup - .iter() - .find(|icp| icp.index_name == index_name) - .and_then(|icp| icp.caught_up_generation_for_shard(shard_id)) - } +use serde::Serialize; - /// Check if an index is fully caught up for a shard. - /// Returns true if the index covers all merged data for the shard. - pub fn is_index_caught_up(&self, index_name: &str, shard_id: &Uuid) -> bool { - let merged_gen = self.merged_generation_for_shard(shard_id).unwrap_or(0); - let caught_up_gen = self.index_caught_up_generation(index_name, shard_id); +pub use lance_table::system_index::mem_wal::*; - // If not tracked in index_catchup, assumed fully caught up - caught_up_gen.is_none_or(|generation| generation >= merged_gen) - } -} +use crate::{Index, IndexType}; #[derive(Serialize)] struct MemWalStatistics { @@ -423,12 +38,6 @@ impl Index for MemWalIndex { self } - fn as_vector_index(self: Arc) -> lance_core::Result> { - Err(Error::not_supported_source( - "MemWalIndex is not a vector index".into(), - )) - } - fn statistics(&self) -> lance_core::Result { let stats = MemWalStatistics { num_shards: self.details.num_shards, diff --git a/rust/lance-index/src/metrics.rs b/rust/lance-index/src/metrics.rs index 9e2161ae8f9..8c0c119a3c3 100644 --- a/rust/lance-index/src/metrics.rs +++ b/rust/lance-index/src/metrics.rs @@ -3,6 +3,11 @@ use std::sync::atomic::{AtomicUsize, Ordering}; +pub const AND_CANDIDATES_SEEN_METRIC: &str = "and_candidates_seen"; +pub const AND_CANDIDATES_PRUNED_BEFORE_RETURN_METRIC: &str = "and_candidates_pruned_before_return"; +pub const AND_FULL_SCORES_METRIC: &str = "and_full_scores"; +pub const FREQS_COLLECTED_METRIC: &str = "freqs_collected"; + /// A trait used by the index to report metrics /// /// Callers can implement this trait to collect metrics @@ -43,6 +48,33 @@ pub trait MetricsCollector: Send + Sync { /// /// The goal is to provide some visibility into the compute cost of the search fn record_comparisons(&self, num_comparisons: usize); + + /// Record AND candidates returned from WAND alignment to the scoring loop. + /// + /// This excludes candidates pruned before `next()` returns. Use this with + /// `record_and_candidates_pruned_before_return` to recover total aligned + /// AND candidates. + fn record_and_candidates_seen(&self, _num_candidates: usize) {} + + /// Record AND candidates pruned during WAND alignment before `next()` returns. + fn record_and_candidates_pruned_before_return(&self, _num_candidates: usize) {} + + fn record_and_full_scores(&self, _num_scores: usize) {} + + fn record_freqs_collected(&self, _num_collections: usize) {} + + /// Returns an optional sink for recording exact I/O statistics (bytes read, + /// IOPS, and requests) performed on behalf of this collector. + /// + /// Index implementations that read from a + /// [`lance_io::scheduler::ScanScheduler`] can attach the returned handle to + /// their file readers so the I/O performed for a single query is measured + /// and attributed here. The default returns `None`, meaning the caller does + /// not want I/O measured (and index implementations should then take their + /// normal, uninstrumented read path). + fn io_stats(&self) -> Option { + None + } } /// A no-op metrics collector that does nothing diff --git a/rust/lance-index/src/registry.rs b/rust/lance-index/src/registry.rs index 8ab65d38896..1abab781635 100644 --- a/rust/lance-index/src/registry.rs +++ b/rust/lance-index/src/registry.rs @@ -10,11 +10,27 @@ use crate::{ pb, pbold, scalar::{ bitmap::BitmapIndexPlugin, bloomfilter::BloomFilterIndexPlugin, btree::BTreeIndexPlugin, - inverted::InvertedIndexPlugin, json::JsonIndexPlugin, label_list::LabelListIndexPlugin, - ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, zonemap::ZoneMapIndexPlugin, + fmindex::FMIndexPlugin, inverted::InvertedIndexPlugin, json::JsonIndexPlugin, + label_list::LabelListIndexPlugin, ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, + zonemap::ZoneMapIndexPlugin, }, }; +/// Derive a human-readable index type name from a details type URL. +/// +/// The display name is the final `.`-separated segment of the type URL with any +/// trailing `IndexDetails` removed. For example, `/lance.index.pb.VectorIndexDetails` +/// yields `Vector`. Used as a best-effort fallback when no plugin is registered +/// for the type URL, so the index type is never reported as opaque "Unknown" +/// while valid index details exist. +pub fn display_type_from_url(type_url: &str) -> &str { + let segment = type_url.rsplit('.').next().unwrap_or(type_url); + segment + .strip_suffix("IndexDetails") + .filter(|stripped| !stripped.is_empty()) + .unwrap_or(segment) +} + /// A registry of index plugins pub struct IndexPluginRegistry { plugins: HashMap>, @@ -28,7 +44,12 @@ impl IndexPluginRegistry { fn get_plugin_name_from_details_name(&self, details_name: &str) -> String { let details_name = Self::normalize_plugin_name(details_name); if details_name.ends_with("indexdetails") { - details_name.replace("indexdetails", "") + let plugin_name = details_name.replace("indexdetails", ""); + if plugin_name == "fmindex" { + "fm".to_string() + } else { + plugin_name + } } else { details_name } @@ -66,6 +87,7 @@ impl IndexPluginRegistry { registry.add_plugin::(); registry.add_plugin::(); registry.add_plugin::(); + registry.add_plugin::(); #[cfg(feature = "geo")] registry.add_plugin::(); @@ -110,6 +132,24 @@ impl IndexPluginRegistry { mod tests { use super::*; + #[test] + fn test_display_type_from_url() { + assert_eq!( + display_type_from_url("/lance.index.pb.VectorIndexDetails"), + "Vector" + ); + assert_eq!(display_type_from_url("BTreeIndexDetails"), "BTree"); + // Segment without the IndexDetails suffix is returned verbatim. + assert_eq!( + display_type_from_url("/lance.pb.SomethingElse"), + "SomethingElse" + ); + // A bare "IndexDetails" segment has nothing left after stripping, so it + // is returned as-is rather than an empty string. + assert_eq!(display_type_from_url("IndexDetails"), "IndexDetails"); + assert_eq!(display_type_from_url(""), ""); + } + #[test] fn test_get_plugin_by_name_accepts_case_insensitive_builtin_names() { let registry = IndexPluginRegistry::with_default_plugins(); @@ -121,6 +161,7 @@ mod tests { ("NGRAM", "NGram"), ("ZONEMAP", "ZoneMap"), ("BLOOMFILTER", "BloomFilter"), + ("FM", "Fm"), ("JSON", "Json"), ] { let plugin = registry.get_plugin_by_name(requested_name).unwrap(); diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 5ab138ff481..3a6834129b3 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -8,6 +8,7 @@ use arrow_array::{BooleanArray, ListArray, RecordBatch, UInt64Array}; use arrow_schema::{Field, Schema}; use async_trait::async_trait; use bytes::Bytes; +use datafusion::functions::regex::regexplike::RegexpLikeFunc; use datafusion::functions::string::contains::ContainsFunc; use datafusion::functions_nested::array_has; use datafusion::physical_plan::SendableRecordBatchStream; @@ -18,8 +19,8 @@ use std::pin::Pin; use std::{any::Any, ops::Bound, sync::Arc}; use datafusion_expr::{Expr, expr::ScalarFunction}; -use deepsize::DeepSizeOf; use inverted::query::{FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, fill_fts_query_column}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter}; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; @@ -35,6 +36,7 @@ pub mod bitmap; pub mod bloomfilter; pub mod btree; pub mod expression; +pub mod fmindex; pub mod inverted; pub mod json; pub mod label_list; @@ -67,6 +69,7 @@ pub enum BuiltinIndexType { BloomFilter, RTree, Inverted, + Fm, } impl BuiltinIndexType { @@ -80,6 +83,7 @@ impl BuiltinIndexType { Self::Inverted => "inverted", Self::BloomFilter => "bloomfilter", Self::RTree => "rtree", + Self::Fm => "fm", } } } @@ -97,6 +101,7 @@ impl TryFrom for BuiltinIndexType { IndexType::Inverted => Ok(Self::Inverted), IndexType::BloomFilter => Ok(Self::BloomFilter), IndexType::RTree => Ok(Self::RTree), + IndexType::Fm => Ok(Self::Fm), _ => Err(Error::index("Invalid index type".to_string())), } } @@ -182,9 +187,12 @@ pub trait IndexWriter: Send { )) } /// Finishes writing the file and closes the file - async fn finish(&mut self) -> Result<()>; + async fn finish(&mut self) -> Result; /// Finishes writing the file and closes the file with additional metadata - async fn finish_with_metadata(&mut self, metadata: HashMap) -> Result<()>; + async fn finish_with_metadata( + &mut self, + metadata: HashMap, + ) -> Result; } /// Trait for reading an index (or parts of an index) from storage @@ -207,6 +215,23 @@ pub trait IndexReader: Send + Sync { range: std::ops::Range, projection: Option<&[&str]>, ) -> Result; + /// Read multiple ranges and concatenate into a single batch. + /// Default impl runs `read_range`s in parallel via `try_join_all`. + async fn read_ranges( + &self, + ranges: &[std::ops::Range], + projection: Option<&[&str]>, + ) -> Result { + if ranges.is_empty() { + return self.read_range(0..0, projection).await; + } + let futures = ranges + .iter() + .map(|r| self.read_range(r.clone(), projection)); + let batches = futures::future::try_join_all(futures).await?; + let schema = batches[0].schema(); + Ok(arrow_select::concat::concat_batches(&schema, &batches)?) + } /// Read a range of rows as a stream of record batches. /// /// This allows the caller to process rows incrementally without loading the @@ -232,6 +257,11 @@ pub trait IndexReader: Send + Sync { fn num_rows(&self) -> usize; /// Return the metadata of the file fn schema(&self) -> &lance_core::datatypes::Schema; + /// Best-effort on-disk byte size of the file when the reader already knows it + /// without extra I/O, else `None`. Used to size prewarm chunks. + fn file_size_bytes(&self) -> Option { + None + } } /// Trait abstracting I/O away from index logic @@ -257,10 +287,26 @@ pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf { /// Copy a range of batches from an index file from this store to another /// /// This is often useful when remapping or updating - async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<()>; + async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result; + + /// Copy an index file from this store to a new name in another store, leaving the source intact + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + if name == new_name { + self.copy_index_file(name, dest_store).await + } else { + Err(Error::not_supported(format!( + "copying index file {name} to {new_name} is not supported by this index store" + ))) + } + } /// Rename an index file - async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<()>; + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result; /// Delete an index file (used in the tmp spill store to keep tmp size down) async fn delete_index_file(&self, name: &str) -> Result<()>; @@ -604,9 +650,15 @@ impl AnyQuery for LabelListQuery { pub enum TextQuery { /// Retrieve all row ids where the text contains the given string StringContains(String), - // TODO: In the future we should be able to do string-insensitive contains - // as well as partial matches (e.g. LIKE 'foo%') and potentially even - // some regular expressions + /// Retrieve all row ids whose text matches the given regular expression. + /// + /// The pattern is a full regular expression (as accepted by `regexp_like`). + /// The index returns a candidate superset that the scan rechecks, so any + /// pattern is sound; patterns with no usable trigram structure simply fall + /// back to rechecking every row. + Regex(String), + // TODO: In the future we should be able to do case-insensitive contains + // as well as partial matches (e.g. LIKE 'foo%'). } impl AnyQuery for TextQuery { @@ -627,6 +679,17 @@ impl AnyQuery for TextQuery { Expr::Literal(ScalarValue::Utf8(Some(substr.clone())), None), ], }), + // `regexp_like` returns Boolean directly, so the reconstructed + // expression can be used as-is for the recheck filter (no IsNotNull + // wrapper, unlike `regexp_match`). It is the semantic equivalent of + // the original predicate for the "does it match" question. + Self::Regex(pattern) => Expr::ScalarFunction(ScalarFunction { + func: Arc::new(RegexpLikeFunc::new().into()), + args: vec![ + Expr::Column(Column::new_unqualified(col)), + Expr::Literal(ScalarValue::Utf8(Some(pattern.clone())), None), + ], + }), } } @@ -848,7 +911,7 @@ pub struct CreatedIndex { /// /// This enables skipping HEAD calls when opening indices and provides /// visibility into index storage size via describe_indices(). - pub files: Option>, + pub files: Vec, } /// The criteria that specifies how to update an index @@ -897,6 +960,17 @@ impl OldIndexDataFilter { .collect(), } } + + /// Apply this filter in place to a set of existing (old) row ids/addresses, + /// retaining only the rows the filter selects to keep. Used by index types + /// that merge old postings directly (e.g. bitmap) instead of re-scanning a + /// row-id array through [`Self::filter_row_ids`]. + pub fn retain_old_rows(&self, rows: &mut RowAddrTreeMap) { + match self { + Self::Fragments { to_keep, .. } => rows.retain_fragments(to_keep.iter()), + Self::RowIds(valid_row_ids) => *rows &= valid_row_ids, + } + } } impl UpdateCriteria { diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index eb0276dcb9f..23b300a2d73 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -17,15 +17,14 @@ use async_trait::async_trait; use bytes::Bytes; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion_common::ScalarValue; -use deepsize::DeepSizeOf; use futures::{StreamExt, TryStreamExt, stream}; -use lance_arrow::ipc::{ - read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_len_prefixed_bytes, -}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, - cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, + cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, + }, error::LanceOptionExt, utils::tokio::get_num_compute_intensive_cpus, }; @@ -36,7 +35,7 @@ use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use tracing::{instrument, warn}; -use super::{AnyQuery, IndexStore, ScalarIndex}; +use super::{AnyQuery, IndexFile, IndexStore, ScalarIndex}; use super::{ BuiltinIndexType, SargableQuery, ScalarIndexParams, SearchResult, btree::OrderableScalarValue, }; @@ -116,7 +115,7 @@ pub struct BitmapIndex { /// Maps each unique value to its bitmap location in the index file /// The usize value is the row offset in the bitmap_page_lookup.lance file /// for quickly locating the row and reading it out - index_map: BTreeMap, + index_map: Arc>, null_map: Arc, @@ -173,11 +172,17 @@ pub struct BitmapIndexState { /// Cached separately from the schema for the empty-index case where the /// `lookup_batch` is empty but we still need to remember the column type. value_type: DataType, + /// Parsed form of `lookup_batch`. Not serialized — populated eagerly in + /// both [`BitmapIndexState::from_index`] and [`CacheCodecImpl::deserialize`]. + /// Stored as `Arc` so cloning into a new [`BitmapIndex`] is O(1). + index_map: Arc>, } impl DeepSizeOf for BitmapIndexState { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - self.lookup_batch.get_array_memory_size() + self.null_map.deep_size_of_children(context) + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.lookup_batch.get_array_memory_size() + + self.null_map.deep_size_of_children(context) + + self.index_map.deep_size_of_children(context) } } @@ -187,25 +192,51 @@ impl BitmapIndexState { lookup_batch: build_lookup_batch(&index.index_map, &index.value_type)?, null_map: index.null_map.clone(), value_type: index.value_type.clone(), + index_map: index.index_map.clone(), }) } - pub(crate) fn into_bitmap_index( - self, + pub(crate) fn to_bitmap_index( + &self, store: Arc, index_cache: &LanceCache, frag_reuse_index: Option>, ) -> Result> { - let index_map = parse_lookup_batch(&self.lookup_batch)?; Ok(Arc::new(BitmapIndex::new( - index_map, - self.null_map, - self.value_type, + self.index_map.clone(), + self.null_map.clone(), + self.value_type.clone(), store, WeakLanceCache::from(index_cache), frag_reuse_index, ))) } + + /// Build a state directly from its parts, for codec tests in sibling + /// modules (e.g. the label-list index, which nests a bitmap state). + #[cfg(test)] + pub(crate) fn new_for_test( + index_map: BTreeMap, + null_map: RowAddrTreeMap, + value_type: DataType, + ) -> Result { + Ok(Self { + lookup_batch: build_lookup_batch(&index_map, &value_type)?, + null_map: Arc::new(null_map), + value_type, + index_map: Arc::new(index_map), + }) + } + + #[cfg(test)] + pub(crate) fn lookup_batch(&self) -> &RecordBatch { + &self.lookup_batch + } + + #[cfg(test)] + pub(crate) fn null_map(&self) -> &RowAddrTreeMap { + &self.null_map + } } fn build_lookup_batch( @@ -245,30 +276,34 @@ fn parse_lookup_batch(batch: &RecordBatch) -> Result, offsets: UInt64)] + /// RAW_BLOB : null_map (roaring tree map, portable encoding) + /// ARROW_IPC : (keys: , offsets: UInt64) /// ``` - /// The value type is recovered from the IPC stream schema. - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + /// The value type is recovered from the IPC section schema. + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let mut null_bytes = Vec::with_capacity(self.null_map.serialized_size()); self.null_map.serialize_into(&mut null_bytes)?; - write_len_prefixed_bytes(writer, &null_bytes)?; - write_ipc_stream(&self.lookup_batch, writer)?; + w.write_raw(&null_bytes)?; + w.write_ipc(&self.lookup_batch)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let null_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let null_bytes = r.read_raw()?; let null_map = Arc::new(RowAddrTreeMap::deserialize_from(null_bytes.as_ref())?); - let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; + let lookup_batch = r.read_ipc()?; let value_type = lookup_batch.schema().field(0).data_type().clone(); + let index_map = Arc::new(parse_lookup_batch(&lookup_batch)?); Ok(Self { lookup_batch, null_map, value_type, + index_map, }) } } @@ -295,7 +330,7 @@ impl CacheKey for BitmapIndexStateKey { impl BitmapIndex { fn new( - index_map: BTreeMap, + index_map: Arc>, null_map: Arc, value_type: DataType, store: Arc, @@ -326,7 +361,7 @@ impl BitmapIndex { let schema = page_lookup_file.schema(); let data_type = schema.fields[0].data_type(); return Ok(Arc::new(Self::new( - BTreeMap::new(), + Arc::new(BTreeMap::new()), Arc::new(RowAddrTreeMap::default()), data_type, store, @@ -381,7 +416,7 @@ impl BitmapIndex { } Ok(Arc::new(Self::new( - index_map, + Arc::new(index_map), null_map, value_type, store, @@ -465,13 +500,8 @@ impl BitmapIndex { } impl DeepSizeOf for BitmapIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - let mut total_size = 0; - - total_size += self.index_map.deep_size_of_children(context); - total_size += self.store.deep_size_of_children(context); - - total_size + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.index_map.deep_size_of_children(context) + self.store.deep_size_of_children(context) } } @@ -521,12 +551,6 @@ impl Index for BitmapIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "BitmapIndex is not a vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { let page_lookup_file = self.lazy_reader.get().await?; let total_rows = page_lookup_file.num_rows(); @@ -765,13 +789,15 @@ impl ScalarIndex for BitmapIndex { ) -> Result { let state = self.load_bitmap_index_state().await?; let remapped_state = BitmapIndexPlugin::remap_bitmap_state(state, mapping); - BitmapIndexPlugin::write_bitmap_index(remapped_state, dest_store, &self.value_type).await?; + let file = + BitmapIndexPlugin::write_bitmap_index(remapped_state, dest_store, &self.value_type) + .await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()) .unwrap(), index_version: BITMAP_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -780,13 +806,14 @@ impl ScalarIndex for BitmapIndex { &self, new_data: SendableRecordBatchStream, dest_store: &dyn IndexStore, - _old_data_filter: Option, + old_data_filter: Option, ) -> Result { - BitmapIndexPlugin::streaming_build_and_write( + let file = BitmapIndexPlugin::streaming_build_and_write( new_data, Some(self), dest_store, BITMAP_LOOKUP_NAME, + old_data_filter.as_ref(), ) .await?; @@ -794,7 +821,7 @@ impl ScalarIndex for BitmapIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()) .unwrap(), index_version: BITMAP_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -867,7 +894,7 @@ impl BitmapBatchWriter { } /// Flush any remaining data, write index statistics, and finalize the file. - async fn finish(mut self) -> Result<()> { + async fn finish(mut self) -> Result { self.flush().await?; let stats_json = serde_json::to_string(&BitmapStatistics { num_bitmaps: self.num_bitmaps, @@ -875,8 +902,7 @@ impl BitmapBatchWriter { .map_err(|e| Error::internal(format!("failed to serialize bitmap statistics: {e}")))?; let mut metadata = HashMap::new(); metadata.insert(INDEX_STATS_METADATA_KEY.to_string(), stats_json); - self.file.finish_with_metadata(metadata).await?; - Ok(()) + self.file.finish_with_metadata(metadata).await } } @@ -1166,6 +1192,19 @@ async fn cleanup_bitmap_shard_files(store: &dyn IndexStore, shard_files: &[Strin #[derive(Debug, Default)] pub struct BitmapIndexPlugin; +/// Drop the rows an old posting should no longer expose -- rows whose fragment +/// was removed, or (under stable row ids) rows rewritten by an update -- keeping +/// only those `filter` still considers valid. A no-op when `filter` is `None`. +fn retain_valid( + mut bitmap: RowAddrTreeMap, + filter: Option<&super::OldIndexDataFilter>, +) -> RowAddrTreeMap { + if let Some(filter) = filter { + filter.retain_old_rows(&mut bitmap); + } + bitmap +} + impl BitmapIndexPlugin { fn get_batch_from_arrays( keys: Arc, @@ -1185,7 +1224,7 @@ impl BitmapIndexPlugin { state: HashMap, index_store: &dyn IndexStore, value_type: &DataType, - ) -> Result<()> { + ) -> Result { Self::write_bitmap_index_with_extras( state, index_store, @@ -1203,7 +1242,7 @@ impl BitmapIndexPlugin { value_type: &DataType, mut metadata: HashMap, global_buffers: Vec<(String, Bytes)>, - ) -> Result<()> { + ) -> Result { let num_bitmaps = state.len(); let schema = Arc::new(Schema::new(vec![ Field::new("keys", value_type.clone(), true), @@ -1267,9 +1306,7 @@ impl BitmapIndexPlugin { .map_err(|e| Error::internal(format!("failed to serialize bitmap statistics: {e}")))?; metadata.insert(INDEX_STATS_METADATA_KEY.to_string(), stats_json); - bitmap_index_file.finish_with_metadata(metadata).await?; - - Ok(()) + bitmap_index_file.finish_with_metadata(metadata).await } /// Builds bitmap index state from a `(value, row_id)` stream without writing it. @@ -1298,8 +1335,8 @@ impl BitmapIndexPlugin { pub async fn train_bitmap_index( data: SendableRecordBatchStream, index_store: &dyn IndexStore, - ) -> Result<()> { - Self::streaming_build_and_write(data, None, index_store, BITMAP_LOOKUP_NAME).await + ) -> Result { + Self::streaming_build_and_write(data, None, index_store, BITMAP_LOOKUP_NAME, None).await } async fn train_bitmap_shard( @@ -1308,15 +1345,16 @@ impl BitmapIndexPlugin { fragment_ids: &[u32], shard_id: Option, progress: Arc, - ) -> Result<()> { + ) -> Result { let partition_id = bitmap_shard_partition_id(fragment_ids, shard_id)?; let file_name = bitmap_shard_file_name(partition_id); progress .stage_start("build_bitmap_shard", None, "rows") .await?; - Self::streaming_build_and_write(data, None, index_store, &file_name).await?; + let file = + Self::streaming_build_and_write(data, None, index_store, &file_name, None).await?; progress.stage_complete("build_bitmap_shard").await?; - Ok(()) + Ok(file) } /// Builds and writes a bitmap index in a streaming fashion from value-sorted @@ -1331,7 +1369,8 @@ impl BitmapIndexPlugin { old_index: Option<&BitmapIndex>, index_store: &dyn IndexStore, output_file_name: &str, - ) -> Result<()> { + old_data_filter: Option<&super::OldIndexDataFilter>, + ) -> Result { let value_type = data_source.schema().field(0).data_type().clone(); let mut writer = @@ -1377,6 +1416,7 @@ impl BitmapIndexPlugin { &mut old_pos, &mut emitted_null, &mut writer, + old_data_filter, ) .await?; } @@ -1399,6 +1439,7 @@ impl BitmapIndexPlugin { &mut old_pos, &mut emitted_null, &mut writer, + old_data_filter, ) .await?; } @@ -1406,7 +1447,13 @@ impl BitmapIndexPlugin { // Emit any remaining old-only entries. if let Some(idx) = old_index { while old_pos < old_keys.len() { - let old_bitmap = idx.load_bitmap(&old_keys[old_pos], None).await?; + let old_bitmap = retain_valid( + idx.load_bitmap(&old_keys[old_pos], None) + .await? + .as_ref() + .clone(), + old_data_filter, + ); writer .emit(old_keys[old_pos].0.clone(), &old_bitmap) .await?; @@ -1421,17 +1468,17 @@ impl BitmapIndexPlugin { { let null_key = new_null_array(&value_type, 1); let null_key = ScalarValue::try_from_array(null_key.as_ref(), 0)?; - writer.emit(null_key, &idx.null_map).await?; + let null_bitmap = retain_valid((*idx.null_map).clone(), old_data_filter); + writer.emit(null_key, &null_bitmap).await?; } - writer.finish().await?; - - Ok(()) + writer.finish().await } /// Flush a completed value-run from the new data stream, emitting any /// old-only entries that sort before it and merging the old bitmap if the /// key exists in both old and new. + #[allow(clippy::too_many_arguments)] async fn finish_run( key: ScalarValue, bitmap: &mut RowAddrTreeMap, @@ -1440,13 +1487,14 @@ impl BitmapIndexPlugin { old_pos: &mut usize, emitted_null: &mut bool, writer: &mut BitmapBatchWriter, + old_data_filter: Option<&super::OldIndexDataFilter>, ) -> Result<()> { if key.is_null() { // Null values are stored separately in the old index's null_map. if let Some(idx) = old_index && !idx.null_map.is_empty() { - *bitmap |= &*idx.null_map; + *bitmap |= &retain_valid((*idx.null_map).clone(), old_data_filter); } *emitted_null = true; writer.emit(key, bitmap).await?; @@ -1455,7 +1503,13 @@ impl BitmapIndexPlugin { // Emit old-only entries that sort before this key. while *old_pos < old_keys.len() && old_keys[*old_pos] < orderable { - let old_bitmap = idx.load_bitmap(&old_keys[*old_pos], None).await?; + let old_bitmap = retain_valid( + idx.load_bitmap(&old_keys[*old_pos], None) + .await? + .as_ref() + .clone(), + old_data_filter, + ); writer .emit(old_keys[*old_pos].0.clone(), &old_bitmap) .await?; @@ -1464,8 +1518,13 @@ impl BitmapIndexPlugin { // If the old index also has this key, merge its bitmap. if *old_pos < old_keys.len() && old_keys[*old_pos] == orderable { - let old_bitmap = idx.load_bitmap(&old_keys[*old_pos], None).await?; - *bitmap |= &*old_bitmap; + *bitmap |= &retain_valid( + idx.load_bitmap(&old_keys[*old_pos], None) + .await? + .as_ref() + .clone(), + old_data_filter, + ); *old_pos += 1; } @@ -1517,7 +1576,7 @@ impl BitmapIndexPlugin { store: &dyn IndexStore, shard_files: &[String], progress: Arc, - ) -> Result<()> { + ) -> Result { progress .stage_start("merge_bitmap_shards", None, "bitmaps") .await?; @@ -1567,10 +1626,10 @@ impl BitmapIndexPlugin { progress .stage_start("write_bitmap_index", Some(1), "files") .await?; - writer.finish().await?; + let file = writer.finish().await?; progress.stage_progress("write_bitmap_index", 1).await?; progress.stage_complete("write_bitmap_index").await?; - Ok(()) + Ok(file) } } @@ -1591,6 +1650,63 @@ pub async fn merge_index_files( Ok(()) } +pub async fn merge_bitmap_indices( + source_indices: &[Arc], + dest_store: &dyn IndexStore, + progress: Arc, +) -> Result { + if source_indices.is_empty() { + return Err(Error::invalid_input( + "Bitmap segment merge requires at least one source segment".to_string(), + )); + } + + let value_type = source_indices[0].value_type().clone(); + let mut merged_state = HashMap::::new(); + + progress + .stage_start( + "merge_bitmap_segments", + Some(source_indices.len() as u64), + "segments", + ) + .await?; + for (idx, source_index) in source_indices.iter().enumerate() { + if source_index.value_type() != &value_type { + return Err(Error::invalid_input(format!( + "Bitmap segment has value type {:?}, expected {:?}", + source_index.value_type(), + value_type + ))); + } + + let state = source_index.load_bitmap_index_state().await?; + for (key, bitmap) in state { + merged_state + .entry(key) + .and_modify(|existing| *existing |= &bitmap) + .or_insert(bitmap); + } + progress + .stage_progress("merge_bitmap_segments", (idx + 1) as u64) + .await?; + } + progress.stage_complete("merge_bitmap_segments").await?; + + progress + .stage_start("write_bitmap_index", Some(1), "files") + .await?; + let file = BitmapIndexPlugin::write_bitmap_index(merged_state, dest_store, &value_type).await?; + progress.stage_progress("write_bitmap_index", 1).await?; + progress.stage_complete("write_bitmap_index").await?; + + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()).unwrap(), + index_version: BITMAP_INDEX_VERSION, + files: vec![file], + }) +} + #[async_trait] impl ScalarIndexPlugin for BitmapIndexPlugin { fn name(&self) -> &str { @@ -1652,7 +1768,7 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { .to_string(), ) })?; - if let Some(fragment_ids) = fragment_ids.as_ref() { + let file = if let Some(fragment_ids) = fragment_ids.as_ref() { Self::train_bitmap_shard( data, index_store, @@ -1660,20 +1776,20 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { request.parameters.shard_id, progress, ) - .await?; + .await? } else if request.parameters.shard_id.is_some() { return Err(Error::invalid_input( "Bitmap shard_id requires fragment_ids and is only supported for distributed shard builds" .to_string(), )); } else { - Self::train_bitmap_index(data, index_store).await?; - } + Self::train_bitmap_index(data, index_store).await? + }; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()) .unwrap(), index_version: BITMAP_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -1697,8 +1813,7 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { let Some(state) = cache.get_with_key(&BitmapIndexStateKey).await else { return Ok(None); }; - let state = (*state).clone(); - let index = state.into_bitmap_index(index_store, cache, frag_reuse_index)?; + let index = state.to_bitmap_index(index_store, cache, frag_reuse_index)?; Ok(Some(index as Arc)) } @@ -1765,8 +1880,12 @@ mod tests { fn assert_state_roundtrips(state: &BitmapIndexState) { let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BitmapIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, BitmapIndexState::CURRENT_VERSION); + let restored = BitmapIndexState::deserialize(&mut reader).unwrap(); assert_eq!(restored.lookup_batch, state.lookup_batch); assert_eq!(&*restored.null_map, &*state.null_map); assert_eq!(restored.value_type, state.value_type); @@ -1786,6 +1905,7 @@ mod tests { lookup_batch: build_lookup_batch(&index_map, &DataType::Int32).unwrap(), null_map: Arc::new(null_map), value_type: DataType::Int32, + index_map: Arc::new(index_map), }; assert_state_roundtrips(&state); @@ -1794,10 +1914,58 @@ mod tests { lookup_batch: build_lookup_batch(&BTreeMap::new(), &DataType::Utf8).unwrap(), null_map: Arc::new(RowAddrTreeMap::new()), value_type: DataType::Utf8, + index_map: Arc::new(BTreeMap::new()), }; assert_state_roundtrips(&empty_state); } + /// The lookup batch must decode zero-copy through the full envelope-bearing + /// [`CacheCodec`] even though the envelope pushes the IPC section to a + /// non-aligned starting offset. + #[test] + fn test_bitmap_index_state_lookup_is_zero_copy() { + const ALIGN: usize = 64; + let mut index_map = BTreeMap::new(); + for k in 0..32i32 { + index_map.insert( + OrderableScalarValue(ScalarValue::Int32(Some(k))), + k as usize, + ); + } + let state = BitmapIndexState { + lookup_batch: build_lookup_batch(&index_map, &DataType::Int32).unwrap(), + null_map: Arc::new(RowAddrTreeMap::new()), + value_type: DataType::Int32, + index_map: Arc::new(index_map), + }; + + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(state); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + // Model a backend reading into a 64-byte-aligned buffer. + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.lookup_batch.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "lookup batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[tokio::test] async fn test_bitmap_lazy_loading_and_cache() { // Create a temporary directory for the index @@ -1933,6 +2101,85 @@ mod tests { } } + // Regression test for the O(N log N) warm-cache rebuild introduced in + // commit 4de5ce67d. BitmapIndexState now caches the parsed Arc + // so that get_from_cache skips parse_lookup_batch on warm hits. + // IS NULL is the worst case: the actual bitmap lookup is O(1) but + // reconstruction of the BTreeMap touched every row in the lookup batch. + #[tokio::test] + async fn test_bitmap_cache_fast_path() { + use arrow_array::Int32Array; + + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // High-cardinality: 1 000 unique integers + 5 null rows. + const N: u64 = 1_000; + const NULL_COUNT: u64 = 5; + // nulls first (sorted batch: nulls precede values) + let null_values: Vec> = + std::iter::repeat_n(None, NULL_COUNT as usize).collect(); + let non_null_values: Vec> = (0..N as i32).map(Some).collect(); + let all_values: Vec> = null_values.into_iter().chain(non_null_values).collect(); + let all_row_ids: Vec = (0..N + NULL_COUNT).collect(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int32, true), + Field::new("_rowid", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(all_values)), + Arc::new(UInt64Array::from(all_row_ids)), + ], + ) + .unwrap(); + let stream = stream::once(async move { Ok(batch) }); + let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream)); + BitmapIndexPlugin::train_bitmap_index(stream, store.as_ref()) + .await + .unwrap(); + + let cache = LanceCache::with_capacity(16 * 1024 * 1024); + let index = BitmapIndex::load(store.clone(), None, &cache) + .await + .unwrap(); + + let plugin = BitmapIndexPlugin; + let index_arc: Arc = index.clone() as Arc; + plugin.put_in_cache(&cache, index_arc).await.unwrap(); + + // get_from_cache must return Some, and the BitmapIndexState's OnceLock + // must have been populated by put_in_cache so no parse_lookup_batch occurs. + let cached = plugin + .get_from_cache(store.clone(), None, &cache) + .await + .unwrap() + .expect("get_from_cache must return Some after put_in_cache"); + + // IS NULL: trivial work once the index is in hand. + let query = SargableQuery::IsNull(); + match cached.search(&query, &NoOpMetricsCollector).await.unwrap() { + SearchResult::Exact(row_set) => { + let mut null_rows: Vec = row_set + .true_rows() + .row_addrs() + .unwrap() + .map(u64::from) + .collect(); + null_rows.sort(); + let expected: Vec = (0..NULL_COUNT).collect(); + assert_eq!(null_rows, expected); + } + _ => panic!("Expected Exact result for IS NULL"), + } + } + #[tokio::test] #[ignore] async fn test_big_bitmap_index() { diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index af37f982d1c..596ea4cc989 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -7,20 +7,19 @@ //! It is a space-efficient data structure that can be used to test whether an element is a member of a set. //! It's an inexact filter - they may include false positives that require rechecking. -use crate::scalar::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; use crate::scalar::expression::{BloomFilterQueryParser, ScalarQueryParser}; use crate::scalar::registry::{ ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest, }; use crate::scalar::{ - BloomFilterQuery, BuiltinIndexType, CreatedIndex, ScalarIndexParams, UpdateCriteria, + BloomFilterQuery, BuiltinIndexType, CreatedIndex, IndexFile, ScalarIndexParams, UpdateCriteria, }; use crate::{Any, pb}; use arrow_array::{Array, UInt64Array}; -mod as_bytes; -pub mod sbbf; use arrow_schema::{DataType, Field}; use lance_arrow_stats::StatisticsAccumulator; +use lance_core::utils::bloomfilter::as_bytes; +use lance_core::utils::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; use serde::{Deserialize, Serialize}; use std::sync::LazyLock; @@ -30,14 +29,13 @@ use std::{collections::HashMap, sync::Arc}; use crate::scalar::FragReuseIndex; use crate::scalar::{AnyQuery, IndexStore, MetricsCollector, ScalarIndex, SearchResult}; -use crate::vector::VectorIndex; use crate::{Index, IndexType}; use arrow_array::{ArrayRef, RecordBatch}; use async_trait::async_trait; -use deepsize::DeepSizeOf; use lance_core::Error; use lance_core::Result; use lance_core::cache::LanceCache; +use lance_core::deepsize::DeepSizeOf; use roaring::RoaringBitmap; use super::zoned::{ZoneBound, ZoneProcessor, ZoneTrainer, rebuild_zones, search_zones}; @@ -59,7 +57,7 @@ struct BloomFilterStatistics { } impl DeepSizeOf for BloomFilterStatistics { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { // Estimate the size of the bloom filter // We could try to get the actual size from the Sbbf if it has a method for that, // but for now we'll estimate based on the number of bytes it serializes to @@ -83,7 +81,7 @@ pub struct BloomFilterIndex { } impl DeepSizeOf for BloomFilterIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.zones.deep_size_of_children(context) } } @@ -378,12 +376,6 @@ impl Index for BloomFilterIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input_source( - "BloomFilter is not a vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { Ok(()) } @@ -459,13 +451,13 @@ impl ScalarIndex for BloomFilterIndex { // Write the combined zones back to storage let mut builder = BloomFilterIndexBuilder::try_new(params)?; builder.blocks = updated_blocks; - builder.write_index(dest_store).await?; + let file = builder.write_index(dest_store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::BloomFilterIndexDetails::default()) .unwrap(), index_version: BLOOMFILTER_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -621,7 +613,7 @@ impl BloomFilterIndexBuilder { Ok(RecordBatch::try_new(schema, columns)?) } - pub async fn write_index(self, index_store: &dyn IndexStore) -> Result<()> { + pub async fn write_index(self, index_store: &dyn IndexStore) -> Result { let record_batch = self.bloomfilter_stats_as_batch()?; let mut file_schema = record_batch.schema().as_ref().clone(); @@ -639,8 +631,7 @@ impl BloomFilterIndexBuilder { .new_index_file(BLOOMFILTER_FILENAME, Arc::new(file_schema)) .await?; index_file.write_record_batch(record_batch).await?; - index_file.finish().await?; - Ok(()) + index_file.finish().await } } @@ -987,13 +978,12 @@ impl BloomFilterIndexPlugin { batches_source: SendableRecordBatchStream, index_store: &dyn IndexStore, options: Option, - ) -> Result<()> { + ) -> Result { let mut builder = BloomFilterIndexBuilder::try_new(options.unwrap_or_default())?; builder.train(batches_source).await?; - builder.write_index(index_store).await?; - Ok(()) + builder.write_index(index_store).await } } @@ -1077,12 +1067,12 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin { "must provide training request created by new_training_request".into(), ) })?; - Self::train_bloomfilter_index(data, index_store, Some(request.params)).await?; + let file = Self::train_bloomfilter_index(data, index_store, Some(request.params)).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::BloomFilterIndexDetails::default()) .unwrap(), index_version: BLOOMFILTER_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: vec![file], }) } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index ba6d3dd142d..7668973b44e 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -4,17 +4,18 @@ use std::{ any::Any, cmp::Ordering, - collections::{BTreeMap, BinaryHeap, HashMap, HashSet}, + collections::{HashMap, HashSet}, fmt::{Debug, Display}, ops::Bound, sync::Arc, }; use super::{ - AnyQuery, BuiltinIndexType, IndexReader, IndexStore, IndexWriter, MetricsCollector, + AnyQuery, BuiltinIndexType, IndexFile, IndexReader, IndexStore, IndexWriter, MetricsCollector, OldIndexDataFilter, SargableQuery, ScalarIndex, ScalarIndexParams, SearchResult, compute_next_prefix, }; +use crate::cache_pb::{BTreeIndexHeader, RangeToFile}; use crate::{Index, IndexType}; use crate::{ frag_reuse::FragReuseIndex, @@ -28,26 +29,40 @@ use crate::{ use crate::{metrics::NoOpMetricsCollector, scalar::registry::TrainingCriteria}; use crate::{pbold, scalar::btree::flat::FlatIndex}; use arrow_arith::numeric::add; -use arrow_array::{Array, RecordBatch, UInt32Array, new_empty_array}; -use arrow_schema::{DataType, Field, Schema, SortOptions}; +use arrow_array::{ + Array, ArrayAccessor, ArrowNativeTypeOp, PrimitiveArray, RecordBatch, UInt32Array, + cast::AsArray, + new_empty_array, + types::{ + ArrowPrimitiveType, Decimal128Type, Decimal256Type, Float16Type, Float32Type, Float64Type, + Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type, + }, +}; +use arrow_ord::ord::make_comparator; +use arrow_schema::{DataType, Field, IntervalUnit, Schema, SortOptions}; use async_trait::async_trait; use datafusion::physical_plan::{ ExecutionPlan, SendableRecordBatchStream, sorts::sort_preserving_merge::SortPreservingMergeExec, stream::RecordBatchStreamAdapter, union::UnionExec, }; -use datafusion_common::{DataFusionError, ScalarValue}; -use datafusion_physical_expr::{PhysicalSortExpr, expressions::Column}; -use deepsize::DeepSizeOf; +use datafusion_common::{DFSchema, DataFusionError, ScalarValue}; +use datafusion_expr::execution_props::ExecutionProps; +use datafusion_physical_expr::{ + PhysicalExpr, PhysicalSortExpr, create_physical_expr, expressions::Column, +}; use futures::{ FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt, future::BoxFuture, stream::{self}, }; -use lance_arrow::ipc::{read_ipc_stream_single_at, write_ipc_stream}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, - cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, + cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, + }, error::LanceOptionExt, utils::{ tokio::get_num_compute_intensive_cpus, @@ -58,7 +73,7 @@ use lance_datafusion::{ chunker::chunk_concat_stream, exec::{LanceExecutionOptions, OneShotExec, execute_plan}, }; -use lance_select::NullableRowAddrSet; +use lance_select::{NullableRowAddrSet, RowSetOps}; use log::{debug, warn}; use object_store::Error as ObjectStoreError; use rangemap::RangeInclusiveMap; @@ -68,7 +83,7 @@ use tracing::{info, instrument}; mod flat; -const BTREE_LOOKUP_NAME: &str = "page_lookup.lance"; +pub const BTREE_LOOKUP_NAME: &str = "page_lookup.lance"; const BTREE_PAGES_NAME: &str = "page_data.lance"; pub const DEFAULT_BTREE_BATCH_SIZE: u64 = 4096; const BATCH_SIZE_META_KEY: &str = "batch_size"; @@ -84,7 +99,7 @@ pub(crate) const BTREE_IDS_COLUMN: &str = "ids"; pub struct OrderableScalarValue(pub ScalarValue); impl DeepSizeOf for OrderableScalarValue { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { // deepsize and size both factor in the size of the ScalarValue self.0.size() - std::mem::size_of::() } @@ -580,7 +595,7 @@ impl Ord for OrderableScalarValue { } } (Struct(_arr), _) => panic!("Attempt to compare Struct with non-Struct"), - (Dictionary(_k1, _v1), Dictionary(_k2, _v2)) => todo!(), + (Dictionary(_k1, v1), Dictionary(_k2, v2)) => Self(*v1.clone()).cmp(&Self(*v2.clone())), (Dictionary(_, v1), Null) => Self(*v1.clone()).cmp(&Self(ScalarValue::Null)), (Dictionary(_, _), _) => panic!("Attempt to compare Dictionary with non-Dictionary"), // What would a btree of unions even look like? May not be possible. @@ -594,44 +609,114 @@ impl Ord for OrderableScalarValue { } } -#[derive(Debug, DeepSizeOf, PartialEq, Eq)] -struct PageRecord { - max: OrderableScalarValue, - page_number: u32, +/// Returns the first index `i` in `[lo, hi)` for which `pred(i)` is `false`. +/// +/// `pred` must be `true` for a (possibly empty) prefix of the range and `false` +/// for the rest, i.e. the range is partitioned by `pred`. +fn partition_point(lo: usize, hi: usize, mut pred: impl FnMut(usize) -> bool) -> usize { + let mut lo = lo; + let mut hi = hi; + while lo < hi { + let mid = lo + (hi - lo) / 2; + if pred(mid) { + lo = mid + 1; + } else { + hi = mid; + } + } + lo } -trait BTreeMapExt { - fn largest_node_less(&self, key: &K) -> Option<(&K, &V)>; +/// Builds a comparator over two array accessors of the same `Ord` item type, +/// matching arrow's NULLs-first ascending order (`null < non-null`, `null == null`). +/// +/// Unlike [`make_comparator`], the returned closure is generic (not boxed), so the +/// element comparison inlines into the scan instead of dispatching through a vtable +/// on every call. +fn accessor_cmp<'a, T, L, R>(left: L, right: R) -> impl Fn(usize, usize) -> Ordering + 'a +where + T: Ord, + L: ArrayAccessor + 'a, + R: ArrayAccessor + 'a, +{ + move |i, j| match (left.is_null(i), right.is_null(j)) { + (true, true) => Ordering::Equal, + (true, false) => Ordering::Less, + (false, true) => Ordering::Greater, + (false, false) => left.value(i).cmp(&right.value(j)), + } +} + +/// Views `arr` as `PrimitiveArray` for comparison. Zero-copy (shared buffers) +/// when `arr` already has type `K`; otherwise — a logical type whose physical +/// storage is `K::Native`, e.g. `Date32`/`Time32` over `i32` or `Timestamp`/ +/// `Duration` over `i64` — the array data is relabeled to `K` without copying the +/// values, so all such logical types share one comparison path. +fn reinterpret_primitive(arr: &dyn Array) -> Result> { + if let Some(arr) = arr.as_primitive_opt::() { + return Ok(arr.clone()); + } + let data = arr + .to_data() + .into_builder() + .data_type(K::DATA_TYPE) + .build() + .map_err(|e| { + Error::internal(format!( + "failed to reinterpret {} as {}: {e}", + arr.data_type(), + K::DATA_TYPE + )) + })?; + Ok(PrimitiveArray::::from(data)) } -impl BTreeMapExt for BTreeMap { - fn largest_node_less(&self, key: &K) -> Option<(&K, &V)> { - self.range((Bound::Unbounded, Bound::Excluded(key))) - .next_back() +/// Like [`accessor_cmp`] but for primitive columns, comparing native values with +/// [`ArrowNativeTypeOp::compare`] (total order, so floats match arrow's NaN-last +/// `make_comparator` ordering). +fn primitive_cmp<'a, T>( + left: &'a PrimitiveArray, + right: &'a PrimitiveArray, +) -> impl Fn(usize, usize) -> Ordering + 'a +where + T: ArrowPrimitiveType, +{ + move |i, j| match (left.is_null(i), right.is_null(j)) { + (true, true) => Ordering::Equal, + (true, false) => Ordering::Less, + (false, true) => Ordering::Greater, + (false, false) => left.value(i).compare(right.value(j)), } } -/// An in-memory structure that can quickly satisfy scalar queries using a btree of ScalarValue -#[derive(Debug, DeepSizeOf, PartialEq, Eq)] +/// Satisfies scalar queries by searching the `page_lookup.lance` batch directly. +/// +/// The batch holds one row per page with columns `min | max | null_count | page_idx`, +/// sorted ascending by `min` with NULLs first (the order the index is trained in). +/// Both query paths binary-search the sorted `min` column for a starting row and +/// scan forward filtering by `max`: +/// +/// - Equality / `IN` (`candidate_pages_for_values`) dispatch on the query's +/// *physical storage type* to a monomorphized, inlined comparator: numerics go +/// through `scan_native` (logical types sharing a native — e.g. `Date32` and +/// `Int32` — fold to one path), byte-likes through `scan_accessor`. Only types +/// without a native fast path (struct-backed intervals, booleans) fall back to the +/// boxed [`make_comparator`] via `scan_fallback`. +/// - Range searches (`pages_between`) currently use [`make_comparator`] directly. +#[derive(Debug, PartialEq, DeepSizeOf)] pub struct BTreeLookup { - tree: BTreeMap>, - /// Pages where the value may be null (does not include all_null_pages) + /// One row per page (`min | max | null_count | page_idx`), sorted by `min`. + batch: RecordBatch, + /// Pages with at least one null value (does not include `all_null_pages`). null_pages: Vec, - /// Pages that are entirely null + /// Pages that are entirely null. all_null_pages: Vec, + /// Index of the first row whose `max` is non-null. Entirely-null pages sort to + /// the front (NULLs first) and are skipped when searching value ranges. + search_start: usize, } -impl BTreeLookup { - fn empty() -> Self { - Self { - tree: BTreeMap::new(), - null_pages: Vec::new(), - all_null_pages: Vec::new(), - } - } -} - -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] enum Matches { Some(u32), All(u32), @@ -647,184 +732,460 @@ impl Matches { } impl BTreeLookup { - fn new( - tree: BTreeMap>, - null_pages: Vec, - all_null_pages: Vec, - ) -> Self { - Self { - tree, + /// Build a lookup over the `page_lookup.lance` batch. The batch is retained as + /// the source of truth; only the small null-page index lists are precomputed. + fn try_new(batch: RecordBatch) -> Result { + let mut null_pages = Vec::new(); + let mut all_null_pages = Vec::new(); + let mut search_start = batch.num_rows(); + + if batch.num_rows() > 0 { + let maxs = batch.column(1); + let null_counts = batch + .column(2) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::internal("BTree lookup null_count column must be UInt32"))?; + let page_numbers = batch + .column(3) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::internal("BTree lookup page_idx column must be UInt32"))?; + + for idx in 0..batch.num_rows() { + let page_number = page_numbers.values()[idx]; + // An entirely-null page has a null `max`; it is never searched by value. + if maxs.is_null(idx) { + all_null_pages.push(page_number); + continue; + } + if search_start == batch.num_rows() { + search_start = idx; + } + if null_counts.values()[idx] > 0 { + null_pages.push(page_number); + } + } + } else { + search_start = 0; + } + + Ok(Self { + batch, null_pages, all_null_pages, - } + search_start, + }) + } + + fn page_numbers(&self) -> Result<&UInt32Array> { + self.batch + .column(3) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::internal("BTree lookup page_idx column must be UInt32")) } // All pages that could have a value equal to val - fn pages_eq(&self, query: &OrderableScalarValue) -> Vec { + fn pages_eq(&self, query: &OrderableScalarValue) -> Result> { if query.0.is_null() { - self.pages_null() + Ok(self.pages_null()) } else { - self.pages_between((Bound::Included(query), Bound::Excluded(query))) + let query_arr = query.0.to_array_of_size(1)?; + let pages = self.candidate_pages_for_values(query_arr.as_ref())?; + Ok(pages.into_iter().map(Matches::Some).collect()) } } // All pages that could have a value equal to one of the values - fn pages_in(&self, values: impl IntoIterator) -> Vec { - // TODO: Right now we convert all Matches::All into Matches::Some. We could refine this. - // It would improve performance on low cardinality data. - let page_lists = values - .into_iter() - .map(|val| { - self.pages_eq(&val) - .into_iter() - .map(|matches| matches.page_id()) - }) - .collect::>(); - let total_size = page_lists.iter().map(|set| set.len()).sum(); - let mut heap = BinaryHeap::with_capacity(total_size); - for page_list in page_lists { - heap.extend(page_list); + fn pages_in( + &self, + values: impl IntoIterator, + ) -> Result> { + // Equality lookups never produce a full-page (`Matches::All`) match because a + // single value cannot cover an entire page's range, so every candidate is + // `Matches::Some`. Refining this for low-cardinality data is the TODO in + // `pages_between`. + let values = values.into_iter(); + let mut has_null = false; + let mut non_null = Vec::with_capacity(values.size_hint().0); + for val in values { + if val.0.is_null() { + has_null = true; + } else { + non_null.push(val.0); + } } - let mut all_pages = heap.into_sorted_vec(); + + // Build a single array holding every queried value so the comparators are + // constructed once and reused across all of them, rather than per value. + let mut all_pages = if non_null.is_empty() { + Vec::new() + } else { + let query_arr = ScalarValue::iter_to_array(non_null)?; + self.candidate_pages_for_values(query_arr.as_ref())? + }; + if has_null { + all_pages.extend(self.pages_null().into_iter().map(|m| m.page_id())); + } + all_pages.sort_unstable(); all_pages.dedup(); - all_pages.into_iter().map(Matches::Some).collect() + Ok(all_pages.into_iter().map(Matches::Some).collect()) + } + + /// Candidate page numbers (deduped, ascending) for an equality search against + /// every value in `query`. A page is a candidate when its `[min, max]` range + /// could contain the value, i.e. `min <= value <= max`. + /// + /// The comparators are built once over the whole `query` array and reused for + /// each value, so an N-value `IN` costs three comparator constructions instead + /// of three per value. + fn candidate_pages_for_values(&self, query: &dyn Array) -> Result> { + let num_rows = self.batch.num_rows(); + if self.search_start >= num_rows || query.is_empty() { + return Ok(vec![]); + } + + let mins = self.batch.column(0).as_ref(); + let maxs = self.batch.column(1).as_ref(); + let page_ids = self.page_numbers()?.values(); + + // Compare against the page columns with a native, monomorphized comparator + // that inlines, rather than the boxed `DynComparator` from `make_comparator` + // (one vtable call per comparison). Logical types that share a physical + // storage type route to one path via a zero-copy reinterpret, so e.g. every + // date/time/timestamp/duration type reuses the `i32`/`i64` path instead of + // generating its own. Types with no native path (intervals with struct + // natives, booleans, ...) take the `make_comparator` fallback. The query + // array always matches the column type, so its type selects the branch. + use DataType::*; + match query.data_type() { + Int8 => self.scan_native::(mins, maxs, query, page_ids), + Int16 => self.scan_native::(mins, maxs, query, page_ids), + // i32-backed: Int32, Date32, Time32, Decimal32, year-month intervals. + Int32 | Date32 | Time32(_) | Decimal32(_, _) | Interval(IntervalUnit::YearMonth) => { + self.scan_native::(mins, maxs, query, page_ids) + } + // i64-backed: Int64, Date64, Time64, Timestamp, Duration, Decimal64. + Int64 | Date64 | Time64(_) | Timestamp(_, _) | Duration(_) | Decimal64(_, _) => { + self.scan_native::(mins, maxs, query, page_ids) + } + UInt8 => self.scan_native::(mins, maxs, query, page_ids), + UInt16 => self.scan_native::(mins, maxs, query, page_ids), + UInt32 => self.scan_native::(mins, maxs, query, page_ids), + UInt64 => self.scan_native::(mins, maxs, query, page_ids), + Float16 => self.scan_native::(mins, maxs, query, page_ids), + Float32 => self.scan_native::(mins, maxs, query, page_ids), + Float64 => self.scan_native::(mins, maxs, query, page_ids), + Decimal128(_, _) => self.scan_native::(mins, maxs, query, page_ids), + Decimal256(_, _) => self.scan_native::(mins, maxs, query, page_ids), + Utf8 => Ok(self.scan_accessor( + mins.as_string::(), + maxs.as_string::(), + query.as_string::(), + page_ids, + )), + LargeUtf8 => Ok(self.scan_accessor( + mins.as_string::(), + maxs.as_string::(), + query.as_string::(), + page_ids, + )), + Binary => Ok(self.scan_accessor( + mins.as_binary::(), + maxs.as_binary::(), + query.as_binary::(), + page_ids, + )), + LargeBinary => Ok(self.scan_accessor( + mins.as_binary::(), + maxs.as_binary::(), + query.as_binary::(), + page_ids, + )), + FixedSizeBinary(_) => Ok(self.scan_accessor( + mins.as_fixed_size_binary(), + maxs.as_fixed_size_binary(), + query.as_fixed_size_binary(), + page_ids, + )), + _ => self.scan_fallback(mins, maxs, query, page_ids), + } + } + + /// Native-comparator equality scan for a primitive physical type `K`. The page + /// columns and `query` are reinterpreted to `PrimitiveArray` (zero-copy when + /// already that type) and compared with [`primitive_cmp`]. + fn scan_native( + &self, + mins: &dyn Array, + maxs: &dyn Array, + query: &dyn Array, + page_ids: &[u32], + ) -> Result> { + let mins = reinterpret_primitive::(mins)?; + let maxs = reinterpret_primitive::(maxs)?; + let query = reinterpret_primitive::(query)?; + Ok(self.scan_equality_pages( + query.len(), + page_ids, + |idx| maxs.is_null(idx), + primitive_cmp(&mins, &query), + primitive_cmp(&maxs, &query), + primitive_cmp(&mins, &mins), + )) + } + + /// Native-comparator equality scan for byte-like columns (`Utf8`/`Binary`/ + /// `FixedSizeBinary` and their large variants), compared lexicographically via + /// [`accessor_cmp`]. + fn scan_accessor(&self, mins: A, maxs: A, query: A, page_ids: &[u32]) -> Vec + where + T: Ord, + A: ArrayAccessor + Copy, + { + self.scan_equality_pages( + query.len(), + page_ids, + |idx| maxs.is_null(idx), + accessor_cmp(mins, query), + accessor_cmp(maxs, query), + accessor_cmp(mins, mins), + ) + } + + /// Fallback equality scan for types without a native path (intervals with struct + /// natives, booleans, ...), using arrow's boxed `make_comparator`. + fn scan_fallback( + &self, + mins: &dyn Array, + maxs: &dyn Array, + query: &dyn Array, + page_ids: &[u32], + ) -> Result> { + // The batch is sorted ascending by `min` with NULLs first; compare the query + // values the same way so the binary searches stay consistent. + let opts = SortOptions { + descending: false, + nulls_first: true, + }; + let cmp_min = make_comparator(mins, query, opts)?; + let cmp_max = make_comparator(maxs, query, opts)?; + let cmp_min_min = make_comparator(mins, mins, opts)?; + Ok(self.scan_equality_pages( + query.len(), + page_ids, + |idx| maxs.is_null(idx), + cmp_min, + cmp_max, + cmp_min_min, + )) + } + + /// Binary-search + forward-scan the page batch for equality candidates. + /// + /// Monomorphized over the comparator closures so a typed-native comparator + /// inlines (no per-call vtable dispatch). The closures encode NULLs-first, + /// ascending order: + /// * `max_is_null(i)` — whether page `i`'s `max` is null (an all-null page) + /// * `cmp_min(i, j)` — page `i`'s `min` vs query value `j` + /// * `cmp_max(i, j)` — page `i`'s `max` vs query value `j` + /// * `cmp_min_min(i, anchor)` — two page `min`s, to expand left onto a straddle + fn scan_equality_pages( + &self, + num_query: usize, + page_ids: &[u32], + max_is_null: impl Fn(usize) -> bool, + cmp_min: impl Fn(usize, usize) -> Ordering, + cmp_max: impl Fn(usize, usize) -> Ordering, + cmp_min_min: impl Fn(usize, usize) -> Ordering, + ) -> Vec { + let num_rows = self.batch.num_rows(); + // High-cardinality lookups hit ~one page per value; presize to avoid the + // element-by-element `RawVec` growth that profiling flagged. + let mut pages = Vec::with_capacity(num_query); + for j in 0..num_query { + // Start row: peek a little to the left of the value. A query for 7 must + // still reach a page like [5, 10], so we include every page whose `min` + // equals the largest `min` strictly less than the value. + let p = partition_point(0, num_rows, |i| cmp_min(i, j) == Ordering::Less); + let start = if p == 0 { + self.search_start + } else { + let anchor = p - 1; + partition_point(0, p, |i| cmp_min_min(i, anchor) == Ordering::Less) + } + .max(self.search_start); + + // End row: pages whose `min` exceeds the value cannot match. + let end = partition_point(start, num_rows, |i| cmp_min(i, j) != Ordering::Greater); + + // The window splits at `p` (first row with `min >= value`): + // * `[start, p)` — the peek-left/straddle region (`min < value`). A page + // here matches only if its `max` reaches the value, so it needs the + // filter, and it may include a null-`min`/null-`max` straddle page. + // * `[p, end)` — rows with `min == value`. These always match (`max >= + // min == value`) and can't have a null `max` (all-null pages sort to + // the front, before `search_start <= start`), so we copy them in one + // slice instead of pushing per row. + let bulk_start = p.max(start); + for (offset, &page_id) in page_ids[start..bulk_start].iter().enumerate() { + let idx = start + offset; + // All-null pages are only matched by IS NULL queries. + if max_is_null(idx) { + continue; + } + // Candidate when the page's `max` reaches the value (`max >= value`). + if cmp_max(idx, j) != Ordering::Less { + pages.push(page_id); + } + } + pages.extend_from_slice(&page_ids[bulk_start..end]); + } + + pages.sort_unstable(); + pages.dedup(); + pages } // All pages that could have a value in the range fn pages_between( &self, range: (Bound<&OrderableScalarValue>, Bound<&OrderableScalarValue>), - ) -> Vec { - // We need to grab a little bit left of the given range because the query might be 7 - // and the first page might be something like 5-10. - let lower_bound = match range.0 { - Bound::Unbounded => Bound::Unbounded, - // It doesn't matter if the bound is exclusive or inclusive. We are going to grab - // the first node whose min is strictly less than the given bound. Then we grab - // all nodes greater than or equal to that - // - // We have to peek a bit to the left because we might have something like a lower - // bound of 7 and there is a page [5-10] we want to search for. - Bound::Included(lower) => self - .tree - .largest_node_less(lower) - .map(|val| Bound::Included(val.0)) - .unwrap_or(Bound::Unbounded), - Bound::Excluded(lower) => self - .tree - .largest_node_less(lower) - .map(|val| Bound::Included(val.0)) - .unwrap_or(Bound::Unbounded), + ) -> Result> { + let num_rows = self.batch.num_rows(); + // No searchable (non-all-null) pages. + if self.search_start >= num_rows { + return Ok(vec![]); + } + + let mins = self.batch.column(0).as_ref(); + let maxs = self.batch.column(1).as_ref(); + let page_numbers = self.page_numbers()?; + + // The batch is sorted ascending by `min` with NULLs first; compare bounds + // the same way so the binary searches and the null `min` of a straddling + // page are handled consistently. + let opts = SortOptions { + descending: false, + nulls_first: true, + }; + // Bounds become 1-row arrays of the column type so arrow's type-dispatched + // comparator can compare them against the `min`/`max` columns. + let lower_arr = match range.0 { + Bound::Unbounded => None, + Bound::Included(v) | Bound::Excluded(v) => Some(v.0.to_array_of_size(1)?), }; - let upper_bound = match range.1 { - Bound::Unbounded => Bound::Unbounded, - Bound::Included(upper) => Bound::Included(upper), - // Even if the upper bound is excluded we need to include it on an [x, x) query. This is because the - // query might be [x, x). Our lower bound might find some [a-x] bucket and we still - // want to include any [x, z] bucket. - // - // We could be slightly more accurate here and only include the upper bound if the lower bound - // is defined, inclusive, and equal to the upper bound. However, let's keep it simple for now. This - // should only affect the probably rare case that our query is a true range query and the value - // matches an upper bound. This will all be moot if/when we merge pages. - Bound::Excluded(upper) => Bound::Included(upper), + let upper_arr = match range.1 { + Bound::Unbounded => None, + Bound::Included(v) | Bound::Excluded(v) => Some(v.0.to_array_of_size(1)?), }; - match (lower_bound, upper_bound) { - (Bound::Excluded(lower), Bound::Excluded(upper)) - | (Bound::Excluded(lower), Bound::Included(upper)) - | (Bound::Included(lower), Bound::Excluded(upper)) => { - // It's not really clear what (Included(5), Excluded(5)) would mean so we - // interpret it as an empty range which matches rust's BTreeMap behavior - if lower >= upper { - return vec![]; + // Start row: peek a little to the left of the lower bound. A query for 7 + // must still reach a page like [5, 10], so we include every page whose + // `min` equals the largest `min` strictly less than the lower bound. + let start = match &lower_arr { + None => self.search_start, + Some(lower) => { + let cmp = make_comparator(mins, lower.as_ref(), opts)?; + // first row with min >= lower + let p = partition_point(0, num_rows, |i| cmp(i, 0) == Ordering::Less); + if p == 0 { + self.search_start + } else { + // first row sharing the straddling page's `min` + let straddle = mins.slice(p - 1, 1); + let cmp = make_comparator(mins, straddle.as_ref(), opts)?; + partition_point(0, p, |i| cmp(i, 0) == Ordering::Less) } } - (Bound::Included(lower), Bound::Included(upper)) => { - if lower > upper { - return vec![]; - } + } + .max(self.search_start); + + // End row: pages whose `min` exceeds the upper bound cannot match. The + // upper bound is treated as inclusive even when the query bound is + // exclusive, so an [x, x) query still reaches a page whose `min` == x. + let end = match &upper_arr { + None => num_rows, + Some(upper) => { + let cmp = make_comparator(mins, upper.as_ref(), opts)?; + partition_point(start, num_rows, |i| cmp(i, 0) != Ordering::Greater) } - _ => {} + }; + + if start >= end { + return Ok(vec![]); } + // Comparators reused across the candidate rows. + let cmp_max_lower = lower_arr + .as_ref() + .map(|l| make_comparator(maxs, l.as_ref(), opts)) + .transpose()?; + let cmp_min_lower = lower_arr + .as_ref() + .map(|l| make_comparator(mins, l.as_ref(), opts)) + .transpose()?; + let cmp_max_upper = upper_arr + .as_ref() + .map(|u| make_comparator(maxs, u.as_ref(), opts)) + .transpose()?; + let mut matches = Vec::new(); + for idx in start..end { + // All-null pages are only matched by IS NULL queries. + if maxs.is_null(idx) { + continue; + } - for (min, page_records) in self.tree.range((lower_bound, upper_bound)) { - for page_record in page_records { - match lower_bound { - Bound::Unbounded => {} - Bound::Included(lower) => { - if page_record.max.cmp(lower) == Ordering::Less { - continue; - } - } - Bound::Excluded(lower) => { - if page_record.max.cmp(lower) != Ordering::Greater { - continue; - } - } - } - // At this point we know the page record matches at least some values. - // We should test to see if ALL values are a match. + // Candidate filter: the page's `max` reaches the lower bound. + let lower_ok = match (range.0, &cmp_max_lower) { + (Bound::Unbounded, _) => true, + (Bound::Included(_), Some(cmp)) => cmp(idx, 0) != Ordering::Less, // max >= lower + (Bound::Excluded(_), Some(cmp)) => cmp(idx, 0) == Ordering::Greater, // max > lower + _ => unreachable!("lower bound and its comparator are constructed together"), + }; + if !lower_ok { + continue; + } - if min.0.is_null() || page_record.max.0.is_null() { - // If there are nulls then we just use Matches::Some - matches.push(Matches::Some(page_record.page_number)); - continue; - } + let page_number = page_numbers.values()[idx]; - match range.0 { - // range.0 < X therefore if the smallest value is not strictly greater than - // the lower bound we only have partial match - Bound::Excluded(lower) => { - if min.cmp(lower) != Ordering::Greater { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - // range.0 <= X therefore if the smallest value is not greater than or equal - // to the lower bound we only have partial match - Bound::Included(lower) => { - if min.cmp(lower) == Ordering::Less { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - Bound::Unbounded => {} - } - match range.1 { - // X < range.1 therefore if the largest value is not strictly less than - // the upper bound we only have partial match - Bound::Excluded(upper) => { - if page_record.max.cmp(upper) != Ordering::Less { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - // X <= range.1 therefore if the largest value is not less than or equal to - // the upper bound we only have partial match - Bound::Included(upper) => { - if page_record.max.cmp(upper) == Ordering::Greater { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - Bound::Unbounded => {} - } - // The min is greater than the lower bound and the max is less than the upper bound - // so we have a full match - matches.push(Matches::All(page_record.page_number)); + // A page with a null `min` straddles the NULL/non-NULL boundary, so it + // is only ever a partial match. + if mins.is_null(idx) { + matches.push(Matches::Some(page_number)); + continue; + } + + // Full match requires the page to sit entirely within the query range. + let lower_full = match (range.0, &cmp_min_lower) { + (Bound::Unbounded, _) => true, + (Bound::Included(_), Some(cmp)) => cmp(idx, 0) != Ordering::Less, // min >= lower + (Bound::Excluded(_), Some(cmp)) => cmp(idx, 0) == Ordering::Greater, // min > lower + _ => unreachable!("lower bound and its comparator are constructed together"), + }; + let upper_full = match (range.1, &cmp_max_upper) { + (Bound::Unbounded, _) => true, + (Bound::Included(_), Some(cmp)) => cmp(idx, 0) != Ordering::Greater, // max <= upper + (Bound::Excluded(_), Some(cmp)) => cmp(idx, 0) == Ordering::Less, // max < upper + _ => unreachable!("upper bound and its comparator are constructed together"), + }; + if lower_full && upper_full { + matches.push(Matches::All(page_number)); + } else { + matches.push(Matches::Some(page_number)); } } - matches + Ok(matches) } fn pages_null(&self) -> Vec { self.null_pages .iter() - .map(|page_id| Matches::Some(*page_id)) + .copied() + .map(Matches::Some) .chain(self.all_null_pages.iter().copied().map(Matches::All)) .collect() } @@ -1013,17 +1374,17 @@ impl CacheKey for BTreePageKey { /// `BTreeIndex::try_from_serialized` reconstructs the in-memory lookup with /// no IO) plus the page batch size and range-partition map. #[derive(Debug, Clone)] -pub struct BTreeIndexState { +struct BTreeIndexState { lookup_batch: RecordBatch, batch_size: u64, ranges_to_files: Option>>, } impl DeepSizeOf for BTreeIndexState { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // `ranges_to_files` is tiny and `RangeInclusiveMap` is not `DeepSizeOf`; // the lookup batch dominates, matching how `BTreeIndex` accounts for itself. - self.lookup_batch.get_array_memory_size() + self.lookup_batch.deep_size_of_children(context) } } @@ -1047,106 +1408,58 @@ impl BTreeIndexState { } impl CacheCodecImpl for BTreeIndexState { - /// Wire format (no stability guarantees yet — the cache is rebuilt from - /// source on any version mismatch): + const TYPE_ID: &'static str = "lance.scalar.BTreeIndexState"; + const CURRENT_VERSION: u32 = 1; + + /// Wire format: /// ```text - /// u64 batch_size (LE) - /// u8 has_ranges (0 = None, 1 = Some) - /// if has_ranges: - /// u32 entry_count (LE) - /// per entry: u32 start | u32 end | u32 offset | u32 path_len | path bytes - /// lookup batch (Arrow IPC stream) + /// HEADER : BTreeIndexHeader proto (batch_size + page-range mapping) + /// ARROW_IPC : page-lookup batch /// ``` - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { - writer.write_all(&self.batch_size.to_le_bytes())?; - match &self.ranges_to_files { - None => writer.write_all(&[0u8])?, - Some(ranges) => { - writer.write_all(&[1u8])?; - let count = u32::try_from(ranges.len()).map_err(|_| { - Error::io("BTreeIndexState: ranges_to_files exceeds u32::MAX entries") - })?; - writer.write_all(&count.to_le_bytes())?; - for (range, (path, page_offset)) in ranges.iter() { - writer.write_all(&range.start().to_le_bytes())?; - writer.write_all(&range.end().to_le_bytes())?; - writer.write_all(&page_offset.to_le_bytes())?; - let path_len = u32::try_from(path.len()).map_err(|_| { - Error::io("BTreeIndexState: ranges_to_files path exceeds u32::MAX bytes") - })?; - writer.write_all(&path_len.to_le_bytes())?; - writer.write_all(path.as_bytes())?; - } - } - } - write_ipc_stream(&self.lookup_batch, writer)?; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let ranges_to_files = match &self.ranges_to_files { + None => Vec::new(), + Some(ranges) => ranges + .iter() + .map(|(range, (path, page_offset))| RangeToFile { + start: *range.start(), + end: *range.end(), + page_offset: *page_offset, + path: path.clone(), + }) + .collect(), + }; + let header = BTreeIndexHeader { + batch_size: self.batch_size, + has_ranges_to_files: self.ranges_to_files.is_some(), + ranges_to_files, + }; + w.write_header(&header)?; + w.write_ipc(&self.lookup_batch)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let batch_size = read_u64_le(data, &mut offset)?; - let has_ranges = read_u8(data, &mut offset)?; - let ranges_to_files = match has_ranges { - 0 => None, - 1 => { - let count = read_u32_le(data, &mut offset)? as usize; - let mut entries = Vec::with_capacity(count); - for _ in 0..count { - let start = read_u32_le(data, &mut offset)?; - let end = read_u32_le(data, &mut offset)?; - let page_offset = read_u32_le(data, &mut offset)?; - let path_len = read_u32_le(data, &mut offset)? as usize; - let path = read_bytes(data, &mut offset, path_len)?; - let path = std::str::from_utf8(&path) - .map_err(|e| Error::io(format!("BTreeIndexState path: {e}")))? - .to_string(); - entries.push((start..=end, (path, page_offset))); - } - Some(Arc::new(entries.into_iter().collect())) - } - other => { - return Err(Error::io(format!( - "BTreeIndexState: invalid has_ranges tag {other}" - ))); - } + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: BTreeIndexHeader = r.read_header()?; + let ranges_to_files = if header.has_ranges_to_files { + let map: RangeInclusiveMap = header + .ranges_to_files + .into_iter() + .map(|entry| (entry.start..=entry.end, (entry.path, entry.page_offset))) + .collect(); + Some(Arc::new(map)) + } else { + None }; - let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; + let lookup_batch = r.read_ipc()?; Ok(Self { lookup_batch, - batch_size, + batch_size: header.batch_size, ranges_to_files, }) } } -fn read_bytes(data: &bytes::Bytes, offset: &mut usize, len: usize) -> Result { - if data.len() < *offset + len { - return Err(Error::io(format!( - "BTreeIndexState: short read of {len} bytes at offset {offset} (have {})", - data.len() - ))); - } - let slice = data.slice(*offset..*offset + len); - *offset += len; - Ok(slice) -} - -fn read_u8(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 1)?; - Ok(bytes[0]) -} - -fn read_u32_le(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 4)?; - Ok(u32::from_le_bytes(bytes.as_ref().try_into().unwrap())) -} - -fn read_u64_le(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 8)?; - Ok(u64::from_le_bytes(bytes.as_ref().try_into().unwrap())) -} - /// Cache key for a [`BTreeIndexState`]. The cache it is used with is already /// namespaced per-index, so the key string is a constant. struct BTreeIndexStateKey; @@ -1207,26 +1520,14 @@ pub struct BTreeIndex { /// - The system now knows to read page `42` from the file `part_2_page_file.lance`. ranges_to_files: Option>>, frag_reuse_index: Option>, - - /// The raw lookup batch this index was built from (the contents of - /// `page_lookup.lance`). Retained so the index can be serialized into a - /// cache as a [`BTreeIndexState`] without re-reading it from storage. - /// - /// TODO: this duplicates the min/max values already held in `page_lookup`. - /// A follow-up could rewrite `BTreeLookup` to query this batch directly - /// (binary search on the sorted `min` column + linear scan, type-dispatched - /// per column type), eliminating the duplication and making this batch the - /// single source of truth. - lookup_batch: RecordBatch, } impl DeepSizeOf for BTreeIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // We don't include the index cache, or anything stored in it. For example: - // sub_index and fri. - self.page_lookup.deep_size_of_children(context) - + self.store.deep_size_of_children(context) - + self.lookup_batch.get_array_memory_size() + // sub_index and fri. `page_lookup` owns the lookup batch (the single source + // of truth), so accounting for it covers the lookup data. + self.page_lookup.deep_size_of_children(context) + self.store.deep_size_of_children(context) } } @@ -1240,7 +1541,6 @@ impl BTreeIndex { batch_size: u64, ranges_to_files: Option>>, frag_reuse_index: Option>, - lookup_batch: RecordBatch, ) -> Self { Self { page_lookup, @@ -1250,10 +1550,69 @@ impl BTreeIndex { batch_size, ranges_to_files, frag_reuse_index, - lookup_batch, } } + /// For each key in `keys`, whether this index contains it — a batched + /// existence check returning a mask aligned to `keys`. + /// + /// The per-key sibling of `search(Equals(..))`, but one call replaces N + /// probes: keys are grouped by page using the same page resolution as + /// [`ScalarIndex::search`] (`pages_eq`), each touched page is loaded once + /// (session-cached), and membership is tested against the page's values via + /// `FlatIndex::contains_values`. Avoids the per-key `SearchResult` / + /// `RowAddrTreeMap` allocation when the caller only wants a yes/no. + /// + /// Intended for primary-key dedup, where keys are non-null; a null key maps + /// to `false`. + pub async fn contains_keys( + &self, + keys: &[ScalarValue], + metrics: &dyn MetricsCollector, + ) -> Result> { + // Group each key (by input position) under every page whose value range + // could hold it. Mirrors `search`'s page selection so the two agree. + let mut by_page: HashMap> = HashMap::new(); + for (idx, key) in keys.iter().enumerate() { + if key.is_null() { + continue; + } + let ov = OrderableScalarValue(key.clone()); + for matches in self.page_lookup.pages_eq(&ov)? { + by_page + .entry(matches.page_id()) + .or_default() + .push((idx, ov.clone())); + } + } + + let index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); + let page_tasks = by_page.into_iter().map(|(page_number, entries)| { + let index_reader = index_reader.clone(); + async move { + let page = self.lookup_page(page_number, index_reader, metrics).await?; + let needles: Vec = + entries.iter().map(|(_, ov)| ov.clone()).collect(); + let present = page.contains_values(&needles)?; + Result::Ok((entries, present)) + } + }); + + let mut result = vec![false; keys.len()]; + let page_results: Vec<_> = stream::iter(page_tasks) + .buffer_unordered(get_num_compute_intensive_cpus()) + .try_collect() + .await?; + for (entries, present) in page_results { + for (idx, ov) in entries { + if present.contains(&ov) { + result[idx] = true; + } + } + } + Ok(result) + } + async fn lookup_page( &self, page_number: u32, @@ -1287,11 +1646,28 @@ impl BTreeIndex { FlatIndex::try_new(serialized_page) } + /// Compile a sargable predicate into a physical expr against the per-page + /// schema ([values, ids]). Built once in `search` and shared across pages so + /// a large IN-list is not re-materialized for every page. + fn compile_predicate(&self, query: &SargableQuery) -> Result> { + let schema = Arc::new(Schema::new(vec![ + Field::new(BTREE_VALUES_COLUMN, self.data_type.clone(), true), + Field::new(BTREE_IDS_COLUMN, DataType::UInt64, false), + ])); + let df_schema = DFSchema::try_from(schema)?; + Ok(create_physical_expr( + &query.to_expr(BTREE_VALUES_COLUMN.to_string()), + &df_schema, + &ExecutionProps::default(), + )?) + } + async fn search_page( &self, query: &SargableQuery, matches: Matches, index_reader: LazyIndexReader, + prebuilt: Option<&Arc>, metrics: &dyn MetricsCollector, ) -> Result { let subindex = self @@ -1299,13 +1675,12 @@ impl BTreeIndex { .await?; match matches { - Matches::Some(_) => { - // TODO: If this is an IN query we can perhaps simplify the subindex query by restricting it to the - // values that might be in the page. E.g. if we are searching for X IN [5, 3, 7] and five is in pages - // 1 and 2 and three is in page 2 and seven is in pages 8 and 9, then when searching page 2 we only need - // to search for X IN [5, 3] - subindex.search(query, metrics) - } + // For a large IsIn the predicate is compiled once (see `search`) and + // reused here, instead of rebuilding the whole IN-list per page. + Matches::Some(_) => match prebuilt { + Some(expr) => subindex.search_prebuilt(expr, metrics), + None => subindex.search(query, metrics), + }, Matches::All(_) => Ok(match query { // This means we hit an all-null page so just grab all row ids as true SargableQuery::IsNull() => subindex.all_ignore_nulls(), @@ -1323,68 +1698,8 @@ impl BTreeIndex { ranges_to_files: Option>>, frag_reuse_index: Option>, ) -> Result { - let mut map = BTreeMap::>::new(); - // Pages that have at least one null value - let mut null_pages = Vec::::new(); - // Pages that are entirely null - let mut all_null_pages = Vec::::new(); - - if data.num_rows() == 0 { - let data_type = data.column(0).data_type().clone(); - let page_lookup = Arc::new(BTreeLookup::empty()); - return Ok(Self::new( - page_lookup, - store, - data_type, - WeakLanceCache::from(index_cache), - batch_size, - ranges_to_files, - frag_reuse_index, - data, - )); - } - - let mins = data.column(0); - let maxs = data.column(1); - let null_counts = data - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - let page_numbers = data - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - - for idx in 0..data.num_rows() { - let min = OrderableScalarValue(ScalarValue::try_from_array(&mins, idx)?); - let max = OrderableScalarValue(ScalarValue::try_from_array(&maxs, idx)?); - let null_count = null_counts.values()[idx]; - let page_number = page_numbers.values()[idx]; - - // If the page is entirely null don't even bother putting it in the tree - if max.0.is_null() { - all_null_pages.push(page_number); - // continue so we don't add it to the null_pages - continue; - } else { - map.entry(min) - .or_default() - .push(PageRecord { max, page_number }); - } - - if null_count > 0 { - null_pages.push(page_number); - } - } - - let last_max = ScalarValue::try_from_array(&maxs, data.num_rows() - 1)?; - map.entry(OrderableScalarValue(last_max)).or_default(); - - let data_type = mins.data_type().clone(); - - let page_lookup = Arc::new(BTreeLookup::new(map, null_pages, all_null_pages)); + let data_type = data.column(0).data_type().clone(); + let page_lookup = Arc::new(BTreeLookup::try_new(data)?); Ok(Self::new( page_lookup, @@ -1394,7 +1709,6 @@ impl BTreeIndex { batch_size, ranges_to_files, frag_reuse_index, - data, )) } @@ -1489,7 +1803,7 @@ impl BTreeIndex { } /// Create a stream of all the data in the index, in the same format used to train the index - async fn into_data_stream(self) -> Result { + async fn data_stream(&self) -> Result { let lazy_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); let reader = lazy_reader.get().await?; let new_schema = Arc::new(self.train_schema()); @@ -1512,25 +1826,66 @@ impl BTreeIndex { ))) } - async fn combine_old_new( - self, + /// Merge N source BTree segments plus an additional `new_data` stream into + /// a single BTree under `dest_store`, without re-reading the dataset. + pub async fn merge_segments( + segments: &[Arc], new_data: SendableRecordBatchStream, - chunk_size: u64, - old_data_filter: Option, - ) -> Result { - let value_column_index = new_data.schema().index_of(VALUE_COLUMN_NAME)?; - - let new_input = Arc::new(OneShotExec::new(new_data)); - let old_stream = self.into_data_stream().await?; - let old_stream = match old_data_filter { - Some(filter) => filter_row_ids(old_stream, filter), - None => old_stream, + dest_store: &dyn IndexStore, + old_data_filters: &[Option], + ) -> Result { + let Some(first) = segments.first() else { + return Err(Error::invalid_input( + "cannot merge BTree index without at least one source segment".to_string(), + )); }; - let old_input = Arc::new(OneShotExec::new(old_stream)); - debug_assert_eq!( - old_input.schema().flattened_fields().len(), - new_input.schema().flattened_fields().len() - ); + + if old_data_filters.len() != segments.len() { + return Err(Error::invalid_input(format!( + "BTree merge: expected one old-data filter per source segment \ + (segments={}, filters={})", + segments.len(), + old_data_filters.len() + ))); + } + + for segment in segments.iter().skip(1) { + if segment.data_type != first.data_type { + return Err(Error::index(format!( + "cannot merge BTree segments with different value types ({:?} vs {:?})", + first.data_type, segment.data_type + ))); + } + } + + let new_schema = new_data.schema(); + let value_column_index = new_schema.index_of(VALUE_COLUMN_NAME)?; + let new_value_type = new_schema.field(value_column_index).data_type(); + if new_value_type != &first.data_type { + return Err(Error::invalid_input(format!( + "BTree merge: new_data value column type {:?} does not match \ + segment value type {:?}", + new_value_type, first.data_type + ))); + } + + let mut inputs: Vec> = Vec::with_capacity(segments.len() + 1); + for (segment, old_data_filter) in segments.iter().zip(old_data_filters) { + if filter_keeps_nothing(old_data_filter) { + continue; + } + let stream = segment.data_stream().await?; + let stream = match segment.frag_reuse_index.clone() { + Some(frag_reuse_index) => remap_row_ids(stream, frag_reuse_index), + None => stream, + }; + let stream = match old_data_filter.clone() { + Some(filter) => filter_row_ids(stream, filter), + None => stream, + }; + inputs.push(Arc::new(OneShotExec::new(stream))); + } + inputs.push(Arc::new(OneShotExec::new(new_data))); let sort_expr = PhysicalSortExpr { expr: Arc::new(Column::new(VALUE_COLUMN_NAME, value_column_index)), @@ -1539,11 +1894,10 @@ impl BTreeIndex { nulls_first: true, }, }; - // The UnionExec creates multiple partitions but the SortPreservingMergeExec merges - // them back into a single partition. - let all_data = UnionExec::try_new(vec![old_input, new_input])?; - let ordered = Arc::new(SortPreservingMergeExec::new([sort_expr].into(), all_data)); - + // UnionExec yields multiple partitions; SortPreservingMergeExec merges + // them back into a single partition while preserving value-ordering. + let unioned = UnionExec::try_new(inputs)?; + let ordered = Arc::new(SortPreservingMergeExec::new([sort_expr].into(), unioned)); let unchunked = execute_plan( ordered, LanceExecutionOptions { @@ -1551,7 +1905,17 @@ impl BTreeIndex { ..Default::default() }, )?; - Ok(chunk_concat_stream(unchunked, chunk_size as usize)) + let merged_stream = chunk_concat_stream(unchunked, first.batch_size as usize); + + let files = + train_btree_index(merged_stream, dest_store, first.batch_size, None, None).await?; + + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) + .unwrap(), + index_version: BTREE_INDEX_VERSION, + files, + }) } } @@ -1574,6 +1938,28 @@ fn filter_row_ids( Box::pin(RecordBatchStreamAdapter::new(schema, filtered)) } +/// True if `filter` would keep no rows at all (its keep-set is empty), letting +/// the merge skip reading the segment entirely. +fn filter_keeps_nothing(filter: &Option) -> bool { + match filter { + Some(OldIndexDataFilter::Fragments { to_keep, .. }) => to_keep.is_empty(), + Some(OldIndexDataFilter::RowIds(valid)) => valid.is_empty(), + None => false, + } +} + +fn remap_row_ids( + stream: SendableRecordBatchStream, + frag_reuse_index: Arc, +) -> SendableRecordBatchStream { + let schema = stream.schema(); + let remapped = stream.map(move |batch_result| { + let batch = batch_result?; + Ok(frag_reuse_index.remap_row_ids_record_batch(batch, 1)?) + }); + Box::pin(RecordBatchStreamAdapter::new(schema, remapped)) +} + fn wrap_bound(bound: &Bound) -> Bound { match bound { Bound::Unbounded => Bound::Unbounded, @@ -1612,12 +1998,6 @@ impl Index for BTreeIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "BTreeIndex is not vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { let index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); let reader = index_reader.get().await?; @@ -1660,18 +2040,25 @@ impl Index for BTreeIndex { } fn statistics(&self) -> Result { - let min = self - .page_lookup - .tree - .first_key_value() - .map(|(k, _)| k.clone()); - let max = self - .page_lookup - .tree - .last_key_value() - .map(|(k, _)| k.clone()); + let lookup = &self.page_lookup; + let batch = &lookup.batch; + let num_rows = batch.num_rows(); + // The batch is sorted by `min`, so the smallest searchable value is the + // `min` of the first non-all-null page and the largest is the `max` of the + // last page. + let (min, max) = if lookup.search_start >= num_rows { + (None, None) + } else { + let min = OrderableScalarValue(ScalarValue::try_from_array( + batch.column(0), + lookup.search_start, + )?); + let max = + OrderableScalarValue(ScalarValue::try_from_array(batch.column(1), num_rows - 1)?); + (Some(min), Some(max)) + }; serde_json::to_value(&BTreeStatistics { - num_pages: self.page_lookup.tree.len() as u32, + num_pages: num_rows as u32, min, max, }) @@ -1718,7 +2105,7 @@ impl ScalarIndex for BTreeIndex { "full text search is not supported for BTree index, build a inverted index for it", )); } - SargableQuery::IsNull() => self.page_lookup.pages_null(), + SargableQuery::IsNull() => Ok(self.page_lookup.pages_null()), SargableQuery::LikePrefix(prefix) => { // Convert LikePrefix to a range query: [prefix, next_prefix) match prefix { @@ -1752,7 +2139,7 @@ impl ScalarIndex for BTreeIndex { } } } - }; + }?; // For non-IsNull queries, also include null pages so that null row IDs // are tracked in the result. Any comparison with NULL yields NULL, and @@ -1763,6 +2150,11 @@ impl ScalarIndex for BTreeIndex { // We add them as Matches::Some (not Matches::All) so that // FlatIndex::search() evaluates the predicate and correctly marks // the rows as NULL rather than TRUE. + // + // TODO: the lookup batch retains a per-page `null_count`. A fully-covered + // page with zero nulls is a true Matches::All, while one with nulls needs + // Matches::Some only to track the null rows; surfacing `null_count` here + // could refine that classification (see #6802). if !matches!(query, SargableQuery::IsNull()) { let existing: HashSet = pages.iter().map(|m| m.page_id()).collect(); for &page_id in self @@ -1777,13 +2169,27 @@ impl ScalarIndex for BTreeIndex { } } + // Compile a large IsIn predicate once and reuse it across every page; + // rebuilding the full IN-list per page is O(pages * values) and dominates + // the lookup for sets with many values. + let prebuilt = match query { + SargableQuery::IsIn(_) => Some(self.compile_predicate(query)?), + _ => None, + }; + let lazy_index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); let page_tasks = pages .into_iter() .map(|page_index| { - self.search_page(query, page_index, lazy_index_reader.clone(), metrics) - .boxed() + self.search_page( + query, + page_index, + lazy_index_reader.clone(), + prebuilt.as_ref(), + metrics, + ) + .boxed() }) .collect::>(); debug!("Searching {} btree pages", page_tasks.len()); @@ -1830,6 +2236,7 @@ impl ScalarIndex for BTreeIndex { let mapping = Arc::new(mapping.clone()); let train_schema = Arc::new(self.train_schema()); + let mut remapped_files = Vec::new(); // TODO: Could potentially parallelize this across parts, unclear it would be worth it for (part_id, page_file) in part_page_files { @@ -1860,7 +2267,10 @@ impl ScalarIndex for BTreeIndex { remapped_stream, )); - train_btree_index(remapped_stream, dest_store, self.batch_size, None, part_id).await?; + let mut files = + train_btree_index(remapped_stream, dest_store, self.batch_size, None, part_id) + .await?; + remapped_files.append(&mut files); } if let Some(ranges_to_files) = &self.ranges_to_files { @@ -1872,7 +2282,7 @@ impl ScalarIndex for BTreeIndex { let lookup_files = (0..num_parts) .map(|part_id| part_lookup_file_path((part_id as u64) << 32)) .collect::>(); - merge_metadata_files( + let merged_files = merge_metadata_files( dest_store, &page_files, &lookup_files, @@ -1880,13 +2290,15 @@ impl ScalarIndex for BTreeIndex { noop_progress(), ) .await?; + remapped_files.retain(|file| file.path.ends_with("_page_data.lance")); + remapped_files.extend(merged_files); } Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) .unwrap(), index_version: BTREE_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: remapped_files, }) } @@ -1896,19 +2308,15 @@ impl ScalarIndex for BTreeIndex { dest_store: &dyn IndexStore, old_data_filter: Option, ) -> Result { - // Merge the existing index data with the new data and then retrain the index on the merged stream - let merged_data_source = self - .clone() - .combine_old_new(new_data, self.batch_size, old_data_filter) - .await?; - train_btree_index(merged_data_source, dest_store, self.batch_size, None, None).await?; - - Ok(CreatedIndex { - index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) - .unwrap(), - index_version: BTREE_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), - }) + // Updating is the single-segment case of a segment merge: union this + // index's data with `new_data`, re-sort on value, and retrain. + Self::merge_segments( + &[Arc::new(self.clone())], + new_data, + dest_store, + &[old_data_filter], + ) + .await } fn update_criteria(&self) -> UpdateCriteria { @@ -2045,7 +2453,7 @@ pub async fn train_btree_index( batch_size: u64, fragment_ids: Option>, range_id: Option, -) -> Result<()> { +) -> Result> { // Create `partition_id` for distributed index building. // This ID serves as a high-level mask (first 32 bits of a u64) to ensure // that index partitions generated by different workers do not conflict. @@ -2110,7 +2518,7 @@ pub async fn train_btree_index( ); batch_idx += 1; } - sub_index_file.finish().await?; + let pages_file = sub_index_file.finish().await?; let record_batch = btree_stats_as_batch(encoded_batches, &value_type)?; let mut file_schema = record_batch.schema().as_ref().clone(); file_schema @@ -2136,8 +2544,8 @@ pub async fn train_btree_index( } }; btree_index_file.write_record_batch(record_batch).await?; - btree_index_file.finish().await?; - Ok(()) + let lookup_file = btree_index_file.finish().await?; + Ok(vec![pages_file, lookup_file]) } fn find_single_partition_files( @@ -2195,7 +2603,7 @@ async fn merge_metadata_files( part_lookup_files: &[String], batch_readhead: Option, progress: Arc, -) -> Result<()> { +) -> Result> { if part_lookup_files.is_empty() || part_page_files.is_empty() { return Err(Error::internal( "No partition files provided for merging".to_string(), @@ -2271,6 +2679,7 @@ async fn merge_metadata_files( progress, ) .await + .map(|file| vec![file]) } else { merge_pages_and_lookups( store, @@ -2324,7 +2733,7 @@ async fn merge_range_partitioned_lookups( batch_size: u64, batch_readhead: Option, progress: Arc, -) -> Result<()> { +) -> Result { let sorted_part_lookup_files = sort_files_by_partition_id(part_lookup_files)?; let mut lookup_file = store .new_index_file(BTREE_LOOKUP_NAME, lookup_schema) @@ -2364,12 +2773,12 @@ async fn merge_range_partitioned_lookups( serde_json::to_string(&pages_per_file)?, ); - lookup_file.finish_with_metadata(metadata).await?; + let lookup_file = lookup_file.finish_with_metadata(metadata).await?; progress.stage_complete("merge_lookups").await?; // In this mode, we only clean up lookup files, and page files are untouched. cleanup_partition_files(store, part_lookup_files, &[]).await; - Ok(()) + Ok(lookup_file) } /// Merges partition files using a K-way sort-merge algorithm. @@ -2388,7 +2797,7 @@ async fn merge_pages_and_lookups( batch_size: u64, batch_readhead: Option, progress: Arc, -) -> Result<()> { +) -> Result> { // Create a new global page file let partition_id = extract_partition_id(part_lookup_files[0].as_str())?; let page_file = page_files_map.get(&partition_id).unwrap(); @@ -2411,7 +2820,7 @@ async fn merge_pages_and_lookups( progress.clone(), ) .await?; - page_file.finish().await?; + let page_file = page_file.finish().await?; progress.stage_complete("merge_pages").await?; let lookup_batch = RecordBatch::try_new( @@ -2436,7 +2845,7 @@ async fn merge_pages_and_lookups( .stage_start("write_lookup_file", Some(1), "files") .await?; lookup_file.write_record_batch(lookup_batch).await?; - lookup_file.finish_with_metadata(metadata).await?; + let lookup_file = lookup_file.finish_with_metadata(metadata).await?; progress.stage_progress("write_lookup_file", 1).await?; progress.stage_complete("write_lookup_file").await?; @@ -2444,7 +2853,7 @@ async fn merge_pages_and_lookups( // Only perform deletion after files are successfully written, ensuring debug information is not lost in case of failure cleanup_partition_files(store, part_lookup_files, part_page_files).await; - Ok(()) + Ok(vec![page_file, lookup_file]) } // Adjust local_page_idx_ in each look-up file to create a contiguous global_page_idx @@ -2853,7 +3262,7 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { The `range_id` field will be removed in a future release." ); } - train_btree_index( + let files = train_btree_index( data, index_store, request @@ -2868,7 +3277,7 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) .unwrap(), index_version: BTREE_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files, }) } @@ -2903,7 +3312,7 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { Error::internal("BTreeIndexPlugin::put_in_cache called with a non-BTree index") })?; let state = BTreeIndexState { - lookup_batch: btree.lookup_batch.clone(), + lookup_batch: btree.page_lookup.batch.clone(), batch_size: btree.batch_size, ranges_to_files: btree.ranges_to_files.clone(), }; @@ -2927,10 +3336,10 @@ mod tests { }; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_physical_expr::{PhysicalSortExpr, expressions::col}; - use deepsize::DeepSizeOf; use futures::TryStreamExt; use futures::stream; use lance_core::cache::LanceCache; + use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tempfile::TempObjDir; use lance_datafusion::{chunker::break_stream, datagen::DatafusionDatagenExt}; use lance_datagen::{ArrayGeneratorExt, BatchCount, RowCount, array, gen_batch}; @@ -2950,12 +3359,29 @@ mod tests { }; use super::{ - BTreeIndexPlugin, BTreeIndexState, BTreePageKey, DEFAULT_BTREE_BATCH_SIZE, - OrderableScalarValue, part_lookup_file_path, part_page_data_file_path, train_btree_index, + BTreeIndexPlugin, BTreeIndexState, BTreeLookup, BTreePageKey, DEFAULT_BTREE_BATCH_SIZE, + Matches, OrderableScalarValue, part_lookup_file_path, part_page_data_file_path, + train_btree_index, }; use crate::scalar::registry::ScalarIndexPlugin; use arrow_array::RecordBatch; - use lance_core::cache::{CacheCodecImpl, CacheKey}; + use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey}; + + /// Serialize a `BTreeIndexState` body (no envelope) for tests. + fn serialize_state(state: &BTreeIndexState) -> Vec { + let mut buf = Vec::new(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + buf + } + + /// Deserialize a `BTreeIndexState` body (no envelope) for tests. + fn deserialize_state(buf: Vec) -> lance_core::Result { + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, BTreeIndexState::CURRENT_VERSION); + BTreeIndexState::deserialize(&mut reader) + } use rangemap::RangeInclusiveMap; lance_testing::define_stage_event_progress!( @@ -2979,6 +3405,37 @@ mod tests { assert!(size_of_many_i32 > 128 * 4); } + #[test] + fn test_orderable_dictionary_cmp() { + use arrow_schema::DataType; + use std::cmp::Ordering; + + let dict = |s: &str, key: DataType| { + OrderableScalarValue(ScalarValue::Dictionary( + Box::new(key), + Box::new(ScalarValue::Utf8(Some(s.to_string()))), + )) + }; + + // Dictionary scalars are ordered by their underlying value, regardless + // of the key type. This is exercised when loading a scalar index built + // on a dictionary-encoded column into a BTreeMap. + assert_eq!( + dict("a", DataType::Int16).cmp(&dict("b", DataType::Int16)), + Ordering::Less + ); + assert_eq!( + dict("b", DataType::Int32).cmp(&dict("b", DataType::Int16)), + Ordering::Equal + ); + + // A non-null dictionary value sorts after null. + assert_eq!( + dict("a", DataType::Int16).cmp(&OrderableScalarValue(ScalarValue::Null)), + Ordering::Greater + ); + } + #[tokio::test] async fn test_null_ids() { let tmpdir = TempObjDir::default(); @@ -3096,6 +3553,86 @@ mod tests { } } + #[tokio::test] + async fn test_contains_keys_matches_search() { + let tmpdir = TempObjDir::default(); + let test_store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // 1000 distinct Int32 values [0, 1000), spread across many small pages + // (batch_size 64) so the keys below exercise multi-page grouping. + let data = gen_batch() + .col("value", array::step::()) + .col("_rowid", array::step::()) + .into_df_exec(RowCount::from(100), BatchCount::from(10)); + let schema = data.schema(); + let sort_expr = PhysicalSortExpr::new_default(col("value", schema.as_ref()).unwrap()); + let plan = Arc::new(SortExec::new([sort_expr].into(), data)); + let stream = plan.execute(0, Arc::new(TaskContext::default())).unwrap(); + let stream = break_stream(stream, 64); + let stream = stream.map_err(DataFusionError::from); + let stream = + Box::pin(RecordBatchStreamAdapter::new(schema, stream)) as SendableRecordBatchStream; + + train_btree_index(stream, test_store.as_ref(), 64, None, None) + .await + .unwrap(); + let index = BTreeIndex::load(test_store, None, &LanceCache::no_cache()) + .await + .unwrap(); + + // Present (range ends, mid, and adjacent values that straddle page + // boundaries), interleaved with absent (below/above range, and a gap). + let keys: Vec = vec![0, 999, 500, 1, 998, -1, 1000, 1500, 250, 251, 7, 64, 63, 65]; + let scalar_keys: Vec = + keys.iter().map(|k| ScalarValue::Int32(Some(*k))).collect(); + + let batched = index + .contains_keys(&scalar_keys, &NoOpMetricsCollector) + .await + .unwrap(); + + // Oracle: the per-key Equals search the batched path replaces. + let mut oracle = Vec::with_capacity(keys.len()); + for k in &scalar_keys { + let result = index + .search(&SargableQuery::Equals(k.clone()), &NoOpMetricsCollector) + .await + .unwrap(); + oracle.push(!result.row_addrs().is_empty()); + } + assert_eq!( + batched, oracle, + "contains_keys must agree with per-key Equals search; keys={keys:?}" + ); + + // And both must match ground truth: [0, 1000) present, others absent. + let expected: Vec = keys.iter().map(|k| (0..1000).contains(k)).collect(); + assert_eq!(batched, expected); + + // Empty input → empty mask. + assert!( + index + .contains_keys(&[], &NoOpMetricsCollector) + .await + .unwrap() + .is_empty() + ); + + // A null key maps to false (and must not panic). + let with_null = vec![ScalarValue::Int32(Some(5)), ScalarValue::Int32(None)]; + assert_eq!( + index + .contains_keys(&with_null, &NoOpMetricsCollector) + .await + .unwrap(), + vec![true, false] + ); + } + #[tokio::test] async fn test_page_cache() { let tmpdir = TempObjDir::default(); @@ -4943,10 +5480,621 @@ mod tests { .unwrap() } + fn osv(v: i32) -> OrderableScalarValue { + OrderableScalarValue(ScalarValue::Int32(Some(v))) + } + + /// The rewritten [`BTreeLookup`] searches the lookup batch directly, so this + /// exercises the binary-search bounds, duplicate `min` values, a partial-null + /// (null `min`) straddling page, and the `Matches::Some`/`All` classification. + #[test] + fn test_btree_lookup_pages_between() { + // Pages sorted by `min`, NULLs first. Page 0 straddles the NULL/non-NULL + // boundary; pages 2 and 3 share a `min` of 20. + let batch = record_batch!( + ("min", Int32, [None, Some(10), Some(20), Some(20), Some(40)]), + ( + "max", + Int32, + [Some(5), Some(20), Some(20), Some(30), Some(50)] + ), + ("null_count", UInt32, [2, 0, 0, 0, 0]), + ("page_idx", UInt32, [0, 1, 2, 3, 4]) + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + assert_eq!(lookup.null_pages, vec![0]); + assert!(lookup.all_null_pages.is_empty()); + assert_eq!(lookup.search_start, 0); + + let between = |lo: i32, hi: i32| { + let mut m = lookup + .pages_between(( + std::ops::Bound::Included(&osv(lo)), + std::ops::Bound::Included(&osv(hi)), + )) + .unwrap(); + m.sort_by_key(|m| m.page_id()); + m + }; + + // Equality only ever yields partial (Some) matches. + assert_eq!(lookup.pages_eq(&osv(15)).unwrap(), vec![Matches::Some(1)]); + assert_eq!( + lookup.pages_eq(&osv(20)).unwrap(), + vec![Matches::Some(1), Matches::Some(2), Matches::Some(3)] + ); + assert!(lookup.pages_eq(&osv(35)).unwrap().is_empty()); + + // [20, 25]: page 2 ([20, 20]) sits entirely inside -> All; pages 1 and 3 + // only partially overlap -> Some. The null-min page 0 (max 5) is excluded. + assert_eq!( + between(20, 25), + vec![Matches::Some(1), Matches::All(2), Matches::Some(3)] + ); + + // A query below all non-null data still reaches the straddling page 0, + // which is only ever a partial match because its `min` is NULL. + assert_eq!(between(0, 5), vec![Matches::Some(0)]); + + // Unbounded above: page 4 ([40, 50]) is fully covered from 40 onward. + assert_eq!( + lookup + .pages_between(( + std::ops::Bound::Included(&osv(40)), + std::ops::Bound::Unbounded + )) + .unwrap(), + vec![Matches::All(4)] + ); + + // Empty / inverted ranges select nothing. + assert!(between(31, 39).is_empty()); + assert!( + lookup + .pages_between(( + std::ops::Bound::Included(&osv(25)), + std::ops::Bound::Included(&osv(15)) + )) + .unwrap() + .is_empty() + ); + } + + /// Exercises the native byte comparator path (`accessor_cmp`) for + /// variable-length `Binary` and fixed-width `FixedSizeBinary` (e.g. UUID) + /// columns, including the null-min straddle page and duplicate `min`s. + #[test] + fn test_btree_lookup_pages_eq_bytes() { + use arrow_array::{ + ArrayRef, BinaryArray, FixedSizeBinaryArray, LargeBinaryArray, LargeStringArray, + UInt32Array, + }; + use arrow_schema::{DataType, Field, Schema}; + + // 2-byte big-endian keys, so lexicographic byte order matches numeric + // order. Same layout as the int test: page 0 is a null-min straddle, + // pages 2 and 3 share `min` 20, and 35 falls in a gap. + fn be(v: u16) -> [u8; 2] { + v.to_be_bytes() + } + let mins = [None, Some(10u16), Some(20), Some(20), Some(40)]; + let maxs = [Some(5u16), Some(20), Some(20), Some(30), Some(50)]; + let null_count = UInt32Array::from(vec![2u32, 0, 0, 0, 0]); + let page_idx = UInt32Array::from(vec![0u32, 1, 2, 3, 4]); + + let assert_byte_lookup = + |min_arr: ArrayRef, max_arr: ArrayRef, sv: &dyn Fn(u16) -> ScalarValue| { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("min", min_arr.data_type().clone(), true), + Field::new("max", max_arr.data_type().clone(), true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])), + vec![ + min_arr, + max_arr, + Arc::new(null_count.clone()), + Arc::new(page_idx.clone()), + ], + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + + let eq = |v: u16| { + let mut p: Vec = lookup + .pages_eq(&OrderableScalarValue(sv(v))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + assert_eq!(eq(15), vec![1]); // only page 1 ([10, 20]) + assert_eq!(eq(20), vec![1, 2, 3]); // shared min of 2 & 3, max of 1 + assert!(eq(35).is_empty()); // gap between pages 3 and 4 + assert_eq!(eq(5), vec![0]); // reaches the null-min straddle via its max + + // IN merges and dedups across values. + let mut in_pages: Vec = lookup + .pages_in([5u16, 15].into_iter().map(|v| OrderableScalarValue(sv(v)))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + in_pages.sort_unstable(); + assert_eq!(in_pages, vec![0, 1]); + }; + + let fsb = |arr: &[Option]| -> ArrayRef { + Arc::new( + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + arr.iter().copied().map(|o| o.map(be)), + 2, + ) + .unwrap(), + ) + }; + assert_byte_lookup(fsb(&mins), fsb(&maxs), &|v| { + ScalarValue::FixedSizeBinary(2, Some(be(v).to_vec())) + }); + + let bin = |arr: &[Option]| -> ArrayRef { + Arc::new(BinaryArray::from_iter( + arr.iter().copied().map(|o| o.map(|v| be(v).to_vec())), + )) + }; + assert_byte_lookup(bin(&mins), bin(&maxs), &|v| { + ScalarValue::Binary(Some(be(v).to_vec())) + }); + + let lbin = |arr: &[Option]| -> ArrayRef { + Arc::new(LargeBinaryArray::from_iter( + arr.iter().copied().map(|o| o.map(|v| be(v).to_vec())), + )) + }; + assert_byte_lookup(lbin(&mins), lbin(&maxs), &|v| { + ScalarValue::LargeBinary(Some(be(v).to_vec())) + }); + + // `LargeUtf8` over zero-padded decimal strings, whose lexicographic order + // matches the numeric order of the keys. + let lstr = |arr: &[Option]| -> ArrayRef { + Arc::new(LargeStringArray::from_iter( + arr.iter().copied().map(|o| o.map(|v| format!("{v:02}"))), + )) + }; + assert_byte_lookup(lstr(&mins), lstr(&maxs), &|v| { + ScalarValue::LargeUtf8(Some(format!("{v:02}"))) + }); + } + + /// Exercises the physical-type reinterpret path: temporal columns (`Date32` + /// over `i32`, `Timestamp` over `i64`) are compared through the integer native + /// path without a dedicated per-type branch. + #[test] + fn test_btree_lookup_pages_eq_temporal() { + use arrow_array::{ArrayRef, Date32Array, TimestampMicrosecondArray, UInt32Array}; + use arrow_schema::{DataType, Field, Schema}; + + let null_count = UInt32Array::from(vec![2u32, 0, 0, 0, 0]); + let page_idx = UInt32Array::from(vec![0u32, 1, 2, 3, 4]); + + let assert_lookup = + |min_arr: ArrayRef, max_arr: ArrayRef, sv: &dyn Fn(i64) -> ScalarValue| { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("min", min_arr.data_type().clone(), true), + Field::new("max", max_arr.data_type().clone(), true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])), + vec![ + min_arr, + max_arr, + Arc::new(null_count.clone()), + Arc::new(page_idx.clone()), + ], + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + let eq = |v: i64| { + let mut p: Vec = lookup + .pages_eq(&OrderableScalarValue(sv(v))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + assert_eq!(eq(15), vec![1]); // only page 1 ([10, 20]) + assert_eq!(eq(20), vec![1, 2, 3]); // shared min of 2 & 3, max of 1 + assert!(eq(35).is_empty()); // gap between pages 3 and 4 + assert_eq!(eq(5), vec![0]); // reaches the null-min straddle via its max + }; + + // Timestamp (i64-backed) → Int64 native path. + assert_lookup( + Arc::new(TimestampMicrosecondArray::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(TimestampMicrosecondArray::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::TimestampMicrosecond(Some(v), None), + ); + + // Date32 (i32-backed) → Int32 native path. + assert_lookup( + Arc::new(Date32Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(Date32Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::Date32(Some(v as i32)), + ); + } + + /// Exercises the remaining physical-type dispatch arms that the temporal and + /// byte tests don't reach: every integer width and signedness, `Float16`, and + /// the 128-/256-bit decimal paths. All share the temporal test's numeric layout + /// (mins `[_, 10, 20, 20, 40]`, maxs `[5, 20, 20, 30, 50]`) so the assertions are + /// identical; only the array/scalar type varies. + #[test] + fn test_btree_lookup_pages_eq_numeric_widths() { + use arrow::datatypes::i256; + use arrow_array::{ + ArrayRef, Decimal128Array, Decimal256Array, Float16Array, Int8Array, Int16Array, + UInt8Array, UInt16Array, UInt32Array, UInt64Array, + }; + use arrow_schema::{DataType, Field, Schema}; + use half::f16; + + let null_count = UInt32Array::from(vec![2u32, 0, 0, 0, 0]); + let page_idx = UInt32Array::from(vec![0u32, 1, 2, 3, 4]); + let assert_lookup = + |min_arr: ArrayRef, max_arr: ArrayRef, sv: &dyn Fn(i64) -> ScalarValue| { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("min", min_arr.data_type().clone(), true), + Field::new("max", max_arr.data_type().clone(), true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])), + vec![ + min_arr, + max_arr, + Arc::new(null_count.clone()), + Arc::new(page_idx.clone()), + ], + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + let eq = |v: i64| { + let mut p: Vec = lookup + .pages_eq(&OrderableScalarValue(sv(v))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + assert_eq!(eq(15), vec![1]); // only page 1 ([10, 20]) + assert_eq!(eq(20), vec![1, 2, 3]); // shared min of 2 & 3, max of 1 + assert!(eq(35).is_empty()); // gap between pages 3 and 4 + assert_eq!(eq(5), vec![0]); // reaches the null-min straddle via its max + }; + + assert_lookup( + Arc::new(Int8Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(Int8Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::Int8(Some(v as i8)), + ); + assert_lookup( + Arc::new(Int16Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(Int16Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::Int16(Some(v as i16)), + ); + assert_lookup( + Arc::new(UInt8Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt8Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt8(Some(v as u8)), + ); + assert_lookup( + Arc::new(UInt16Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt16Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt16(Some(v as u16)), + ); + assert_lookup( + Arc::new(UInt32Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt32Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt32(Some(v as u32)), + ); + assert_lookup( + Arc::new(UInt64Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt64Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt64(Some(v as u64)), + ); + + let f = |v: f64| f16::from_f64(v); + assert_lookup( + Arc::new(Float16Array::from(vec![ + None, + Some(f(10.0)), + Some(f(20.0)), + Some(f(20.0)), + Some(f(40.0)), + ])), + Arc::new(Float16Array::from(vec![ + Some(f(5.0)), + Some(f(20.0)), + Some(f(20.0)), + Some(f(30.0)), + Some(f(50.0)), + ])), + &|v| ScalarValue::Float16(Some(f(v as f64))), + ); + + // Decimal128 (i128 native path). Comparison is on the raw integer, so a + // scale of 0 lets the values double as plain integers. + let dec128 = |vals: Vec>| -> ArrayRef { + Arc::new( + Decimal128Array::from(vals) + .with_precision_and_scale(18, 0) + .unwrap(), + ) + }; + assert_lookup( + dec128(vec![None, Some(10), Some(20), Some(20), Some(40)]), + dec128(vec![Some(5), Some(20), Some(20), Some(30), Some(50)]), + &|v| ScalarValue::Decimal128(Some(v as i128), 18, 0), + ); + + // Decimal256 (i256 native path). + let dec256 = |vals: Vec>| -> ArrayRef { + Arc::new( + Decimal256Array::from( + vals.into_iter() + .map(|o| o.map(i256::from_i128)) + .collect::>(), + ) + .with_precision_and_scale(40, 0) + .unwrap(), + ) + }; + assert_lookup( + dec256(vec![None, Some(10), Some(20), Some(20), Some(40)]), + dec256(vec![Some(5), Some(20), Some(20), Some(30), Some(50)]), + &|v| ScalarValue::Decimal256(Some(i256::from_i128(v as i128)), 40, 0), + ); + } + + /// Exercises the NULL paths of the lookup directly: `pages_eq(NULL)` and + /// `pages_in` with a NULL in the value list (and a NULL-only list), including + /// the partial-null (`Some`) vs entirely-null (`All`) page classification. + #[test] + fn test_btree_lookup_pages_null() { + // Page 0 is entirely null (null max -> All); page 1 is a partial-null + // straddle (max 5, null_count > 0 -> Some); page 2 also carries a null. + let batch = record_batch!( + ("min", Int32, [None, None, Some(10), Some(20), Some(40)]), + ("max", Int32, [None, Some(5), Some(20), Some(30), Some(50)]), + ("null_count", UInt32, [3, 2, 1, 0, 0]), + ("page_idx", UInt32, [0, 1, 2, 3, 4]) + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + assert_eq!(lookup.all_null_pages, vec![0]); + assert_eq!(lookup.null_pages, vec![1, 2]); + + // pages_eq(NULL) short-circuits to the null pages: partial-null pages are + // `Some`, the entirely-null page is `All`. + assert_eq!( + lookup + .pages_eq(&OrderableScalarValue(ScalarValue::Int32(None))) + .unwrap(), + vec![Matches::Some(1), Matches::Some(2), Matches::All(0)] + ); + + let in_ids = |vals: Vec>| { + let mut p: Vec = lookup + .pages_in( + vals.into_iter() + .map(|v| OrderableScalarValue(ScalarValue::Int32(v))), + ) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + // Baseline: a non-null value only -> just its value page. + assert_eq!(in_ids(vec![Some(45)]), vec![4]); + // A NULL in the list unions in every null page (0, 1, 2). + assert_eq!(in_ids(vec![Some(45), None]), vec![0, 1, 2, 4]); + // A NULL-only list (empty non-null set) returns exactly the null pages. + assert_eq!(in_ids(vec![None]), vec![0, 1, 2]); + } + + /// A 0-row page_lookup batch (an index over an empty dataset) must yield no + /// candidates for any query rather than panicking on the binary-search bounds. + #[test] + fn test_btree_lookup_empty_batch() { + use arrow_schema::{DataType, Field, Schema}; + + let schema = Arc::new(Schema::new(vec![ + Field::new("min", DataType::Int32, true), + Field::new("max", DataType::Int32, true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])); + let lookup = BTreeLookup::try_new(RecordBatch::new_empty(schema)).unwrap(); + assert_eq!(lookup.search_start, 0); + assert!(lookup.null_pages.is_empty()); + assert!(lookup.all_null_pages.is_empty()); + + assert!(lookup.pages_eq(&osv(5)).unwrap().is_empty()); + assert!(lookup.pages_in([osv(5)]).unwrap().is_empty()); + assert!( + lookup + .pages_between(( + std::ops::Bound::Included(&osv(0)), + std::ops::Bound::Included(&osv(100)), + )) + .unwrap() + .is_empty() + ); + assert!(lookup.pages_null().is_empty()); + } + + /// A straddle page (null `min`, non-null `max`) can sort ahead of an entirely- + /// null page within the leading NULL-`min` group. When it does, `search_start` + /// points at the straddle and the all-null page falls inside the forward-scan + /// window, so both the equality and range scans must skip it (it matches only + /// IS NULL). + #[test] + fn test_btree_lookup_skips_all_null_page_in_scan_window() { + // Page 0: straddle (null min, max 5). Page 1: entirely null (null min/max). + let batch = record_batch!( + ("min", Int32, [None, None, Some(10), Some(20), Some(40)]), + ("max", Int32, [Some(5), None, Some(20), Some(30), Some(50)]), + ("null_count", UInt32, [2, 3, 0, 0, 0]), + ("page_idx", UInt32, [0, 1, 2, 3, 4]) + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + assert_eq!(lookup.search_start, 0); // straddle page 0 has a non-null max + assert_eq!(lookup.all_null_pages, vec![1]); + assert_eq!(lookup.null_pages, vec![0]); + + // Equality for 5 peeks left across the all-null page 1 (index 1, inside the + // scan window) and must skip it, reaching only the straddle page 0. + assert_eq!( + lookup + .pages_eq(&osv(5)) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect::>(), + vec![0] + ); + + // The same all-null page sits inside the range scan window and is skipped: + // page 0 (straddle) is a partial match, pages 2-4 are fully covered. + let mut between = lookup + .pages_between(( + std::ops::Bound::Included(&osv(0)), + std::ops::Bound::Included(&osv(100)), + )) + .unwrap(); + between.sort_by_key(|m| m.page_id()); + assert_eq!( + between, + vec![ + Matches::Some(0), + Matches::All(2), + Matches::All(3), + Matches::All(4), + ] + ); + } + fn assert_state_roundtrips(state: &BTreeIndexState) { - let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + let restored = deserialize_state(serialize_state(state)).unwrap(); assert_eq!(restored.lookup_batch, state.lookup_batch); assert_eq!(restored.batch_size, state.batch_size); assert_eq!(restored.ranges_to_files, state.ranges_to_files); @@ -5011,13 +6159,11 @@ mod tests { // Round-trip the state through the codec and reconstruct an index from it. let state = BTreeIndexState { - lookup_batch: index.lookup_batch.clone(), + lookup_batch: index.page_lookup.batch.clone(), batch_size: index.batch_size, ranges_to_files: index.ranges_to_files.clone(), }; - let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + let restored = deserialize_state(serialize_state(&state)).unwrap(); let reconstructed = restored .reconstruct(test_store.clone(), &LanceCache::no_cache(), None) .unwrap(); @@ -5053,18 +6199,57 @@ mod tests { assert_eq!(expected, actual); } + /// The lookup batch must decode zero-copy through the full envelope even + /// though the proto header pushes the IPC section to a non-aligned offset. + #[test] + fn test_btree_index_state_lookup_is_zero_copy() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let ranges: RangeInclusiveMap = + [(0..=99, ("part_0_page_file.lance".to_string(), 0))] + .into_iter() + .collect(); + let state = BTreeIndexState { + lookup_batch: sample_lookup_batch(), + batch_size: 8192, + ranges_to_files: Some(Arc::new(ranges)), + }; + + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(state); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.lookup_batch.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "lookup batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[test] - fn test_btree_index_state_rejects_invalid_has_ranges_tag() { - // u64 batch_size (any) then a bad has_ranges tag. + fn test_btree_index_state_rejects_truncated_header() { + // A header length prefix that overruns the buffer must error rather + // than panic or silently misread it. let mut buf = Vec::new(); - buf.extend_from_slice(&1000u64.to_le_bytes()); - buf.push(7u8); - let err = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap_err(); - let msg = err.to_string(); - assert!( - msg.contains("has_ranges") && msg.contains("7"), - "expected error to mention the bad has_ranges tag, got: {msg}" - ); + buf.extend_from_slice(&100u32.to_le_bytes()); // claims a 100-byte header + buf.extend_from_slice(&[0u8; 4]); // but only 4 bytes follow + assert!(deserialize_state(buf).is_err()); } #[tokio::test] @@ -5093,7 +6278,7 @@ mod tests { .await .unwrap(); let state = BTreeIndexState { - lookup_batch: index.lookup_batch.clone(), + lookup_batch: index.page_lookup.batch.clone(), batch_size: index.batch_size, ranges_to_files: index.ranges_to_files.clone(), }; diff --git a/rust/lance-index/src/scalar/btree/flat.rs b/rust/lance-index/src/scalar/btree/flat.rs index 4240753772b..744f6a3cb3c 100644 --- a/rust/lance-index/src/scalar/btree/flat.rs +++ b/rust/lance-index/src/scalar/btree/flat.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::{ops::Bound, sync::Arc}; use arrow_array::Array; @@ -11,19 +11,20 @@ use arrow_array::{ use datafusion_common::DFSchema; use datafusion_expr::execution_props::ExecutionProps; -use datafusion_physical_expr::create_physical_expr; -use deepsize::DeepSizeOf; +use datafusion_physical_expr::{PhysicalExpr, create_physical_expr}; use lance_arrow::RecordBatchExt; -use lance_arrow::ipc::{read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream}; use lance_core::Result; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; use roaring::RoaringBitmap; use tracing::instrument; +use datafusion_common::ScalarValue; + use crate::metrics::MetricsCollector; -use crate::scalar::btree::BTREE_VALUES_COLUMN; +use crate::scalar::btree::{BTREE_VALUES_COLUMN, OrderableScalarValue}; use crate::scalar::{AnyQuery, SargableQuery}; const VALUES_COL_IDX: usize = 0; @@ -43,7 +44,7 @@ pub struct FlatIndex { } impl DeepSizeOf for FlatIndex { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.data.get_array_memory_size() } } @@ -83,6 +84,46 @@ impl FlatIndex { self.data.column(IDS_COL_IDX) } + fn values(&self) -> &ArrayRef { + self.data.column(VALUES_COL_IDX) + } + + /// Which of `needles` are present in this page. + /// + /// Batched existence sibling of [`Self::search`]: it runs the same `IsIn` + /// predicate over the page's `values` column, but returns the matched + /// *values* rather than row addresses — so the caller can map each result + /// back to the input key it asked about. The page scan stays vectorized; + /// only the (small) matched subset is lifted into `ScalarValue`. + /// + /// Nulls: a null `values` entry never matches a (non-null) primary-key + /// needle, so it is simply absent from the result. + pub(crate) fn contains_values( + &self, + needles: &[OrderableScalarValue], + ) -> Result> { + if needles.is_empty() { + return Ok(BTreeSet::new()); + } + let query = SargableQuery::IsIn(needles.iter().map(|v| v.0.clone()).collect()); + let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string()); + let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?; + let predicate = expr.evaluate(&self.data)?; + let predicate = predicate.into_array(self.data.num_rows())?; + let predicate = predicate + .as_any() + .downcast_ref::() + .expect("Predicate should return boolean array"); + let matched = arrow_select::filter::filter(self.values(), predicate)?; + (0..matched.len()) + .map(|i| { + Ok(OrderableScalarValue(ScalarValue::try_from_array( + &matched, i, + )?)) + }) + .collect() + } + pub fn all(&self) -> NullableRowAddrSet { // Some rows will be in both sets but that is ok, null trumps true NullableRowAddrSet::new(self.all_addrs_map.clone(), self.null_addrs_map.clone()) @@ -196,7 +237,22 @@ impl FlatIndex { // No shortcut possible, need to actually evaluate the query let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string()); let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?; + self.eval_expr(&expr) + } + /// Evaluate a predicate compiled once by the caller. Lets a large IsIn that + /// spans many pages build the physical expr a single time instead of + /// rebuilding the whole IN-list per page (the dominant cost of a big lookup). + pub fn search_prebuilt( + &self, + expr: &Arc, + metrics: &dyn MetricsCollector, + ) -> Result { + metrics.record_comparisons(self.data.num_rows()); + self.eval_expr(expr) + } + + fn eval_expr(&self, expr: &Arc) -> Result { let predicate = expr.evaluate(&self.data)?; let predicate = predicate.into_array(self.data.num_rows())?; let predicate = predicate @@ -236,32 +292,38 @@ impl FlatIndex { } impl CacheCodecImpl for FlatIndex { - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.scalar.FlatIndex"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { // Format: - // [len-prefixed all_addrs_map][len-prefixed null_addrs_map][batch IPC stream] - writer.write_all(&(self.all_addrs_map.serialized_size() as u64).to_le_bytes())?; - self.all_addrs_map.serialize_into(&mut *writer)?; + // RAW_BLOB : all_addrs_map (roaring tree map) + // RAW_BLOB : null_addrs_map (roaring tree map) + // ARROW_IPC : data batch + let mut all_addrs_bytes = Vec::with_capacity(self.all_addrs_map.serialized_size()); + self.all_addrs_map.serialize_into(&mut all_addrs_bytes)?; + w.write_raw(&all_addrs_bytes)?; - writer.write_all(&(self.null_addrs_map.serialized_size() as u64).to_le_bytes())?; - self.null_addrs_map.serialize_into(&mut *writer)?; + let mut null_addrs_bytes = Vec::with_capacity(self.null_addrs_map.serialized_size()); + self.null_addrs_map.serialize_into(&mut null_addrs_bytes)?; + w.write_raw(&null_addrs_bytes)?; - write_ipc_stream(self.data.as_ref(), writer)?; + w.write_ipc(self.data.as_ref())?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result where Self: Sized, { - let mut offset = 0; - let all_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let all_addrs_bytes = r.read_raw()?; let all_addrs_map = RowAddrTreeMap::deserialize_from(all_addrs_bytes.as_ref())?; - let null_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let null_addrs_bytes = r.read_raw()?; let null_addrs_map = RowAddrTreeMap::deserialize_from(null_addrs_bytes.as_ref())?; - let batch = read_ipc_stream_single_at(data, &mut offset)?; + let batch = r.read_ipc()?; let df_schema = DFSchema::try_from(batch.schema())?; @@ -309,8 +371,12 @@ mod tests { fn assert_roundtrips(index: &FlatIndex) { let mut buf = Vec::new(); - index.serialize(&mut buf).unwrap(); - let restored = FlatIndex::deserialize(&bytes::Bytes::from(buf)).unwrap(); + index + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, FlatIndex::CURRENT_VERSION); + let restored = FlatIndex::deserialize(&mut reader).unwrap(); assert_eq!(restored.data, index.data); assert_eq!(restored.all_addrs_map, index.all_addrs_map); @@ -335,6 +401,41 @@ mod tests { assert_roundtrips(&FlatIndex::try_new(empty).unwrap()); } + /// The data batch must decode zero-copy through the full envelope-bearing + /// [`CacheCodec`], even though the two roaring blobs and the envelope push + /// the IPC section to a non-aligned starting offset. + #[test] + fn test_flat_index_data_is_zero_copy() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let index = example_index(); + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(index); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.data.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "data batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[tokio::test] async fn test_equality() { check_index(&SargableQuery::Equals(ScalarValue::from(100)), &[0]).await; diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 187d5be999f..053da5ae5e7 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -22,6 +22,7 @@ use super::{GeoQuery, RelationQuery}; use lance_core::{Error, Result}; use lance_datafusion::{expr::safe_coerce_scalar, planner::Planner}; use lance_select::{IndexExprResult, NullableIndexExprResult, NullableRowAddrMask}; +use roaring::RoaringBitmap; use tracing::instrument; const MAX_DEPTH: usize = 500; @@ -178,6 +179,18 @@ impl MultiQueryParser { pub fn add(&mut self, other: Box) { self.parsers.push(other); } + + /// Pick the first underlying parser whose `is_valid_reference` accepts `expr`. + pub fn select( + &self, + expr: &Expr, + data_type: &DataType, + ) -> Option<(&dyn ScalarQueryParser, DataType)> { + self.parsers.iter().find_map(|p| { + p.is_valid_reference(expr, data_type) + .map(|dt| (p.as_ref(), dt)) + }) + } } impl ScalarQueryParser for MultiQueryParser { @@ -436,6 +449,7 @@ impl ScalarQueryParser for SargableQueryParser { index_type: self.index_type.clone(), query: Arc::new(query), needs_recheck: self.needs_recheck, + fragment_bitmap: None, })); // If the pattern has wildcards beyond simple prefix, add refine expression @@ -779,20 +793,28 @@ impl ScalarQueryParser for LabelListQueryParser { } } -/// A parser for indices that handle string contains queries +/// A parser for indices that handle string `contains` queries, and -- when +/// `supports_regex` is set -- `regexp_like` / `regexp_match` queries. #[derive(Debug, Clone)] pub struct TextQueryParser { index_name: String, index_type: String, needs_recheck: bool, + supports_regex: bool, } impl TextQueryParser { - pub fn new(index_name: String, index_type: String, needs_recheck: bool) -> Self { + pub fn new( + index_name: String, + index_type: String, + needs_recheck: bool, + supports_regex: bool, + ) -> Self { Self { index_name, index_type, needs_recheck, + supports_regex, } } } @@ -835,31 +857,156 @@ impl ScalarQueryParser for TextQueryParser { func: &ScalarUDF, args: &[Expr], ) -> Option { - if args.len() != 2 { + // The first argument is the indexed column; the second is the substring + // / pattern. `contains` takes exactly two arguments; the regex functions + // optionally take a third flags argument. + if args.len() < 2 { return None; } - let scalar = maybe_scalar(&args[1], data_type)?; - match scalar { - ScalarValue::Utf8(Some(scalar_str)) | ScalarValue::LargeUtf8(Some(scalar_str)) => { - if func.name() == "contains" { - let query = TextQuery::StringContains(scalar_str); - Some(IndexedExpression::index_query_with_recheck( - column.to_string(), - self.index_name.clone(), - self.index_type.clone(), - Arc::new(query), - self.needs_recheck, - )) - } else { + // A non-string pattern cannot be handled. + let (ScalarValue::Utf8(Some(pattern)) | ScalarValue::LargeUtf8(Some(pattern))) = + maybe_scalar(&args[1], data_type)? + else { + return None; + }; + + let query = match func.name() { + "contains" if args.len() == 2 => TextQuery::StringContains(pattern), + "regexp_like" | "regexp_match" if self.supports_regex => { + let pattern = match args.get(2) { + Some(flags_expr) => apply_regex_flags(&pattern, flags_expr)?, + None => pattern, + }; + // If the pattern yields no usable trigram (e.g. `a.b`), leave it + // to a full scan instead of routing it to the index, which could + // only answer with an unsupported "recheck everything" result. + if !crate::scalar::ngram::regex_can_use_index(&pattern) { + return None; + } + TextQuery::Regex(pattern) + } + _ => return None, + }; + + Some(IndexedExpression::index_query_with_recheck( + column.to_string(), + self.index_name.clone(), + self.index_type.clone(), + Arc::new(query), + self.needs_recheck, + )) + } + + fn visit_like( + &self, + column: &str, + like: &Like, + pattern: &ScalarValue, + ) -> Option { + // Infix LIKE is accelerated only by the ngram index (via its regex + // machinery). A plain-literal `regexp_like(col, 'foo')` is rewritten to + // `col LIKE '%foo%'` before it reaches the index, so this is the path + // that accelerates those. ILIKE is skipped because its case folding does + // not match the index's normalization. + if !self.supports_regex || like.case_insensitive { + return None; + } + let pattern_str = match pattern { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => s.as_str(), + _ => return None, + }; + // Translate the LIKE pattern into a loose regex used only for candidate + // generation; the original LIKE stays as the recheck filter, so the + // regex only needs to be a sound superset. + let regex = like_to_regex(pattern_str, like.escape_char)?; + if !crate::scalar::ngram::regex_can_use_index(®ex) { + return None; + } + Some(IndexedExpression { + scalar_query: Some(ScalarIndexExpr::Query(ScalarIndexSearch { + column: column.to_string(), + index_name: self.index_name.clone(), + index_type: self.index_type.clone(), + query: Arc::new(TextQuery::Regex(regex)), + needs_recheck: self.needs_recheck, + fragment_bitmap: None, + })), + refine_expr: Some(Expr::Like(like.clone())), + }) + } +} + +/// Translate a LIKE pattern into a regular expression used purely for ngram +/// candidate generation: `%` becomes `.*`, `_` becomes `.`, and literal +/// characters are regex-escaped. Returns `None` when no literal run is long +/// enough to yield a trigram (the index could not help, so a full scan is left +/// to handle it). +fn like_to_regex(pattern: &str, escape: Option) -> Option { + let mut regex = String::new(); + let mut run = 0usize; + let mut longest_run = 0usize; + let mut chars = pattern.chars(); + while let Some(c) = chars.next() { + let literal = if Some(c) == escape { + // The next character is escaped, i.e. a literal. + chars.next() + } else { + match c { + '%' => { + regex.push_str(".*"); + run = 0; None } + '_' => { + regex.push('.'); + run = 0; + None + } + other => Some(other), } - _ => { - // If the scalar is not a string, we cannot handle it - None + }; + if let Some(lit) = literal { + if regex_syntax::is_meta_character(lit) { + regex.push('\\'); + } + regex.push(lit); + // Only runs of alphanumeric characters can produce a trigram. + if lit.is_alphanumeric() { + run += 1; + longest_run = longest_run.max(run); + } else { + run = 0; } } } + (longest_run >= 3).then_some(regex) +} + +/// Fold the supported `regexp_like` / `regexp_match` flags into an inline prefix +/// on the pattern (e.g. flags `"i"` -> `"(?i)pattern"`). Returns `None` for a +/// non-literal flags argument or an unrecognized flag, so the caller leaves the +/// predicate to a full recheck rather than risk changing its semantics. +fn apply_regex_flags(pattern: &str, flags_expr: &Expr) -> Option { + let (Expr::Literal(ScalarValue::Utf8(Some(flags)), _) + | Expr::Literal(ScalarValue::LargeUtf8(Some(flags)), _)) = flags_expr + else { + return None; + }; + let mut inline = String::new(); + for flag in flags.chars() { + // Only flags expressible as an inline `(?...)` group in the regex crate + // (which the recheck uses) are safe to fold. + if ['i', 's', 'm', 'x'].contains(&flag) { + inline.push(flag); + } else { + return None; + } + } + if inline.is_empty() { + Some(pattern.to_string()) + } else { + Some(format!("(?{inline}){pattern}")) + } } /// A parser for indices that handle queries with the contains_tokens function @@ -1074,7 +1221,8 @@ impl IndexedExpression { index_name, index_type, query, - needs_recheck: false, // Default to false, will be set by parser + needs_recheck: false, // Default to false, will be set by parser + fragment_bitmap: None, // Filled in by `apply_scalar_indices` })), refine_expr: None, } @@ -1095,6 +1243,7 @@ impl IndexedExpression { index_type, query, needs_recheck, + fragment_bitmap: None, // Filled in by `apply_scalar_indices` })), refine_expr: None, } @@ -1236,10 +1385,21 @@ pub struct ScalarIndexSearch { pub query: Arc, /// If true, the query results are inexact and will need a recheck pub needs_recheck: bool, + /// The fragments the underlying index has entries for. + /// + /// `None` means coverage is unknown (e.g. constructed outside of scanner + /// planning, or from a legacy code path). Optimizer rules that need to + /// decide whether the index covers the dataset must treat `None` as + /// "refuse to use" — the bitmap is the only way to safely answer that + /// question synchronously without an async metadata load. + pub fragment_bitmap: Option, } impl PartialEq for ScalarIndexSearch { fn eq(&self, other: &Self) -> bool { + // `fragment_bitmap` is metadata derived from the dataset state, not + // part of the query identity, so it intentionally does not participate + // in equality. self.column == other.column && self.index_name == other.index_name && self.query.as_ref().eq(other.query.as_ref()) @@ -1437,8 +1597,8 @@ fn maybe_indexed_column<'b>( ) -> Option<(String, DataType, &'b dyn ScalarQueryParser)> { // First try to extract the full nested column path for get_field expressions if let Some(nested_path) = extract_nested_column_path(expr) - && let Some((data_type, parser)) = index_info.get_index(&nested_path) - && let Some(data_type) = parser.is_valid_reference(expr, data_type) + && let Some((data_type, multi)) = index_info.get_index(&nested_path) + && let Some((parser, data_type)) = multi.select(expr, data_type) { return Some((nested_path, data_type, parser)); } @@ -1446,12 +1606,9 @@ fn maybe_indexed_column<'b>( match expr { Expr::Column(col) => { let col = col.name.as_str(); - let (data_type, parser) = index_info.get_index(col)?; - if let Some(data_type) = parser.is_valid_reference(expr, data_type) { - Some((col.to_string(), data_type, parser)) - } else { - None - } + let (data_type, multi) = index_info.get_index(col)?; + let (parser, data_type) = multi.select(expr, data_type)?; + Some((col.to_string(), data_type, parser)) } Expr::ScalarFunction(udf) => { if udf.args.is_empty() { @@ -1459,12 +1616,9 @@ fn maybe_indexed_column<'b>( } // For non-get_field functions, fall back to old behavior let col = maybe_column(&udf.args[0])?; - let (data_type, parser) = index_info.get_index(col)?; - if let Some(data_type) = parser.is_valid_reference(expr, data_type) { - Some((col.to_string(), data_type, parser)) - } else { - None - } + let (data_type, multi) = index_info.get_index(col)?; + let (parser, data_type) = multi.select(expr, data_type)?; + Some((col.to_string(), data_type, parser)) } _ => None, } @@ -1798,7 +1952,18 @@ fn visit_node( Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)), Expr::IsTrue(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, true)), Expr::IsNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, false)), - Expr::IsNotNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, true)), + Expr::IsNotNull(expr) => { + // `regexp_match(col, pat)` returns a list and is coerced to + // `IsNotNull(regexp_match(...))` before it reaches here. Unwrap that + // so the regex acceleration applies; everything else is a genuine + // IS NOT NULL check. + if let Expr::ScalarFunction(scalar_fn) = expr.as_ref() + && scalar_fn.func.name() == "regexp_match" + { + return Ok(visit_scalar_fn(scalar_fn, index_info)); + } + Ok(visit_is_null(expr.as_ref(), index_info, true)) + } Expr::Not(expr) => visit_not(expr.as_ref(), index_info, depth), Expr::BinaryExpr(binary_expr) => visit_binary_expr(binary_expr, index_info, depth), Expr::ScalarFunction(scalar_fn) => Ok(visit_scalar_fn(scalar_fn, index_info)), @@ -1818,7 +1983,17 @@ fn visit_node( pub trait IndexInformationProvider { /// Check if an index exists for `col` and, if so, return the data type of col /// as well as a query parser that can parse queries for that column - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)>; + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)>; + + /// The set of fragments covered by `(column, index_name)`. + /// + /// Returns `None` when the provider doesn't know — callers must treat + /// that as "coverage unknown" rather than "covers everything". The + /// default implementation always returns `None`, so providers that + /// haven't been updated cannot accidentally claim full coverage. + fn fragment_bitmap(&self, _column: &str, _index_name: &str) -> Option { + None + } } /// Attempt to split a filter expression into a search of scalar indexes and an @@ -1827,7 +2002,31 @@ pub fn apply_scalar_indices( expr: Expr, index_info: &dyn IndexInformationProvider, ) -> Result { - Ok(visit_node(&expr, index_info, 0)?.unwrap_or(IndexedExpression::refine_only(expr))) + let mut result = + visit_node(&expr, index_info, 0)?.unwrap_or(IndexedExpression::refine_only(expr)); + if let Some(query) = result.scalar_query.as_mut() { + populate_fragment_bitmaps(query, index_info); + } + Ok(result) +} + +/// Walk a [`ScalarIndexExpr`] and fill in `fragment_bitmap` on each leaf from +/// the `index_info` provider. Leaves the bitmap as `None` if the provider +/// can't answer. +fn populate_fragment_bitmaps( + expr: &mut ScalarIndexExpr, + index_info: &dyn IndexInformationProvider, +) { + match expr { + ScalarIndexExpr::Not(inner) => populate_fragment_bitmaps(inner, index_info), + ScalarIndexExpr::And(lhs, rhs) | ScalarIndexExpr::Or(lhs, rhs) => { + populate_fragment_bitmaps(lhs, index_info); + populate_fragment_bitmaps(rhs, index_info); + } + ScalarIndexExpr::Query(search) => { + search.fragment_bitmap = index_info.fragment_bitmap(&search.column, &search.index_name); + } + } } #[derive(Clone, Default, Debug)] @@ -1966,11 +2165,18 @@ mod tests { struct ColInfo { data_type: DataType, - parser: Box, + parser: Box, } impl ColInfo { fn new(data_type: DataType, parser: Box) -> Self { + Self { + data_type, + parser: Box::new(MultiQueryParser::single(parser)), + } + } + + fn with_multi(data_type: DataType, parser: Box) -> Self { Self { data_type, parser } } } @@ -1992,7 +2198,7 @@ mod tests { } impl IndexInformationProvider for MockIndexInfoProvider { - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> { + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> { self.indexed_columns .get(col) .map(|col_info| (&col_info.data_type, col_info.parser.as_ref())) @@ -2422,6 +2628,7 @@ mod tests { index_type: "BTree".to_string(), query: Arc::new(SargableQuery::Equals(ScalarValue::UInt32(Some(10)))), needs_recheck: false, + fragment_bitmap: None, })); let right = Box::new(ScalarIndexExpr::Query(ScalarIndexSearch { column: "color".to_string(), @@ -2431,6 +2638,7 @@ mod tests { "blue".to_string(), )))), needs_recheck: false, + fragment_bitmap: None, })); check( &index_info, @@ -2639,6 +2847,59 @@ mod tests { assert!(matches!(negated.upper, NullableRowAddrMask::BlockList(_))); } + #[test] + fn test_like_to_regex() { + // `%` -> `.*`, `_` -> `.`, with a literal run of at least three chars. + assert_eq!(like_to_regex("%foo%", None).as_deref(), Some(".*foo.*")); + assert_eq!(like_to_regex("foo%bar", None).as_deref(), Some("foo.*bar")); + assert_eq!(like_to_regex("foo_bar", None).as_deref(), Some("foo.bar")); + assert_eq!(like_to_regex("foobar", None).as_deref(), Some("foobar")); + + // Regex metacharacters in the literal portion are escaped. + assert_eq!( + like_to_regex("%a.bcd%", None).as_deref(), + Some(".*a\\.bcd.*") + ); + + // No literal run of three alphanumeric characters -> no index help. + assert_eq!(like_to_regex("%ab%", None), None); + assert_eq!(like_to_regex("%a%b%c%", None), None); + assert_eq!(like_to_regex("%", None), None); + + // The escape character makes the following character a literal. + assert_eq!( + like_to_regex(r"%foo\%bar%", Some('\\')).as_deref(), + Some(".*foo%bar.*") + ); + } + + #[test] + fn test_apply_regex_flags() { + fn flags(s: &str) -> Expr { + Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None) + } + + // Empty flags leave the pattern untouched (no inline group emitted). + assert_eq!(apply_regex_flags("foo", &flags("")).as_deref(), Some("foo")); + // Supported flags are folded into an inline `(?...)` prefix. + assert_eq!( + apply_regex_flags("foo", &flags("i")).as_deref(), + Some("(?i)foo") + ); + assert_eq!( + apply_regex_flags("foo", &flags("is")).as_deref(), + Some("(?is)foo") + ); + // An unrecognized flag bails out so the caller leaves the predicate to a + // full recheck rather than risk changing its semantics. + assert_eq!(apply_regex_flags("foo", &flags("g")), None); + // A non-string (hence non-literal-flags) argument cannot be folded. + assert_eq!( + apply_regex_flags("foo", &Expr::Literal(ScalarValue::Int32(Some(1)), None)), + None + ); + } + #[test] fn test_extract_like_leading_prefix() { // Simple prefix patterns (no recheck needed) @@ -3106,4 +3367,75 @@ mod tests { assert_eq!(round_tripped.upper, RowAddrMask::from_allowed(upper_addrs)); assert_eq!(round_tripped_frags, fragments_covered); } + + /// Regression test: when two JSON indices target different paths on the same + /// column, a query against one path must be routed to its own index instead + /// of being intercepted by whichever parser was registered first. + #[test] + fn test_multi_json_indices_route_by_path() { + // Build a MultiQueryParser containing two JSON sub-parsers: one for + // path "$.a" and one for path "$.b". + let mut multi = MultiQueryParser::single(Box::new(JsonQueryParser::new( + "$.a".to_string(), + Box::new(SargableQueryParser::new( + "json_a_idx".to_string(), + "Json".to_string(), + false, + )), + ))); + multi.add(Box::new(JsonQueryParser::new( + "$.b".to_string(), + Box::new(SargableQueryParser::new( + "json_b_idx".to_string(), + "Json".to_string(), + false, + )), + ))); + + let index_info = MockIndexInfoProvider::new(vec![( + "json", + ColInfo::with_multi(DataType::LargeBinary, Box::new(multi)), + )]); + + // Query against path "$.b" must hit the "$.b" index. + let expected_b = IndexedExpression::index_query( + "json".to_string(), + "json_b_idx".to_string(), + "Json".to_string(), + Arc::new(JsonQuery::new( + Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some( + "foo".to_string(), + )))), + "$.b".to_string(), + )), + ); + check( + &index_info, + "json_extract(json, '$.b') = 'foo'", + Some(expected_b), + false, + ); + + // Query against path "$.a" must hit the "$.a" index. + let expected_a = IndexedExpression::index_query( + "json".to_string(), + "json_a_idx".to_string(), + "Json".to_string(), + Arc::new(JsonQuery::new( + Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some( + "foo".to_string(), + )))), + "$.a".to_string(), + )), + ); + check( + &index_info, + "json_extract(json, '$.a') = 'foo'", + Some(expected_a), + false, + ); + + // Query against an unindexed path must not bind to either index. + check_no_index(&index_info, "json_extract(json, '$.c') = 'foo'"); + } } diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs new file mode 100644 index 00000000000..79c949a5426 --- /dev/null +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -0,0 +1,2368 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! FM-Index for exact substring search (following the Infini-gram Mini paper) +//! +//! The FM-Index is a compressed full-text index based on the Burrows-Wheeler Transform (BWT). +//! It supports exact substring matching via backward search and returns exact row ids. +//! +//! Architecture (matching the paper): +//! - Huffman-shaped Wavelet Tree over BWT for entropy-compressed rank queries (~0.26N) +//! - Sampled Suffix Array every D-th position for locate (~N/D × 8 bytes) +//! - doc_start_positions for mapping text positions to documents (tiny) +//! - No doc_array — documents are resolved via SA sampling + LF-mapping + binary search +//! +//! Total index size: ~0.44N (matching paper's claim) +//! +//! Storage layout (v10 - blocked, partitioned): +//! - BWT wavelet tree bitvectors in blocks of BLOCK_WORDS (32KB each) +//! - SA samples stored as packed binary blocks after wavelet blocks +//! - Row IDs and doc_start_positions in metadata +//! - File metadata: c_table, huffman_codes, tree topology + +use std::cmp::Reverse; +use std::collections::{BinaryHeap, HashMap}; +use std::sync::{Arc, OnceLock}; + +use arrow_array::RecordBatch; +use arrow_schema::{DataType, Field}; +use async_trait::async_trait; +use datafusion::execution::SendableRecordBatchStream; +use futures::StreamExt; +use lance_core::cache::LanceCache; +use lance_core::deepsize::DeepSizeOf; +use lance_core::{Error, ROW_ADDR, Result}; +use roaring::RoaringBitmap; + +use crate::frag_reuse::FragReuseIndex; +use crate::metrics::MetricsCollector; +use crate::pb; +use crate::scalar::expression::{ScalarQueryParser, TextQueryParser}; +use crate::scalar::registry::{ + DefaultTrainingRequest, ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest, + VALUE_COLUMN_NAME, +}; +use crate::scalar::{ + AnyQuery, BuiltinIndexType, CreatedIndex, IndexFile, IndexStore, OldIndexDataFilter, + ScalarIndex, ScalarIndexParams, SearchResult, TextQuery, UpdateCriteria, +}; +use crate::{Index, IndexType}; + +const FMINDEX_INDEX_VERSION: u32 = 10; +const BLOCK_WORDS: usize = 4096; +const PARTITION_SIZE: usize = 10_000; +const SENTINEL_BYTE: u8 = 0xFF; + +/// SA sampling rate. Store every D-th SA entry. Locate walks at most D LF steps. +const SA_SAMPLE_RATE: usize = 32; + +fn fmindex_partition_path(partition_id: u64) -> String { + format!("part_{partition_id}_fm.lance") +} + +// ── Bitvector with O(1) rank ───────────────────────────────────────────────── + +const SUPERBLOCK_BITS: usize = 512; +const WORDS_PER_SUPERBLOCK: usize = SUPERBLOCK_BITS / 64; + +#[derive(Debug, Clone)] +struct RankBitVec { + words: Vec, + superblocks: Vec, + len: usize, +} + +#[allow(dead_code)] +impl RankBitVec { + fn new(len: usize) -> Self { + Self { + words: vec![0u64; len.div_ceil(64)], + superblocks: Vec::new(), + len, + } + } + + #[inline] + fn set(&mut self, pos: usize) { + self.words[pos / 64] |= 1u64 << (pos % 64); + } + + #[inline] + fn get(&self, pos: usize) -> bool { + (self.words[pos / 64] >> (pos % 64)) & 1 != 0 + } + + fn build_rank_index(&mut self) { + let num_sb = self.words.len().div_ceil(WORDS_PER_SUPERBLOCK) + 1; + self.superblocks = Vec::with_capacity(num_sb); + let mut cum = 0u32; + for (i, chunk) in self.words.chunks(WORDS_PER_SUPERBLOCK).enumerate() { + self.superblocks.push(if i == 0 { 0 } else { cum }); + for &w in chunk { + cum += w.count_ones(); + } + } + self.superblocks.push(cum); + } + + #[inline] + fn rank1(&self, pos: usize) -> usize { + if pos == 0 { + return 0; + } + let word_idx = pos / 64; + let bit_idx = pos % 64; + let sb_idx = word_idx / WORDS_PER_SUPERBLOCK; + let mut count = self.superblocks[sb_idx] as usize; + for i in (sb_idx * WORDS_PER_SUPERBLOCK)..word_idx { + count += self.words[i].count_ones() as usize; + } + if bit_idx > 0 { + count += (self.words[word_idx] & ((1u64 << bit_idx) - 1)).count_ones() as usize; + } + count + } + + #[inline] + fn rank0(&self, pos: usize) -> usize { + pos - self.rank1(pos) + } + + fn deep_size(&self) -> usize { + self.words.len() * 8 + self.superblocks.len() * 4 + } +} + +// ── Huffman-shaped Wavelet Tree ────────────────────────────────────────────── + +#[derive(Debug, Clone, Default)] +struct HuffmanCode { + bits: u32, + length: u8, + node_path: Vec, +} + +#[derive(Debug, Clone)] +enum WaveletChild { + Node(usize), + Leaf(u8), +} + +#[derive(Debug, Clone)] +struct HuffmanWaveletTree { + nodes: Vec, + codes: [HuffmanCode; 256], + children: Vec<(WaveletChild, WaveletChild)>, + len: usize, +} + +#[derive(Debug)] +enum HuffNode { + Leaf(u8), + Internal { left: Box, right: Box }, +} + +impl PartialEq for HuffNode { + fn eq(&self, _: &Self) -> bool { + true + } +} +impl Eq for HuffNode {} +impl PartialOrd for HuffNode { + fn partial_cmp(&self, o: &Self) -> Option { + Some(self.cmp(o)) + } +} +impl Ord for HuffNode { + fn cmp(&self, _: &Self) -> std::cmp::Ordering { + std::cmp::Ordering::Equal + } +} + +#[allow(dead_code)] +impl HuffmanWaveletTree { + fn build(data: &[u8]) -> Self { + let n = data.len(); + if n == 0 { + return Self { + nodes: Vec::new(), + codes: std::array::from_fn(|_| HuffmanCode::default()), + children: Vec::new(), + len: 0, + }; + } + + let mut freq = [0u64; 256]; + for &b in data { + freq[b as usize] += 1; + } + + let mut heap: BinaryHeap<(Reverse, Reverse, Box)> = BinaryHeap::new(); + let mut tie = 0; + for (v, &f) in freq.iter().enumerate() { + if f > 0 { + heap.push((Reverse(f), Reverse(tie), Box::new(HuffNode::Leaf(v as u8)))); + tie += 1; + } + } + if heap.len() == 1 { + let (f, _, node) = heap.pop().unwrap(); + heap.push((Reverse(0), Reverse(tie), Box::new(HuffNode::Leaf(255)))); + tie += 1; + heap.push((f, Reverse(tie), node)); + tie += 1; + } + while heap.len() > 1 { + let (Reverse(f1), _, l) = heap.pop().unwrap(); + let (Reverse(f2), _, r) = heap.pop().unwrap(); + heap.push(( + Reverse(f1 + f2), + Reverse(tie), + Box::new(HuffNode::Internal { left: l, right: r }), + )); + tie += 1; + } + let root = heap.pop().unwrap().2; + + let mut codes: [HuffmanCode; 256] = std::array::from_fn(|_| HuffmanCode::default()); + let mut node_count = 0; + let mut children_map: Vec<(WaveletChild, WaveletChild)> = Vec::new(); + + fn assign( + node: &HuffNode, + bits: u32, + len: u8, + path: &mut Vec, + nid: &mut usize, + codes: &mut [HuffmanCode; 256], + cm: &mut Vec<(WaveletChild, WaveletChild)>, + ) -> WaveletChild { + match node { + HuffNode::Leaf(b) => { + codes[*b as usize] = HuffmanCode { + bits, + length: len, + node_path: path.clone(), + }; + WaveletChild::Leaf(*b) + } + HuffNode::Internal { left, right } => { + let my = *nid; + *nid += 1; + path.push(my); + cm.push((WaveletChild::Leaf(0), WaveletChild::Leaf(0))); + let lc = assign(left, bits << 1, len + 1, path, nid, codes, cm); + let rc = assign(right, (bits << 1) | 1, len + 1, path, nid, codes, cm); + cm[my] = (lc, rc); + path.pop(); + WaveletChild::Node(my) + } + } + } + assign( + &root, + 0, + 0, + &mut Vec::new(), + &mut node_count, + &mut codes, + &mut children_map, + ); + + let mut node_sizes = vec![0usize; node_count]; + for &b in data { + for &nid in &codes[b as usize].node_path { + node_sizes[nid] += 1; + } + } + let mut nodes: Vec = node_sizes.iter().map(|&sz| RankBitVec::new(sz)).collect(); + let mut cursors = vec![0usize; node_count]; + for &b in data { + let code = &codes[b as usize]; + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 1 { + nodes[nid].set(cursors[nid]); + } + cursors[nid] += 1; + } + } + for n in &mut nodes { + n.build_rank_index(); + } + Self { + nodes, + codes, + children: children_map, + len: n, + } + } + + /// Retrieve the byte at position `pos` in the original BWT. + #[inline] + fn access(&self, mut pos: usize) -> u8 { + if self.nodes.is_empty() { + return 0; + } + let mut node_idx = 0; + loop { + let bit = self.nodes[node_idx].get(pos); + let (ref left, ref right) = self.children[node_idx]; + if bit { + pos = self.nodes[node_idx].rank1(pos); + match right { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } else { + pos = self.nodes[node_idx].rank0(pos); + match left { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } + } + } + + /// Count occurrences of byte `c` in positions `[0, pos)`. + #[inline] + fn rank(&self, c: u8, pos: usize) -> usize { + let code = &self.codes[c as usize]; + if code.length == 0 { + return 0; + } + let (mut lo, mut hi) = (0, pos); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + lo = self.nodes[nid].rank0(lo); + hi = self.nodes[nid].rank0(hi); + } else { + lo = self.nodes[nid].rank1(lo); + hi = self.nodes[nid].rank1(hi); + } + } + hi - lo + } + + #[inline] + fn rank_pair(&self, c: u8, lo: usize, hi: usize) -> (usize, usize) { + let code = &self.codes[c as usize]; + if code.length == 0 { + return (0, 0); + } + let (mut s, mut l, mut h) = (0, lo, hi); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + s = self.nodes[nid].rank0(s); + l = self.nodes[nid].rank0(l); + h = self.nodes[nid].rank0(h); + } else { + s = self.nodes[nid].rank1(s); + l = self.nodes[nid].rank1(l); + h = self.nodes[nid].rank1(h); + } + } + (l - s, h - s) + } + + fn deep_size(&self) -> usize { + self.nodes.iter().map(|n| n.deep_size()).sum::() + + self + .codes + .iter() + .map(|c| c.node_path.len() * 8) + .sum::() + + self.children.len() * 24 + } +} + +// ── Suffix Array ───────────────────────────────────────────────────────────── + +fn build_suffix_array(text: &[u8]) -> Vec { + let n = text.len(); + if n == 0 { + return Vec::new(); + } + if n > i32::MAX as usize { + let mut sa = vec![0i64; n]; + assert_eq!(libsais_rs::libsais64(text, &mut sa, 0, None), 0); + sa.iter().map(|&x| x as usize).collect() + } else { + let mut sa = vec![0i32; n]; + assert_eq!(libsais_rs::libsais(text, &mut sa, 0, None), 0); + sa.iter().map(|&x| x as usize).collect() + } +} + +// ── Lazy Block Loading ─────────────────────────────────────────────────────── + +const BLOCK_BITS: usize = BLOCK_WORDS * 64; + +struct LazyRankBitVec { + prefix_ranks: Vec, + blocks: Vec>>, + reader: Arc, + block_row_offset: usize, + len: usize, +} + +impl std::fmt::Debug for LazyRankBitVec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LazyRankBitVec") + .field("len", &self.len) + .finish() + } +} + +impl LazyRankBitVec { + fn new( + prefix_ranks: Vec, + num_blocks: usize, + reader: Arc, + offset: usize, + len: usize, + ) -> Self { + Self { + prefix_ranks, + blocks: (0..num_blocks).map(|_| OnceLock::new()).collect(), + reader, + block_row_offset: offset, + len, + } + } + + /// Pre-load all blocks into memory. Call this before sync rank/access operations + /// to avoid the need for `block_in_place` during queries. + async fn load_all_blocks(&self) -> Result<()> { + for (idx, lock) in self.blocks.iter().enumerate() { + if lock.get().is_none() { + let words = self.load_block(idx).await?; + let _ = lock.set(words); + } + } + Ok(()) + } + + #[inline] + fn ensure_block(&self, idx: usize) -> &[u64] { + self.blocks[idx].get_or_init(|| { + tokio::task::block_in_place(|| { + tokio::runtime::Handle::current().block_on(self.load_block(idx)) + }) + .unwrap_or_else(|e| panic!("FM-Index block load failed: {e}")) + }) + } + + async fn load_block(&self, idx: usize) -> Result> { + let row = self.block_row_offset + idx; + let batch = self + .reader + .read_range(row..row + 1, Some(&["words"])) + .await?; + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("expected LargeBinary words column"))?; + Ok(col + .value(0) + .chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap())) + .collect()) + } + + #[inline] + fn rank1(&self, pos: usize) -> usize { + if pos == 0 { + return 0; + } + let bi = pos / BLOCK_BITS; + let local = pos % BLOCK_BITS; + if local == 0 { + return self.prefix_ranks[bi] as usize; + } + let mut count = self.prefix_ranks[bi] as usize; + let block = self.ensure_block(bi); + let wi = local / 64; + let bit = local % 64; + for w in &block[..wi] { + count += w.count_ones() as usize; + } + if bit > 0 { + count += (block[wi] & ((1u64 << bit) - 1)).count_ones() as usize; + } + count + } + + #[inline] + fn rank0(&self, pos: usize) -> usize { + pos - self.rank1(pos) + } + + #[inline] + fn get(&self, pos: usize) -> bool { + let bi = pos / BLOCK_BITS; + let local = pos % BLOCK_BITS; + let block = self.ensure_block(bi); + (block[local / 64] >> (local % 64)) & 1 != 0 + } + + fn deep_size(&self) -> usize { + let loaded: usize = self + .blocks + .iter() + .filter_map(|b| b.get()) + .map(|w| w.len() * 8) + .sum(); + self.prefix_ranks.len() * 8 + loaded + } +} + +struct LazyHuffmanWaveletTree { + nodes: Vec, + codes: [HuffmanCode; 256], + children: Vec<(WaveletChild, WaveletChild)>, + len: usize, +} + +impl std::fmt::Debug for LazyHuffmanWaveletTree { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LazyHuffmanWaveletTree") + .field("len", &self.len) + .finish() + } +} + +impl LazyHuffmanWaveletTree { + /// Pre-load all wavelet tree blocks into memory. + async fn load_all(&self) -> Result<()> { + for node in &self.nodes { + node.load_all_blocks().await?; + } + Ok(()) + } + + #[inline] + fn access(&self, mut pos: usize) -> u8 { + if self.nodes.is_empty() { + return 0; + } + let mut node_idx = 0; + loop { + let bit = self.nodes[node_idx].get(pos); + let (ref left, ref right) = self.children[node_idx]; + if bit { + pos = self.nodes[node_idx].rank1(pos); + match right { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } else { + pos = self.nodes[node_idx].rank0(pos); + match left { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } + } + } + + #[inline] + fn rank(&self, c: u8, pos: usize) -> usize { + let code = &self.codes[c as usize]; + if code.length == 0 { + return 0; + } + let (mut lo, mut hi) = (0, pos); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + lo = self.nodes[nid].rank0(lo); + hi = self.nodes[nid].rank0(hi); + } else { + lo = self.nodes[nid].rank1(lo); + hi = self.nodes[nid].rank1(hi); + } + } + hi - lo + } + + #[inline] + fn rank_pair(&self, c: u8, lo: usize, hi: usize) -> (usize, usize) { + let code = &self.codes[c as usize]; + if code.length == 0 { + return (0, 0); + } + let (mut s, mut l, mut h) = (0, lo, hi); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + s = self.nodes[nid].rank0(s); + l = self.nodes[nid].rank0(l); + h = self.nodes[nid].rank0(h); + } else { + s = self.nodes[nid].rank1(s); + l = self.nodes[nid].rank1(l); + h = self.nodes[nid].rank1(h); + } + } + (l - s, h - s) + } + + fn deep_size(&self) -> usize { + self.nodes.iter().map(|n| n.deep_size()).sum::() + + self + .codes + .iter() + .map(|c| c.node_path.len() * 8) + .sum::() + } +} + +// ── FM-Index (in-memory, build-time) ───────────────────────────────────────── + +#[derive(Debug, Clone)] +pub struct FMIndex { + wavelet: HuffmanWaveletTree, + row_ids: Vec, + /// Sampled SA: sa_samples[i] = SA[i * SA_SAMPLE_RATE]. Size: N/D × 8 bytes. + sa_samples: Vec, + /// Starting byte offset of each document in the concatenated text. + doc_start_positions: Vec, + c_table: Vec, + alphabet_size: usize, +} + +impl DeepSizeOf for FMIndex { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { + self.wavelet.deep_size() + + self.row_ids.len() * 8 + + self.sa_samples.len() * 8 + + self.doc_start_positions.len() * 8 + + self.c_table.len() * std::mem::size_of::() + } +} + +#[allow(dead_code)] +impl FMIndex { + fn build(texts: &[(u64, &[u8])]) -> Result { + if texts.is_empty() { + return Ok(Self { + wavelet: HuffmanWaveletTree { + nodes: Vec::new(), + codes: std::array::from_fn(|_| HuffmanCode::default()), + children: Vec::new(), + len: 0, + }, + row_ids: Vec::new(), + sa_samples: Vec::new(), + doc_start_positions: Vec::new(), + c_table: vec![0; 257], + alphabet_size: 256, + }); + } + + let mut concat = Vec::new(); + let mut doc_row_ids = Vec::new(); + let mut doc_starts: Vec = Vec::new(); + for (row_id, text) in texts { + doc_starts.push(concat.len() as u64); + doc_row_ids.push(*row_id); + concat.extend_from_slice(text); + concat.push(SENTINEL_BYTE); // \xFF separator between documents + } + // Append unique terminator \x00 so SA-IS produces a proper suffix array + // with a single-cycle LF-mapping permutation. + concat.push(0x00); + let n = concat.len(); + let sa = build_suffix_array(&concat); + + let bwt: Vec = sa + .iter() + .map(|&pos| { + if pos == 0 { + concat[n - 1] + } else { + concat[pos - 1] + } + }) + .collect(); + + let mut counts = vec![0usize; 257]; + for &b in &concat { + counts[b as usize + 1] += 1; + } + for i in 1..257 { + counts[i] += counts[i - 1]; + } + + // Sampled SA: store every D-th entry + let sa_samples: Vec = sa + .iter() + .step_by(SA_SAMPLE_RATE) + .map(|&pos| pos as u64) + .collect(); + + let wavelet = HuffmanWaveletTree::build(&bwt); + + Ok(Self { + wavelet, + row_ids: doc_row_ids, + sa_samples, + doc_start_positions: doc_starts, + c_table: counts, + alphabet_size: 256, + }) + } + + /// Locate: resolve SA[pos] by walking LF-mapping until hitting a sampled position. + /// For large data (N >> SA_SAMPLE_RATE), converges within SA_SAMPLE_RATE steps. + /// For small data with short LF cycles, may need up to N steps. + #[inline] + fn locate(&self, mut pos: usize) -> usize { + let mut steps = 0; + let n = self.wavelet.len; + loop { + if pos.is_multiple_of(SA_SAMPLE_RATE) && (pos / SA_SAMPLE_RATE) < self.sa_samples.len() + { + return (self.sa_samples[pos / SA_SAMPLE_RATE] as usize + steps) % n; + } + let c = self.wavelet.access(pos); + pos = self.c_table[c as usize] + self.wavelet.rank(c, pos); + steps += 1; + if steps >= n { + log::warn!("FM-Index SA locate exceeded {n} steps, possible index corruption"); + return 0; + } + } + } + + /// Map a text position to document index via binary search on doc_start_positions. + #[inline] + fn doc_for_position(&self, text_pos: usize) -> usize { + let tp = text_pos as u64; + match self.doc_start_positions.binary_search(&tp) { + Ok(idx) => idx, + Err(idx) => idx - 1, + } + } + + fn backward_search(&self, pattern: &[u8]) -> (usize, usize) { + if pattern.is_empty() || self.wavelet.len == 0 { + return (0, 0); + } + let (mut lo, mut hi) = (0, self.wavelet.len); + for &b in pattern.iter().rev() { + let c = self.c_table[b as usize]; + let (occ_lo, occ_hi) = self.wavelet.rank_pair(b, lo, hi); + lo = c + occ_lo; + hi = c + occ_hi; + if lo >= hi { + return (0, 0); + } + } + (lo, hi) + } + + #[cfg(test)] + fn search(&self, pattern: &[u8]) -> RoaringBitmap { + let (lo, hi) = self.backward_search(pattern); + if lo >= hi { + return RoaringBitmap::new(); + } + let mut result = RoaringBitmap::new(); + for i in lo..hi { + let text_pos = self.locate(i); + let doc_idx = self.doc_for_position(text_pos); + result.insert(self.row_ids[doc_idx] as u32); + } + result + } + + /// Search returning full u64 row addresses (preserving fragment ID in upper bits). + fn search_row_addrs(&self, pattern: &[u8]) -> Vec { + let (lo, hi) = self.backward_search(pattern); + if lo >= hi { + return Vec::new(); + } + let mut seen = std::collections::HashSet::new(); + let mut result = Vec::new(); + for i in lo..hi { + let text_pos = self.locate(i); + let doc_idx = self.doc_for_position(text_pos); + let row_addr = self.row_ids[doc_idx]; + if seen.insert(row_addr) { + result.push(row_addr); + } + } + result + } + + fn serialize_huffman_codes(&self) -> Vec { + let mut buf = Vec::new(); + for code in &self.wavelet.codes { + buf.extend_from_slice(&code.bits.to_le_bytes()); + buf.push(code.length); + buf.extend_from_slice(&(code.node_path.len() as u16).to_le_bytes()); + for &nid in &code.node_path { + buf.extend_from_slice(&(nid as u32).to_le_bytes()); + } + } + buf + } + + fn deserialize_huffman_codes(data: &[u8]) -> [HuffmanCode; 256] { + let mut codes: [HuffmanCode; 256] = std::array::from_fn(|_| HuffmanCode::default()); + let mut cur = 0; + for code in &mut codes { + let bits = u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()); + cur += 4; + let length = data[cur]; + cur += 1; + let plen = u16::from_le_bytes(data[cur..cur + 2].try_into().unwrap()) as usize; + cur += 2; + let mut node_path = Vec::with_capacity(plen); + for _ in 0..plen { + node_path.push(u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()) as usize); + cur += 4; + } + *code = HuffmanCode { + bits, + length, + node_path, + }; + } + codes + } + + fn serialize_tree_topology(&self) -> Vec { + let mut buf = Vec::new(); + buf.extend_from_slice(&(self.wavelet.children.len() as u32).to_le_bytes()); + for (left, right) in &self.wavelet.children { + for child in [left, right] { + match child { + WaveletChild::Node(id) => { + buf.push(0); + buf.extend_from_slice(&(*id as u32).to_le_bytes()); + } + WaveletChild::Leaf(b) => { + buf.push(1); + buf.extend_from_slice(&(*b as u32).to_le_bytes()); + } + } + } + } + buf + } + + fn deserialize_tree_topology(data: &[u8]) -> Vec<(WaveletChild, WaveletChild)> { + let mut cur = 0; + let count = u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()) as usize; + cur += 4; + let mut children = Vec::with_capacity(count); + for _ in 0..count { + let mut read_child = || { + let t = data[cur]; + cur += 1; + let v = u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()); + cur += 4; + if t == 0 { + WaveletChild::Node(v as usize) + } else { + WaveletChild::Leaf(v as u8) + } + }; + let l = read_child(); + let r = read_child(); + children.push((l, r)); + } + children + } + + fn serialize_c_table(&self) -> Vec { + self.c_table + .iter() + .flat_map(|&v| (v as u64).to_le_bytes()) + .collect() + } + + fn deserialize_c_table(data: &[u8]) -> Vec { + data.chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap()) as usize) + .collect() + } + + fn u64_to_bytes(data: &[u64]) -> Vec { + data.iter().flat_map(|v| v.to_le_bytes()).collect() + } + + fn build_wavelet_batch(&self) -> Result { + use arrow_array::{LargeBinaryArray, UInt32Array, UInt64Array}; + let mut nid_b = Vec::new(); + let mut bid_b = Vec::new(); + let mut words_b: Vec> = Vec::new(); + let mut pr_b = Vec::new(); + let mut bl_b = Vec::new(); + + for (i, node) in self.wavelet.nodes.iter().enumerate() { + let mut pr: u64 = 0; + if node.words.is_empty() { + nid_b.push(i as u32); + bid_b.push(0u32); + words_b.push(Vec::new()); + pr_b.push(0u64); + bl_b.push(node.len as u64); + } else { + for (bi, chunk) in node.words.chunks(BLOCK_WORDS).enumerate() { + nid_b.push(i as u32); + bid_b.push(bi as u32); + words_b.push(Self::u64_to_bytes(chunk)); + pr_b.push(pr); + bl_b.push(node.len as u64); + pr += chunk.iter().map(|w| w.count_ones() as u64).sum::(); + } + } + } + let refs: Vec<&[u8]> = words_b.iter().map(|v| v.as_slice()).collect(); + let schema = Arc::new(Self::block_schema()); + Ok(RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt32Array::from(nid_b)), + Arc::new(UInt32Array::from(bid_b)), + Arc::new(LargeBinaryArray::from(refs)), + Arc::new(UInt64Array::from(pr_b)), + Arc::new(UInt64Array::from(bl_b)), + ], + )?) + } + + fn block_schema() -> arrow_schema::Schema { + arrow_schema::Schema::new(vec![ + Field::new("node_id", DataType::UInt32, false), + Field::new("block_id", DataType::UInt32, false), + Field::new("words", DataType::LargeBinary, false), + Field::new("prefix_rank", DataType::UInt64, false), + Field::new("bit_len", DataType::UInt64, false), + ]) + } +} + +// ── Lazy FM-Index ──────────────────────────────────────────────────────────── + +#[derive(Debug)] +struct LazyFMIndex { + wavelet: LazyHuffmanWaveletTree, + row_ids: Vec, + sa_samples: Vec, + doc_start_positions: Vec, + c_table: Vec, +} + +impl LazyFMIndex { + /// Pre-load all wavelet tree blocks before sync search operations. + async fn prewarm(&self) -> Result<()> { + self.wavelet.load_all().await + } + + fn backward_search(&self, pattern: &[u8]) -> (usize, usize) { + if pattern.is_empty() || self.wavelet.len == 0 { + return (0, 0); + } + let (mut lo, mut hi) = (0, self.wavelet.len); + for &b in pattern.iter().rev() { + let c = self.c_table[b as usize]; + let (occ_lo, occ_hi) = self.wavelet.rank_pair(b, lo, hi); + lo = c + occ_lo; + hi = c + occ_hi; + if lo >= hi { + return (0, 0); + } + } + (lo, hi) + } + + #[inline] + fn locate(&self, mut pos: usize) -> usize { + let mut steps = 0; + let n = self.wavelet.len; + loop { + if pos.is_multiple_of(SA_SAMPLE_RATE) && (pos / SA_SAMPLE_RATE) < self.sa_samples.len() + { + return (self.sa_samples[pos / SA_SAMPLE_RATE] as usize + steps) % n; + } + let c = self.wavelet.access(pos); + pos = self.c_table[c as usize] + self.wavelet.rank(c, pos); + steps += 1; + if steps >= n { + log::warn!("FM-Index SA locate exceeded {n} steps, possible index corruption"); + return 0; + } + } + } + + #[inline] + fn doc_for_position(&self, text_pos: usize) -> usize { + let tp = text_pos as u64; + match self.doc_start_positions.binary_search(&tp) { + Ok(idx) => idx, + Err(idx) => idx - 1, + } + } + + #[cfg(test)] + fn search(&self, pattern: &[u8]) -> RoaringBitmap { + let (lo, hi) = self.backward_search(pattern); + if lo >= hi { + return RoaringBitmap::new(); + } + let mut result = RoaringBitmap::new(); + for i in lo..hi { + let text_pos = self.locate(i); + let doc_idx = self.doc_for_position(text_pos); + result.insert(self.row_ids[doc_idx] as u32); + } + result + } + + /// Search returning full u64 row addresses (preserving fragment ID in upper bits). + fn search_row_addrs(&self, pattern: &[u8]) -> Vec { + let (lo, hi) = self.backward_search(pattern); + if lo >= hi { + return Vec::new(); + } + let mut seen = std::collections::HashSet::new(); + let mut result = Vec::new(); + for i in lo..hi { + let text_pos = self.locate(i); + let doc_idx = self.doc_for_position(text_pos); + let row_addr = self.row_ids[doc_idx]; + if seen.insert(row_addr) { + result.push(row_addr); + } + } + result + } + + #[allow(clippy::too_many_arguments)] + async fn from_reader( + reader: Arc, + num_bwt_nodes: usize, + huffman_codes: [HuffmanCode; 256], + children: Vec<(WaveletChild, WaveletChild)>, + c_table: Vec, + bwt_len: usize, + total_wavelet_rows: usize, + num_sa_blocks: usize, + sa_samples_len: usize, + row_ids: Vec, + doc_start_positions: Vec, + ) -> Result { + use arrow_array::UInt64Array; + + let meta = reader + .read_range( + 0..total_wavelet_rows, + Some(&["node_id", "prefix_rank", "bit_len"]), + ) + .await?; + let nid_col = meta + .column_by_name("node_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let pr_col = meta + .column_by_name("prefix_rank") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let bl_col = meta + .column_by_name("bit_len") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + struct NM { + prs: Vec, + offset: usize, + blen: usize, + } + let mut nms: Vec = (0..num_bwt_nodes) + .map(|_| NM { + prs: Vec::new(), + offset: 0, + blen: 0, + }) + .collect(); + for row in 0..meta.num_rows() { + let nid = nid_col.value(row) as usize; + if nid >= num_bwt_nodes { + continue; + } + let nm = &mut nms[nid]; + if nm.prs.is_empty() { + nm.offset = row; + } + nm.prs.push(pr_col.value(row)); + nm.blen = bl_col.value(row) as usize; + } + + let mut bwt_nodes = Vec::with_capacity(num_bwt_nodes); + for nm in &nms { + bwt_nodes.push(LazyRankBitVec::new( + nm.prs.clone(), + nm.prs.len(), + reader.clone(), + nm.offset, + nm.blen, + )); + } + let wavelet = LazyHuffmanWaveletTree { + nodes: bwt_nodes, + codes: huffman_codes, + children, + len: bwt_len, + }; + + // Read SA samples from packed binary blocks + let mut sa_samples = Vec::with_capacity(sa_samples_len); + let sa_batch = reader + .read_range( + total_wavelet_rows..total_wavelet_rows + num_sa_blocks, + Some(&["words"]), + ) + .await?; + let words_col = sa_batch + .column_by_name("words") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..sa_batch.num_rows() { + let raw = words_col.value(i); + for chunk in raw.chunks_exact(8) { + sa_samples.push(u64::from_le_bytes(chunk.try_into().unwrap())); + } + } + sa_samples.truncate(sa_samples_len); + + Ok(Self { + wavelet, + row_ids, + sa_samples, + doc_start_positions, + c_table, + }) + } + + fn deep_size(&self) -> usize { + self.wavelet.deep_size() + + self.row_ids.len() * 8 + + self.sa_samples.len() * 8 + + self.doc_start_positions.len() * 8 + + self.c_table.len() * std::mem::size_of::() + } +} + +// ── FMIndexScalarIndex ─────────────────────────────────────────────────────── + +#[derive(Debug)] +struct FMIndexPartition { + #[allow(dead_code)] + id: u64, + fm: LazyFMIndex, +} + +#[derive(Debug)] +pub struct FMIndexScalarIndex { + partitions: Vec>, +} + +impl DeepSizeOf for FMIndexScalarIndex { + fn deep_size_of_children(&self, _ctx: &mut lance_core::deepsize::Context) -> usize { + self.partitions.iter().map(|p| p.fm.deep_size()).sum() + } +} + +impl FMIndexScalarIndex { + async fn load_partition( + store: &dyn IndexStore, + filename: &str, + pid: u64, + ) -> Result { + let reader = store.open_index_file(filename).await?; + let md = &reader.schema().metadata; + + let parse = |key: &str| -> Result { + md.get(key) + .ok_or_else(|| Error::invalid_input(format!("missing {key}")))? + .parse() + .map_err(|e| Error::invalid_input(format!("invalid {key}: {e}"))) + }; + + let num_bwt_nodes = parse("num_bwt_nodes")?; + let bwt_len = parse("bwt_len")?; + let num_sa_blocks = parse("num_sa_blocks")?; + let sa_samples_len = parse("sa_samples_len")?; + let total_wavelet_rows = parse("total_wavelet_rows")?; + + let c_table = FMIndex::deserialize_c_table(&hex_decode( + md.get("c_table") + .ok_or_else(|| Error::invalid_input("missing c_table"))?, + )?); + let huffman_codes = FMIndex::deserialize_huffman_codes(&hex_decode( + md.get("huffman_codes") + .ok_or_else(|| Error::invalid_input("missing huffman_codes"))?, + )?); + let children = FMIndex::deserialize_tree_topology(&hex_decode( + md.get("tree_topology") + .ok_or_else(|| Error::invalid_input("missing tree_topology"))?, + )?); + + // row_ids and doc_start_positions stored in metadata (small) + let row_ids_hex = md + .get("row_ids") + .ok_or_else(|| Error::invalid_input("missing row_ids"))?; + let row_ids_bytes = hex_decode(row_ids_hex)?; + let row_ids: Vec = row_ids_bytes + .chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap())) + .collect(); + + let doc_starts_hex = md + .get("doc_start_positions") + .ok_or_else(|| Error::invalid_input("missing doc_start_positions"))?; + let doc_starts_bytes = hex_decode(doc_starts_hex)?; + let doc_start_positions: Vec = doc_starts_bytes + .chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap())) + .collect(); + + let fm = Box::pin(LazyFMIndex::from_reader( + reader, + num_bwt_nodes, + huffman_codes, + children, + c_table, + bwt_len, + total_wavelet_rows, + num_sa_blocks, + sa_samples_len, + row_ids, + doc_start_positions, + )) + .await?; + Ok(FMIndexPartition { id: pid, fm }) + } + + async fn load( + store: Arc, + _fri: Option>, + _cache: &LanceCache, + ) -> Result> { + let files = store.list_files_with_sizes().await?; + let mut pfiles: Vec<(u64, String)> = Vec::new(); + for f in &files { + if let Some(id) = f + .path + .strip_prefix("part_") + .and_then(|r| r.strip_suffix("_fm.lance")) + .and_then(|s| s.parse::().ok()) + { + pfiles.push((id, f.path.clone())); + } + } + if pfiles.is_empty() { + return Err(Error::invalid_input("no FM-Index partition files found")); + } + pfiles.sort_by_key(|(id, _)| *id); + let mut parts = Vec::with_capacity(pfiles.len()); + for (id, name) in &pfiles { + parts.push(Arc::new( + Self::load_partition(store.as_ref(), name, *id).await?, + )); + } + Ok(Arc::new(Self { partitions: parts })) + } +} + +#[async_trait] +impl Index for FMIndexScalarIndex { + fn as_any(&self) -> &dyn std::any::Any { + self + } + fn as_index(self: Arc) -> Arc { + self + } + async fn prewarm(&self) -> Result<()> { + Ok(()) + } + fn statistics(&self) -> Result { + Ok(serde_json::json!({ + "type": "Fm", + "num_partitions": self.partitions.len(), + "total_bwt_len": self.partitions.iter().map(|p| p.fm.wavelet.len).sum::(), + "total_docs": self.partitions.iter().map(|p| p.fm.row_ids.len()).sum::(), + })) + } + fn index_type(&self) -> IndexType { + IndexType::Fm + } + async fn calculate_included_frags(&self) -> Result { + let mut frags = RoaringBitmap::new(); + for p in &self.partitions { + for &rid in &p.fm.row_ids { + frags.insert((rid >> 32) as u32); + } + } + Ok(frags) + } +} + +#[async_trait] +impl ScalarIndex for FMIndexScalarIndex { + async fn search( + &self, + query: &dyn AnyQuery, + _metrics: &dyn MetricsCollector, + ) -> Result { + let tq = query + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("Fm only supports TextQuery"))?; + match tq { + TextQuery::StringContains(pattern) => { + let pb = pattern.as_bytes(); + use lance_select::RowAddrTreeMap; + let mut tree = RowAddrTreeMap::new(); + for p in &self.partitions { + p.fm.prewarm().await?; + for row_addr in p.fm.search_row_addrs(pb) { + tree.insert(row_addr); + } + } + Ok(SearchResult::Exact(lance_select::NullableRowAddrSet::new( + tree, + Default::default(), + ))) + } + // Regex queries are routed only to the ngram index (the FM-index's + // query parser advertises `supports_regex = false`), so this is + // unreachable in practice; reject it explicitly rather than silently. + TextQuery::Regex(_) => Err(Error::invalid_input( + "FMIndex does not support regular expression queries", + )), + } + } + fn can_remap(&self) -> bool { + false + } + async fn remap( + &self, + _: &HashMap>, + _: &dyn IndexStore, + ) -> Result { + Err(Error::not_supported("Fm does not support remap")) + } + async fn update( + &self, + new_data: SendableRecordBatchStream, + dest: &dyn IndexStore, + _old_data_filter: Option, + ) -> Result { + let files = write_partitioned_fmindex_stream(new_data, dest).await?; + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), + index_version: FMINDEX_INDEX_VERSION, + files, + }) + } + fn update_criteria(&self) -> UpdateCriteria { + UpdateCriteria::requires_old_data( + TrainingCriteria::new(TrainingOrdering::None).with_row_addr(), + ) + } + fn derive_index_params(&self) -> Result { + Ok(ScalarIndexParams::for_builtin(BuiltinIndexType::Fm)) + } +} + +// ── Helpers ────────────────────────────────────────────────────────────────── + +async fn write_partitioned_fmindex_stream( + mut stream: SendableRecordBatchStream, + store: &dyn IndexStore, +) -> Result> { + let mut files = Vec::new(); + let mut partition = Vec::with_capacity(PARTITION_SIZE); + let mut partition_id = 0; + + while let Some(batch) = stream.next().await { + let batch = batch?; + // Prefer _rowaddr (global row address) over _rowid to ensure stable, + // globally unique identifiers across segments. + let row_addrs: &arrow_array::UInt64Array = batch + .column_by_name(ROW_ADDR) + .or_else(|| batch.column_by_name("_rowid")) + .and_then(|c| c.as_any().downcast_ref()) + .ok_or_else(|| { + Error::invalid_input("Fm training data must include _rowaddr or _rowid column") + })?; + // Use the named value column; fall back to column(0) for legacy streams + let value_col = batch + .column_by_name(VALUE_COLUMN_NAME) + .unwrap_or_else(|| batch.column(0)); + for i in 0..batch.num_rows() { + let rid = row_addrs.value(i); + if let Some(bytes) = extract_sanitized_text_bytes(value_col.as_ref(), i)? { + partition.push((rid, bytes)); + if partition.len() == PARTITION_SIZE { + files.push(write_fmindex_partition(&partition, store, partition_id).await?); + partition.clear(); + partition_id += 1; + } + } + } + } + + if !partition.is_empty() { + files.push(write_fmindex_partition(&partition, store, partition_id).await?); + } else if files.is_empty() { + files.push(write_empty_fmindex_partition(store).await?); + } + + Ok(files) +} + +fn sanitize_text_bytes(bytes: &[u8]) -> Vec { + bytes + .iter() + .map(|&b| { + if b == SENTINEL_BYTE || b == 0x00 { + b' ' + } else { + b + } + }) + .collect() +} + +fn extract_sanitized_text_bytes( + array: &dyn arrow_array::Array, + index: usize, +) -> Result>> { + if array.is_null(index) { + return Ok(None); + } + match array.data_type() { + DataType::Utf8 => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes(), + ))), + DataType::LargeUtf8 => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes(), + ))), + DataType::Binary => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index), + ))), + DataType::LargeBinary => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index), + ))), + _ => Err(Error::invalid_input(format!( + "Fm does not support data type: {:?}", + array.data_type() + ))), + } +} + +#[cfg(test)] +fn extract_text_bytes(array: &dyn arrow_array::Array, index: usize) -> Result>> { + if array.is_null(index) { + return Ok(None); + } + match array.data_type() { + DataType::Utf8 => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes() + .to_vec(), + )), + DataType::LargeUtf8 => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes() + .to_vec(), + )), + DataType::Binary => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .to_vec(), + )), + DataType::LargeBinary => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .to_vec(), + )), + _ => Err(Error::invalid_input(format!( + "Fm does not support data type: {:?}", + array.data_type() + ))), + } +} + +fn hex_encode(data: &[u8]) -> String { + data.iter().map(|b| format!("{b:02x}")).collect() +} +fn hex_decode(s: &str) -> Result> { + if !s.len().is_multiple_of(2) { + return Err(Error::invalid_input("invalid hex length")); + } + (0..s.len()) + .step_by(2) + .map(|i| { + u8::from_str_radix(&s[i..i + 2], 16) + .map_err(|e| Error::invalid_input(format!("invalid hex: {e}"))) + }) + .collect() +} + +/// Write an FM-Index partition to storage. +/// +/// Layout: +/// - Wavelet block rows (BWT nodes) +/// - SA sample blocks (packed u64 in LargeBinary) +/// - Metadata: c_table, huffman_codes, tree_topology, row_ids, doc_start_positions +async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) -> Result { + let schema = Arc::new(FMIndex::block_schema()); + + let mut writer = store.new_index_file(filename, schema.clone()).await?; + + // 1. Wavelet blocks + let wb = fm.build_wavelet_batch()?; + let nw = wb.num_rows(); + writer.write_record_batch(wb).await?; + + // 2. SA samples packed as binary blocks + let u64s_per_block = BLOCK_WORDS; // 4096 u64s per block = 32KB + let mut sa_nid = Vec::new(); + let mut sa_bid = Vec::new(); + let mut sa_words: Vec> = Vec::new(); + let mut sa_pr = Vec::new(); + let mut sa_bl = Vec::new(); + for (bi, chunk) in fm.sa_samples.chunks(u64s_per_block).enumerate() { + sa_nid.push(u32::MAX); + sa_bid.push(bi as u32); + sa_words.push(FMIndex::u64_to_bytes(chunk)); + sa_pr.push(0u64); + sa_bl.push(fm.sa_samples.len() as u64); + } + let num_sa_blocks = sa_nid.len(); + if num_sa_blocks > 0 { + let refs: Vec<&[u8]> = sa_words.iter().map(|v| v.as_slice()).collect(); + writer + .write_record_batch(RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow_array::UInt32Array::from(sa_nid)), + Arc::new(arrow_array::UInt32Array::from(sa_bid)), + Arc::new(arrow_array::LargeBinaryArray::from(refs)), + Arc::new(arrow_array::UInt64Array::from(sa_pr)), + Arc::new(arrow_array::UInt64Array::from(sa_bl)), + ], + )?) + .await?; + } + + // Metadata + let mut metadata = HashMap::new(); + metadata.insert("num_bwt_nodes".into(), fm.wavelet.nodes.len().to_string()); + metadata.insert("bwt_len".into(), fm.wavelet.len.to_string()); + metadata.insert("num_sa_blocks".into(), num_sa_blocks.to_string()); + metadata.insert("sa_samples_len".into(), fm.sa_samples.len().to_string()); + metadata.insert("total_wavelet_rows".into(), nw.to_string()); + metadata.insert("sa_sample_rate".into(), SA_SAMPLE_RATE.to_string()); + metadata.insert("alphabet_size".into(), fm.alphabet_size.to_string()); + metadata.insert("c_table".into(), hex_encode(&fm.serialize_c_table())); + metadata.insert( + "huffman_codes".into(), + hex_encode(&fm.serialize_huffman_codes()), + ); + metadata.insert( + "tree_topology".into(), + hex_encode(&fm.serialize_tree_topology()), + ); + // row_ids in metadata (10K × 8 = 80KB per partition — small) + let row_ids_bytes: Vec = fm.row_ids.iter().flat_map(|&v| v.to_le_bytes()).collect(); + metadata.insert("row_ids".into(), hex_encode(&row_ids_bytes)); + // doc_start_positions in metadata (10K × 8 = 80KB per partition — small) + let doc_starts_bytes: Vec = fm + .doc_start_positions + .iter() + .flat_map(|&v| v.to_le_bytes()) + .collect(); + metadata.insert("doc_start_positions".into(), hex_encode(&doc_starts_bytes)); + + writer.finish_with_metadata(metadata).await +} + +#[cfg(test)] +async fn write_partitioned_fmindex( + texts: &[(u64, Vec)], + store: &dyn IndexStore, +) -> Result> { + if texts.is_empty() { + return Ok(vec![write_empty_fmindex_partition(store).await?]); + } + let mut files = Vec::new(); + for (pid, chunk) in texts.chunks(PARTITION_SIZE).enumerate() { + files.push(write_fmindex_partition(chunk, store, pid as u64).await?); + } + Ok(files) +} + +async fn write_fmindex_partition( + texts: &[(u64, Vec)], + store: &dyn IndexStore, + partition_id: u64, +) -> Result { + let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect(); + let fm = FMIndex::build(&refs)?; + write_fmindex(&fm, store, &fmindex_partition_path(partition_id)).await +} + +async fn write_empty_fmindex_partition(store: &dyn IndexStore) -> Result { + let fm = FMIndex::build(&[])?; + write_fmindex(&fm, store, &fmindex_partition_path(0)).await +} + +// ── Plugin ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default)] +pub struct FMIndexPlugin; + +#[async_trait] +impl ScalarIndexPlugin for FMIndexPlugin { + fn name(&self) -> &str { + "Fm" + } + fn new_training_request( + &self, + _params: &str, + field: &Field, + ) -> Result> { + match field.data_type() { + DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {} + _ => { + return Err(Error::invalid_input(format!( + "FM-Index does not support {:?}", + field.data_type() + ))); + } + } + Ok(Box::new(DefaultTrainingRequest::new( + TrainingCriteria::new(TrainingOrdering::None).with_row_addr(), + ))) + } + async fn train_index( + &self, + data: SendableRecordBatchStream, + store: &dyn IndexStore, + _req: Box, + _fids: Option>, + _progress: Arc, + ) -> Result { + let files = write_partitioned_fmindex_stream(data, store).await?; + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), + index_version: FMINDEX_INDEX_VERSION, + files, + }) + } + fn provides_exact_answer(&self) -> bool { + true + } + fn version(&self) -> u32 { + FMINDEX_INDEX_VERSION + } + fn new_query_parser( + &self, + index_name: String, + _details: &prost_types::Any, + ) -> Option> { + Some(Box::new(TextQueryParser::new( + index_name, + self.name().to_string(), + // needs_recheck: the FM-index returns exact substring matches. + false, + // supports_regex: regex acceleration is only implemented for ngram. + false, + ))) + } + async fn load_index( + &self, + store: Arc, + details: &prost_types::Any, + fri: Option>, + cache: &LanceCache, + ) -> Result> { + let _ = details + .to_msg::() + .unwrap_or_default(); + Ok(FMIndexScalarIndex::load(store, fri, cache).await? as Arc) + } + async fn load_statistics( + &self, + _: Arc, + _: &prost_types::Any, + ) -> Result> { + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, UInt64Array}; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use futures::stream; + use lance_core::{ROW_ADDR, cache::LanceCache}; + use lance_io::object_store::ObjectStore; + use object_store::path::Path; + use std::sync::Arc; + + use crate::scalar::lance_format::LanceIndexStore; + + #[test] + fn test_fmindex_build_and_search() { + let texts: Vec<(u64, &[u8])> = vec![ + (0, b"hello world"), + (1, b"hello rust"), + (2, b"goodbye world"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + let r = fm.search(b"hello"); + assert!(r.contains(0)); + assert!(r.contains(1)); + assert!(!r.contains(2)); + + let r = fm.search(b"world"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + let r = fm.search(b"goodbye"); + assert!(!r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + assert!(fm.search(b"xyz").is_empty()); + } + + #[test] + fn test_fmindex_empty() { + let fm = FMIndex::build(&[]).unwrap(); + assert!(fm.search(b"anything").is_empty()); + } + + #[test] + fn test_fmindex_single_char_search() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"abc"), (1, b"def")]; + let fm = FMIndex::build(&texts).unwrap(); + assert!(fm.search(b"a").contains(0)); + assert!(!fm.search(b"a").contains(1)); + assert!(!fm.search(b"d").contains(0)); + assert!(fm.search(b"d").contains(1)); + } + + #[test] + fn test_fmindex_repeated_pattern() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"ababab"), (1, b"cdcd")]; + let fm = FMIndex::build(&texts).unwrap(); + assert!(fm.search(b"ab").contains(0)); + assert!(!fm.search(b"ab").contains(1)); + assert!(!fm.search(b"cd").contains(0)); + assert!(fm.search(b"cd").contains(1)); + } + + #[test] + fn test_early_exit_all_docs_match() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"the cat"), (1, b"the dog"), (2, b"the bird")]; + let fm = FMIndex::build(&texts).unwrap(); + assert_eq!(fm.search(b"the").len(), 3); + } + + #[test] + fn test_locate_correctness() { + let texts: Vec<(u64, &[u8])> = vec![ + (0, b"the quick brown fox jumps over the lazy dog"), + (1, b"pack my box with five dozen liquor jugs"), + (2, b"how vexingly quick daft zebras jump"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + let r = fm.search(b"quick"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + let r = fm.search(b"the"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(!r.contains(2)); + + let r = fm.search(b"jump"); + assert!(r.contains(0)); + assert!(r.contains(2)); + } + + #[test] + fn test_many_documents() { + let docs: Vec> = (0..100) + .map(|i| format!("document number {} with hello world data xyz", i).into_bytes()) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + assert_eq!(fm.search(b"hello world").len(), 100); + assert_eq!(fm.search(b"document number 42").len(), 1); + assert_eq!(fm.search(b"nonexistent").len(), 0); + } + + #[test] + fn test_index_size_ratio() { + let docs: Vec> = (0..200) + .map(|i| { + format!( + "document {} with enough text to test size ratio properly end", + i + ) + .into_bytes() + }) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + let text_size: usize = docs.iter().map(|d| d.len()).sum(); + let wavelet_size = fm.wavelet.deep_size(); + let sa_size = fm.sa_samples.len() * 8; + let total = wavelet_size + sa_size; + + let ratio = total as f64 / text_size as f64; + assert!( + ratio < 1.5, + "index should be much smaller than text, got ratio={ratio:.2}" + ); + } + + #[test] + fn test_wavelet_access_consistency() { + let docs: Vec> = (0..50) + .map(|i| format!("document {i} hello world test").into_bytes()) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + + let mut concat = Vec::new(); + for (_, text) in &texts { + concat.extend_from_slice(text); + concat.push(SENTINEL_BYTE); + } + concat.push(0x00); + let sa = build_suffix_array(&concat); + let n = concat.len(); + let bwt: Vec = sa + .iter() + .map(|&pos| { + if pos == 0 { + concat[n - 1] + } else { + concat[pos - 1] + } + }) + .collect(); + let wavelet = HuffmanWaveletTree::build(&bwt); + + for (i, &expected) in bwt.iter().enumerate().take(n.min(500)) { + assert_eq!(wavelet.access(i), expected, "access mismatch at {i}"); + } + } + + #[test] + fn test_serialization_roundtrip() { + let texts: Vec<(u64, &[u8])> = vec![ + (10, b"alpha beta gamma"), + (20, b"beta gamma delta"), + (30, b"gamma delta epsilon"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + // Test huffman codes roundtrip + let hc_bytes = fm.serialize_huffman_codes(); + let hc = FMIndex::deserialize_huffman_codes(&hc_bytes); + for (i, (loaded, original)) in hc.iter().zip(fm.wavelet.codes.iter()).enumerate() { + assert_eq!(loaded.bits, original.bits, "bits mismatch at {i}"); + assert_eq!(loaded.length, original.length, "length mismatch at {i}"); + assert_eq!(loaded.node_path, original.node_path, "path mismatch at {i}"); + } + + // Test tree topology roundtrip + let topo_bytes = fm.serialize_tree_topology(); + let topo = FMIndex::deserialize_tree_topology(&topo_bytes); + assert_eq!(topo.len(), fm.wavelet.children.len()); + + // Test c_table roundtrip + let ct_bytes = fm.serialize_c_table(); + let ct = FMIndex::deserialize_c_table(&ct_bytes); + assert_eq!(ct, fm.c_table); + } + + #[test] + fn test_hex_roundtrip() { + let data = vec![0u8, 1, 127, 255, 42]; + let encoded = hex_encode(&data); + let decoded = hex_decode(&encoded).unwrap(); + assert_eq!(data, decoded); + } + + #[test] + fn test_sentinel_sanitization() { + // Text containing \xFF should be sanitized to space during training. + let texts: Vec<(u64, &[u8])> = vec![(0, b"hello\xFFworld")]; + let fm = FMIndex::build(&texts).unwrap(); + // Build itself does not sanitize, but search should still work. + let r = fm.search(b"hello"); + assert!(r.contains(0)); + } + + #[test] + fn test_wavelet_rank_pair_consistency() { + let docs: Vec> = (0..30) + .map(|i| format!("doc {i} with repeated words hello world test data").into_bytes()) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + let n = fm.wavelet.len; + for b in [b'a', b'e', b' ', SENTINEL_BYTE] { + for &(lo, hi) in &[(0usize, 1usize), (0, n), (n / 4, n / 2)] { + if lo >= n || hi > n || lo >= hi { + continue; + } + let (pl, ph) = fm.wavelet.rank_pair(b, lo, hi); + let rl = fm.wavelet.rank(b, lo); + let rh = fm.wavelet.rank(b, hi); + assert_eq!(pl, rl, "rank_pair lo mismatch for b={b} [{lo},{hi})"); + assert_eq!(ph, rh, "rank_pair hi mismatch for b={b} [{lo},{hi})"); + } + } + } + + #[test] + fn test_large_sa_sampling() { + // Test with enough documents to have multiple SA sample points + let docs: Vec> = (0..50) + .map(|i| { + format!( + "document number {} with lots of text to ensure we have enough bytes for multiple SA samples across the suffix array positions", + i + ) + .into_bytes() + }) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + assert!(fm.sa_samples.len() > 1, "should have multiple SA samples"); + assert_eq!(fm.search(b"document number 25").len(), 1); + assert_eq!(fm.search(b"document number").len(), 50); + assert_eq!(fm.search(b"nonexistent pattern").len(), 0); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_write_and_load_roundtrip() { + let texts: Vec<(u64, &[u8])> = vec![ + (0, b"hello world foo bar"), + (1, b"hello rust baz qux"), + (2, b"goodbye world quux"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + // Write + write_fmindex(&fm, store.as_ref(), &fmindex_partition_path(0)) + .await + .unwrap(); + + // Load + let part = + FMIndexScalarIndex::load_partition(store.as_ref(), &fmindex_partition_path(0), 0) + .await + .unwrap(); + + // Verify search results match + let r = part.fm.search(b"hello"); + assert!(r.contains(0)); + assert!(r.contains(1)); + assert!(!r.contains(2)); + + let r = part.fm.search(b"world"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + assert!(part.fm.search(b"xyz").is_empty()); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_partitioned_write_and_load() { + let docs: Vec> = (0..30) + .map(|i| format!("document {i} hello world test data").into_bytes()) + .collect(); + let texts: Vec<(u64, Vec)> = docs + .into_iter() + .enumerate() + .map(|(i, d)| (i as u64, d)) + .collect(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + write_partitioned_fmindex(&texts, store.as_ref()) + .await + .unwrap(); + + let index = FMIndexScalarIndex::load(store, None, &LanceCache::no_cache()) + .await + .unwrap(); + + // Search across partitions + let r = index + .search( + &TextQuery::StringContains("hello world".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(30)); + } + _ => panic!("expected exact result"), + } + + let r = index + .search( + &TextQuery::StringContains("document 15".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(1)); + } + _ => panic!("expected exact result"), + } + + let r = index + .search( + &TextQuery::StringContains("nonexistent".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(0)); + } + _ => panic!("expected exact result"), + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_plugin_train_and_load() { + let docs = vec!["hello world", "hello rust", "goodbye world"]; + let row_addrs: Vec = vec![0, 1, 2]; + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new( + crate::scalar::registry::VALUE_COLUMN_NAME, + DataType::Utf8, + false, + ), + arrow_schema::Field::new(ROW_ADDR, DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(docs)), + Arc::new(UInt64Array::from(row_addrs)), + ], + ) + .unwrap(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + let stream = RecordBatchStreamAdapter::new(batch.schema(), stream::iter(vec![Ok(batch)])); + let req = FMIndexPlugin + .new_training_request("", &arrow_schema::Field::new("val", DataType::Utf8, false)) + .unwrap(); + let created = FMIndexPlugin + .train_index( + Box::pin(stream), + store.as_ref(), + req, + None, + Arc::new(crate::progress::NoopIndexBuildProgress), + ) + .await + .unwrap(); + + let index = FMIndexPlugin + .load_index(store, &created.index_details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let r = index + .search( + &TextQuery::StringContains("hello".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(2)); + } + _ => panic!("expected exact result"), + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_plugin_train_streams_multiple_partitions() { + fn training_batch( + schema: Arc, + start: usize, + len: usize, + ) -> RecordBatch { + let docs = vec!["x"; len]; + let row_addrs: Vec = (start..start + len).map(|i| i as u64).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(docs)), + Arc::new(UInt64Array::from(row_addrs)), + ], + ) + .unwrap() + } + + let total_rows = PARTITION_SIZE + 5; + let first_batch_rows = PARTITION_SIZE - 3; + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new( + crate::scalar::registry::VALUE_COLUMN_NAME, + DataType::Utf8, + false, + ), + arrow_schema::Field::new(ROW_ADDR, DataType::UInt64, false), + ])); + let batches = vec![ + Ok(training_batch(schema.clone(), 0, first_batch_rows)), + Ok(training_batch( + schema.clone(), + first_batch_rows, + total_rows - first_batch_rows, + )), + ]; + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + let stream = RecordBatchStreamAdapter::new(schema, stream::iter(batches)); + let req = FMIndexPlugin + .new_training_request("", &arrow_schema::Field::new("val", DataType::Utf8, false)) + .unwrap(); + let created = FMIndexPlugin + .train_index( + Box::pin(stream), + store.as_ref(), + req, + None, + Arc::new(crate::progress::NoopIndexBuildProgress), + ) + .await + .unwrap(); + + assert_eq!(created.files.len(), 2); + + let index = FMIndexPlugin + .load_index(store, &created.index_details, None, &LanceCache::no_cache()) + .await + .unwrap(); + let r = index + .search( + &TextQuery::StringContains("x".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(total_rows as u64)); + } + _ => panic!("expected exact result"), + } + } + + #[test] + fn test_build_wavelet_batch() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"hello world"), (1, b"test data")]; + let fm = FMIndex::build(&texts).unwrap(); + let batch = fm.build_wavelet_batch().unwrap(); + assert!(batch.num_rows() > 0); + assert_eq!(batch.num_columns(), 5); + } + + #[test] + fn test_extract_text_bytes_types() { + let utf8 = StringArray::from(vec!["hello"]); + assert_eq!( + extract_text_bytes(&utf8, 0).unwrap(), + Some(b"hello".to_vec()) + ); + + let large_utf8 = LargeStringArray::from(vec!["world"]); + assert_eq!( + extract_text_bytes(&large_utf8, 0).unwrap(), + Some(b"world".to_vec()) + ); + + let binary = BinaryArray::from(vec![b"bytes" as &[u8]]); + assert_eq!( + extract_text_bytes(&binary, 0).unwrap(), + Some(b"bytes".to_vec()) + ); + let binary_with_sentinels = BinaryArray::from(vec![b"a\xFFb\0c" as &[u8]]); + assert_eq!( + extract_sanitized_text_bytes(&binary_with_sentinels, 0).unwrap(), + Some(b"a b c".to_vec()) + ); + + let large_binary = LargeBinaryArray::from(vec![b"large" as &[u8]]); + assert_eq!( + extract_text_bytes(&large_binary, 0).unwrap(), + Some(b"large".to_vec()) + ); + + // Null handling + let nullable = StringArray::from(vec![None::<&str>]); + assert_eq!(extract_text_bytes(&nullable, 0).unwrap(), None); + } + + #[test] + fn test_fmindex_statistics() { + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + rt.block_on(async { + let docs: Vec> = (0..10).map(|i| format!("doc {i}").into_bytes()).collect(); + let texts: Vec<(u64, Vec)> = docs + .into_iter() + .enumerate() + .map(|(i, d)| (i as u64, d)) + .collect(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + write_partitioned_fmindex(&texts, store.as_ref()) + .await + .unwrap(); + let index = FMIndexScalarIndex::load(store, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let stats = index.statistics().unwrap(); + assert_eq!(stats["type"], "Fm"); + assert_eq!(stats["total_docs"], 10); + assert!(stats["total_bwt_len"].as_u64().unwrap() > 0); + }); + } +} diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs index 6adf4457f05..d0bb0e40d3a 100644 --- a/rust/lance-index/src/scalar/inverted.rs +++ b/rust/lance-index/src/scalar/inverted.rs @@ -7,6 +7,7 @@ mod encoding; mod index; mod iter; pub mod json; +mod lazy_docset; pub mod parser; pub mod query; mod scorer; @@ -150,11 +151,11 @@ impl InvertedIndexPlugin { let mut inverted_index = InvertedIndexBuilder::new_with_fragment_mask(params, fragment_mask) .with_progress(progress); - inverted_index.update(data, index_store, None).await?; + let files = inverted_index.update(data, index_store, None).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: current_fts_format_version().index_version(), - files: Some(index_store.list_files_with_sizes().await?), + files, }) } diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index b8b6b52c6af..17cb18c5e96 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -8,7 +8,7 @@ use crate::scalar::inverted::json::JsonTextStream; use crate::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer; #[cfg(test)] use crate::scalar::lance_format::LanceIndexStore; -use crate::scalar::{IndexStore, OldIndexDataFilter}; +use crate::scalar::{IndexFile, IndexStore, OldIndexDataFilter}; use crate::vector::graph::OrderedFloat; use crate::{progress::IndexBuildProgress, progress::noop_progress}; use arrow::array::AsArray; @@ -18,12 +18,12 @@ use arrow_schema::{DataType, Field, Schema, SchemaRef}; use bitpacking::{BitPacker, BitPacker4x}; use bytes::Bytes; use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream}; -use deepsize::DeepSizeOf; use fst::Streamer; use futures::{Stream, StreamExt, TryStreamExt}; use lance_arrow::json::JSON_EXT_NAME; use lance_arrow::{ARROW_EXT_NAME_KEY, iter_str_array}; use lance_core::cache::LanceCache; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::utils::tokio::{IO_CORE_RESERVATION, get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::{Error, ROW_ID, ROW_ID_FIELD, Result}; @@ -288,7 +288,7 @@ impl InvertedIndexBuilder { new_data: SendableRecordBatchStream, dest_store: &dyn IndexStore, old_data_filter: Option, - ) -> Result<()> { + ) -> Result> { let schema = new_data.schema(); let doc_col = schema.field(0).name(); @@ -305,15 +305,15 @@ impl InvertedIndexBuilder { self.progress .stage_start("tokenize_docs", None, "rows") .await?; - self.update_index(new_data, dest_store).await?; + let mut files = self.update_index(new_data, dest_store).await?; if let Some(OldIndexDataFilter::Fragments { to_remove, .. }) = old_data_filter { self.deleted_fragments.extend(to_remove); } self.progress.stage_complete("tokenize_docs").await?; - self.write(dest_store).await?; - Ok(()) + files.extend(self.write(dest_store).await?); + Ok(files) } pub async fn update_from_segments( @@ -322,7 +322,7 @@ impl InvertedIndexBuilder { dest_store: &dyn IndexStore, old_segments: &[Arc], old_data_filter: Option, - ) -> Result<()> { + ) -> Result> { let schema = new_data.schema(); let doc_col = schema.field(0).name(); @@ -332,7 +332,8 @@ impl InvertedIndexBuilder { self.params.lance_tokenizer = Some(doc_type.as_ref().to_string()); } - self.merge_existing_segments(dest_store, old_segments, old_data_filter.as_ref()) + let mut files = self + .merge_existing_segments(dest_store, old_segments, old_data_filter.as_ref()) .await?; let new_data = document_input(new_data, doc_col)?; @@ -340,11 +341,11 @@ impl InvertedIndexBuilder { self.progress .stage_start("tokenize_docs", None, "rows") .await?; - self.update_index(new_data, dest_store).await?; + files.extend(self.update_index(new_data, dest_store).await?); self.progress.stage_complete("tokenize_docs").await?; - self.write(dest_store).await?; - Ok(()) + files.extend(self.write(dest_store).await?); + Ok(files) } async fn merge_existing_segments( @@ -352,10 +353,11 @@ impl InvertedIndexBuilder { dest_store: &dyn IndexStore, old_segments: &[Arc], old_data_filter: Option<&crate::scalar::OldIndexDataFilter>, - ) -> Result<()> { + ) -> Result> { let num_workers = resolve_num_workers(&self.params); let memory_limit_bytes = resolve_worker_memory_limit_bytes(&self.params, num_workers); let mut merged: Option = None; + let mut files = Vec::new(); for index in old_segments { if old_data_filter.is_none() { self.deleted_fragments @@ -382,7 +384,7 @@ impl InvertedIndexBuilder { > u32::MAX as usize; if would_exceed_memory || would_exceed_doc_ids { let builder = std::mem::replace(merged, partition_builder); - self.write_new_partition(dest_store, builder).await?; + files.extend(self.write_new_partition(dest_store, builder).await?); } else { merged.merge_from(partition_builder)?; } @@ -393,21 +395,31 @@ impl InvertedIndexBuilder { } if let Some(builder) = merged { - self.write_new_partition(dest_store, builder).await?; + files.extend(self.write_new_partition(dest_store, builder).await?); } - Ok(()) + Ok(files) } async fn write_new_partition( &mut self, dest_store: &dyn IndexStore, mut builder: InnerBuilder, - ) -> Result<()> { + ) -> Result> { let partition_id = self.next_partition_id() | self.fragment_mask.unwrap_or(0); builder.set_id(partition_id); - builder.write(dest_store).await?; + let files = builder + .write_to(dest_store, self.partition_write_target()) + .await?; self.new_partitions.push(partition_id); - Ok(()) + Ok(files) + } + + fn partition_write_target(&self) -> PartitionWriteTarget { + if self.fragment_mask.is_some() { + PartitionWriteTarget::Staged + } else { + PartitionWriteTarget::Final + } } fn next_partition_id(&self) -> u64 { @@ -424,7 +436,7 @@ impl InvertedIndexBuilder { &mut self, stream: SendableRecordBatchStream, dest_store: &dyn IndexStore, - ) -> Result<()> { + ) -> Result> { let num_workers = resolve_num_workers(&self.params); let tokenizer = self.params.build()?; let with_position = self.params.with_position; @@ -507,9 +519,11 @@ impl InvertedIndexBuilder { // wait for the workers to finish let start = std::time::Instant::now(); let mut tail_partitions = Vec::new(); + let mut files = Vec::new(); for index_task in index_tasks { let output = index_task.await??; self.new_partitions.extend(output.partitions); + files.extend(output.files); if let Some(tail_partition) = output.tail_partition { tail_partitions.push(tail_partition); } @@ -519,10 +533,14 @@ impl InvertedIndexBuilder { if let Some(builder) = merged_tail_partitions { self.new_partitions.push(builder.id()); let mut builder = builder; - builder.write(dest_store.as_ref()).await?; + files.extend( + builder + .write_to(dest_store.as_ref(), self.partition_write_target()) + .await?, + ); } log::info!("wait workers indexing elapsed: {:?}", start.elapsed()); - Result::Ok(()) + Result::Ok(files) }; index_build.await @@ -533,7 +551,8 @@ impl InvertedIndexBuilder { mapping: &HashMap>, src_store: Arc, dest_store: &dyn IndexStore, - ) -> Result<()> { + ) -> Result> { + let mut files = Vec::new(); for part in self.partitions.iter() { let part = InvertedPartition::load( src_store.clone(), @@ -545,20 +564,28 @@ impl InvertedIndexBuilder { .await?; let mut builder = part.into_builder().await?; builder.remap(mapping).await?; - builder.write(dest_store).await?; + files.extend( + builder + .write_to(dest_store, self.partition_write_target()) + .await?, + ); } if self.fragment_mask.is_none() { - self.write_metadata(dest_store, &self.partitions).await?; + files.push(self.write_metadata(dest_store, &self.partitions).await?); } else { - // in distributed mode, the part_temp_metadata is written by the worker + // in distributed mode, the staged partition metadata is written by the worker for &partition_id in &self.partitions { - self.write_part_metadata(dest_store, partition_id).await?; + files.push(self.write_part_metadata(dest_store, partition_id).await?); } } - Ok(()) + Ok(files) } - async fn write_metadata(&self, dest_store: &dyn IndexStore, partitions: &[u64]) -> Result<()> { + async fn write_metadata( + &self, + dest_store: &dyn IndexStore, + partitions: &[u64], + ) -> Result { let mut serialized_deleted_fragments = Vec::with_capacity(self.deleted_fragments.serialized_size()); self.deleted_fragments @@ -607,8 +634,7 @@ impl InvertedIndexBuilder { .new_index_file(METADATA_FILE, metadata_file_schema) .await?; writer.write_record_batch(record_batch).await?; - writer.finish_with_metadata(metadata).await?; - Ok(()) + writer.finish_with_metadata(metadata).await } /// Write partition metadata file for a single partition @@ -619,7 +645,7 @@ impl InvertedIndexBuilder { &self, dest_store: &dyn IndexStore, partition: u64, // Modify parameter type - ) -> Result<()> { + ) -> Result { let partitions = vec![partition]; let mut metadata = HashMap::from_iter(vec![ ("partitions".to_owned(), serde_json::to_string(&partitions)?), @@ -652,30 +678,30 @@ impl InvertedIndexBuilder { let mut writer = dest_store .new_index_file(&file_name, Arc::new(Schema::empty())) .await?; - writer.finish_with_metadata(metadata).await?; - Ok(()) + writer.finish_with_metadata(metadata).await } async fn write_metadata_with_progress( &self, dest_store: &dyn IndexStore, partitions: &[u64], - ) -> Result<()> { + ) -> Result> { let total = if self.fragment_mask.is_none() { Some(1) } else { Some(partitions.len() as u64) }; + let mut files = Vec::new(); self.progress .stage_start("write_metadata", total, "files") .await?; if self.fragment_mask.is_none() { - self.write_metadata(dest_store, partitions).await?; + files.push(self.write_metadata(dest_store, partitions).await?); self.progress.stage_progress("write_metadata", 1).await?; } else { let mut completed = 0; for &partition_id in partitions { - self.write_part_metadata(dest_store, partition_id).await?; + files.push(self.write_part_metadata(dest_store, partition_id).await?); completed += 1; self.progress .stage_progress("write_metadata", completed) @@ -683,10 +709,10 @@ impl InvertedIndexBuilder { } } self.progress.stage_complete("write_metadata").await?; - Ok(()) + Ok(files) } - async fn write(&self, dest_store: &dyn IndexStore) -> Result<()> { + async fn write(&self, dest_store: &dyn IndexStore) -> Result> { let mut partitions = Vec::with_capacity(self.partitions.len() + self.new_partitions.len()); partitions.extend_from_slice(&self.partitions); partitions.extend_from_slice(&self.new_partitions); @@ -700,22 +726,38 @@ impl InvertedIndexBuilder { ) .await?; let mut copied = 0; + let mut files = Vec::new(); + let target = self.partition_write_target(); for part in self.partitions.iter() { - self.src_store - .as_ref() - .expect("existing partitions require a source store") - .copy_index_file(&token_file_path(*part), dest_store) - .await?; - self.src_store - .as_ref() - .expect("existing partitions require a source store") - .copy_index_file(&posting_file_path(*part), dest_store) - .await?; - self.src_store - .as_ref() - .expect("existing partitions require a source store") - .copy_index_file(&doc_file_path(*part), dest_store) - .await?; + files.push( + self.src_store + .as_ref() + .expect("existing partitions require a source store") + .copy_index_file_to( + &token_file_path(*part), + &target.token_path(*part), + dest_store, + ) + .await?, + ); + files.push( + self.src_store + .as_ref() + .expect("existing partitions require a source store") + .copy_index_file_to( + &posting_file_path(*part), + &target.posting_path(*part), + dest_store, + ) + .await?, + ); + files.push( + self.src_store + .as_ref() + .expect("existing partitions require a source store") + .copy_index_file_to(&doc_file_path(*part), &target.doc_path(*part), dest_store) + .await?, + ); copied += 1; self.progress .stage_progress("copy_partitions", copied) @@ -729,9 +771,11 @@ impl InvertedIndexBuilder { } self.progress.stage_complete("copy_partitions").await?; - self.write_metadata_with_progress(dest_store, &partitions) - .await?; - Ok(()) + files.extend( + self.write_metadata_with_progress(dest_store, &partitions) + .await?, + ); + Ok(files) } } @@ -968,12 +1012,25 @@ impl InnerBuilder { + posting_lists_size } - pub async fn write(&mut self, store: &dyn IndexStore) -> Result<()> { + pub async fn write(&mut self, store: &dyn IndexStore) -> Result> { + self.write_to(store, PartitionWriteTarget::Final).await + } + + async fn write_to( + &mut self, + store: &dyn IndexStore, + target: PartitionWriteTarget, + ) -> Result> { let docs = Arc::new(std::mem::take(&mut self.docs)); - self.write_posting_lists(store, docs.clone()).await?; - self.write_tokens(store).await?; - self.write_docs(store, docs).await?; - Ok(()) + let files = vec![ + self.write_posting_lists(store, docs.clone(), &target.posting_path(self.id)) + .await?, + self.write_tokens(store, &target.token_path(self.id)) + .await?, + self.write_docs(store, docs, &target.doc_path(self.id)) + .await?, + ]; + Ok(files) } #[instrument(level = "debug", skip_all)] @@ -981,11 +1038,12 @@ impl InnerBuilder { &mut self, store: &dyn IndexStore, docs: Arc, - ) -> Result<()> { + path: &str, + ) -> Result { let id = self.id; let mut writer = store .new_index_file( - &posting_file_path(self.id), + path, inverted_list_schema_for_version(self.with_position, self.format_version), ) .await?; @@ -1067,33 +1125,58 @@ impl InnerBuilder { buffer_id.to_string(), ); } - writer.finish_with_metadata(extra_metadata).await?; - Ok(()) + writer.finish_with_metadata(extra_metadata).await } #[instrument(level = "debug", skip_all)] - async fn write_tokens(&mut self, store: &dyn IndexStore) -> Result<()> { + async fn write_tokens(&mut self, store: &dyn IndexStore, path: &str) -> Result { log::info!("writing tokens of partition {}", self.id); let tokens = std::mem::take(&mut self.tokens); let batch = tokens.to_batch(self.token_set_format)?; - let mut writer = store - .new_index_file(&token_file_path(self.id), batch.schema()) - .await?; + let mut writer = store.new_index_file(path, batch.schema()).await?; writer.write_record_batch(batch).await?; - writer.finish().await?; - Ok(()) + writer.finish().await } #[instrument(level = "debug", skip_all)] - async fn write_docs(&mut self, store: &dyn IndexStore, docs: Arc) -> Result<()> { + async fn write_docs( + &mut self, + store: &dyn IndexStore, + docs: Arc, + path: &str, + ) -> Result { log::info!("writing docs of partition {}", self.id); let batch = docs.to_batch()?; - let mut writer = store - .new_index_file(&doc_file_path(self.id), batch.schema()) - .await?; + let mut writer = store.new_index_file(path, batch.schema()).await?; writer.write_record_batch(batch).await?; - writer.finish().await?; - Ok(()) + writer.finish().await + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PartitionWriteTarget { + Final, + Staged, +} + +impl PartitionWriteTarget { + fn file_path(self, partition_id: u64, suffix: &str) -> String { + match self { + Self::Final => partition_file_path(partition_id, suffix), + Self::Staged => staged_partition_file_path(partition_id, suffix), + } + } + + fn token_path(self, partition_id: u64) -> String { + self.file_path(partition_id, TOKENS_FILE) + } + + fn posting_path(self, partition_id: u64) -> String { + self.file_path(partition_id, INVERT_LIST_FILE) + } + + fn doc_path(self, partition_id: u64) -> String { + self.file_path(partition_id, DOCS_FILE) } } @@ -1103,6 +1186,7 @@ struct IndexWorker { id_alloc: Arc, builder: InnerBuilder, partitions: Vec, + files: Vec, schema: SchemaRef, memory_size: u64, worker_memory_limit_bytes: u64, @@ -1119,6 +1203,7 @@ struct TailPartition { struct WorkerOutput { partitions: Vec, + files: Vec, tail_partition: Option, } @@ -1185,6 +1270,7 @@ impl IndexWorker { config.format_version, ), partitions: Vec::new(), + files: Vec::new(), id_alloc, schema, memory_size: 0, @@ -1233,9 +1319,8 @@ impl IndexWorker { let mut token_stream = self.tokenizer.token_stream_for_doc(doc); while token_stream.advance() { - let token = token_stream.token_mut(); - let token_text = std::mem::take(&mut token.text); - let token_id = builder.tokens.add(token_text); + let token = token_stream.token(); + let token_id = builder.tokens.get_or_add(&token.text); if token_id as usize == builder.posting_lists.len() { let old_posting_lists_overhead_size = (builder.posting_lists.capacity() * std::mem::size_of::()) @@ -1274,9 +1359,7 @@ impl IndexWorker { let mut token_stream = self.tokenizer.token_stream_for_doc(doc); while token_stream.advance() { - let token = token_stream.token_mut(); - let token_text = std::mem::take(&mut token.text); - let token_id = self.builder.tokens.add(token_text); + let token_id = self.builder.tokens.get_or_add(&token_stream.token().text); self.token_ids.push(token_id); token_num += 1; } @@ -1411,8 +1494,13 @@ impl IndexWorker { ); let written_partition_id = builder.id(); let mut builder = builder; - builder - .write(self.dest_store.as_ref()) + let target = if self.fragment_mask.is_some() { + PartitionWriteTarget::Staged + } else { + PartitionWriteTarget::Final + }; + let files = builder + .write_to(self.dest_store.as_ref(), target) .await .map_err(|err| { Error::execution(format!( @@ -1420,6 +1508,7 @@ impl IndexWorker { written_partition_id )) })?; + self.files.extend(files); self.partitions.push(written_partition_id); Ok(()) } @@ -1434,6 +1523,7 @@ impl IndexWorker { }; Ok(WorkerOutput { partitions: self.partitions, + files: self.files, tail_partition, }) } @@ -1761,14 +1851,23 @@ pub(crate) fn doc_file_path(partition_id: u64) -> String { } pub(crate) fn part_metadata_file_path(partition_id: u64) -> String { - format!("part_{}_{}", partition_id, METADATA_FILE) + staged_partition_file_path(partition_id, METADATA_FILE) } const PARTITION_FILE_SUFFIXES: [&str; 3] = [TOKENS_FILE, INVERT_LIST_FILE, DOCS_FILE]; -// Each remapped file is renamed twice: first to a temp path (phase 1), then to -// its final path (phase 2). Keep in sync with the two rename loops below in -// `merge_metadata_files`. -const PARTITION_FILE_RENAME_PHASES: u64 = 2; +const STAGED_PARTITION_DIR: &str = "staging"; + +fn partition_file_path(partition_id: u64, suffix: &str) -> String { + format!("part_{}_{}", partition_id, suffix) +} + +fn staged_partition_file_path(partition_id: u64, suffix: &str) -> String { + format!( + "{}/{}", + STAGED_PARTITION_DIR, + partition_file_path(partition_id, suffix) + ) +} pub async fn merge_index_files( object_store: &ObjectStore, @@ -1776,33 +1875,65 @@ pub async fn merge_index_files( store: Arc, progress: Arc, ) -> Result<()> { - // List all partition metadata files in the index directory - let part_metadata_files = list_metadata_files(object_store, index_dir).await?; + let metadata_path = index_dir.clone().join(METADATA_FILE); + if object_store.exists(&metadata_path).await? { + return Ok(()); + } + + // List all staged partition metadata files in the index directory + let index_files = list_index_files(object_store, index_dir).await?; + let part_metadata_files = metadata_files(&index_files); + if part_metadata_files.is_empty() { + return Err(Error::invalid_input_source( + format!( + "No partition metadata files found in index directory: {}", + index_dir + ) + .into(), + )); + } // Call merge_metadata_files function for inverted index merge_metadata_files(store, &part_metadata_files, progress).await } -/// List and filter metadata files from the index directory -/// Returns partition metadata files -async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { - // List all partition metadata files in the index directory - let mut part_metadata_files = Vec::new(); - let mut list_stream = object_store.list(Some(index_dir.clone())); +async fn list_index_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { + let mut index_files = Vec::new(); + let mut list_stream = object_store.read_dir_all(index_dir, None); while let Some(item) = list_stream.next().await { match item { Ok(meta) => { - let file_name = meta.location.filename().unwrap_or_default(); - // Filter files matching the pattern part_*_metadata.lance - if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") { - part_metadata_files.push(file_name.to_string()); - } + let location = meta.location.as_ref().trim_start_matches('/'); + let index_dir = index_dir.as_ref().trim_start_matches('/'); + let relative_path = location + .strip_prefix(index_dir) + .map(|s| s.trim_start_matches('/').to_string()) + .unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string()); + index_files.push(relative_path); } Err(err) => return Err(err), } } + Ok(index_files) +} + +fn metadata_files(index_files: &[String]) -> Vec { + index_files + .iter() + .filter(|file_name| { + file_name.starts_with(&format!("{}/part_", STAGED_PARTITION_DIR)) + && file_name.ends_with("_metadata.lance") + }) + .cloned() + .collect() +} + +#[cfg(test)] +async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { + let index_files = list_index_files(object_store, index_dir).await?; + let part_metadata_files = metadata_files(&index_files); if part_metadata_files.is_empty() { return Err(Error::invalid_input_source( format!( @@ -1893,89 +2024,35 @@ async fn merge_metadata_files( progress.stage_complete("read_partition_metadata").await?; // Create ID mapping: sorted original IDs -> 0,1,2... - let mut sorted_ids = all_partitions.clone(); + let mut sorted_ids = all_partitions; sorted_ids.sort(); sorted_ids.dedup(); - let id_mapping: HashMap = sorted_ids + let id_mapping: Vec<(u64, u64)> = sorted_ids .iter() .enumerate() .map(|(new_id, &old_id)| (old_id, new_id as u64)) .collect(); - // Safe rename partition files using temporary files to avoid overwrite - let timestamp = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_secs(); - - let changed_partition_count = id_mapping - .iter() - .filter(|(old_id, new_id)| old_id != new_id) - .count() as u64; - let total_renames = changed_partition_count - * PARTITION_FILE_SUFFIXES.len() as u64 - * PARTITION_FILE_RENAME_PHASES; + let total_copies = id_mapping.len() as u64 * PARTITION_FILE_SUFFIXES.len() as u64; progress - .stage_start("remap_partition_files", Some(total_renames), "files") + .stage_start("remap_partition_files", Some(total_copies), "files") .await?; - // Phase 1: Move files to temporary locations - let mut temp_files: Vec<(String, String, String)> = Vec::new(); // (temp_path, old_path, final_path) - let mut renamed_files = 0u64; - - for (&old_id, &new_id) in &id_mapping { - if old_id != new_id { - for suffix in PARTITION_FILE_SUFFIXES { - let old_path = format!("part_{}_{}", old_id, suffix); - let new_path = format!("part_{}_{}", new_id, suffix); - let temp_path = format!("temp_{}_{}", timestamp, old_path); - - // Move to temporary location first to avoid overwrite - if let Err(e) = store.rename_index_file(&old_path, &temp_path).await { - // Rollback phase 1: restore files from temp locations - for (temp_name, old_name, _) in temp_files.iter().rev() { - let _ = store.rename_index_file(temp_name, old_name).await; - } - return Err(Error::index(format!( - "Failed to move {} to temp {}: {}", - old_path, temp_path, e - ))); - } - temp_files.push((temp_path, old_path, new_path)); - renamed_files += 1; - progress - .stage_progress("remap_partition_files", renamed_files) - .await?; - } - } - } - - // Phase 2: Move from temporary to final locations - let mut completed_renames: Vec<(String, String)> = Vec::new(); // (final_path, temp_path) + let mut copied_files = 0u64; - for (temp_path, _old_path, final_path) in &temp_files { - if let Err(e) = store.rename_index_file(temp_path, final_path).await { - // Rollback phase 2: restore completed renames and remaining temps - for (final_name, temp_name) in completed_renames.iter().rev() { - let _ = store.rename_index_file(final_name, temp_name).await; - } - // Restore remaining temp files to original locations - for (temp_name, orig_name, _) in temp_files.iter() { - if !completed_renames.iter().any(|(_, t)| t == temp_name) { - let _ = store.rename_index_file(temp_name, orig_name).await; - } - } - return Err(Error::index(format!( - "Failed to rename {} to {}: {}", - temp_path, final_path, e - ))); + for &(old_id, new_id) in &id_mapping { + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(old_id, suffix); + let final_path = partition_file_path(new_id, suffix); + store + .copy_index_file_to(&staged_path, &final_path, store.as_ref()) + .await?; + copied_files += 1; + progress + .stage_progress("remap_partition_files", copied_files) + .await?; } - completed_renames.push((final_path.clone(), temp_path.clone())); - renamed_files += 1; - progress - .stage_progress("remap_partition_files", renamed_files) - .await?; } progress.stage_complete("remap_partition_files").await?; @@ -2002,10 +2079,15 @@ async fn merge_metadata_files( progress.stage_progress("write_merged_metadata", 1).await?; progress.stage_complete("write_merged_metadata").await?; - // Cleanup partition metadata files + // Cleanup staged partition metadata files for file_name in part_metadata_files { - if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") { - let _ = store.delete_index_file(file_name).await; + let _ = store.delete_index_file(file_name).await; + } + for &(old_id, _) in &id_mapping { + for suffix in PARTITION_FILE_SUFFIXES { + let _ = store + .delete_index_file(&staged_partition_file_path(old_id, suffix)) + .await; } } @@ -2220,13 +2302,242 @@ mod tests { } impl DeepSizeOf for CountingStore { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } + #[derive(Debug, Clone)] + struct NoRenameStore { + inner: Arc, + final_delete_count: Option>, + } + + impl NoRenameStore { + fn new(inner: Arc) -> Self { + Self { + inner, + final_delete_count: None, + } + } + + fn with_final_delete_tracking(inner: Arc) -> Self { + Self { + inner, + final_delete_count: Some(Arc::new(AtomicUsize::new(0))), + } + } + + fn final_delete_count(&self) -> usize { + self.final_delete_count + .as_ref() + .map(|count| count.load(Ordering::SeqCst)) + .unwrap_or_default() + } + + fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore { + dest_store + .as_any() + .downcast_ref::() + .map(|store| store.inner.as_ref()) + .unwrap_or(dest_store) + } + } + + impl DeepSizeOf for NoRenameStore { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.inner.deep_size_of_children(context) + } + } + + #[async_trait] + impl IndexStore for NoRenameStore { + fn as_any(&self) -> &dyn Any { + self + } + + fn clone_arc(&self) -> Arc { + Arc::new(self.clone()) + } + + fn io_parallelism(&self) -> usize { + self.inner.io_parallelism() + } + + async fn new_index_file( + &self, + name: &str, + schema: Arc, + ) -> Result> { + self.inner.new_index_file(name, schema).await + } + + async fn open_index_file(&self, name: &str) -> Result> { + self.inner.open_index_file(name).await + } + + async fn copy_index_file( + &self, + name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file(name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { + Err(Error::internal(format!( + "merge_index_files should not rename partition file {name} to {new_name}" + ))) + } + + async fn delete_index_file(&self, name: &str) -> Result<()> { + if name.starts_with("part_") + && let Some(count) = &self.final_delete_count + { + count.fetch_add(1, Ordering::SeqCst); + } + self.inner.delete_index_file(name).await + } + + async fn list_files_with_sizes(&self) -> Result> { + self.inner.list_files_with_sizes().await + } + } + + #[derive(Debug)] + struct FailMetadataStore { + inner: Arc, + } + + impl FailMetadataStore { + fn new(inner: Arc) -> Self { + Self { inner } + } + + fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore { + dest_store + .as_any() + .downcast_ref::() + .map(|store| store.inner.as_ref()) + .unwrap_or(dest_store) + } + } + + impl DeepSizeOf for FailMetadataStore { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.inner.deep_size_of_children(context) + } + } + + #[async_trait] + impl IndexStore for FailMetadataStore { + fn as_any(&self) -> &dyn Any { + self + } + + fn clone_arc(&self) -> Arc { + Arc::new(Self { + inner: self.inner.clone(), + }) + } + + fn io_parallelism(&self) -> usize { + self.inner.io_parallelism() + } + + async fn new_index_file( + &self, + name: &str, + schema: Arc, + ) -> Result> { + let writer = self.inner.new_index_file(name, schema).await?; + if name == METADATA_FILE { + Ok(Box::new(FailFinishWriter { inner: writer })) + } else { + Ok(writer) + } + } + + async fn open_index_file(&self, name: &str) -> Result> { + self.inner.open_index_file(name).await + } + + async fn copy_index_file( + &self, + name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file(name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { + self.inner.rename_index_file(name, new_name).await + } + + async fn delete_index_file(&self, name: &str) -> Result<()> { + self.inner.delete_index_file(name).await + } + + async fn list_files_with_sizes(&self) -> Result> { + self.inner.list_files_with_sizes().await + } + } + + struct FailFinishWriter { + inner: Box, + } + + #[async_trait] + impl IndexWriter for FailFinishWriter { + async fn write_record_batch(&mut self, batch: RecordBatch) -> Result { + self.inner.write_record_batch(batch).await + } + + async fn add_global_buffer(&mut self, data: Bytes) -> Result { + self.inner.add_global_buffer(data).await + } + + async fn finish(&mut self) -> Result { + Err(Error::internal("injected metadata write failure")) + } + + async fn finish_with_metadata( + &mut self, + _metadata: HashMap, + ) -> Result { + Err(Error::internal("injected metadata write failure")) + } + } + #[derive(Debug)] struct CountingWriter { + path: String, write_count: Arc, } @@ -2242,12 +2553,21 @@ mod tests { Ok(1) } - async fn finish(&mut self) -> Result<()> { - Ok(()) + async fn finish(&mut self) -> Result { + Ok(IndexFile { + path: self.path.clone(), + size_bytes: 0, + }) } - async fn finish_with_metadata(&mut self, _metadata: HashMap) -> Result<()> { - Ok(()) + async fn finish_with_metadata( + &mut self, + _metadata: HashMap, + ) -> Result { + Ok(IndexFile { + path: self.path.clone(), + size_bytes: 0, + }) } } @@ -2267,10 +2587,11 @@ mod tests { async fn new_index_file( &self, - _name: &str, + name: &str, _schema: Arc, ) -> Result> { Ok(Box::new(CountingWriter { + path: name.to_string(), write_count: self.write_count.clone(), })) } @@ -2281,13 +2602,17 @@ mod tests { )) } - async fn copy_index_file(&self, _name: &str, _dest_store: &dyn IndexStore) -> Result<()> { + async fn copy_index_file( + &self, + _name: &str, + _dest_store: &dyn IndexStore, + ) -> Result { Err(Error::not_supported( "CountingStore does not support copying", )) } - async fn rename_index_file(&self, _name: &str, _new_name: &str) -> Result<()> { + async fn rename_index_file(&self, _name: &str, _new_name: &str) -> Result { Err(Error::not_supported( "CountingStore does not support renaming", )) @@ -2376,12 +2701,446 @@ mod tests { let store = CountingStore::new(); let docs = Arc::new(std::mem::take(&mut builder.docs)); - builder.write_posting_lists(&store, docs).await?; + builder + .write_posting_lists(&store, docs, &posting_file_path(0)) + .await?; assert_eq!(store.write_count(), 1); Ok(()) } + async fn write_partition_file_marker( + store: &dyn IndexStore, + path: &str, + partition_id: u64, + ) -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "partition_id", + DataType::UInt64, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt64Array::from(vec![partition_id]))], + )?; + let mut writer = store.new_index_file(path, schema).await?; + writer.write_record_batch(batch).await?; + writer.finish().await?; + Ok(()) + } + + async fn write_partition_files( + store: &dyn IndexStore, + partition_id: u64, + target: PartitionWriteTarget, + ) -> Result<()> { + write_partition_file_marker(store, &target.token_path(partition_id), partition_id).await?; + write_partition_file_marker(store, &target.posting_path(partition_id), partition_id) + .await?; + write_partition_file_marker(store, &target.doc_path(partition_id), partition_id).await?; + Ok(()) + } + + async fn read_partition_file_marker(store: &dyn IndexStore, path: &str) -> Result { + let reader = store.open_index_file(path).await?; + let batch = reader.read_range(0..1, None).await?; + let partition_ids = batch.column(0).as_primitive::(); + Ok(partition_ids.value(0)) + } + + async fn assert_partition_file_markers( + store: &dyn IndexStore, + partition_id: u64, + expected_marker: u64, + ) -> Result<()> { + assert_eq!( + read_partition_file_marker(store, &token_file_path(partition_id)).await?, + expected_marker + ); + assert_eq!( + read_partition_file_marker(store, &posting_file_path(partition_id)).await?, + expected_marker + ); + assert_eq!( + read_partition_file_marker(store, &doc_file_path(partition_id)).await?, + expected_marker + ); + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_remaps_staged_partitions_without_rename() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let store = Arc::new(NoRenameStore::new(base_store.clone())); + let partitions = vec![5_u64, 1_u64, (17_u64 << 32) | 2]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store, + noop_progress(), + ) + .await?; + + let metadata_reader = base_store.open_index_file(METADATA_FILE).await?; + let metadata = &metadata_reader.schema().metadata; + let written_partitions: Vec = serde_json::from_str( + metadata + .get("partitions") + .expect("partitions missing from metadata"), + )?; + let mut expected_partitions = partitions.clone(); + expected_partitions.sort_unstable(); + expected_partitions.dedup(); + let remapped_partitions = (0..expected_partitions.len() as u64).collect::>(); + assert_eq!(written_partitions, remapped_partitions); + + for (new_id, old_id) in expected_partitions.iter().enumerate() { + assert_partition_file_markers(base_store.as_ref(), new_id as u64, *old_id).await?; + assert!( + base_store + .open_index_file(&part_metadata_file_path(*old_id)) + .await + .is_err(), + "partition metadata should be cleaned up after final metadata is written" + ); + for suffix in PARTITION_FILE_SUFFIXES { + assert!( + base_store + .open_index_file(&staged_partition_file_path(*old_id, suffix)) + .await + .is_err(), + "staged partition files should be cleaned up after final metadata is written" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_rewrites_partial_final_files_from_staging() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let store = Arc::new(NoRenameStore::with_final_delete_tracking( + base_store.clone(), + )); + let partitions = vec![1_u64, 5_u64]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + for suffix in PARTITION_FILE_SUFFIXES { + write_partition_file_marker(base_store.as_ref(), &partition_file_path(1, suffix), 999) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store.clone(), + noop_progress(), + ) + .await?; + + assert_partition_file_markers(base_store.as_ref(), 0, 1).await?; + assert_partition_file_markers(base_store.as_ref(), 1, 5).await?; + assert_eq!( + store.final_delete_count(), + 0, + "merge should overwrite final partition files without deleting them first" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_distributed_from_existing_copies_existing_partitions_to_staging_and_finalizes() + -> Result<()> { + let object_store = Arc::new(ObjectStore::local()); + let source_dir = TempDir::default(); + let dest_dir = TempDir::default(); + let source_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + source_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let dest_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + dest_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let merge_store = Arc::new(NoRenameStore::new(dest_store.clone())); + let fragment_mask = 7_u64 << 32; + let partitions = vec![fragment_mask | 5, fragment_mask | 1]; + + for partition_id in &partitions { + write_partition_files( + source_store.as_ref(), + *partition_id, + PartitionWriteTarget::Final, + ) + .await?; + } + + let builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + Some(source_store.clone()), + partitions.clone(), + TokenSetFormat::default(), + Some(fragment_mask), + RoaringBitmap::new(), + ); + builder.write(dest_store.as_ref()).await?; + + for partition_id in &partitions { + assert_partition_file_markers(source_store.as_ref(), *partition_id, *partition_id) + .await?; + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(*partition_id, suffix); + assert_eq!( + read_partition_file_marker(dest_store.as_ref(), &staged_path).await?, + *partition_id + ); + assert!( + dest_store + .open_index_file(&partition_file_path(*partition_id, suffix)) + .await + .is_err(), + "distributed existing partition should be staged instead of copied to root" + ); + } + dest_store + .open_index_file(&part_metadata_file_path(*partition_id)) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &dest_dir.obj_path(), + merge_store, + noop_progress(), + ) + .await?; + + let mut expected_partitions = partitions.clone(); + expected_partitions.sort_unstable(); + for (new_id, old_id) in expected_partitions.iter().enumerate() { + assert_partition_file_markers(dest_store.as_ref(), new_id as u64, *old_id).await?; + for suffix in PARTITION_FILE_SUFFIXES { + assert!( + dest_store + .open_index_file(&staged_partition_file_path(*old_id, suffix)) + .await + .is_err(), + "staged partition files should be cleaned after final metadata is written" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_keeps_staging_when_final_metadata_write_fails() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let failing_store = Arc::new(FailMetadataStore::new(base_store.clone())); + let partitions = vec![1_u64, 5_u64]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + let err = merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + failing_store, + noop_progress(), + ) + .await + .unwrap_err(); + assert!( + err.to_string().contains("metadata write failure"), + "expected injected metadata failure, got: {err}" + ); + + for partition_id in &partitions { + base_store + .open_index_file(&part_metadata_file_path(*partition_id)) + .await?; + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(*partition_id, suffix); + assert_eq!( + read_partition_file_marker(base_store.as_ref(), &staged_path).await?, + *partition_id + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_distributed_build_writes_partition_data_to_staging() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = ObjectStore::local(); + let store = Arc::new(LanceIndexStore::new( + object_store.into(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + + let fragment_mask = 7_u64 << 32; + let batch = make_doc_batch("hello world", fragment_mask); + let stream = RecordBatchStreamAdapter::new(batch.schema(), stream::iter(vec![Ok(batch)])); + let stream = Box::pin(stream); + let mut builder = InvertedIndexBuilder::new_with_fragment_mask( + InvertedIndexParams::default(), + Some(fragment_mask), + ); + builder.update(stream, store.as_ref(), None).await?; + + let part_metadata_files = + list_metadata_files(&ObjectStore::local(), &index_dir.obj_path()).await?; + assert_eq!(part_metadata_files.len(), 1); + assert!( + part_metadata_files[0].starts_with("staging/part_"), + "partition metadata should be written to staging" + ); + let reader = store.open_index_file(&part_metadata_files[0]).await?; + let partition_ids: Vec = serde_json::from_str( + reader + .schema() + .metadata + .get("partitions") + .expect("partitions missing from metadata"), + )?; + assert_eq!(partition_ids.len(), 1); + let partition_id = partition_ids[0]; + + store + .open_index_file(&staged_partition_file_path(partition_id, TOKENS_FILE)) + .await?; + assert!( + store + .open_index_file(&partition_file_path(partition_id, METADATA_FILE)) + .await + .is_err(), + "distributed build-only metadata should not be written to root partition metadata paths" + ); + assert!( + store + .open_index_file(&token_file_path(partition_id)) + .await + .is_err(), + "distributed build-only data should not be written to final partition paths" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_is_noop_when_metadata_exists() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + vec![42], + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + metadata_builder + .write_metadata(store.as_ref(), &[42]) + .await?; + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store, + noop_progress(), + ) + .await?; + + Ok(()) + } + #[tokio::test] async fn test_build_only_path_writes_partitions_as_is() -> Result<()> { let src_dir = TempDir::default(); @@ -2820,7 +3579,6 @@ mod tests { } }) .collect::>(); - let read_start = tags .iter() .position(|e| e == "start:read_partition_metadata") @@ -2858,8 +3616,8 @@ mod tests { ); assert_eq!( remap_progress.last().copied().unwrap_or_default(), - 12, - "expected remap_partition_files progress to cover both rename phases" + 6, + "expected remap_partition_files progress to cover staged-to-final copies" ); assert!( tags.iter().any(|e| e == "progress:write_merged_metadata"), @@ -3088,6 +3846,118 @@ mod tests { Ok(()) } + #[test] + fn test_merge_from_after_remap_does_not_panic() { + // `first` is the merge accumulator. Give it three tokens, then remap away the + // middle one, mirroring filter_old_data dropping a token whose postings emptied. + let mut first = InnerBuilder::new(0, false, TokenSetFormat::default()); + for token in ["a", "b", "c"] { + first.tokens.add(token.to_owned()); + } + first + .posting_lists + .resize_with(first.tokens.len(), || PostingListBuilder::new(false)); + let first_doc = first.docs.append(10, 1); + first.posting_lists[0].add(first_doc, PositionRecorder::Count(1)); // "a" + first.posting_lists[2].add(first_doc, PositionRecorder::Count(1)); // "c" + + // Remove token "b" (id 1) and compact its (empty) posting list to match. + first.tokens.remap(&[1]); + first.posting_lists.remove(1); + assert_eq!(first.tokens.len(), first.posting_lists.len()); + + // `second` contributes a brand-new token absent from `first`. Before the fix, + // get_or_add returned the stale next_id, indexing past posting_lists. + let mut second = InnerBuilder::new(1, false, TokenSetFormat::default()); + let zeta = second.tokens.add("zeta".to_owned()); + second + .posting_lists + .resize_with(second.tokens.len(), || PostingListBuilder::new(false)); + let second_doc = second.docs.append(20, 1); + second.posting_lists[zeta as usize].add(second_doc, PositionRecorder::Count(1)); + + first.merge_from(second).unwrap(); + + assert_eq!(first.tokens.len(), 3); + assert_eq!(first.posting_lists.len(), 3); + let zeta_id = first.tokens.get("zeta").expect("zeta should be merged in"); + assert!((zeta_id as usize) < first.posting_lists.len()); + } + + // FST token file with a stale next_id (above the token count), as a pre-#7115 writer left. + async fn write_stale_next_id_token_file(store: &dyn IndexStore, partition_id: u64) { + let mut tokens = TokenSet::default(); + tokens.add("alpha".to_owned()); + tokens.add("gamma".to_owned()); + assert_eq!(tokens.len(), 2); + tokens.next_id = 9; + let batch = tokens.to_batch(TokenSetFormat::Fst).unwrap(); + let mut writer = store + .new_index_file(&token_file_path(partition_id), batch.schema()) + .await + .unwrap(); + writer.write_record_batch(batch).await.unwrap(); + writer.finish().await.unwrap(); + } + + // load_fst recomputes next_id from the token count rather than trusting the persisted value. + #[tokio::test] + async fn test_load_fst_recomputes_stale_next_id() { + let index_dir = TempDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + + write_stale_next_id_token_file(store.as_ref(), 0).await; + let reader = store.open_index_file(&token_file_path(0)).await.unwrap(); + let tokens = TokenSet::load(reader, TokenSetFormat::Fst).await.unwrap(); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens.next_id(), 2); + } + + // A stale next_id loaded from disk must not leak an out-of-range token id into a merge. + #[tokio::test] + async fn test_merge_with_stale_next_id_token_file_does_not_panic() { + let index_dir = TempDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + + write_stale_next_id_token_file(store.as_ref(), 0).await; + let reader = store.open_index_file(&token_file_path(0)).await.unwrap(); + let tokens = TokenSet::load(reader, TokenSetFormat::Fst) + .await + .unwrap() + .into_mutable(); + + let mut first = InnerBuilder::new(0, false, TokenSetFormat::Fst); + first.set_tokens(tokens); + first + .posting_lists + .resize_with(first.tokens.len(), || PostingListBuilder::new(false)); + let doc = first.docs.append(10, 1); + first.posting_lists[0].add(doc, PositionRecorder::Count(1)); + first.posting_lists[1].add(doc, PositionRecorder::Count(1)); + + let mut second = InnerBuilder::new(1, false, TokenSetFormat::Fst); + let zeta = second.tokens.add("zeta".to_owned()); + second + .posting_lists + .resize_with(second.tokens.len(), || PostingListBuilder::new(false)); + let second_doc = second.docs.append(20, 1); + second.posting_lists[zeta as usize].add(second_doc, PositionRecorder::Count(1)); + + first.merge_from(second).unwrap(); + assert_eq!(first.tokens.len(), 3); + assert_eq!(first.posting_lists.len(), 3); + let zeta_id = first.tokens.get("zeta").expect("zeta should be merged in"); + assert!((zeta_id as usize) < first.posting_lists.len()); + } + #[tokio::test] async fn test_update_index_returns_worker_error_when_workers_exit_during_dispatch() { let num_batches = (*LANCE_FTS_NUM_SHARDS * 2 + 1) as u64; diff --git a/rust/lance-index/src/scalar/inverted/cache_codec.rs b/rust/lance-index/src/scalar/inverted/cache_codec.rs index 74cfc98ef7b..a676455d5c9 100644 --- a/rust/lance-index/src/scalar/inverted/cache_codec.rs +++ b/rust/lance-index/src/scalar/inverted/cache_codec.rs @@ -4,16 +4,24 @@ //! Cache codec impls for FTS index entries. //! //! Serializes [`PostingList`] and [`Positions`] cache values for persistent -//! cache backends. The format is a small variant tag plus a JSON header for -//! scalar metadata, with Arrow-backed payload sections written as zero-copy -//! Arrow IPC streams via [`lance_arrow::ipc`]. The raw byte buffer inside -//! [`SharedPositionStream`] is written via [`write_len_prefixed_bytes`] and -//! read back via [`read_len_prefixed_bytes_at`] -- both zero-copy slices into -//! the input `Bytes` allocation. +//! cache backends, behind the stabilized envelope written by +//! [`CacheCodec`](lance_core::cache::CacheCodec). //! -//! This is the FTS counterpart of `partition_serde.rs` for vector indices. +//! Every variant uses a protobuf header (see `protos-cache/cache.proto`, with the +//! tail/position codecs and position-storage kind as proto enums) followed by +//! 64-byte-aligned Arrow IPC sections and, where applicable, raw blobs: +//! +//! - the compressed posting list: an IPC section for `blocks`, then the +//! position sections (legacy IPC, or shared block-offsets IPC + a raw blob of +//! the [`SharedPositionStream`] byte buffer, which has its own portable +//! encoding); +//! - the plain posting list: an IPC section of `(row_ids, frequencies)`, then +//! an optional legacy position IPC section; +//! - the standalone [`Positions`] codec: the position sections alone. +//! +//! All sections read back zero-copy via [`lance_arrow::ipc`]. This is the FTS +//! counterpart of `partition_serde.rs` for vector indices. -use std::io::Write; use std::sync::Arc; use arrow_array::cast::AsArray; @@ -22,14 +30,14 @@ use arrow_array::{ Array, Float32Array, LargeBinaryArray, ListArray, RecordBatch, UInt32Array, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; -use bytes::Bytes; -use lance_arrow::ipc::{ - read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_len_prefixed_bytes, -}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; -use serde::{Deserialize, Serialize}; + +use crate::cache_pb::{ + CompressedPostingHeader, PlainPostingHeader, PositionStorage as PbPositionStorage, + PositionStreamCodec as PbPositionStreamCodec, PositionsHeader, PostingListGroupHeader, + PostingTailCodec as PbPostingTailCodec, +}; use super::index::{ CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec, @@ -43,86 +51,43 @@ use super::index::{ const POSTING_VARIANT_PLAIN: u8 = 0; const POSTING_VARIANT_COMPRESSED: u8 = 1; -const POSITIONS_TAG_NONE: u8 = 0; -const POSITIONS_TAG_LEGACY: u8 = 1; -const POSITIONS_TAG_SHARED: u8 = 2; - -const POSTING_TAIL_CODEC_FIXED32: u8 = 0; -const POSTING_TAIL_CODEC_VARINT_DELTA: u8 = 1; - -const POSITION_STREAM_CODEC_VARINT_DOC_DELTA: u8 = 0; -const POSITION_STREAM_CODEC_PACKED_DELTA: u8 = 1; - // --------------------------------------------------------------------------- -// Codec enum byte mappings +// Codec enum mappings // --------------------------------------------------------------------------- -fn posting_tail_codec_to_u8(c: PostingTailCodec) -> u8 { - match c { - PostingTailCodec::Fixed32 => POSTING_TAIL_CODEC_FIXED32, - PostingTailCodec::VarintDelta => POSTING_TAIL_CODEC_VARINT_DELTA, - } -} +// Posting lists carry their discriminants as protobuf enums in the header; +// these map to/from the in-memory Rust enums. -fn u8_to_posting_tail_codec(v: u8) -> Result { - match v { - POSTING_TAIL_CODEC_FIXED32 => Ok(PostingTailCodec::Fixed32), - POSTING_TAIL_CODEC_VARINT_DELTA => Ok(PostingTailCodec::VarintDelta), - _ => Err(Error::io(format!("unknown posting tail codec: {v}"))), +fn posting_tail_codec_to_proto(c: PostingTailCodec) -> PbPostingTailCodec { + match c { + PostingTailCodec::Fixed32 => PbPostingTailCodec::Fixed32, + PostingTailCodec::VarintDelta => PbPostingTailCodec::VarintDelta, } } -fn position_stream_codec_to_u8(c: PositionStreamCodec) -> u8 { +fn proto_to_posting_tail_codec(c: PbPostingTailCodec) -> PostingTailCodec { match c { - PositionStreamCodec::VarintDocDelta => POSITION_STREAM_CODEC_VARINT_DOC_DELTA, - PositionStreamCodec::PackedDelta => POSITION_STREAM_CODEC_PACKED_DELTA, + PbPostingTailCodec::Fixed32 => PostingTailCodec::Fixed32, + PbPostingTailCodec::VarintDelta => PostingTailCodec::VarintDelta, } } -fn u8_to_position_stream_codec(v: u8) -> Result { - match v { - POSITION_STREAM_CODEC_VARINT_DOC_DELTA => Ok(PositionStreamCodec::VarintDocDelta), - POSITION_STREAM_CODEC_PACKED_DELTA => Ok(PositionStreamCodec::PackedDelta), - _ => Err(Error::io(format!("unknown position stream codec: {v}"))), +fn position_stream_codec_to_proto(c: PositionStreamCodec) -> PbPositionStreamCodec { + match c { + PositionStreamCodec::VarintDocDelta => PbPositionStreamCodec::VarintDocDelta, + PositionStreamCodec::PackedDelta => PbPositionStreamCodec::PackedDelta, } } -// --------------------------------------------------------------------------- -// Header / tag I/O helpers (mirrors partition_serde.rs) -// --------------------------------------------------------------------------- - -fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> { - let bytes = serde_json::to_vec(header)?; - write_len_prefixed_bytes(writer, &bytes)?; - Ok(()) -} - -fn read_json_header(data: &Bytes, offset: &mut usize) -> Result { - let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - serde_json::from_slice(&bytes) - .map_err(|e| Error::io(format!("failed to deserialize cache header: {e}"))) -} - -fn write_u8(writer: &mut dyn Write, value: u8) -> Result<()> { - writer - .write_all(&[value]) - .map_err(|e| Error::io(format!("failed to write tag byte: {e}"))) -} - -fn read_u8(data: &Bytes, offset: &mut usize) -> Result { - let bytes = data.as_ref(); - if *offset >= bytes.len() { - return Err(Error::io( - "truncated cache entry: missing tag byte".to_string(), - )); +fn proto_to_position_stream_codec(c: PbPositionStreamCodec) -> PositionStreamCodec { + match c { + PbPositionStreamCodec::VarintDocDelta => PositionStreamCodec::VarintDocDelta, + PbPositionStreamCodec::PackedDelta => PositionStreamCodec::PackedDelta, } - let v = bytes[*offset]; - *offset += 1; - Ok(v) } // --------------------------------------------------------------------------- -// Position storage serde (shared by PostingList variants and Positions) +// Position storage sections (shared by PostingList variants and Positions) // --------------------------------------------------------------------------- const POSITION_LIST_COLUMN: &str = "position_list"; @@ -131,33 +96,36 @@ const ROW_IDS_COLUMN: &str = "row_ids"; const FREQUENCIES_COLUMN: &str = "frequencies"; const BLOCKS_COLUMN: &str = "blocks"; -#[derive(Serialize, Deserialize)] -struct SharedPositionsHeader { - codec: u8, +fn legacy_positions_batch(list: &ListArray) -> Result { + let schema = Arc::new(Schema::new(vec![Field::new( + POSITION_LIST_COLUMN, + list.data_type().clone(), + list.is_nullable(), + )])); + Ok(RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?) +} + +fn read_legacy_positions(r: &mut CacheEntryReader<'_>) -> Result { + let batch = r.read_ipc()?; + Ok(batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))? + .clone()) } -fn write_position_storage( - writer: &mut dyn Write, +/// Write the position sections (the bytes after the header) for `storage`. The +/// caller's header proto carries the storage kind and shared-stream codec. +fn write_position_sections( + w: &mut CacheEntryWriter<'_>, storage: &CompressedPositionStorage, ) -> Result<()> { match storage { CompressedPositionStorage::LegacyPerDoc(list) => { - write_u8(writer, POSITIONS_TAG_LEGACY)?; - let schema = Arc::new(Schema::new(vec![Field::new( - POSITION_LIST_COLUMN, - list.data_type().clone(), - list.is_nullable(), - )])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?; - write_ipc_stream(&batch, writer)?; + w.write_ipc(&legacy_positions_batch(list)?)?; } CompressedPositionStorage::SharedStream(stream) => { - write_u8(writer, POSITIONS_TAG_SHARED)?; - let header = SharedPositionsHeader { - codec: position_stream_codec_to_u8(stream.codec()), - }; - write_json_header(writer, &header)?; - let offsets = UInt32Array::from(stream.block_offsets().to_vec()); let schema = Arc::new(Schema::new(vec![Field::new( BLOCK_OFFSETS_COLUMN, @@ -165,55 +133,42 @@ fn write_position_storage( false, )])); let batch = RecordBatch::try_new(schema, vec![Arc::new(offsets)])?; - write_ipc_stream(&batch, writer)?; - - write_len_prefixed_bytes(writer, stream.bytes())?; + w.write_ipc(&batch)?; + w.write_raw(stream.bytes())?; } } Ok(()) } -fn read_position_storage( - data: &Bytes, - offset: &mut usize, - tag: u8, -) -> Result { - match tag { - POSITIONS_TAG_LEGACY => { - let batch = - read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - let list = batch - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))? - .clone(); - Ok(CompressedPositionStorage::LegacyPerDoc(list)) - } - POSITIONS_TAG_SHARED => { - let header: SharedPositionsHeader = read_json_header(data, offset)?; - let codec = u8_to_position_stream_codec(header.codec)?; - - let batch = - read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; +/// Read the position sections for the given `storage` kind and (for shared +/// streams) `stream_codec`. Returns `None` only when `storage` is +/// [`PbPositionStorage::None`]. +fn read_position_sections( + r: &mut CacheEntryReader<'_>, + storage: PbPositionStorage, + stream_codec: PositionStreamCodec, +) -> Result> { + match storage { + PbPositionStorage::None => Ok(None), + PbPositionStorage::Legacy => Ok(Some(CompressedPositionStorage::LegacyPerDoc( + read_legacy_positions(r)?, + ))), + PbPositionStorage::Shared => { + let batch = r.read_ipc()?; let block_offsets = batch .column(0) .as_primitive_opt::() .ok_or_else(|| Error::io("block_offsets column is not UInt32".to_string()))? .values() .to_vec(); - - // Zero copy: read_len_prefixed_bytes_at returns a Bytes slice - // backed by the same allocation as `data`, and SharedPositionStream - // now stores its byte buffer as Bytes -- no copy on read. - let bytes = - read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - - Ok(CompressedPositionStorage::SharedStream( - SharedPositionStream::new(codec, block_offsets, bytes), - )) + // Zero copy: read_raw returns a Bytes slice backed by the same + // allocation as the input, and SharedPositionStream stores its byte + // buffer as Bytes -- no copy on read. + let bytes = r.read_raw()?; + Ok(Some(CompressedPositionStorage::SharedStream( + SharedPositionStream::new(stream_codec, block_offsets, bytes), + ))) } - other => Err(Error::io(format!("unknown positions tag: {other}"))), } } @@ -221,50 +176,45 @@ fn read_position_storage( // PostingList codec // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct PlainPostingHeader { - max_score: Option, -} - -#[derive(Serialize, Deserialize)] -struct CompressedPostingHeader { - max_score: f32, - length: u32, - posting_tail_codec: u8, -} - impl CacheCodecImpl for PostingList { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.fts.PostingList"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { match self { Self::Plain(plain) => { - write_u8(writer, POSTING_VARIANT_PLAIN)?; - serialize_plain(writer, plain) + w.write_u8(POSTING_VARIANT_PLAIN)?; + serialize_plain(w, plain) } Self::Compressed(compressed) => { - write_u8(writer, POSTING_VARIANT_COMPRESSED)?; - serialize_compressed(writer, compressed) + w.write_u8(POSTING_VARIANT_COMPRESSED)?; + serialize_compressed(w, compressed) } } } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let variant = read_u8(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let variant = r.read_u8()?; match variant { - POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(data, &mut offset)?)), - POSTING_VARIANT_COMPRESSED => { - Ok(Self::Compressed(deserialize_compressed(data, &mut offset)?)) - } + POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(r)?)), + POSTING_VARIANT_COMPRESSED => Ok(Self::Compressed(deserialize_compressed(r)?)), other => Err(Error::io(format!("unknown PostingList variant: {other}"))), } } } -fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<()> { +fn serialize_plain(w: &mut CacheEntryWriter<'_>, plain: &PlainPostingList) -> Result<()> { + // Plain postings carry only per-doc legacy positions (or none). + let position_storage = if plain.positions.is_some() { + PbPositionStorage::Legacy + } else { + PbPositionStorage::None + }; let header = PlainPostingHeader { max_score: plain.max_score, + position_storage: position_storage as i32, }; - write_json_header(writer, &header)?; + w.write_header(&header)?; let row_ids = UInt64Array::new(plain.row_ids.clone(), None); let frequencies = Float32Array::new(plain.frequencies.clone(), None); @@ -273,26 +223,18 @@ fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<( Field::new(FREQUENCIES_COLUMN, DataType::Float32, false), ])); let batch = RecordBatch::try_new(schema, vec![Arc::new(row_ids), Arc::new(frequencies)])?; - write_ipc_stream(&batch, writer)?; - - match &plain.positions { - Some(list) => { - // Plain postings can only carry per-doc legacy positions; reuse - // the shared encoder. - write_position_storage( - writer, - &CompressedPositionStorage::LegacyPerDoc(list.clone()), - )?; - } - None => write_u8(writer, POSITIONS_TAG_NONE)?, + w.write_ipc(&batch)?; + + if let Some(list) = &plain.positions { + w.write_ipc(&legacy_positions_batch(list)?)?; } Ok(()) } -fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result { - let header: PlainPostingHeader = read_json_header(data, offset)?; +fn deserialize_plain(r: &mut CacheEntryReader<'_>) -> Result { + let header: PlainPostingHeader = r.read_header()?; - let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; + let batch = r.read_ipc()?; let row_ids = batch .column(0) .as_primitive_opt::() @@ -306,19 +248,13 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result None, - POSITIONS_TAG_LEGACY => match read_position_storage(data, offset, positions_tag)? { - CompressedPositionStorage::LegacyPerDoc(list) => Some(list), - CompressedPositionStorage::SharedStream(_) => { - unreachable!("shared stream tag was read as legacy variant (this is a bug)") - } - }, - other => { - return Err(Error::io(format!( - "Plain posting list cannot have positions tag {other}" - ))); + let positions = match header.position_storage() { + PbPositionStorage::None => None, + PbPositionStorage::Legacy => Some(read_legacy_positions(r)?), + PbPositionStorage::Shared => { + return Err(Error::io( + "Plain posting list cannot have a shared position stream".to_string(), + )); } }; @@ -330,13 +266,33 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result Result<()> { +/// The compressed posting list is serialized with a protobuf header followed +/// by 64-byte-aligned Arrow IPC sections (for the `blocks`, and for shared +/// position block-offsets) and a raw blob (for the shared position byte +/// stream, which already has its own portable encoding). +fn serialize_compressed( + w: &mut CacheEntryWriter<'_>, + posting: &CompressedPostingList, +) -> Result<()> { + let (position_storage, position_stream_codec) = match &posting.positions { + None => (PbPositionStorage::None, PbPositionStreamCodec::default()), + Some(CompressedPositionStorage::LegacyPerDoc(_)) => { + (PbPositionStorage::Legacy, PbPositionStreamCodec::default()) + } + Some(CompressedPositionStorage::SharedStream(stream)) => ( + PbPositionStorage::Shared, + position_stream_codec_to_proto(stream.codec()), + ), + }; + let header = CompressedPostingHeader { max_score: posting.max_score, length: posting.length, - posting_tail_codec: posting_tail_codec_to_u8(posting.posting_tail_codec), + posting_tail_codec: posting_tail_codec_to_proto(posting.posting_tail_codec) as i32, + position_storage: position_storage as i32, + position_stream_codec: position_stream_codec as i32, }; - write_json_header(writer, &header)?; + w.write_header(&header)?; let schema = Arc::new(Schema::new(vec![Field::new( BLOCKS_COLUMN, @@ -344,20 +300,19 @@ fn serialize_compressed(writer: &mut dyn Write, posting: &CompressedPostingList) false, )])); let batch = RecordBatch::try_new(schema, vec![Arc::new(posting.blocks.clone())])?; - write_ipc_stream(&batch, writer)?; + w.write_ipc(&batch)?; - match &posting.positions { - Some(storage) => write_position_storage(writer, storage)?, - None => write_u8(writer, POSITIONS_TAG_NONE)?, + if let Some(storage) = &posting.positions { + write_position_sections(w, storage)?; } Ok(()) } -fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result { - let header: CompressedPostingHeader = read_json_header(data, offset)?; - let posting_tail_codec = u8_to_posting_tail_codec(header.posting_tail_codec)?; +fn deserialize_compressed(r: &mut CacheEntryReader<'_>) -> Result { + let header: CompressedPostingHeader = r.read_header()?; + let posting_tail_codec = proto_to_posting_tail_codec(header.posting_tail_codec()); - let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; + let batch = r.read_ipc()?; let blocks = batch .column(0) .as_any() @@ -365,12 +320,8 @@ fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result Result Result<()> { + const TYPE_ID: &'static str = "lance.fts.PostingListGroup"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let count = u32::try_from(self.posting_lists.len()) .map_err(|_| Error::io("posting list group too large to serialize".to_string()))?; - writer - .write_all(&count.to_le_bytes()) - .map_err(|e| Error::io(format!("failed to write group count: {e}")))?; + w.write_header(&PostingListGroupHeader { count })?; for posting in &self.posting_lists { - let mut buf = Vec::new(); - posting.serialize(&mut buf)?; - write_len_prefixed_bytes(writer, &buf)?; + posting.serialize(w)?; } Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - if data.len() < 4 { - return Err(Error::io( - "truncated posting list group: missing count".to_string(), - )); - } - let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize; - offset += 4; - let mut posting_lists = Vec::with_capacity(count); - for _ in 0..count { - let entry = read_len_prefixed_bytes_at(data, &mut offset) - .map_err(|e| Error::io(e.to_string()))?; - posting_lists.push(PostingList::deserialize(&entry)?); + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PostingListGroupHeader = r.read_header()?; + let mut posting_lists = Vec::with_capacity(header.count as usize); + for _ in 0..header.count { + posting_lists.push(PostingList::deserialize(r)?); } Ok(Self::new(posting_lists)) } @@ -428,20 +371,35 @@ impl CacheCodecImpl for PostingListGroup { // --------------------------------------------------------------------------- impl CacheCodecImpl for Positions { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - write_position_storage(writer, &self.0) + const TYPE_ID: &'static str = "lance.fts.Positions"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let (position_storage, position_stream_codec) = match &self.0 { + CompressedPositionStorage::LegacyPerDoc(_) => { + (PbPositionStorage::Legacy, PbPositionStreamCodec::default()) + } + CompressedPositionStorage::SharedStream(stream) => ( + PbPositionStorage::Shared, + position_stream_codec_to_proto(stream.codec()), + ), + }; + let header = PositionsHeader { + position_storage: position_storage as i32, + position_stream_codec: position_stream_codec as i32, + }; + w.write_header(&header)?; + write_position_sections(w, &self.0) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let tag = read_u8(data, &mut offset)?; - if tag == POSITIONS_TAG_NONE { - return Err(Error::io( - "Positions cache entry cannot encode the None variant".to_string(), - )); - } - let storage = read_position_storage(data, &mut offset, tag)?; - Ok(Self(storage)) + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PositionsHeader = r.read_header()?; + let stream_codec = proto_to_position_stream_codec(header.position_stream_codec()); + read_position_sections(r, header.position_storage(), stream_codec)? + .map(Self) + .ok_or_else(|| { + Error::io("Positions cache entry cannot encode the None variant".to_string()) + }) } } @@ -455,7 +413,8 @@ mod tests { use arrow_array::LargeBinaryArray; use arrow_array::builder::{Int32Builder, ListBuilder}; use bytes::Bytes; - use lance_core::cache::CacheCodecImpl; + use lance_core::Result; + use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use super::super::index::{ CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec, @@ -502,16 +461,26 @@ mod tests { } } - fn roundtrip_posting_list(entry: &PostingList) -> PostingList { + /// Serialize a codec body (no envelope) into a standalone buffer. + fn body_bytes(entry: &T) -> Bytes { let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); - PostingList::deserialize(&Bytes::from(buf)).unwrap() + let mut w = CacheEntryWriter::new(&mut buf); + entry.serialize(&mut w).unwrap(); + Bytes::from(buf) + } + + /// Deserialize a codec body (no envelope) at the current build's version. + fn from_body(data: &Bytes) -> Result { + let mut r = CacheEntryReader::new(data, 0, T::CURRENT_VERSION); + T::deserialize(&mut r) + } + + fn roundtrip_posting_list(entry: &PostingList) -> PostingList { + from_body::(&body_bytes(entry)).unwrap() } fn roundtrip_positions(entry: &Positions) -> Positions { - let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); - Positions::deserialize(&Bytes::from(buf)).unwrap() + from_body::(&body_bytes(entry)).unwrap() } fn assert_slice_points_into_bytes(slice: &[u8], bytes: &Bytes) { @@ -652,13 +621,9 @@ mod tests { expected_stream.clone(), )), ); - let mut buf = Vec::new(); - PostingList::Compressed(posting) - .serialize(&mut buf) - .unwrap(); - let serialized = Bytes::from(buf); + let serialized = body_bytes(&PostingList::Compressed(posting)); - let restored = PostingList::deserialize(&serialized).unwrap(); + let restored = from_body::(&serialized).unwrap(); let PostingList::Compressed(restored) = restored else { panic!("expected Compressed variant"); }; @@ -695,9 +660,7 @@ mod tests { vec![plain.clone(), compressed, plain], ] { let group = PostingListGroup::new(members.clone()); - let mut buf = Vec::new(); - group.serialize(&mut buf).unwrap(); - let restored = PostingListGroup::deserialize(&Bytes::from(buf)).unwrap(); + let restored = from_body::(&body_bytes(&group)).unwrap(); assert_eq!(restored.posting_lists.len(), members.len()); for (a, b) in members.iter().zip(restored.posting_lists.iter()) { match (a, b) { @@ -743,9 +706,241 @@ mod tests { None, ); let entry = PostingList::Plain(plain); - let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); + let mut buf = body_bytes(&entry).to_vec(); buf.truncate(buf.len() / 2); - assert!(PostingList::deserialize(&Bytes::from(buf)).is_err()); + assert!(from_body::(&Bytes::from(buf)).is_err()); + } + + /// Tests covering the stabilized envelope + compressed proto format, + /// exercised through the full type-erased [`CacheCodec`] (envelope + body). + mod stable_format { + use std::sync::Arc; + + use arrow_array::Array; + use lance_core::cache::CacheCodec; + use prost::Message; + + use super::*; + use crate::cache_pb::{CompressedPostingHeader, PostingTailCodec as PbPostingTailCodec}; + + type ArcAny = Arc; + + fn codec() -> CacheCodec { + CacheCodec::from_impl::() + } + + /// Serialize an entry through the full codec (envelope + body). + fn serialize_entry(entry: PostingList) -> Vec { + let any: ArcAny = Arc::new(entry); + let mut buf = Vec::new(); + codec().serialize(&any, &mut buf).unwrap(); + buf + } + + /// A `Bytes` whose base address is 64-byte aligned, modelling a backend + /// that reads cache entries into an aligned buffer. + fn aligned_bytes(payload: &[u8]) -> Bytes { + const ALIGN: usize = 64; + let mut v = vec![0u8; payload.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + payload.len()].copy_from_slice(payload); + Bytes::from(v).slice(pad..pad + payload.len()) + } + + fn compressed_with_shared_positions() -> PostingList { + let blocks = + LargeBinaryArray::from_opt_vec(vec![Some(&[9u8; 48][..]), Some(&[1u8; 48])]); + let stream = SharedPositionStream::new( + PositionStreamCodec::PackedDelta, + vec![0u32, 4, 11], + Bytes::from((0u8..64).collect::>()), + ); + PostingList::Compressed(CompressedPostingList::new( + blocks, + 7.0, + 3, + PostingTailCodec::VarintDelta, + Some(CompressedPositionStorage::SharedStream(stream)), + )) + } + + /// The compressed `blocks` (an aligned IPC section) and the shared + /// position blob (a raw section) must both be borrowed zero-copy from + /// the input even though the envelope pushes them to a non-zero, + /// non-aligned starting offset. + #[test] + fn compressed_sections_are_zero_copy_through_envelope() { + let serialized = aligned_bytes(&serialize_entry(compressed_with_shared_positions())); + let restored = codec().deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + let PostingList::Compressed(restored) = restored.as_ref() else { + panic!("expected Compressed"); + }; + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + let points_in = |ptr: usize| ptr >= base && ptr < end; + + // blocks IPC section decoded in place (no realigning memcpy). + for buf in restored.blocks.to_data().buffers() { + assert!( + points_in(buf.as_ptr() as usize), + "blocks buffer was realigned out of the input — misaligned IPC section", + ); + } + // shared position raw blob borrowed in place. + let Some(CompressedPositionStorage::SharedStream(stream)) = &restored.positions else { + panic!("expected shared stream"); + }; + assert!(points_in(stream.bytes().as_ptr() as usize)); + } + + /// Every member of a `PostingListGroup` must also decode zero-copy. The + /// group writes its members inline so each member's IPC sections stay + /// 64-byte aligned within the entry; embedding members in per-member + /// sub-buffers would land them at arbitrary offsets and force a + /// realigning memcpy on load. + #[test] + fn group_member_sections_are_zero_copy_through_envelope() { + let make_member = |fill: u8| { + let blocks = + LargeBinaryArray::from_opt_vec(vec![Some(&[fill; 48][..]), Some(&[fill; 48])]); + PostingList::Compressed(CompressedPostingList::new( + blocks, + 7.0, + 3, + PostingTailCodec::VarintDelta, + None, + )) + }; + let group = PostingListGroup::new(vec![make_member(9), make_member(1)]); + + let group_codec = CacheCodec::from_impl::(); + let any: ArcAny = Arc::new(group); + let mut buf = Vec::new(); + group_codec.serialize(&any, &mut buf).unwrap(); + let serialized = aligned_bytes(&buf); + + let restored = group_codec.deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + let points_in = |ptr: usize| ptr >= base && ptr < end; + + assert_eq!(restored.posting_lists.len(), 2); + for member in &restored.posting_lists { + let PostingList::Compressed(member) = member else { + panic!("expected Compressed member"); + }; + for buf in member.blocks.to_data().buffers() { + assert!( + points_in(buf.as_ptr() as usize), + "group member blocks buffer was realigned out of the input — \ + misaligned IPC section", + ); + } + } + } + + /// The plain posting's row-id/frequency IPC section must also decode + /// zero-copy through the envelope + proto header. + #[test] + fn plain_sections_are_zero_copy_through_envelope() { + let plain = PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from((0u64..64).collect::>()), + ScalarBuffer::from(vec![1.0f32; 64]), + Some(2.0), + None, + )); + let serialized = aligned_bytes(&serialize_entry(plain)); + let restored = codec().deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + let PostingList::Plain(restored) = restored.as_ref() else { + panic!("expected Plain"); + }; + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + // The row_ids ScalarBuffer must borrow from the input allocation. + let ptr = restored.row_ids.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "row_ids buffer was realigned out of the input — misaligned IPC section", + ); + } + + /// Additive proto fields (lever #1) must not break decoding: an unknown + /// field number appended to the header is ignored. + #[test] + fn header_proto_ignores_unknown_fields() { + let header = CompressedPostingHeader { + max_score: 1.5, + length: 9, + posting_tail_codec: PbPostingTailCodec::VarintDelta as i32, + ..Default::default() + }; + let mut bytes = header.encode_to_vec(); + // Append an unknown field #15, varint wire type (0), value 7. + bytes.push(15 << 3); + bytes.push(7); + let decoded = CompressedPostingHeader::decode(bytes.as_slice()).unwrap(); + assert_eq!(decoded.length, 9); + assert_eq!(decoded.max_score, 1.5); + } + + /// An entry written by a different codec (foreign TYPE_ID) misses. + #[test] + fn foreign_type_id_is_miss() { + // A PostingListGroup entry carries a different TYPE_ID in its + // envelope; reading it as a PostingList must miss, not misread it. + let group = PostingListGroup::new(vec![]); + let any: ArcAny = Arc::new(group); + let mut buf = Vec::new(); + CacheCodec::from_impl::() + .serialize(&any, &mut buf) + .unwrap(); + assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none()); + } + + /// An entry written by a newer build (higher type_version) misses. + #[test] + fn future_type_version_is_miss() { + let mut buf = serialize_entry(compressed_with_shared_positions()); + // Patch the envelope's type_version (magic[4] + ver[1] + len[2] + + // type_id[N]) to a value beyond what this build understands. + let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize; + let version_off = 4 + 1 + 2 + type_id_len; + buf[version_off..version_off + 4].copy_from_slice(&u32::MAX.to_le_bytes()); + assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none()); + } + + /// A pre-stabilization blob (no magic) self-heals to a miss. + #[test] + fn pre_stabilization_blob_is_miss() { + // Old format led with a u64 LE length prefix, never our magic. + let mut blob = (30u64).to_le_bytes().to_vec(); + blob.extend_from_slice(&[0u8; 30]); + assert!(codec().deserialize(&Bytes::from(blob)).hit().is_none()); + } + + /// A structurally-valid envelope whose body leads with an out-of-range + /// variant tag self-heals to a `BodyError` miss rather than panicking or + /// misreading the remaining bytes. + #[test] + fn unknown_posting_variant_is_miss() { + use lance_core::cache::{CacheDecode, CacheMissReason}; + + let mut buf = serialize_entry(compressed_with_shared_positions()); + // The variant tag is the first body byte, right after the envelope + // (magic[4] + ver[1] + type_id_len[2] + type_id[N] + type_version[4]). + let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize; + let variant_off = 4 + 1 + 2 + type_id_len + 4; + buf[variant_off] = 2; // neither PLAIN (0) nor COMPRESSED (1) + match codec().deserialize(&Bytes::from(buf)) { + CacheDecode::Miss(reason) => assert_eq!(reason, CacheMissReason::BodyError), + CacheDecode::Hit(_) => panic!("expected a BodyError miss, got a hit"), + } + } } } diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 5b67ba1da9b..c23dc1c4e78 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -3,13 +3,13 @@ use std::fmt::{Debug, Display}; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use std::{ cmp::{Reverse, min}, collections::BinaryHeap, }; use std::{ - collections::{HashMap, HashSet}, + collections::{BTreeMap, HashMap, HashSet}, ops::Range, time::Instant, }; @@ -35,24 +35,26 @@ use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::metrics::Time; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use fst::{Automaton, IntoStreamer, Streamer}; use futures::{FutureExt, Stream, StreamExt, TryStreamExt, stream}; -use itertools::Itertools; +use itertools::{Either, Itertools}; use lance_arrow::{RecordBatchExt, iter_str_array}; use lance_core::cache::{CacheCodec, CacheKey, LanceCache, WeakLanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::{DataFusionResult, LanceOptionExt}; +use lance_core::utils::address::RowAddress; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS}; use lance_core::{Error, ROW_ID, ROW_ID_FIELD, Result}; use lance_select::{RowAddrMask, RowAddrTreeMap}; use roaring::RoaringBitmap; use std::sync::LazyLock; -use tokio::task::spawn_blocking; +use tokio::{sync::OnceCell, task::spawn_blocking}; use tracing::{info, instrument}; use super::encoding::{PositionBlockBuilder, decode_group_starts}; use super::iter::PostingListIterator; +use super::lazy_docset::LazyDocSet; use super::{InvertedIndexBuilder, InvertedIndexParams, wand::*}; use super::{ builder::{ @@ -205,6 +207,7 @@ impl FromStr for InvertedListFormatVersion { #[derive(Debug)] struct PartitionCandidates { tokens_by_position: Vec, + grouped_expansions: Vec, candidates: Vec, } @@ -212,11 +215,74 @@ impl PartitionCandidates { fn empty() -> Self { Self { tokens_by_position: Vec::new(), + grouped_expansions: Vec::new(), candidates: Vec::new(), } } } +#[derive(Debug)] +struct LoadedPostings { + postings: Vec, + grouped_expansions: Vec, +} + +impl LoadedPostings { + fn empty() -> Self { + Self { + postings: Vec::new(), + grouped_expansions: Vec::new(), + } + } +} + +#[derive(Debug)] +struct GroupedExpansionTerms { + position: u32, + terms: Vec, +} + +fn grouped_rescore_wand_limit( + limit: Option, + grouped_expansions: &[GroupedExpansionTerms], +) -> Option { + let limit = limit?; + // Grouped fuzzy AND rescoring needs a small candidate cushion because WAND + // ranks by the unioned group posting first and the exact expansion IDF later. + let expansion_terms = grouped_expansions + .iter() + .map(|group| group.terms.len()) + .sum::() + .max(1); + Some(limit.saturating_mul(expansion_terms)) +} + +#[derive(Debug)] +struct ExpansionTermFreqs { + token: String, + freqs_by_posting_doc_id: Vec<(u64, u32)>, +} + +impl ExpansionTermFreqs { + fn new(token: String, posting: &PostingList) -> Self { + let freqs_by_posting_doc_id = posting + .iter() + .map(|(posting_doc_id, freq, _)| (posting_doc_id, freq)) + .collect(); + Self { + token, + freqs_by_posting_doc_id, + } + } + + fn frequency(&self, posting_doc_id: u64) -> Option { + self.freqs_by_posting_doc_id + .binary_search_by_key(&posting_doc_id, |(doc_id, _)| *doc_id) + .ok() + .map(|idx| self.freqs_by_posting_doc_id[idx].1) + } +} + #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Default)] pub enum TokenSetFormat { Arrow, @@ -250,7 +316,7 @@ impl FromStr for TokenSetFormat { } impl DeepSizeOf for TokenSetFormat { - fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _: &mut lance_core::deepsize::Context) -> usize { 0 } } @@ -353,6 +419,7 @@ pub struct InvertedIndex { tokenizer: Box, token_set_format: TokenSetFormat, pub(crate) partitions: Vec>, + corpus_stats: Arc>, // Fragments which are contained in the index, but no longer in the dataset. // These should be pruned at search time since we don't prune them at update time. deleted_fragments: RoaringBitmap, @@ -370,11 +437,40 @@ impl Debug for InvertedIndex { } impl DeepSizeOf for InvertedIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.partitions.deep_size_of_children(context) } } +/// Resolve any `Pending` candidates that wand emitted via the +/// deferred-row_id path. After this returns, every entry in +/// `candidates` carries a real row_id. +async fn resolve_deferred_candidates( + docs: &LazyDocSet, + candidates: &mut [DocCandidate], +) -> Result<()> { + let pending: Vec = candidates + .iter() + .filter_map(|c| match c.addr { + CandidateAddr::Pending(d) => Some(d), + CandidateAddr::RowId(_) => None, + }) + .collect(); + if pending.is_empty() { + return Ok(()); + } + let mut iter = docs.resolve_row_ids(&pending).await?.into_iter(); + for c in candidates { + if matches!(c.addr, CandidateAddr::Pending(_)) { + let r = iter.next().ok_or_else(|| { + Error::internal("resolve_row_ids returned fewer items than requested") + })?; + c.addr = CandidateAddr::RowId(r); + } + } + Ok(()) +} + impl InvertedIndex { fn format_version(&self) -> InvertedListFormatVersion { self.partitions @@ -508,7 +604,7 @@ impl InvertedIndex { .with_token_set_format(first.token_set_format) .with_format_version(first.format_version()) .with_posting_tail_codec(first.posting_tail_codec()); - builder + let files = builder .update_from_segments(new_data, dest_store, segments, old_data_filter) .await?; @@ -517,7 +613,7 @@ impl InvertedIndex { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: first.index_version(), - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } @@ -532,7 +628,7 @@ impl InvertedIndex { query_tokens: &Tokens, params: &FtsSearchParams, ) -> Result { - let scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); + let (total_tokens, num_docs) = self.aggregate_corpus_stats().await?; let mut terms: Vec = Vec::new(); let mut seen = HashSet::new(); if matches!(params.fuzziness, Some(n) if n != 0) { @@ -555,18 +651,42 @@ impl InvertedIndex { let df = self.df_for_term(term).await?; token_docs.insert(term.clone(), df); } - Ok(MemBM25Scorer::new( - scorer.total_tokens(), - scorer.num_docs(), - token_docs, - )) + Ok(MemBM25Scorer::new(total_tokens, num_docs, token_docs)) } pub async fn bm25_stats_for_terms(&self, terms: &[String]) -> Result<(u64, usize, Vec)> { - let scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); + let (total_tokens, num_docs) = self.aggregate_corpus_stats().await?; let token_docs = futures::future::try_join_all(terms.iter().map(|term| self.df_for_term(term))).await?; - Ok((scorer.total_tokens(), scorer.num_docs(), token_docs)) + Ok((total_tokens, num_docs, token_docs)) + } + + /// Aggregate per-partition `total_tokens` and `num_docs` across the + /// index. `len` is cheap (no IO); `total_tokens_num` reads only the + /// num_tokens column the first time per partition and caches it on + /// `LazyDocSet`. Avoids materializing the full DocSet just to get + /// these two scalars. + async fn aggregate_corpus_stats(&self) -> Result<(u64, usize)> { + self.corpus_stats + .get_or_try_init(|| async { + let io_parallelism = self.store.io_parallelism(); + let num_docs: usize = self.partitions.iter().map(|p| p.docs.len()).sum(); + let futures = self + .partitions + .iter() + .map(|p| { + let docs = p.docs.clone(); + async move { docs.total_tokens_num().await } + }) + .collect::>(); + let totals: Vec = stream::iter(futures) + .buffer_unordered(io_parallelism) + .try_collect() + .await?; + Ok((totals.into_iter().sum(), num_docs)) + }) + .await + .copied() } /// Sum the posting-list length for `term` across this index's partitions @@ -603,9 +723,10 @@ impl InvertedIndex { let expanded = partition.expand_fuzzy(tokens, params)?; for idx in 0..expanded.len() { let token = expanded.get_token(idx); - if seen.insert(token.to_string()) { + let position = expanded.position(idx); + if seen.insert((token.to_string(), position)) { expanded_tokens.push(token.to_string()); - expanded_positions.push(expanded.position(idx)); + expanded_positions.push(position); } } } @@ -651,6 +772,11 @@ impl InvertedIndex { let mask = prefilter.mask(); let mut candidates = BinaryHeap::new(); + // Shared top-k floor across this query's partitions. Seeded to -inf so + // the first real score wins; each partition publishes its local k-th + // and prunes against the running global k-th (a lower bound on the true + // global k-th — see `Wand::shared_threshold`). + let shared_threshold = Arc::new(AtomicU32::new(f32::NEG_INFINITY.to_bits())); let parts = self .partitions .iter() @@ -660,13 +786,27 @@ impl InvertedIndex { let params = params.clone(); let mask = mask.clone(); let metrics = metrics.clone(); + let shared_threshold = shared_threshold.clone(); async move { - let postings = part - .load_posting_lists(tokens.as_ref(), params.as_ref(), metrics.as_ref()) + let loaded_postings = part + .load_posting_lists( + tokens.as_ref(), + params.as_ref(), + operator, + metrics.as_ref(), + ) .await?; + let LoadedPostings { + postings, + grouped_expansions, + } = loaded_postings; if postings.is_empty() { + // No hits in this partition; its DocSet stays + // unloaded, so we never pay the per-doc + // row_id/num_tokens download for it. return Result::Ok(PartitionCandidates::empty()); } + let docs_for_wand = part.docs.docs_for_wand(mask.as_ref()).await?; let max_position = postings .iter() .map(|posting| posting.term_index() as usize) @@ -680,20 +820,42 @@ impl InvertedIndex { let params = params.clone(); let mask = mask.clone(); let metrics = metrics.clone(); - spawn_cpu(move || { - let candidates = part.bm25_search( - params.as_ref(), + let part_for_wand = part.clone(); + let has_grouped_expansions = !grouped_expansions.is_empty(); + let wand_params = if has_grouped_expansions { + let mut rescoring_params = params.as_ref().clone(); + rescoring_params.limit = + grouped_rescore_wand_limit(params.limit, &grouped_expansions); + Arc::new(rescoring_params) + } else { + params.clone() + }; + let partition_threshold = if has_grouped_expansions { + Arc::new(AtomicU32::new(f32::NEG_INFINITY.to_bits())) + } else { + shared_threshold + }; + let candidates = spawn_cpu(move || { + let candidates = part_for_wand.bm25_search( + docs_for_wand.as_ref(), + wand_params.as_ref(), operator, mask, postings, metrics.as_ref(), + partition_threshold, )?; - Ok(PartitionCandidates { - tokens_by_position, - candidates, - }) + std::result::Result::<_, Error>::Ok(candidates) }) - .await + .await?; + let mut partition_result = PartitionCandidates { + tokens_by_position, + grouped_expansions, + candidates, + }; + resolve_deferred_candidates(&part.docs, &mut partition_result.candidates) + .await?; + Result::Ok(partition_result) } }) .collect::>(); @@ -703,8 +865,17 @@ impl InvertedIndex { if res.candidates.is_empty() { continue; } - let mut idf_by_position = Vec::with_capacity(res.tokens_by_position.len()); - for token in &res.tokens_by_position { + let PartitionCandidates { + tokens_by_position, + grouped_expansions, + candidates: part_candidates, + } = res; + let grouped_positions = grouped_expansions + .iter() + .map(|group| group.position) + .collect::>(); + let mut idf_by_position = Vec::with_capacity(tokens_by_position.len()); + for token in &tokens_by_position { let idf_weight = match idf_cache.get(token) { Some(weight) => *weight, None => { @@ -716,17 +887,47 @@ impl InvertedIndex { idf_by_position.push(idf_weight); } for DocCandidate { - row_id, + addr, + posting_doc_id, freqs, doc_length, - } in res.candidates + } in part_candidates { + // resolve_deferred_candidates ran upstream, so every + // candidate carries a real row_id at this point. + let row_id = match addr { + CandidateAddr::RowId(r) => r, + CandidateAddr::Pending(_) => { + return Err(Error::internal( + "bm25_search post-condition: deferred candidate left unresolved", + )); + } + }; let mut score = 0.0; for (term_index, freq) in freqs.into_iter() { + if grouped_positions.contains(&term_index) { + continue; + } debug_assert!((term_index as usize) < idf_by_position.len()); score += idf_by_position[term_index as usize] * scorer.doc_weight(freq, doc_length); } + for group in &grouped_expansions { + for term in &group.terms { + let Some(freq) = term.frequency(posting_doc_id) else { + continue; + }; + let idf_weight = match idf_cache.get(&term.token) { + Some(weight) => *weight, + None => { + let weight = scorer.query_weight(&term.token); + idf_cache.insert(term.token.clone(), weight); + weight + } + }; + score += idf_weight * scorer.doc_weight(freq, doc_length); + } + } if candidates.len() < limit { candidates.push(Reverse(ScoredDoc::new(row_id, score))); } else if candidates.peek().unwrap().0.score.0 < score { @@ -799,9 +1000,10 @@ impl InvertedIndex { store, tokens, inverted_list, - docs, + docs: Arc::new(LazyDocSet::from_loaded(docs)), token_set_format: TokenSetFormat::Arrow, })], + corpus_stats: Arc::new(OnceCell::new()), deleted_fragments: RoaringBitmap::new(), })) } @@ -889,6 +1091,7 @@ impl InvertedIndex { tokenizer, token_set_format, partitions, + corpus_stats: Arc::new(OnceCell::new()), deleted_fragments, })) } @@ -910,12 +1113,6 @@ impl Index for InvertedIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input( - "inverted index cannot be cast to vector index", - )) - } - fn statistics(&self) -> Result { let num_tokens = self .partitions @@ -948,6 +1145,52 @@ impl Index for InvertedIndex { } } +/// Target on-disk size of one prewarm chunk; a partition is streamed in chunks of +/// ~this size so its peak resident set is one chunk, not the whole `invert.lance`. +const PREWARM_CHUNK_TARGET_BYTES: u64 = 32 << 20; + +/// Cap on token rows per chunk, bounding the built `Vec` when posting lists are tiny. +const PREWARM_MAX_CHUNK_TOKENS: usize = 4096; + +/// Floor on token rows per chunk, so a partition always makes progress. +const PREWARM_MIN_CHUNK_TOKENS: usize = 1; + +/// Token rows per chunk: byte target / average bytes-per-token, clamped to `[MIN, MAX]`. +fn prewarm_chunk_tokens(token_count: usize, file_size_bytes: u64) -> usize { + if token_count == 0 { + return PREWARM_MIN_CHUNK_TOKENS; + } + let bytes_per_token = (file_size_bytes / token_count as u64).max(1); // >= 1: no div-by-zero + let by_bytes = (PREWARM_CHUNK_TARGET_BYTES / bytes_per_token) as usize; + by_bytes.clamp(PREWARM_MIN_CHUNK_TOKENS, PREWARM_MAX_CHUNK_TOKENS) +} + +/// Snap a chunk's exclusive token end back to a posting-group boundary so no group +/// straddles chunks. Returns the largest group boundary in `(tok_start, desired_end]`, +/// or the next boundary past an oversized group so it runs as one solo chunk. +fn group_aligned_chunk_end( + starts: &[u32], + token_count: usize, + tok_start: usize, + desired_end: usize, +) -> usize { + let fit = starts + .iter() + .map(|&s| s as usize) + .chain(std::iter::once(token_count)) + .filter(|&b| b > tok_start && b <= desired_end) + .max(); + if let Some(end) = fit { + return end; + } + // Oversized group: extend to its end so it runs as one chunk. + starts + .iter() + .map(|&s| s as usize) + .find(|&b| b > tok_start) + .unwrap_or(token_count) +} + impl InvertedIndex { pub async fn prewarm_with_options(&self, options: &FtsPrewarmOptions) -> Result<()> { let with_position = options.with_position; @@ -960,6 +1203,11 @@ impl InvertedIndex { part.inverted_list .prewarm_posting_lists(with_position) .await?; + // Materialize the deferred DocSet too: prewarm's contract is + // that subsequent queries do no IO, so the per-doc row_ids / + // num_tokens must be resident, not lazily faulted in at query + // time. `ensure_loaded` opens, reads, and drops the reader. + part.docs.ensure_loaded().await?; Result::Ok(()) }); stream::iter(prewarm_futures) @@ -1027,7 +1275,8 @@ impl ScalarIndex for InvertedIndex { mapping: &HashMap>, dest_store: &dyn IndexStore, ) -> Result { - self.to_builder() + let files = self + .to_builder() .remap(mapping, self.store.clone(), dest_store) .await?; @@ -1036,7 +1285,7 @@ impl ScalarIndex for InvertedIndex { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: self.index_version(), - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } @@ -1046,7 +1295,8 @@ impl ScalarIndex for InvertedIndex { dest_store: &dyn IndexStore, old_data_filter: Option, ) -> Result { - self.to_builder() + let files = self + .to_builder() .update(new_data, dest_store, old_data_filter) .await?; @@ -1055,7 +1305,7 @@ impl ScalarIndex for InvertedIndex { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: self.index_version(), - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } @@ -1091,7 +1341,10 @@ pub struct InvertedPartition { store: Arc, pub(crate) tokens: TokenSet, pub(crate) inverted_list: Arc, - pub(crate) docs: DocSet, + /// Per-doc row_id + num_tokens. Wrapped in `LazyDocSet` so partitions + /// that don't contribute hits to a query never pay the full-array + /// download. Scoring paths call `ensure_loaded` before walking wand. + pub(crate) docs: Arc, token_set_format: TokenSetFormat, } @@ -1133,8 +1386,21 @@ impl InvertedPartition { let tokens = TokenSet::load(token_file, token_set_format).await?; let invert_list_file = store.open_index_file(&posting_file_path(id)).await?; let inverted_list = PostingListReader::try_new(invert_list_file, index_cache).await?; - let docs_file = store.open_index_file(&doc_file_path(id)).await?; - let docs = DocSet::load(docs_file, false, frag_reuse_index).await?; + // Defer the per-doc row_id/num_tokens read. Construction reads only + // the doc count (one footer read) and then drops the reader; the bulk + // load happens on first scoring use, re-opening the docs file on + // demand, and partitions that never score skip it entirely. Storing + // the store + path instead of an open reader keeps a cached partition + // from pinning a docs-file handle for its whole lifetime. + let docs_path = doc_file_path(id); + let num_docs = store.open_index_file(&docs_path).await?.num_rows(); + let docs = Arc::new(LazyDocSet::new( + store.clone(), + docs_path, + num_docs, + false, + frag_reuse_index, + )); Ok(Self { id, @@ -1152,7 +1418,14 @@ impl InvertedPartition { pub fn expand_fuzzy(&self, tokens: &Tokens, params: &FtsSearchParams) -> Result { let mut new_tokens = Vec::with_capacity(min(tokens.len(), params.max_expansions)); - for token in tokens { + let mut new_positions = Vec::with_capacity(new_tokens.capacity()); + let mut seen = HashSet::new(); + for token_idx in 0..tokens.len() { + if new_tokens.len() >= params.max_expansions { + break; + } + let token = tokens.get_token(token_idx); + let position = tokens.position(token_idx); let fuzziness = match params.fuzziness { Some(fuzziness) => fuzziness, None => MatchQuery::auto_fuzziness(token), @@ -1162,39 +1435,150 @@ impl InvertedPartition { let base_len = tokens.token_type().prefix_len(token) as u32; if let TokenMap::Fst(ref map) = self.tokens.tokens { + let mut expanded = Vec::new(); + let remaining = params.max_expansions - new_tokens.len(); match base_len + params.prefix_length { - 0 => take_fst_keys(map.search(lev), &mut new_tokens, params.max_expansions), + 0 => take_fst_keys(map.search(lev), &mut expanded, remaining), prefix_length => { let prefix = &token[..min(prefix_length as usize, token.len())]; let prefix = fst::automaton::Str::new(prefix).starts_with(); take_fst_keys( map.search(lev.intersection(prefix)), - &mut new_tokens, - params.max_expansions, + &mut expanded, + remaining, ) } } + for token in expanded { + if seen.insert((token.clone(), position)) { + new_tokens.push(token); + new_positions.push(position); + if new_tokens.len() >= params.max_expansions { + break; + } + } + } } else { return Err(Error::index( "tokens is not fst, which is not expected".to_owned(), )); } } - Ok(Tokens::new(new_tokens, tokens.token_type().clone())) + Ok(Tokens::with_positions( + new_tokens, + new_positions, + tokens.token_type().clone(), + )) + } + + fn union_plain_posting_lists(postings: Vec) -> Result { + let mut freqs_by_row_id = BTreeMap::new(); + for posting in postings { + for (row_id, freq, _) in posting.iter() { + let entry = freqs_by_row_id.entry(row_id).or_insert(0u32); + *entry = entry.checked_add(freq).ok_or_else(|| { + Error::index(format!("posting frequency overflow for row id {}", row_id)) + })?; + } + } + let mut row_ids = Vec::with_capacity(freqs_by_row_id.len()); + let mut frequencies = Vec::with_capacity(freqs_by_row_id.len()); + for (row_id, freq) in freqs_by_row_id { + row_ids.push(row_id); + frequencies.push(freq as f32); + } + Ok(PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from(row_ids), + ScalarBuffer::from(frequencies), + None, + None, + ))) + } + + fn union_compressed_posting_lists( + postings: Vec, + docs: &DocSet, + ) -> Result { + let mut freqs_by_doc_id = BTreeMap::new(); + for posting in postings { + for (doc_id, freq, _) in posting.iter() { + let doc_id = u32::try_from(doc_id).map_err(|_| { + Error::index(format!( + "compressed posting doc id {} exceeds u32::MAX", + doc_id + )) + })?; + let entry = freqs_by_doc_id.entry(doc_id).or_insert(0u32); + *entry = entry.checked_add(freq).ok_or_else(|| { + Error::index(format!("posting frequency overflow for doc id {}", doc_id)) + })?; + } + } + if freqs_by_doc_id.is_empty() { + return Ok(PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from(Vec::::new()), + ScalarBuffer::from(Vec::::new()), + None, + None, + ))); + } + + let mut builder = PostingListBuilder::new(false); + let mut doc_ids = Vec::with_capacity(freqs_by_doc_id.len()); + let mut frequencies = Vec::with_capacity(freqs_by_doc_id.len()); + for (doc_id, freq) in freqs_by_doc_id { + builder.add(doc_id, PositionRecorder::Count(freq)); + doc_ids.push(doc_id); + frequencies.push(freq); + } + let block_max_scores = docs.calculate_block_max_scores(doc_ids.iter(), frequencies.iter()); + let batch = builder.to_batch(block_max_scores)?; + let max_score = batch[MAX_SCORE_COL].as_primitive::().value(0); + let length = batch[LENGTH_COL].as_primitive::().value(0); + PostingList::from_batch(&batch, Some(max_score), Some(length)) + } + + fn union_posting_lists(postings: Vec, docs: &DocSet) -> Result { + let has_plain = postings + .iter() + .any(|posting| matches!(posting, PostingList::Plain(_))); + let has_compressed = postings + .iter() + .any(|posting| matches!(posting, PostingList::Compressed(_))); + match (has_plain, has_compressed) { + (true, true) => Err(Error::index( + "cannot union mixed plain and compressed posting lists".to_owned(), + )), + (true, false) => Self::union_plain_posting_lists(postings), + (false, true) => Self::union_compressed_posting_lists(postings, docs), + (false, false) => Ok(PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from(Vec::::new()), + ScalarBuffer::from(Vec::::new()), + None, + None, + ))), + } } // search the documents that contain the query // return the doc info and the doc length // ref: https://en.wikipedia.org/wiki/Okapi_BM25 #[instrument(level = "debug", skip_all)] - pub async fn load_posting_lists( + async fn load_posting_lists( &self, tokens: &Tokens, params: &FtsSearchParams, + operator: Operator, metrics: &dyn MetricsCollector, - ) -> Result> { + ) -> Result { let is_fuzzy = matches!(params.fuzziness, Some(n) if n != 0); let is_phrase_query = params.phrase_slop.is_some(); + let is_and_query = operator == Operator::And; + let required_positions = (is_and_query || is_phrase_query).then(|| { + (0..tokens.len()) + .map(|index| tokens.position(index)) + .collect::>() + }); let tokens = match is_fuzzy { true => self.expand_fuzzy(tokens, params)?, false => tokens.clone(), @@ -1203,65 +1587,174 @@ impl InvertedPartition { .map(|index| tokens.position(index)) .collect::>(); let mut token_ids = Vec::with_capacity(tokens.len()); + let mut matched_positions = HashSet::new(); for (index, token) in tokens.into_iter().enumerate() { let token_id = self.map(&token); if let Some(token_id) = token_id { - token_ids.push((token_id, token, token_positions[index])); - } else if is_phrase_query { - // if the token is not found, we can't do phrase query - return Ok(Vec::new()); + let position = token_positions[index]; + matched_positions.insert(position); + token_ids.push((token_id, token, position)); + } else if is_phrase_query || is_and_query { + // if the token is not found, we can't do phrase or AND query + return Ok(LoadedPostings::empty()); } } if token_ids.is_empty() { - return Ok(Vec::new()); + return Ok(LoadedPostings::empty()); + } + if let Some(required_positions) = required_positions + && !required_positions.is_subset(&matched_positions) + { + return Ok(LoadedPostings::empty()); } + + let is_fuzzy_and_query = is_fuzzy && is_and_query && !is_phrase_query; if !is_phrase_query { - token_ids.sort_unstable_by_key(|(token_id, _, _)| *token_id); - token_ids.dedup_by_key(|(token_id, _, _)| *token_id); + if is_fuzzy_and_query { + token_ids.sort_unstable_by_key(|(token_id, _, position)| (*position, *token_id)); + token_ids.dedup_by(|lhs, rhs| lhs.0 == rhs.0 && lhs.2 == rhs.2); + } else { + token_ids.sort_unstable_by_key(|(token_id, _, _)| *token_id); + token_ids.dedup_by_key(|(token_id, _, _)| *token_id); + } } let num_docs = self.docs.len(); - stream::iter(token_ids) + let loaded_postings = stream::iter(token_ids) .map(|(token_id, token, position)| async move { let posting = self .inverted_list .posting_list(token_id, is_phrase_query, metrics) .await?; - let query_weight = idf(posting.len(), num_docs); - - Result::Ok(PostingIterator::with_query_weight( - token, - token_id, - position, - query_weight, - posting, - num_docs, - )) + Result::Ok((token_id, token, position, posting)) }) .buffered(self.store.io_parallelism()) .try_collect::>() - .await + .await?; + + if (is_and_query || is_phrase_query) + && !is_fuzzy_and_query + && loaded_postings + .iter() + .any(|(_, _, _, posting)| posting.is_empty()) + { + return Ok(LoadedPostings::empty()); + } + + if !is_fuzzy_and_query { + return Ok(LoadedPostings { + postings: loaded_postings + .into_iter() + .map(|(token_id, token, position, posting)| { + let query_weight = idf(posting.len(), num_docs); + PostingIterator::with_query_weight( + token, + token_id, + position, + query_weight, + posting, + num_docs, + ) + }) + .collect(), + grouped_expansions: Vec::new(), + }); + } + + let needs_union = loaded_postings + .windows(2) + .any(|window| window[0].2 == window[1].2); + let docs_for_union = if needs_union { + Some(self.docs.ensure_num_tokens_loaded().await?) + } else { + None + }; + + // WAND's AND mode treats every iterator as required, so expansions from + // one original query position must be merged before scoring. + let mut grouped_postings = Vec::new(); + let mut grouped_expansions = Vec::new(); + let mut iter = loaded_postings.into_iter().peekable(); + while let Some((token_id, token, position, posting)) = iter.next() { + let mut group = vec![(token_id, token, posting)]; + while matches!(iter.peek(), Some((_, _, next_position, _)) if *next_position == position) + { + let (token_id, token, _, posting) = iter.next().expect("peeked item must exist"); + group.push((token_id, token, posting)); + } + + let (token_id, token, posting) = if group.len() == 1 { + group.pop().expect("single-item group must exist") + } else { + let token_id = group[0].0; + let token = group[0].1.clone(); + grouped_expansions.push(GroupedExpansionTerms { + position, + terms: group + .iter() + .map(|(_, token, posting)| ExpansionTermFreqs::new(token.clone(), posting)) + .collect(), + }); + let postings = group + .into_iter() + .map(|(_, _, posting)| posting) + .collect::>(); + let posting = Self::union_posting_lists( + postings, + docs_for_union + .as_deref() + .expect("union docs must be loaded for grouped fuzzy AND"), + )?; + (token_id, token, posting) + }; + if posting.is_empty() { + return Ok(LoadedPostings::empty()); + } + + let query_weight = idf(posting.len(), num_docs); + grouped_postings.push(PostingIterator::with_query_weight( + token, + token_id, + position, + query_weight, + posting, + num_docs, + )); + } + + Ok(LoadedPostings { + postings: grouped_postings, + grouped_expansions, + }) } #[instrument(level = "debug", skip_all)] + // Deferred-DocSet adds the `docs` param (caller materializes it) on top of + // the cross-partition `shared_threshold`, tipping this hot-path search fn + // one over the limit. Bundling args isn't worth the churn here. + #[allow(clippy::too_many_arguments)] pub fn bm25_search( &self, + docs: &DocSet, params: &FtsSearchParams, operator: Operator, mask: Arc, postings: Vec, metrics: &dyn MetricsCollector, + shared_threshold: Arc, ) -> Result> { if postings.is_empty() { return Ok(Vec::new()); } - // let local_metrics = LocalMetricsCollector::default(); + // Caller selects the DocSet shape via `LazyDocSet::docs_for_wand` + // and passes it in here; wand uses `docs.has_row_ids()` to + // handle the num_tokens-only case. let scorer = IndexBM25Scorer::new(std::iter::once(self)); - let mut wand = Wand::new(operator, postings.into_iter(), &self.docs, scorer); + let mut wand = Wand::new(operator, postings.into_iter(), docs, scorer) + .with_shared_threshold(shared_threshold); let hits = wand.search(params, mask, metrics)?; - // local_metrics.dump_into(metrics); Ok(hits) } @@ -1273,7 +1766,10 @@ impl InvertedPartition { self.inverted_list.posting_tail_codec(), ); builder.tokens = self.tokens.into_mutable(); - builder.docs = self.docs; + // into_builder rewrites every doc, so materialize the full + // DocSet now and clone it out of the Arc. + let docs_arc = self.docs.ensure_loaded().await?; + builder.docs = (*docs_arc).clone(); builder .posting_lists @@ -1307,7 +1803,7 @@ impl Default for TokenMap { } impl DeepSizeOf for TokenMap { - fn deep_size_of_children(&self, ctx: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, ctx: &mut lance_core::deepsize::Context) -> usize { match self { Self::HashMap(map) => map.deep_size_of_children(ctx), Self::Fst(map) => map.as_fst().size(), @@ -1511,15 +2007,13 @@ impl TokenSet { let map = fst::Map::new(bytes.to_vec()) .map_err(|e| Error::index(format!("failed to load fst tokens: {}", e)))?; - let next_id_col = batch[TOKEN_NEXT_ID_COL].as_primitive::(); let total_length_col = batch[TOKEN_TOTAL_LENGTH_COL].as_primitive::(); - let next_id = next_id_col - .values() - .first() - .copied() - .ok_or(Error::index("token next id column is empty".to_owned()))?; + // Token ids are dense `[0, len)`, so `next_id` must equal the token count. Recompute + // it instead of trusting the persisted value, which writers before #7115 could leave + // stale. Mirrors `load_arrow`. + let next_id = map.len() as u32; let total_length = total_length_col .values() @@ -1629,17 +2123,26 @@ impl TokenSet { } }; + let mut retained_length = 0; map.retain( - |_, token_id| match removed_token_ids.binary_search(token_id) { + |token, token_id| match removed_token_ids.binary_search(token_id) { Ok(_) => false, Err(index) => { *token_id -= index as u32; + retained_length += token.len(); true } }, ); self.tokens = TokenMap::HashMap(map); + + // The retain above compacts the surviving token ids into a dense `[0, len)` + // range, so `next_id` (handed to the next new token) must follow them down. + // `total_length` likewise must drop the removed tokens' bytes; it is persisted + // and feeds memory accounting, so a stale value drifts across remap/merge cycles. + self.next_id = self.tokens.len() as u32; + self.total_length = retained_length; } pub fn next_id(&self) -> u32 { @@ -1698,7 +2201,7 @@ enum PostingMetadata { /// `ensure_metadata_loaded`, and the stats path can also fetch a single /// token via `posting_len_for_token` without forcing the bulk load. V2 { - metadata: tokio::sync::OnceCell, + metadata: OnceCell, }, } @@ -1737,7 +2240,7 @@ impl std::fmt::Debug for PostingListReader { } impl DeepSizeOf for PostingListReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let metadata_size = match &self.metadata { PostingMetadata::LegacyV1 { offsets, @@ -1777,7 +2280,7 @@ impl PostingListReader { } } else { PostingMetadata::V2 { - metadata: tokio::sync::OnceCell::new(), + metadata: OnceCell::new(), } }; @@ -1878,10 +2381,10 @@ impl PostingListReader { } /// Async access to a single token's posting list length. For v2 - /// indexes this reads a single row from `LENGTH_COL` if the bulk metadata - /// has not been loaded yet, and never triggers the bulk load itself. The - /// stats path uses this so a single-term `df` lookup costs O(1) bytes - /// rather than O(num_unique_tokens). + /// indexes this reads one row of posting metadata if the bulk metadata has + /// not been loaded yet, and never triggers the bulk load itself. The stats + /// path uses this so a single-term `df` lookup costs O(1) bytes rather + /// than O(num_unique_tokens). pub(crate) async fn posting_len_for_token(&self, token_id: u32) -> Result { match &self.metadata { PostingMetadata::LegacyV1 { .. } => Ok(self.posting_len(token_id)), @@ -1889,13 +2392,10 @@ impl PostingListReader { if let Some(metadata) = metadata.get() { return Ok(metadata.lengths[token_id as usize] as usize); } - let token_id = token_id as usize; - let batch = self - .reader - .read_range(token_id..token_id + 1, Some(&[LENGTH_COL])) - .await?; - let len = batch[LENGTH_COL].as_primitive::().value(0); - Ok(len as usize) + let (_, length) = self.posting_metadata_for_token(token_id).await?; + length + .map(|len| len as usize) + .ok_or_else(|| Error::index("posting length metadata missing".to_string())) } } } @@ -1921,17 +2421,20 @@ impl PostingListReader { Some(loaded.lengths[token_id as usize]), )); } - let token_id_usize = token_id as usize; - let batch = self - .reader - .read_range( - token_id_usize..token_id_usize + 1, - Some(&[MAX_SCORE_COL, LENGTH_COL]), - ) + let metadata = self + .index_cache + .get_or_insert_with_key(PostingMetadataKey { token_id }, || async move { + let token_id = token_id as usize; + let batch = self + .reader + .read_range(token_id..token_id + 1, Some(&[MAX_SCORE_COL, LENGTH_COL])) + .await?; + let max_score = batch[MAX_SCORE_COL].as_primitive::().value(0); + let length = batch[LENGTH_COL].as_primitive::().value(0); + Ok(PostingMetadataValue { max_score, length }) + }) .await?; - let max_score = batch[MAX_SCORE_COL].as_primitive::().value(0); - let length = batch[LENGTH_COL].as_primitive::().value(0); - Ok((Some(max_score), Some(length))) + Ok((Some(metadata.max_score), Some(metadata.length))) } } } @@ -2168,50 +2671,84 @@ impl PostingListReader { ) } - fn build_prewarm_posting_lists( - batch: RecordBatch, - offsets: Option>, - max_scores: Option>, - lengths: Option>, - posting_tail_codec: PostingTailCodec, - positions_layout: PositionsLayout, + /// Build posting lists for one chunk's token range from `chunk_batch`, rebasing + /// global offsets to chunk-local rows. Returns `(global token_id, PostingList)` + /// pairs identical to the whole-file path, only bounded to one chunk. + fn build_prewarm_posting_lists_chunk( + chunk_batch: RecordBatch, + chunk: PrewarmChunk<'_>, + ctx: &PrewarmBuildCtx<'_>, ) -> Result> { - let token_count = if let Some(offsets) = offsets.as_ref() { - offsets.len() - } else if let Some(lengths) = lengths.as_ref() { - lengths.len() - } else { - batch.num_rows() - }; - - let mut posting_lists = Vec::with_capacity(token_count); - for token_id in 0..token_count { - let batch = if let Some(offsets) = offsets.as_ref() { - let start = offsets[token_id]; - let end = if token_id + 1 < offsets.len() { - offsets[token_id + 1] + let mut posting_lists = Vec::with_capacity(chunk.token_count); + for local in 0..chunk.token_count { + let global = chunk.tok_start + local; + let row_batch = if let Some(chunk_offsets) = chunk.offsets { + // Legacy v1: rebase global offsets to chunk row 0; the last token + // ends at `chunk.end_row` (no trailing sentinel in chunk_offsets). + let base = chunk_offsets[0]; + let start = chunk_offsets[local] - base; + let end = if local + 1 < chunk_offsets.len() { + chunk_offsets[local + 1] - base } else { - batch.num_rows() + chunk.end_row - base }; - batch.slice(start, end - start) + chunk_batch.slice(start, end - start) } else { - batch.slice(token_id, 1) + // V2: one posting row per token; row `local` within the chunk. + chunk_batch.slice(local, 1) }; - let batch = batch.shrink_to_fit()?; + let row_batch = row_batch.shrink_to_fit()?; let posting_list = Self::posting_list_from_batch_parts( - &batch, - max_scores.as_ref().map(|scores| scores[token_id]), - lengths.as_ref().map(|lengths| lengths[token_id]), - posting_tail_codec, - positions_layout, + &row_batch, + ctx.max_scores.map(|scores| scores[global]), + ctx.lengths.map(|lengths| lengths[global]), + ctx.posting_tail_codec, + ctx.positions_layout, )?; - posting_lists.push((token_id as u32, posting_list)); + posting_lists.push((global as u32, posting_list)); } Ok(posting_lists) } + /// Read the posting rows for token ids `[tok_start, tok_end)` into one RecordBatch. + /// For v2 the token range is the row range; for v1 it's derived from the offsets. + async fn read_chunk_batch( + &self, + tok_start: usize, + tok_end: usize, + with_position: bool, + ) -> Result { + let columns = self.posting_columns(with_position); + let row_range = match &self.metadata { + PostingMetadata::LegacyV1 { offsets, .. } => { + let start = offsets[tok_start]; + let end = offsets + .get(tok_end) + .copied() + .unwrap_or_else(|| self.reader.num_rows()); + start..end + } + PostingMetadata::V2 { .. } => tok_start..tok_end, + }; + let batch = self.reader.read_range(row_range, Some(&columns)).await?; + Ok(batch) + } + async fn prewarm_posting_lists(&self, with_position: bool) -> Result<()> { + self.prewarm_posting_lists_chunked(with_position, None) + .await?; + Ok(()) + } + + /// Stream the partition's posting lists into the cache in bounded token-row chunks + /// (read -> build -> insert -> drop), so peak resident set is ~one chunk. Returns + /// the chunk count (tests assert it split). `chunk_tokens_override` is test-only. + async fn prewarm_posting_lists_chunked( + &self, + with_position: bool, + chunk_tokens_override: Option, + ) -> Result { if with_position && !self.has_positions() { return Err(Error::invalid_input( "cannot prewarm positions for an inverted index that was built without positions; recreate the index with with_position=true".to_owned(), @@ -2223,34 +2760,124 @@ impl PostingListReader { // OnceCells. self.ensure_metadata_loaded().await?; - let read_batch_start = Instant::now(); - let batch = self.read_batch(with_position).await?; - let read_batch_elapsed = read_batch_start.elapsed(); + let state = self.chunk_build_state(); + // With grouping the cache stores one entry per group, so a group's posting + // lists must all be resident at once: align chunk boundaries to whole + // groups. Without grouping, chunks are plain token ranges. + let group_starts = self.group_starts.clone(); + let token_count = self.len(); + let chunk_tokens = chunk_tokens_override + .unwrap_or_else(|| prewarm_chunk_tokens(token_count, self.posting_data_size_bytes())) + .max(1); + + let mut chunk_count = 0usize; + let read_build_start = Instant::now(); + let mut tok_start = 0usize; + while tok_start < token_count { + let mut tok_end = (tok_start + chunk_tokens).min(token_count); + // `tok_start` is always a group boundary; snap `tok_end` back to one too. + if let Some(starts) = group_starts.as_ref() { + tok_end = group_aligned_chunk_end(starts, token_count, tok_start, tok_end); + } + chunk_count += 1; - let (legacy_layout, offsets, max_scores, lengths) = match &self.metadata { + let posting_lists = self + .build_chunk_postings(tok_start, tok_end, with_position, &state) + .await?; + self.publish_chunk_postings( + posting_lists, + group_starts.as_deref(), + tok_start, + tok_end, + token_count, + with_position, + ) + .await; + + tok_start = tok_end; + } + let read_build_elapsed = read_build_start.elapsed(); + + info!( + legacy_layout = self.is_legacy_layout(), + with_position, + token_count, + chunk_count, + chunk_tokens, + read_build_ms = read_build_elapsed.as_secs_f64() * 1000.0, + "posting list prewarm timing" + ); + + Ok(chunk_count) + } + + /// Loop-invariant inputs shared by every chunk build: the metadata vecs + /// (`Arc`d so chunks share them without re-cloning) plus codec/layout. + fn chunk_build_state(&self) -> ChunkBuildState { + let (offsets, max_scores, lengths) = match &self.metadata { PostingMetadata::LegacyV1 { offsets, max_scores, - } => (true, Some(offsets.clone()), max_scores.clone(), None), + } => (Some(offsets.clone()), max_scores.clone(), None), PostingMetadata::V2 { metadata } => ( - false, None, metadata.get().map(|loaded| loaded.max_scores.clone()), metadata.get().map(|loaded| loaded.lengths.clone()), ), }; - let posting_tail_codec = self.posting_tail_codec; - let positions_layout = self.positions_layout; - let populate_start = Instant::now(); + ChunkBuildState { + offsets: offsets.map(Arc::new), + max_scores: max_scores.map(Arc::new), + lengths: lengths.map(Arc::new), + posting_tail_codec: self.posting_tail_codec, + positions_layout: self.positions_layout, + } + } + + /// Read one token-row chunk and build its posting lists off the runtime thread. + /// The large batch is dropped inside the blocking task once built, bounding + /// resident memory to one chunk. + async fn build_chunk_postings( + &self, + tok_start: usize, + tok_end: usize, + with_position: bool, + state: &ChunkBuildState, + ) -> Result> { + let chunk_token_count = tok_end - tok_start; + let chunk_batch = self + .read_chunk_batch(tok_start, tok_end, with_position) + .await?; + + let (chunk_offsets, chunk_end_row) = match state.offsets.as_ref() { + Some(offsets) => { + let end_row = offsets + .get(tok_end) + .copied() + .unwrap_or_else(|| self.reader.num_rows()); + (Some(offsets[tok_start..tok_end].to_vec()), end_row) + } + // V2 doesn't use chunk_end_row (one row per token); pass tok_end. + None => (None, tok_end), + }; + let max_scores = state.max_scores.clone(); + let lengths = state.lengths.clone(); + let posting_tail_codec = state.posting_tail_codec; + let positions_layout = state.positions_layout; let posting_lists = spawn_blocking(move || { - Self::build_prewarm_posting_lists( - batch, - offsets, - max_scores, - lengths, + let ctx = PrewarmBuildCtx { + max_scores: max_scores.as_deref().map(|v| v.as_slice()), + lengths: lengths.as_deref().map(|v| v.as_slice()), posting_tail_codec, positions_layout, - ) + }; + let chunk = PrewarmChunk { + tok_start, + token_count: chunk_token_count, + offsets: chunk_offsets.as_deref(), + end_row: chunk_end_row, + }; + Self::build_prewarm_posting_lists_chunk(chunk_batch, chunk, &ctx) }) .await .map_err(|err| { @@ -2258,60 +2885,95 @@ impl PostingListReader { "Failed to build prewarm posting lists in blocking task: {err}" )) })??; - // Strip positions into their own per-token cache entries first - // (unchanged); the posting cache holds positions-free lists. - let mut postings_by_token = Vec::with_capacity(posting_lists.len()); - for (token_id, mut posting_list) in posting_lists { - if with_position && let Some(positions) = posting_list.take_positions() { - self.index_cache - .insert_with_key(&PositionKey { token_id }, Arc::new(Positions(positions))) - .await; - } - debug_assert_eq!(token_id as usize, postings_by_token.len()); - postings_by_token.push(posting_list); - } - // Populate the same cache keys the read path uses: grouped entries when - // grouping is active (issue #7040), per-token entries otherwise. - match self.group_starts.as_ref() { + // The chunk yields its token range as contiguous ascending ids from + // `tok_start`; the group publish path relies on this to index the lists. + debug_assert_eq!(posting_lists.len(), chunk_token_count); + debug_assert!( + posting_lists + .iter() + .enumerate() + .all(|(i, (token_id, _))| *token_id as usize == tok_start + i) + ); + Ok(posting_lists) + } + + /// Strip positions into their own per-token cache entries (the posting cache + /// holds positions-free lists), then populate the same cache keys the read + /// path uses: grouped entries when grouping is active, per-token entries + /// otherwise. Called once per chunk; the chunk's lists drop on return. + async fn publish_chunk_postings( + &self, + posting_lists: Vec<(u32, PostingList)>, + group_starts: Option<&[u32]>, + tok_start: usize, + tok_end: usize, + token_count: usize, + with_position: bool, + ) { + match group_starts { Some(starts) => { - // The read path derives the last group's `end` from `self.len()`; - // match it here so both produce identical `PostingListGroupKey`s. - debug_assert_eq!(postings_by_token.len(), self.len()); + let mut chunk_postings = Vec::with_capacity(posting_lists.len()); + for (token_id, mut posting_list) in posting_lists { + self.cache_positions(&mut posting_list, token_id, with_position) + .await; + chunk_postings.push(posting_list); + } + // Chunk is group-aligned, so every group starting in it also ends + // in it; `chunk_postings[i]` is token `tok_start + i`. The last + // group's `end` derives from `token_count`, matching the read path + // so both produce identical `PostingListGroupKey`s. for (k, &start) in starts.iter().enumerate() { - let end = starts.get(k + 1).copied().unwrap_or(self.len() as u32); - let group = PostingListGroup::new( - postings_by_token[start as usize..end as usize].to_vec(), - ); + let start_usize = start as usize; + if start_usize < tok_start || start_usize >= tok_end { + continue; + } + let end = starts.get(k + 1).copied().unwrap_or(token_count as u32); + let lo = start_usize - tok_start; + let hi = end as usize - tok_start; + let group = PostingListGroup::new(chunk_postings[lo..hi].to_vec()); self.index_cache .insert_with_key(&PostingListGroupKey { start, end }, Arc::new(group)) .await; } } None => { - for (token_id, posting_list) in postings_by_token.into_iter().enumerate() { + for (token_id, mut posting_list) in posting_lists { + self.cache_positions(&mut posting_list, token_id, with_position) + .await; self.index_cache - .insert_with_key( - &PostingListKey { - token_id: token_id as u32, - }, - Arc::new(posting_list), - ) + .insert_with_key(&PostingListKey { token_id }, Arc::new(posting_list)) .await; } } } - let populate_elapsed = populate_start.elapsed(); + } - info!( - legacy_layout, - with_position, - token_count = self.len(), - read_batch_ms = read_batch_elapsed.as_secs_f64() * 1000.0, - post_read_loop_ms = populate_elapsed.as_secs_f64() * 1000.0, - "posting list prewarm timing" - ); + /// Move a posting list's positions (when present and requested) into the + /// dedicated per-token position cache, leaving the posting list positions-free. + async fn cache_positions( + &self, + posting_list: &mut PostingList, + token_id: u32, + with_position: bool, + ) { + if with_position && let Some(positions) = posting_list.take_positions() { + self.index_cache + .insert_with_key(&PositionKey { token_id }, Arc::new(Positions(positions))) + .await; + } + } - Ok(()) + /// Cheap `invert.lance` size estimate (file length from object metadata, no + /// data read), used only to size prewarm chunks. Falls back to a row-count + /// proxy when the reader can't surface the length (legacy v1). + pub(crate) fn posting_data_size_bytes(&self) -> u64 { + if let Some(size) = self.reader.file_size_bytes() { + return size; + } + // Fallback proxy for readers that don't cache their file length: just needs + // to be monotonic in partition size. + const ESTIMATED_BYTES_PER_ROW: u64 = 16; + (self.reader.num_rows() as u64).saturating_mul(ESTIMATED_BYTES_PER_ROW) } pub(crate) async fn read_batch(&self, with_position: bool) -> Result { @@ -2453,6 +3115,38 @@ impl PostingListReader { } } +/// Loop-invariant state for [`InvertedPartition::build_chunk_postings`]. The +/// metadata vecs are `Arc`d so each chunk's blocking build shares them cheaply. +struct ChunkBuildState { + offsets: Option>>, + max_scores: Option>>, + lengths: Option>>, + posting_tail_codec: PostingTailCodec, + positions_layout: PositionsLayout, +} + +/// Chunk-invariant inputs to [`InvertedPartition::build_prewarm_posting_lists_chunk`]: +/// the per-partition codec/layout and the (shared, whole-partition) metadata +/// slices indexed by global token id. These don't change across chunks. +struct PrewarmBuildCtx<'a> { + max_scores: Option<&'a [f32]>, + lengths: Option<&'a [u32]>, + posting_tail_codec: PostingTailCodec, + positions_layout: PositionsLayout, +} + +/// Per-chunk inputs to [`InvertedPartition::build_prewarm_posting_lists_chunk`]: +/// the token sub-range `[tok_start, tok_start + token_count)` and, for legacy +/// v1, the rebased offset slice plus the chunk's end row. +struct PrewarmChunk<'a> { + tok_start: usize, + token_count: usize, + /// Legacy v1 only: `offsets[tok_start..tok_start+token_count]` (no sentinel). + offsets: Option<&'a [usize]>, + /// Legacy v1 only: global row at which this chunk's posting rows end. + end_row: usize, +} + /// New type just to allow Positions implement DeepSizeOf so it can be put /// in the cache. #[derive(Clone)] @@ -2502,7 +3196,7 @@ fn sliced_cache_bytes(array: &dyn Array) -> usize { } impl DeepSizeOf for Positions { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0.deep_size_of_children(context) } } @@ -2555,6 +3249,29 @@ impl CacheKey for PostingListGroupKey { } } +#[derive(Debug, Clone, DeepSizeOf)] +struct PostingMetadataValue { + max_score: f32, + length: u32, +} + +#[derive(Debug, Clone)] +struct PostingMetadataKey { + token_id: u32, +} + +impl CacheKey for PostingMetadataKey { + type ValueType = PostingMetadataValue; + + fn key(&self) -> std::borrow::Cow<'_, str> { + format!("posting-metadata-{}", self.token_id).into() + } + + fn type_name() -> &'static str { + "PostingMetadata" + } +} + #[derive(Debug, Clone)] pub struct PositionKey { pub token_id: u32, @@ -2583,7 +3300,7 @@ pub enum CompressedPositionStorage { } impl DeepSizeOf for CompressedPositionStorage { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { match self { Self::LegacyPerDoc(positions) => sliced_cache_bytes(positions), Self::SharedStream(stream) => stream.size(), @@ -2594,10 +3311,9 @@ impl DeepSizeOf for CompressedPositionStorage { #[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct SharedPositionStream { codec: PositionStreamCodec, - block_offsets: Vec, - // Stored as `Bytes` so that the cache deserialization path can hand - // ownership of an IPC-decoded slice in without copying. Cloning the - // stream is then an `Arc` bump rather than an O(N) buffer copy. + block_offsets: Arc<[u32]>, + // Stored with shared ownership so cache hits can clone position streams + // without copying either offsets or bytes. bytes: bytes::Bytes, } @@ -2605,7 +3321,7 @@ impl SharedPositionStream { pub fn new(codec: PositionStreamCodec, block_offsets: Vec, bytes: bytes::Bytes) -> Self { Self { codec, - block_offsets, + block_offsets: Arc::from(block_offsets.into_boxed_slice()), bytes, } } @@ -2638,11 +3354,11 @@ impl SharedPositionStream { } pub fn block_offsets(&self) -> &[u32] { - &self.block_offsets + self.block_offsets.as_ref() } pub fn size(&self) -> usize { - self.block_offsets.capacity() * std::mem::size_of::() + self.bytes.len() + self.block_offsets.len() * std::mem::size_of::() + self.bytes.len() } } @@ -2860,7 +3576,7 @@ pub struct PlainPostingList { } impl DeepSizeOf for PlainPostingList { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.row_ids.len() * std::mem::size_of::() + self.frequencies.len() * std::mem::size_of::() + self @@ -2963,7 +3679,7 @@ pub struct CompressedPostingList { } impl DeepSizeOf for CompressedPostingList { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { sliced_cache_bytes(&self.blocks) + self .positions @@ -4233,13 +4949,27 @@ pub struct DocSet { impl DocSet { #[inline] pub fn len(&self) -> usize { - self.row_ids.len() + // Use num_tokens instead of row_ids so the deferred-row_ids + // scoring path (which constructs a DocSet via + // [`Self::from_num_tokens_only`]) still reports the right doc + // count. + self.num_tokens.len() } pub fn is_empty(&self) -> bool { self.len() == 0 } + /// True iff the per-doc `row_id` array is populated. The + /// deferred-row_id scoring path constructs DocSets with the array + /// left empty so wand can skip the load; callers that need to do + /// row_id lookups in the inner loop must check this and fall back + /// to async resolution otherwise. + #[inline] + pub fn has_row_ids(&self) -> bool { + !self.row_ids.is_empty() + } + pub fn iter(&self) -> impl Iterator { self.row_ids.iter().zip(self.num_tokens.iter()) } @@ -4248,18 +4978,25 @@ impl DocSet { self.row_ids[doc_id as usize] } - pub fn doc_id(&self, row_id: u64) -> Option { + /// Resolve a `row_id` to every `doc_id` it owns. + /// + /// A scalar column maps each row to a single document, but a + /// `list` column indexes every element as its own document, so a + /// single `row_id` can own several `doc_id`s sharing that key in `inv`. + /// The prefilter path (`flat_search`) walks an allow-list of row_ids and + /// must evaluate *all* of a row's documents; resolving to one `doc_id` + /// silently drops matches at non-last list positions (lancedb#3352). + pub fn doc_ids(&self, row_id: u64) -> impl Iterator + '_ { if self.inv.is_empty() { - // in legacy format, the row id is doc id - match self.row_ids.binary_search(&row_id) { - Ok(_) => Some(row_id), - Err(_) => None, - } + // in legacy format, the row id is doc id (one document per row) + let found = self.row_ids.binary_search(&row_id).is_ok(); + Either::Left(found.then_some(row_id).into_iter()) } else { - match self.inv.binary_search_by_key(&row_id, |x| x.0) { - Ok(idx) => Some(self.inv[idx].1 as u64), - Err(_) => None, - } + // `inv` is sorted by row_id, so the entries sharing this key form a + // contiguous run; yield the doc_id of each. + let lo = self.inv.partition_point(|entry| entry.0 < row_id); + let hi = self.inv.partition_point(|entry| entry.0 <= row_id); + Either::Right(self.inv[lo..hi].iter().map(|entry| entry.1 as u64)) } } pub fn total_tokens_num(&self) -> u64 { @@ -4329,7 +5066,35 @@ impl DocSet { let batch = reader.read_range(0..reader.num_rows(), None).await?; let row_id_col = batch[ROW_ID].as_primitive::(); let num_tokens_col = batch[NUM_TOKEN_COL].as_primitive::(); + Self::from_columns(row_id_col, num_tokens_col, is_legacy, frag_reuse_index) + } + + /// Build a `DocSet` carrying only the per-doc `num_tokens` array; + /// `row_ids` and `inv` are left empty. Used by the deferred-row_id + /// scoring path: wand checks `has_row_ids()` to skip `row_id` / + /// `num_tokens_by_row_id` calls, and the per-partition caller + /// resolves doc_id → row_id for the surviving top-K post-wand. + pub fn from_num_tokens_only(num_tokens_col: &arrow_array::UInt32Array) -> Self { + let num_tokens = num_tokens_col.values().to_vec(); + let total_tokens = num_tokens.iter().map(|&n| n as u64).sum(); + Self { + row_ids: Vec::new(), + num_tokens, + inv: Vec::new(), + total_tokens, + } + } + /// Build a `DocSet` from already-loaded `row_id` and `num_tokens` + /// arrow columns. Lets callers that have one column already in hand + /// (e.g. `LazyDocSet` after `total_tokens_num` pre-fetched + /// `num_tokens`) skip re-reading that column. + pub fn from_columns( + row_id_col: &UInt64Array, + num_tokens_col: &arrow_array::UInt32Array, + is_legacy: bool, + frag_reuse_index: Option>, + ) -> Result { // for legacy format, the row id is doc id; sorting keeps binary search viable if is_legacy { let (row_ids, num_tokens): (Vec<_>, Vec<_>) = row_id_col @@ -4355,23 +5120,36 @@ impl DocSet { }); } - // if frag reuse happened, we'll need to remap the row_ids. And after row_ids been - // remapped, we'll need resort to make sure binary_search works. + // If frag reuse happened, remap the row_ids through it. Crucially we + // must NOT drop the rows the reuse index deleted, because the posting + // lists reference doc_ids *positionally* (a doc_id is an index into + // these arrays, fixed at build time). Dropping deleted rows would + // renumber every later doc_id and desync the posting lists, so wand + // would index `num_tokens`/`row_ids` out of bounds or score the wrong + // doc. Instead we tombstone deleted rows in place: their slot survives + // (so doc_ids stay aligned with the posting lists) carrying + // `RowAddress::TOMBSTONE_ROW`, which wand skips, and they are left out + // of `inv` so a row_id lookup never resolves to a deleted doc. The + // heavyweight physical remap (`DocSet::remap`) is what actually + // renumbers and compacts; this load-time path only has to stay + // consistent until then. if let Some(frag_reuse_index_ref) = frag_reuse_index.as_ref() { let mut row_ids = Vec::with_capacity(row_id_col.len()); - let mut num_tokens = Vec::with_capacity(num_tokens_col.len()); - for (row_id, num_token) in row_id_col.values().iter().zip(num_tokens_col.values()) { - if let Some(new_row_id) = frag_reuse_index_ref.remap_row_id(*row_id) { - row_ids.push(new_row_id); - num_tokens.push(*num_token); + let num_tokens = num_tokens_col.values().to_vec(); + let mut inv = Vec::with_capacity(row_id_col.len()); + for (doc_id, row_id) in row_id_col.values().iter().enumerate() { + match frag_reuse_index_ref.remap_row_id(*row_id) { + Some(new_row_id) => { + row_ids.push(new_row_id); + inv.push((new_row_id, doc_id as u32)); + } + None => { + // Deleted: keep the slot (doc_ids must not shift) but + // tombstone it and leave it out of `inv`. + row_ids.push(RowAddress::TOMBSTONE_ROW); + } } } - - let mut inv: Vec<(u64, u32)> = row_ids - .iter() - .enumerate() - .map(|(doc_id, row_id)| (*row_id, doc_id as u32)) - .collect(); inv.sort_unstable_by_key(|entry| entry.0); let total_tokens = num_tokens.iter().map(|&x| x as u64).sum(); @@ -4795,8 +5573,22 @@ pub async fn flat_bm25_search_stream_with_metrics( elapsed_compute: Option

/// Internal API @@ -58,11 +61,95 @@ pub trait DistCalculator { dists: &mut Vec, _u16_scratch: &mut Vec, _u8_scratch: &mut Vec, + _u32_scratch: &mut Vec, ) { *dists = self.distance_all(k_hint); } fn prefetch(&self, _id: u32) {} + + #[allow(clippy::too_many_arguments)] + fn accumulate_topk_with_scratch( + &self, + k: usize, + lower_bound: Option, + upper_bound: Option, + row_id: impl Fn(u32) -> u64, + res: &mut BinaryHeap>, + dists: &mut Vec, + u16_scratch: &mut Vec, + u8_scratch: &mut Vec, + u32_scratch: &mut Vec, + ) { + if k == 0 { + return; + } + + self.distance_all_with_scratch(k, dists, u16_scratch, u8_scratch, u32_scratch); + let lower_bound = lower_bound.unwrap_or(f32::MIN).into(); + let upper_bound = upper_bound.unwrap_or(f32::MAX).into(); + let mut max_dist = res.peek().map(|node| node.dist); + + for (id, dist) in dists.iter().copied().enumerate() { + let dist = OrderedFloat(dist); + if dist < lower_bound || dist >= upper_bound { + continue; + } + if res.len() < k { + res.push(OrderedNode::new(row_id(id as u32), dist)); + if res.len() == k { + max_dist = res.peek().map(|node| node.dist); + } + } else if max_dist.is_some_and(|max_dist| max_dist > dist) { + res.pop(); + res.push(OrderedNode::new(row_id(id as u32), dist)); + max_dist = res.peek().map(|node| node.dist); + } + } + } + + #[allow(clippy::too_many_arguments)] + fn accumulate_filtered_topk_with_scratch( + &self, + k: usize, + lower_bound: Option, + upper_bound: Option, + row_ids: impl Iterator, + accept_row: impl Fn(u64) -> bool, + res: &mut BinaryHeap>, + _dists: &mut Vec, + _u16_scratch: &mut Vec, + _u8_scratch: &mut Vec, + _u32_scratch: &mut Vec, + ) { + if k == 0 { + return; + } + + let lower_bound = lower_bound.unwrap_or(f32::MIN).into(); + let upper_bound = upper_bound.unwrap_or(f32::MAX).into(); + let mut max_dist = res.peek().map(|node| node.dist); + + for (id, row_id) in row_ids { + if !accept_row(row_id) { + continue; + } + let dist = OrderedFloat(self.distance(id)); + if dist < lower_bound || dist >= upper_bound { + continue; + } + if res.len() < k { + res.push(OrderedNode::new(row_id, dist)); + if res.len() == k { + max_dist = res.peek().map(|node| node.dist); + } + } else if max_dist.is_some_and(|max_dist| max_dist > dist) { + res.pop(); + res.push(OrderedNode::new(row_id, dist)); + max_dist = res.peek().map(|node| node.dist); + } + } + } } pub const STORAGE_METADATA_KEY: &str = "storage_metadata"; @@ -73,6 +160,7 @@ pub struct QueryScratch { pub query_f32: Vec, pub u16: Vec, pub u8: Vec, + pub u32: Vec, } impl QueryScratch { @@ -82,6 +170,7 @@ impl QueryScratch { query_f32: Vec::new(), u16: Vec::new(), u8: Vec::new(), + u32: Vec::new(), } } @@ -91,6 +180,7 @@ impl QueryScratch { query_f32: vec![0.0; capacity.query_f32], u16: vec![0; capacity.u16], u8: vec![0; capacity.u8], + u32: vec![0; capacity.u32], } } } @@ -102,11 +192,12 @@ impl Default for QueryScratch { } impl DeepSizeOf for QueryScratch { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.distances.capacity() * size_of::() + self.query_f32.capacity() * size_of::() + self.u16.capacity() * size_of::() + self.u8.capacity() * size_of::() + + self.u32.capacity() * size_of::() } } @@ -116,15 +207,27 @@ pub struct QueryScratchCapacity { pub query_f32: usize, pub u16: usize, pub u8: usize, + pub u32: usize, } impl QueryScratchCapacity { pub const fn new(distances: usize, query_f32: usize, u16: usize, u8: usize) -> Self { + Self::new_with_u32(distances, query_f32, u16, u8, 0) + } + + pub const fn new_with_u32( + distances: usize, + query_f32: usize, + u16: usize, + u8: usize, + u32: usize, + ) -> Self { Self { distances, query_f32, u16, u8, + u32, } } @@ -133,12 +236,35 @@ impl QueryScratchCapacity { + self.query_f32 * size_of::() + self.u16 * size_of::() + self.u8 * size_of::() + + self.u32 * size_of::() } } +#[derive(Clone, Copy, Debug, Default)] +pub struct DistanceCalculatorOptions { + pub approx_mode: ApproxMode, +} + +#[derive(Debug)] +pub struct RabitRawQueryContext { + pub code_dim: usize, + pub ex_bits: u8, + pub rotated_query: Vec, + pub dist_table: Vec, + /// The rotated query zero-padded to a 64-dim multiple for the ex-dot + /// kernels; empty when `code_dim` is already aligned (the kernels then + /// read `rotated_query` directly). + pub ex_query: Vec, + pub sum_q: f32, +} + #[derive(Clone, Copy)] pub enum QueryResidual<'a> { Centroid(&'a dyn arrow_array::Array), + RabitRawQuery { + rotated_centroid: Option<&'a [f32]>, + query: Option<&'a RabitRawQueryContext>, + }, } #[derive(Debug)] @@ -224,7 +350,7 @@ impl Drop for QueryScratchGuard<'_> { } impl DeepSizeOf for QueryScratchPool { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let mut total = self.scratches.capacity() * size_of::(); let mut scratches = Vec::new(); while let Some(scratch) = self.scratches.pop() { @@ -295,8 +421,9 @@ pub trait VectorStore: Send + Sync + Sized + Clone { &'a self, query: ArrayRef, dist_q_c: f32, - _residual: Option>, + _residual: Option>, _f32_scratch: &'a mut Vec, + _options: DistanceCalculatorOptions, ) -> Self::DistanceCalculator<'a> { self.dist_calculator(query, dist_q_c) } @@ -381,7 +508,7 @@ pub struct IvfQuantizationStorage { } impl DeepSizeOf for IvfQuantizationStorage { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.metadata.deep_size_of_children(context) + self.ivf.deep_size_of_children(context) } } @@ -498,15 +625,29 @@ impl IvfQuantizationStorage { self.ivf.num_partitions() } - pub async fn load_partition(&self, part_id: usize) -> Result { + /// Load a partition's quantization storage, optionally measuring the exact + /// I/O it performs into `io_stats`. + /// + /// When `io_stats` is `Some`, the partition is read through a reader whose + /// scheduler also records into the sink (a cheap clone that shares all + /// cached metadata, so no file is re-opened). When `None`, the normal + /// uninstrumented reader is used. + pub async fn load_partition( + &self, + part_id: usize, + io_stats: Option, + ) -> Result { let range = self.ivf.row_range(part_id); let batch = if range.is_empty() { let schema = self.reader.schema(); let arrow_schema = arrow_schema::Schema::from(schema.as_ref()); RecordBatch::new_empty(Arc::new(arrow_schema)) } else { - let batches = self - .reader + let reader = match &io_stats { + Some(io_stats) => Cow::Owned(self.reader.with_io_stats(io_stats.recorder())), + None => Cow::Borrowed(&self.reader), + }; + let batches = reader .read_stream( ReadBatchParams::Range(range), u32::MAX, @@ -531,7 +672,7 @@ impl IvfQuantizationStorage { #[cfg(test)] mod tests { use super::{QueryScratchCapacity, QueryScratchPool}; - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; #[test] fn test_query_scratch_pool_reuses_buffers() { @@ -545,11 +686,14 @@ mod tests { scratch.u16.resize(4, 3); scratch.u8.clear(); scratch.u8.resize(2, 4); + scratch.u32.clear(); + scratch.u32.resize(3, 5); ( scratch.query_f32.as_ptr(), scratch.distances.as_ptr(), scratch.u16.as_ptr(), scratch.u8.as_ptr(), + scratch.u32.as_ptr(), ) }); @@ -562,11 +706,14 @@ mod tests { assert!(scratch.u16.iter().all(|value| *value == 3)); assert_eq!(scratch.u8.len(), 2); assert!(scratch.u8.iter().all(|value| *value == 4)); + assert_eq!(scratch.u32.len(), 3); + assert!(scratch.u32.iter().all(|value| *value == 5)); ( scratch.query_f32.as_ptr(), scratch.distances.as_ptr(), scratch.u16.as_ptr(), scratch.u8.as_ptr(), + scratch.u32.as_ptr(), ) }); @@ -592,7 +739,8 @@ mod tests { #[test] fn test_query_scratch_pool_uses_temporary_scratch_when_empty() { - let pool = QueryScratchPool::with_capacity(1, QueryScratchCapacity::new(8, 16, 4, 2)); + let pool = + QueryScratchPool::with_capacity(1, QueryScratchCapacity::new_with_u32(8, 16, 4, 2, 3)); let pooled = pool.scratch(); assert!(pooled.pooled); @@ -602,12 +750,14 @@ mod tests { assert_eq!(temporary.query_f32.len(), 16); assert_eq!(temporary.u16.len(), 4); assert_eq!(temporary.u8.len(), 2); + assert_eq!(temporary.u32.len(), 3); } #[test] fn test_query_scratch_pool_deep_size_includes_buffer_capacity() { let empty_size = QueryScratchPool::new(1).deep_size_of(); - let pool = QueryScratchPool::with_capacity(1, QueryScratchCapacity::new(8, 16, 4, 2)); + let pool = + QueryScratchPool::with_capacity(1, QueryScratchCapacity::new_with_u32(8, 16, 4, 2, 3)); assert!(pool.deep_size_of() > empty_size); @@ -619,7 +769,8 @@ mod tests { #[test] fn test_query_scratch_pool_initializes_buffer_capacity() { - let pool = QueryScratchPool::with_capacity(1, QueryScratchCapacity::new(8, 16, 4, 2)); + let pool = + QueryScratchPool::with_capacity(1, QueryScratchCapacity::new_with_u32(8, 16, 4, 2, 3)); pool.with_scratch(|scratch| { assert_eq!(scratch.distances.len(), 8); @@ -630,6 +781,8 @@ mod tests { assert_eq!(scratch.u16.capacity(), 4); assert_eq!(scratch.u8.len(), 2); assert_eq!(scratch.u8.capacity(), 2); + assert_eq!(scratch.u32.len(), 3); + assert_eq!(scratch.u32.capacity(), 3); }); } } diff --git a/rust/lance-index/src/vector/utils.rs b/rust/lance-index/src/vector/utils.rs index 1e56370613e..fb4f9004c57 100644 --- a/rust/lance-index/src/vector/utils.rs +++ b/rust/lance-index/src/vector/utils.rs @@ -302,15 +302,16 @@ mod tests { #[rstest] #[case::f16(Arc::new(Float16Array::from( (0..100).flat_map(|i| std::iter::repeat_n(f16::from_f32(i as f32), 16)).collect::>(), - )) as ArrayRef)] + )) as ArrayRef, 42.0f32)] #[case::f32(Arc::new(Float32Array::from( (0..100).flat_map(|i| std::iter::repeat_n(i as f32, 16)).collect::>(), - )) as ArrayRef)] - fn test_simple_index_nearest_centroid(#[case] centroids: ArrayRef) { + )) as ArrayRef, 42.0f32)] + fn test_simple_index_nearest_centroid(#[case] centroids: ArrayRef, #[case] query_val: f32) { let index = build_index(centroids, 16); - let query: ArrayRef = Arc::new(Float32Array::from(vec![42.1f32; 16])); - let (id, _) = index.search(query).unwrap(); + let query: ArrayRef = Arc::new(Float32Array::from(vec![query_val; 16])); + let (id, dist) = index.search(query).unwrap(); assert_eq!(id, 42); + assert_eq!(dist, 0.0); } #[test] diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs index 0b76517e1c2..4203d099d0b 100644 --- a/rust/lance-index/src/vector/v3/shuffler.rs +++ b/rust/lance-index/src/vector/v3/shuffler.rs @@ -5,8 +5,7 @@ //! the corresponding IVF partitions. use std::ops::Range; -use std::sync::atomic::AtomicU64; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use arrow::compute::concat_batches; use arrow::datatypes::UInt64Type; @@ -14,8 +13,7 @@ use arrow::{array::AsArray, compute::sort_to_indices}; use arrow_array::{RecordBatch, UInt32Array, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; use futures::{future::try_join_all, prelude::*}; -use lance_arrow::stream::rechunk_stream_by_size; -use lance_arrow::{RecordBatchExt, SchemaExt}; +use lance_arrow::{RecordBatchExt, SchemaExt, interleave_batches}; use lance_core::{ Error, Result, cache::LanceCache, @@ -341,6 +339,11 @@ pub fn create_ivf_shuffler( const DEFAULT_SHUFFLE_BATCH_BYTES: usize = 128 * 1024 * 1024; +/// Number of rows per output batch when streaming sorted data via interleave. +/// Small enough to keep the output chunk's memory footprint modest relative to +/// the accumulated source data. +const SHUFFLE_WRITE_CHUNK_ROWS: usize = 8 * 1024; + /// Limit of how much transformed data we accumulate before spilling to disk. /// /// A larger value will use more RAM but require less random access during the @@ -407,6 +410,51 @@ impl TwoFileShuffler { } } +/// `(batch_idx, row_idx)` pairs produced by [`sort_to_interleave_indices`], paired with +/// per-partition row counts. +type InterleaveResult = (Vec<(usize, usize)>, Vec); + +/// Sorts rows from multiple batches by partition ID and returns interleave indices. +/// +/// Builds a sort key of `(part_id, batch_idx, row_idx)` for every row across all +/// batches, sorts by `part_id`, then emits `(batch_idx, row_idx)` pairs in that +/// order. This avoids concatenating the full data: only the `UInt32` partition-ID +/// columns are touched here. +/// +/// Also returns per-partition row counts (derived from the same sorted keys at no +/// extra cost). +/// +/// Returns an error if any partition ID is out of range `[0, num_partitions)`. +fn sort_to_interleave_indices( + part_id_columns: &[&UInt32Array], + num_partitions: usize, +) -> Result { + let total_rows: usize = part_id_columns.iter().map(|a| a.len()).sum(); + let mut keys: Vec<(u32, u32, u32)> = Vec::with_capacity(total_rows); + for (batch_idx, col) in part_id_columns.iter().enumerate() { + let batch_idx = batch_idx as u32; + for (row_idx, &part_id) in col.values().iter().enumerate() { + keys.push((part_id, batch_idx, row_idx as u32)); + } + } + keys.sort_unstable_by_key(|k| k.0); + + let mut partition_counts = vec![0u64; num_partitions]; + let mut interleave_indices = Vec::with_capacity(total_rows); + for (part_id, batch_idx, row_idx) in &keys { + let pid = *part_id as usize; + if pid >= num_partitions { + return Err(Error::invalid_input(format!( + "partition ID {} is out of range [0, {})", + pid, num_partitions + ))); + } + partition_counts[pid] += 1; + interleave_indices.push((*batch_idx as usize, *row_idx as usize)); + } + Ok((interleave_indices, partition_counts)) +} + #[async_trait::async_trait] impl Shuffler for TwoFileShuffler { async fn shuffle( @@ -414,8 +462,7 @@ impl Shuffler for TwoFileShuffler { data: Box, ) -> Result> { let num_partitions = self.num_partitions; - let full_schema = Arc::new(data.schema().as_ref().clone()); - // No need to write partition ids since we can infer this + // No need to write partition ids since we can infer this from offsets let schema = data.schema().without_column(PART_ID_COLUMN); let offsets_schema = Arc::new(Schema::new(vec![Field::new( "offset", @@ -424,28 +471,6 @@ impl Shuffler for TwoFileShuffler { )])); let batch_size_bytes = self.batch_size_bytes; - // Extract loss from batch metadata before rechunking (concat_batches drops metadata) - let total_loss = Arc::new(Mutex::new(0.0f64)); - let loss_ref = total_loss.clone(); - let loss_stream = data.map(move |result| { - result.inspect(|batch| { - let loss = batch - .metadata() - .get(LOSS_METADATA_KEY) - .and_then(|s| s.parse::().ok()) - .unwrap_or(0.0); - *loss_ref.lock().unwrap() += loss; - }) - }); - - // Rechunk to target batch size - let rechunked = rechunk_stream_by_size( - loss_stream, - full_schema, - batch_size_bytes, - batch_size_bytes * 2, - ); - // Create data file writer let data_path = self.output_dir.clone().join("shuffle_data.lance"); let spill_path = self.output_dir.clone().join("shuffle_data.spill"); @@ -468,72 +493,63 @@ impl Shuffler for TwoFileShuffler { )? .with_page_metadata_spill(self.object_store.clone(), spill_path); - let num_batches = Arc::new(AtomicU64::new(0)); - let num_batches_ref = num_batches.clone(); + let mut num_batches: u64 = 0; let mut partition_counts: Vec = vec![0; num_partitions]; let mut global_row_count: u64 = 0; let mut rows_processed: u64 = 0; + let mut total_loss = 0.0f64; + let mut accumulated: Vec = Vec::new(); + let mut acc_bytes: usize = 0; - let mut rechunked = std::pin::pin!(rechunked); - while let Some(batch) = rechunked.next().await { - num_batches_ref.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let mut data = std::pin::pin!(data); + while let Some(batch) = data.next().await { let batch = batch?; - let np = num_partitions; - let num_rows = batch.num_rows() as u64; - - // Sort by partition ID and compute offsets on CPU - let (sorted_batch, batch_offsets) = spawn_cpu(move || { - let part_ids: &UInt32Array = batch[PART_ID_COLUMN].as_primitive(); - let indices = sort_to_indices(part_ids, None, None)?; - let batch = batch.take(&indices)?; - - let part_ids: &UInt32Array = batch[PART_ID_COLUMN].as_primitive(); - let batch = batch.drop_column(PART_ID_COLUMN)?; - - // Count rows per partition by scanning sorted part IDs - let mut partition_counts = vec![0u64; np]; - for i in 0..part_ids.len() { - let pid = part_ids.value(i) as usize; - if pid < np { - partition_counts[pid] += 1; - } else { - log::warn!("Partition ID {} is out of range [0, {})", pid, np); - } - } - - // Build cumulative offsets (end positions) for this batch - let mut batch_offsets = Vec::with_capacity(np); - let mut running = 0u64; - for count in &partition_counts { - running += count; - batch_offsets.push(running); + total_loss += batch + .metadata() + .get(LOSS_METADATA_KEY) + .and_then(|s| s.parse::().ok()) + .unwrap_or(0.0); + acc_bytes += batch.get_array_memory_size(); + accumulated.push(batch); + + if acc_bytes >= batch_size_bytes { + let (total_rows, counts) = flush_shuffle_batch( + std::mem::take(&mut accumulated), + &mut file_writer, + &mut offsets_writer, + offsets_schema.clone(), + num_partitions, + global_row_count, + ) + .await?; + acc_bytes = 0; + for (p, c) in counts.iter().enumerate() { + partition_counts[p] += c; } + global_row_count += total_rows; + rows_processed += total_rows; + num_batches += 1; + self.progress + .stage_progress("shuffle", rows_processed) + .await?; + } + } - Ok::<(RecordBatch, Vec), Error>((batch, batch_offsets)) - }) + if !accumulated.is_empty() { + let (total_rows, counts) = flush_shuffle_batch( + accumulated, + &mut file_writer, + &mut offsets_writer, + offsets_schema, + num_partitions, + global_row_count, + ) .await?; - - // Write sorted batch to data file - file_writer.write_batch(&sorted_batch).await?; - - // Record offsets adjusted by global row count - let mut adjusted_offsets = Vec::with_capacity(batch_offsets.len()); - let mut last_offset = 0; - for (idx, offset) in batch_offsets.iter().enumerate() { - adjusted_offsets.push(global_row_count + offset); - partition_counts[idx] += offset - last_offset; - last_offset = *offset; + for (p, c) in counts.iter().enumerate() { + partition_counts[p] += c; } - global_row_count += sorted_batch.num_rows() as u64; - - // Write offsets to offsets file - let offsets_batch = RecordBatch::try_new( - offsets_schema.clone(), - vec![Arc::new(UInt64Array::from(adjusted_offsets))], - )?; - offsets_writer.write_batch(&offsets_batch).await?; - - rows_processed += num_rows; + rows_processed += total_rows; + num_batches += 1; self.progress .stage_progress("shuffle", rows_processed) .await?; @@ -543,22 +559,76 @@ impl Shuffler for TwoFileShuffler { file_writer.finish().await?; offsets_writer.finish().await?; - let num_batches = num_batches.load(std::sync::atomic::Ordering::Relaxed); - - let total_loss_val = *total_loss.lock().unwrap(); - TwoFileShuffleReader::try_new( self.object_store.clone(), self.output_dir.clone(), num_partitions, num_batches, partition_counts, - total_loss_val, + total_loss, ) .await } } +/// Sorts `accumulated` batches by partition ID and writes the result to the data +/// and offsets files. +/// +/// Returns `(total_rows_written, per_partition_row_counts)`. +async fn flush_shuffle_batch( + accumulated: Vec, + file_writer: &mut FileWriter, + offsets_writer: &mut FileWriter, + offsets_schema: Arc, + num_partitions: usize, + global_row_count: u64, +) -> Result<(u64, Vec)> { + let total_rows: u64 = accumulated.iter().map(|b| b.num_rows() as u64).sum(); + + // Clone part-id columns into the CPU task (cheap: Arc ref bump, not data copy). + let part_id_cols: Vec = accumulated + .iter() + .map(|b| { + let col: &UInt32Array = b[PART_ID_COLUMN].as_primitive(); + col.clone() + }) + .collect(); + + let np = num_partitions; + let (interleave_indices, batch_partition_counts) = + spawn_cpu(move || sort_to_interleave_indices(&part_id_cols.iter().collect::>(), np)) + .await?; + + // Drop part-id column from source batches before interleaving. + let source_batches: Vec = accumulated + .into_iter() + .map(|b| b.drop_column(PART_ID_COLUMN).map_err(Error::from)) + .collect::>()?; + + // Stream sorted output to the data file in fixed-size chunks so the peak + // memory for the interleave output stays small relative to the source data. + for chunk in interleave_indices.chunks(SHUFFLE_WRITE_CHUNK_ROWS) { + let out = interleave_batches(&source_batches, chunk)?; + file_writer.write_batch(&out).await?; + } + + // Compute cumulative end-row offsets (adjusted by global position) and write + // one offsets batch for this flush group. + let mut adjusted_offsets = Vec::with_capacity(num_partitions); + let mut running = 0u64; + for count in &batch_partition_counts { + running += count; + adjusted_offsets.push(global_row_count + running); + } + let offsets_batch = RecordBatch::try_new( + offsets_schema, + vec![Arc::new(UInt64Array::from(adjusted_offsets))], + )?; + offsets_writer.write_batch(&offsets_batch).await?; + + Ok((total_rows, batch_partition_counts)) +} + pub struct TwoFileShuffleReader { _scheduler: Arc, file_reader: FileReader, @@ -934,4 +1004,65 @@ mod tests { assert!((reader.total_loss().unwrap() - 6.0).abs() < 1e-10); } + + #[tokio::test] + async fn test_two_file_shuffler_multi_batch_single_flush() { + // All three batches fit within the default batch_size_bytes, so they + // accumulate and are interleaved in a single flush group. This exercises + // the cross-batch interleave path. + let dir = TempStrDir::default(); + let output_dir = Path::from(dir.as_ref()); + let num_partitions = 3; + + let batch1 = make_batch(&[0, 1, 2], &[10, 20, 30], None); + let batch2 = make_batch(&[2, 0, 1], &[40, 50, 60], None); + let batch3 = make_batch(&[1, 2, 0], &[70, 80, 90], None); + + // Large batch_size_bytes so all three batches flush together. + let shuffler = + TwoFileShuffler::new(output_dir, num_partitions).with_batch_size_bytes(1024 * 1024); + let stream = batches_to_stream(vec![batch1, batch2, batch3]); + let reader = shuffler.shuffle(stream).await.unwrap(); + + assert_eq!(reader.partition_size(0).unwrap(), 3); + assert_eq!(reader.partition_size(1).unwrap(), 3); + assert_eq!(reader.partition_size(2).unwrap(), 3); + + let p0 = collect_partition(reader.as_ref(), 0).await.unwrap(); + let vals: &Int32Array = p0.column_by_name("val").unwrap().as_primitive(); + let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); + v.sort(); + assert_eq!(v, vec![10, 50, 90]); + + let p1 = collect_partition(reader.as_ref(), 1).await.unwrap(); + let vals: &Int32Array = p1.column_by_name("val").unwrap().as_primitive(); + let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); + v.sort(); + assert_eq!(v, vec![20, 60, 70]); + + let p2 = collect_partition(reader.as_ref(), 2).await.unwrap(); + let vals: &Int32Array = p2.column_by_name("val").unwrap().as_primitive(); + let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); + v.sort(); + assert_eq!(v, vec![30, 40, 80]); + } + + #[tokio::test] + async fn test_two_file_shuffler_out_of_range_partition_id() { + let dir = TempStrDir::default(); + let output_dir = Path::from(dir.as_ref()); + + // Row with partition ID 5 is out of range for num_partitions=3. + let batch = make_batch(&[0, 5, 1], &[10, 20, 30], None); + + let shuffler = TwoFileShuffler::new(output_dir, 3); + let stream = batches_to_stream(vec![batch]); + let Err(err) = shuffler.shuffle(stream).await else { + panic!("expected an error for out-of-range partition ID"); + }; + assert!( + err.to_string().contains("partition ID 5 is out of range"), + "unexpected error: {err}" + ); + } } diff --git a/rust/lance-index/src/vector/v3/subindex.rs b/rust/lance-index/src/vector/v3/subindex.rs index 7c9667859f4..9a49bc95f1d 100644 --- a/rust/lance-index/src/vector/v3/subindex.rs +++ b/rust/lance-index/src/vector/v3/subindex.rs @@ -6,7 +6,7 @@ use std::fmt::Debug; use std::sync::Arc; use arrow_array::{ArrayRef, RecordBatch}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use crate::metrics::MetricsCollector; diff --git a/rust/lance-io/Cargo.toml b/rust/lance-io/Cargo.toml index d0c8a3c3c33..6cee04d5fd9 100644 --- a/rust/lance-io/Cargo.toml +++ b/rust/lance-io/Cargo.toml @@ -34,7 +34,6 @@ aws-credential-types = { workspace = true, optional = true } byteorder.workspace = true bytes.workspace = true chrono.workspace = true -deepsize.workspace = true futures.workspace = true http.workspace = true log.workspace = true @@ -59,6 +58,7 @@ test-log.workspace = true mockall.workspace = true rstest.workspace = true mock_instant.workspace = true +tokio = { workspace = true, features = ["test-util"] } tracing-mock = { workspace = true } [[bench]] @@ -68,12 +68,16 @@ harness = false [features] default = ["aws", "azure", "gcp"] gcs-test = [] +goosefs-test = [] gcp = ["object_store/gcp", "dep:opendal", "opendal/services-gcs", "dep:object_store_opendal"] aws = ["object_store/aws", "dep:aws-config", "dep:aws-credential-types", "dep:opendal", "opendal/services-s3", "dep:object_store_opendal"] azure = ["object_store/azure", "dep:opendal", "opendal/services-azblob", "opendal/services-azdls", "dep:object_store_opendal"] oss = ["dep:opendal", "opendal/services-oss", "dep:object_store_opendal"] +goosefs = ["dep:opendal", "opendal/services-goosefs", "dep:object_store_opendal"] tencent = ["dep:opendal", "opendal/services-cos", "dep:object_store_opendal"] huggingface = ["dep:opendal", "opendal/services-huggingface", "dep:object_store_opendal"] +tos = ["dep:opendal", "opendal/services-tos", "dep:object_store_opendal"] +tos-test = ["tos"] test-util = [] [lints] diff --git a/rust/lance-io/src/local.rs b/rust/lance-io/src/local.rs index 12f846bcc52..2b8a339331a 100644 --- a/rust/lance-io/src/local.rs +++ b/rust/lance-io/src/local.rs @@ -16,8 +16,8 @@ use std::os::windows::fs::FileExt; use async_trait::async_trait; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use object_store::path::Path; use tokio::io::AsyncSeekExt; @@ -89,7 +89,7 @@ pub struct LocalObjectReader { } impl DeepSizeOf for LocalObjectReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skipping `file` as it should just be a file handle self.path.as_ref().deep_size_of_children(context) } diff --git a/rust/lance-io/src/object_reader.rs b/rust/lance-io/src/object_reader.rs index d6d5de98f0b..1c27800c90f 100644 --- a/rust/lance-io/src/object_reader.rs +++ b/rust/lance-io/src/object_reader.rs @@ -11,12 +11,12 @@ use crate::local::read_exact_at; use std::os::unix::fs::FileExt; use bytes::Bytes; -use deepsize::DeepSizeOf; use futures::{ FutureExt, future::{BoxFuture, Shared}, stream::{self, StreamExt}, }; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result, error::CloneableError}; use object_store::ObjectStoreExt; use object_store::{GetOptions, GetResult, ObjectStore, Result as OSResult, path::Path}; @@ -74,7 +74,7 @@ pub struct CloudObjectReader { } impl DeepSizeOf for CloudObjectReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skipping object_store because there is no easy way to do that and it shouldn't be too big self.path.as_ref().deep_size_of_children(context) } @@ -449,7 +449,7 @@ pub(crate) fn stream_local_range( } impl DeepSizeOf for SmallReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let mut size = self.inner.path.as_ref().deep_size_of_children(context); if let Ok(guard) = self.inner.state.try_lock() diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 698378c5b9e..1761dc4b059 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -13,9 +13,9 @@ use std::time::Duration; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; -use deepsize::DeepSizeOf; use futures::{FutureExt, Stream}; use futures::{StreamExt, TryStreamExt, future, stream::BoxStream}; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::utils::parse::str_is_truthy; use list_retry::ListRetryStream; @@ -36,7 +36,7 @@ use super::local::LocalObjectReader; use crate::uring::{UringCurrentThreadReader, UringReader}; #[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] pub(crate) mod dynamic_credentials; -#[cfg(any(feature = "oss", feature = "huggingface"))] +#[cfg(any(feature = "oss", feature = "huggingface", feature = "tos"))] pub(crate) mod dynamic_opendal; mod list_retry; pub mod providers; @@ -61,7 +61,15 @@ pub const DEFAULT_LOCAL_IO_PARALLELISM: usize = 8; pub const DEFAULT_CLOUD_IO_PARALLELISM: usize = 64; const DEFAULT_LOCAL_BLOCK_SIZE: usize = 4 * 1024; // 4KB block size -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +#[cfg(any( + feature = "aws", + feature = "gcp", + feature = "azure", + feature = "oss", + feature = "tencent", + feature = "huggingface", + feature = "tos", +))] const DEFAULT_CLOUD_BLOCK_SIZE: usize = 64 * 1024; // 64KB block size pub static DEFAULT_MAX_IOP_SIZE: std::sync::LazyLock = std::sync::LazyLock::new(|| { @@ -145,7 +153,7 @@ pub struct ObjectStore { } impl DeepSizeOf for ObjectStore { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // We aren't counting `inner` here which is problematic but an ObjectStore // shouldn't be too big. The only exception might be the write cache but, if // the writer cache has data, it means we're using it somewhere else that isn't @@ -570,10 +578,17 @@ impl ObjectStore { self.max_iop_size } + /// The amount of parallelism to use for I/O operations. + /// + /// Honors the `LANCE_IO_THREADS` override when set, otherwise the store's configured value. + /// Always at least 1: callers feed this straight into `buffered` / `buffer_unordered`, and a + /// window of 0 makes those streams never poll their input — e.g. a metadata-only `count_rows` + /// would hang rather than return. pub fn io_parallelism(&self) -> usize { std::env::var("LANCE_IO_THREADS") .map(|val| val.parse::().unwrap()) .unwrap_or(self.io_parallelism) + .max(1) } /// Get the IO tracker for this object store @@ -830,7 +845,7 @@ impl ObjectStore { .common_prefixes .iter() .chain(output.objects.iter().map(|o| &o.location)) - .map(|s| s.filename().unwrap().to_string()) + .filter_map(|s| s.filename().map(|f| f.to_string())) .collect()) } @@ -1150,6 +1165,35 @@ mod tests { Ok(contents) } + #[test] + fn test_io_parallelism_clamped_to_nonzero() { + // `io_parallelism()` feeds `buffered`/`buffer_unordered` windows; a value of 0 makes those + // streams never poll, hanging callers (e.g. a metadata-only `count_rows`). It must clamp. + let store = ObjectStore::local(); + + // SAFETY: process-global env var, set and restored within this test. `io_parallelism()` + // only reads it, and a concurrent reader observes a valid clamped value, never 0. + unsafe { std::env::set_var("LANCE_IO_THREADS", "0") }; + assert_eq!( + store.io_parallelism(), + 1, + "LANCE_IO_THREADS=0 must clamp to 1" + ); + + unsafe { std::env::set_var("LANCE_IO_THREADS", "8") }; + assert_eq!( + store.io_parallelism(), + 8, + "a positive override must pass through unchanged" + ); + + unsafe { std::env::remove_var("LANCE_IO_THREADS") }; + assert!( + store.io_parallelism() >= 1, + "the configured default parallelism must be at least 1" + ); + } + #[tokio::test] async fn test_absolute_paths() { let tmp_path = TempStrDir::default(); diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index 20fa251a0c5..45ac30a757a 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -24,6 +24,8 @@ pub mod aws; pub mod azure; #[cfg(feature = "gcp")] pub mod gcp; +#[cfg(feature = "goosefs")] +pub mod goosefs; #[cfg(feature = "huggingface")] pub mod huggingface; pub mod local; @@ -33,6 +35,8 @@ pub mod oss; pub mod shared_memory; #[cfg(feature = "tencent")] pub mod tencent; +#[cfg(feature = "tos")] +pub mod tos; #[async_trait::async_trait] pub trait ObjectStoreProvider: std::fmt::Debug + Sync + Send { @@ -95,6 +99,7 @@ pub struct ObjectStoreRegistryStats { /// - `s3+ddb`: An S3 object store with DynamoDB for metadata. /// - `az`: An Azure Blob Storage object store. /// - `gs`: A Google Cloud Storage object store. +/// - `tos`: A Volcengine TOS object store. /// /// Use [`Self::empty()`] to create an empty registry, with no providers registered. /// @@ -324,12 +329,16 @@ impl Default for ObjectStoreRegistry { } #[cfg(feature = "gcp")] providers.insert("gs".into(), Arc::new(gcp::GcsStoreProvider)); + #[cfg(feature = "goosefs")] + providers.insert("goosefs".into(), Arc::new(goosefs::GooseFsStoreProvider)); #[cfg(feature = "oss")] providers.insert("oss".into(), Arc::new(oss::OssStoreProvider)); #[cfg(feature = "tencent")] providers.insert("cos".into(), Arc::new(tencent::TencentStoreProvider)); #[cfg(feature = "huggingface")] providers.insert("hf".into(), Arc::new(huggingface::HuggingfaceStoreProvider)); + #[cfg(feature = "tos")] + providers.insert("tos".into(), Arc::new(tos::TosStoreProvider)); Self { providers: RwLock::new(providers), active_stores: RwLock::new(HashMap::new()), diff --git a/rust/lance-io/src/object_store/providers/goosefs.rs b/rust/lance-io/src/object_store/providers/goosefs.rs new file mode 100644 index 00000000000..d6173571551 --- /dev/null +++ b/rust/lance-io/src/object_store/providers/goosefs.rs @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; +use std::sync::Arc; + +use object_store::path::Path; +use object_store_opendal::OpendalStore; +use opendal::{Operator, services::GooseFs}; +use url::Url; + +use crate::object_store::{ + DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, ObjectStore, + ObjectStoreParams, ObjectStoreProvider, StorageOptions, +}; +use lance_core::error::{Error, Result}; + +/// Default GooseFS Master gRPC port. +const DEFAULT_GOOSEFS_PORT: u16 = 9200; + +/// GooseFS object store provider. +/// +/// Uses OpenDAL's GooseFs service to access GooseFS via gRPC. +/// URL format: `goosefs://host:port/path` +/// +/// Where: +/// - `host:port` is the GooseFS Master address (default port: 9200) +/// - `/path` is the filesystem path within GooseFS +/// +/// Configuration priority: storage_options > environment variables > URL authority > defaults +#[derive(Default, Debug)] +pub struct GooseFsStoreProvider; + +impl GooseFsStoreProvider { + /// Resolve the GooseFS Master address from storage_options, environment, or URL. + /// + /// Priority: + /// 1. `storage_options["goosefs_master_addr"]` (supports HA: "addr1:port,addr2:port") + /// 2. `GOOSEFS_MASTER_ADDR` environment variable + /// 3. URL authority (host:port from the URL) + fn resolve_master_addr(url: &Url, storage_options: &StorageOptions) -> Result { + // 1. storage_options + if let Some(addr) = storage_options + .0 + .get("goosefs_master_addr") + .filter(|v| !v.is_empty()) + { + return Ok(addr.clone()); + } + + // 2. Environment variable + if let Ok(addr) = std::env::var("GOOSEFS_MASTER_ADDR") + && !addr.is_empty() + { + return Ok(addr); + } + + // 3. URL authority + let host = url.host_str().ok_or_else(|| { + Error::invalid_input( + "GooseFS URL must contain a master address (host), e.g. goosefs://host:port/path", + ) + })?; + + let port = url.port().unwrap_or(DEFAULT_GOOSEFS_PORT); + Ok(format!("{}:{}", host, port)) + } + + /// Resolve a storage option from storage_options or environment variable. + fn resolve_option( + storage_options: &StorageOptions, + option_key: &str, + env_key: &str, + ) -> Option { + storage_options + .0 + .get(option_key) + .cloned() + .or_else(|| std::env::var(env_key).ok()) + .filter(|v| !v.is_empty()) + } +} + +#[async_trait::async_trait] +impl ObjectStoreProvider for GooseFsStoreProvider { + async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { + let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); + + // Resolve master address + let master_addr = Self::resolve_master_addr(&base_path, &storage_options)?; + + // Extract root path from URL + let root = base_path.path().to_string(); + + // Build OpenDAL config map + let mut config_map: HashMap = HashMap::new(); + config_map.insert("master_addr".to_string(), master_addr); + + if !root.is_empty() && root != "/" { + config_map.insert("root".to_string(), root); + } + + // Optional: write_type + if let Some(wt) = + Self::resolve_option(&storage_options, "goosefs_write_type", "GOOSEFS_WRITE_TYPE") + { + config_map.insert("write_type".to_string(), wt); + } + + // Optional: block_size (for GooseFS, not Lance block_size) + if let Some(bs) = + Self::resolve_option(&storage_options, "goosefs_block_size", "GOOSEFS_BLOCK_SIZE") + { + config_map.insert("block_size".to_string(), bs); + } + + // Optional: chunk_size + if let Some(cs) = + Self::resolve_option(&storage_options, "goosefs_chunk_size", "GOOSEFS_CHUNK_SIZE") + { + config_map.insert("chunk_size".to_string(), cs); + } + + // Optional: auth_type (nosasl / simple) + if let Some(at) = + Self::resolve_option(&storage_options, "goosefs_auth_type", "GOOSEFS_AUTH_TYPE") + { + config_map.insert("auth_type".to_string(), at); + } + + // Optional: auth_username (used in SIMPLE auth mode) + if let Some(au) = Self::resolve_option( + &storage_options, + "goosefs_auth_username", + "GOOSEFS_AUTH_USERNAME", + ) { + config_map.insert("auth_username".to_string(), au); + } + + // Create OpenDAL Operator with GooseFS service + let operator = Operator::from_iter::(config_map) + .map_err(|e| { + Error::invalid_input(format!("Failed to create GooseFS operator: {:?}", e)) + })? + .finish(); + + // Wrap as object_store::ObjectStore via OpendalStore bridge + let opendal_store = Arc::new(OpendalStore::new(operator)); + + Ok(ObjectStore { + scheme: "goosefs".to_string(), + inner: opendal_store, + block_size, + max_iop_size: *DEFAULT_MAX_IOP_SIZE, + use_constant_size_upload_parts: params.use_constant_size_upload_parts, + list_is_lexically_ordered: params.list_is_lexically_ordered.unwrap_or(false), + io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, + download_retry_count: storage_options.download_retry_count(), + io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, + }) + } + + /// Extract the path relative to the root of the GooseFS filesystem. + /// + /// For GooseFS, the entire URL path is set as the OpenDAL `root` in `new_store`, + /// so the relative path returned here must be empty to avoid path duplication. + /// + /// `goosefs://host:port/data/file.lance` → root="/data/file.lance", extract_path="" + fn extract_path(&self, _url: &Url) -> Result { + Ok(Path::from("")) + } + + /// Calculate the object store prefix for caching. + /// + /// Format: `goosefs$host:port` + /// This ensures different GooseFS clusters get separate caches. + fn calculate_object_store_prefix( + &self, + url: &Url, + _storage_options: Option<&HashMap>, + ) -> Result { + Ok(format!("{}${}", url.scheme(), url.authority())) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_goosefs_store_path() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://10.0.0.1:9200/data/embeddings.lance").unwrap(); + let path = provider.extract_path(&url).unwrap(); + // extract_path returns empty because the full path is used as OpenDAL root + assert_eq!(path.to_string(), ""); + } + + #[test] + fn test_goosefs_store_root_path() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://10.0.0.1:9200/").unwrap(); + let path = provider.extract_path(&url).unwrap(); + assert_eq!(path.to_string(), ""); + } + + #[test] + fn test_goosefs_store_deep_path() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://master:9200/a/b/c/d.lance").unwrap(); + let path = provider.extract_path(&url).unwrap(); + // All path components are in the OpenDAL root, extract_path is empty + assert_eq!(path.to_string(), ""); + } + + #[test] + fn test_calculate_object_store_prefix() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://10.0.0.1:9200/data").unwrap(); + let prefix = provider.calculate_object_store_prefix(&url, None).unwrap(); + assert_eq!(prefix, "goosefs$10.0.0.1:9200"); + } + + #[test] + fn test_calculate_object_store_prefix_with_hostname() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://myhost:9200/data").unwrap(); + let prefix = provider.calculate_object_store_prefix(&url, None).unwrap(); + assert_eq!(prefix, "goosefs$myhost:9200"); + } + + #[test] + fn test_resolve_master_addr_from_url() { + let url = Url::parse("goosefs://10.0.0.1:9200/data").unwrap(); + let storage_options = StorageOptions(HashMap::new()); + let addr = GooseFsStoreProvider::resolve_master_addr(&url, &storage_options).unwrap(); + assert_eq!(addr, "10.0.0.1:9200"); + } + + #[test] + fn test_resolve_master_addr_default_port() { + let url = Url::parse("goosefs://10.0.0.1/data").unwrap(); + let storage_options = StorageOptions(HashMap::new()); + let addr = GooseFsStoreProvider::resolve_master_addr(&url, &storage_options).unwrap(); + assert_eq!(addr, "10.0.0.1:9200"); + } + + #[test] + fn test_resolve_master_addr_from_storage_options() { + let url = Url::parse("goosefs://10.0.0.1:9200/data").unwrap(); + let storage_options = StorageOptions(HashMap::from([( + "goosefs_master_addr".to_string(), + "10.0.0.2:9200,10.0.0.3:9200".to_string(), + )])); + let addr = GooseFsStoreProvider::resolve_master_addr(&url, &storage_options).unwrap(); + assert_eq!(addr, "10.0.0.2:9200,10.0.0.3:9200"); + } +} diff --git a/rust/lance-io/src/object_store/providers/tos.rs b/rust/lance-io/src/object_store/providers/tos.rs new file mode 100644 index 00000000000..923186484c6 --- /dev/null +++ b/rust/lance-io/src/object_store/providers/tos.rs @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; +use std::sync::Arc; + +use object_store::ObjectStore as OSObjectStore; +use object_store_opendal::OpendalStore; +use opendal::{Operator, services::Tos}; +use url::Url; + +use crate::object_store::dynamic_opendal::DynamicOpenDalStore; +use crate::object_store::{ + DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, ObjectStore, + ObjectStoreParams, ObjectStoreProvider, StorageOptions, +}; +use lance_core::error::{Error, Result}; + +#[derive(Default, Debug)] +pub struct TosStoreProvider; + +impl TosStoreProvider { + fn tos_env_options_from_iter(vars: I) -> HashMap + where + I: IntoIterator, + K: Into, + V: Into, + { + let vars = vars + .into_iter() + .map(|(key, value)| (key.into(), value.into())) + .collect::>(); + let mut config_map = HashMap::new(); + + for prefix in ["VOLCENGINE_", "TOS_"] { + for (key, value) in &vars { + if let Some(stripped_key) = key.strip_prefix(prefix) { + config_map.insert(stripped_key.to_ascii_lowercase(), value.clone()); + } + } + } + + config_map + } + + fn base_tos_options( + base_path: &Url, + storage_options: &StorageOptions, + ) -> Result> { + let bucket = base_path + .host_str() + .ok_or_else(|| Error::invalid_input("TOS URL must contain bucket name"))? + .to_string(); + + let prefix = base_path.path().trim_start_matches('/').to_string(); + + let mut config_map = Self::tos_env_options_from_iter(std::env::vars()); + + config_map.extend(storage_options.0.clone()); + + config_map.insert("bucket".to_string(), bucket); + if prefix.is_empty() { + config_map.remove("root"); + } else { + config_map.insert("root".to_string(), "/".to_string()); + } + + Ok(config_map) + } + + /// Normalize TOS storage options, resolving aliases for well-known keys + /// while passing through all other options so that OpenDAL can use them. + fn normalize_tos_config(options: &HashMap) -> Result> { + let mut config_map = options.clone(); + + let alias_groups: &[(&str, &[&str])] = &[ + ("endpoint", &["tos_endpoint"]), + ("region", &["tos_region"]), + ("access_key_id", &["tos_access_key_id"]), + ("secret_access_key", &["tos_secret_access_key"]), + ("security_token", &["tos_security_token"]), + ]; + + for (canonical, aliases) in alias_groups { + for alias in *aliases { + if let Some(value) = config_map.remove(*alias) { + config_map.insert(canonical.to_string(), value); + break; + } + } + } + + if !config_map.contains_key("endpoint") { + return Err(Error::invalid_input( + "TOS endpoint is required. Please provide 'tos_endpoint' in storage options or set TOS_ENDPOINT environment variable", + )); + } + + Ok(config_map) + } + + fn build_tos_store(config_map: HashMap) -> Result { + let operator = Operator::from_iter::(config_map) + .map_err(|e| Error::invalid_input(format!("Failed to create TOS operator: {:?}", e)))? + .finish(); + + Ok(OpendalStore::new(operator)) + } +} + +#[async_trait::async_trait] +impl ObjectStoreProvider for TosStoreProvider { + async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { + let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); + + let base_options = Self::base_tos_options(&base_path, &storage_options)?; + let accessor = params.get_accessor(); + + let inner: Arc = + if let Some(accessor) = accessor.filter(|a| a.has_provider()) { + Arc::new( + DynamicOpenDalStore::new( + format!("tos:{}", base_path), + base_options, + accessor, + Self::normalize_tos_config, + Self::build_tos_store, + ) + .with_protected_keys(["bucket", "root"]), + ) + } else { + Arc::new(Self::build_tos_store(Self::normalize_tos_config( + &base_options, + )?)?) + }; + + let mut url = base_path; + if !url.path().ends_with('/') { + url.set_path(&format!("{}/", url.path())); + } + + Ok(ObjectStore { + scheme: "tos".to_string(), + inner, + block_size, + max_iop_size: *DEFAULT_MAX_IOP_SIZE, + use_constant_size_upload_parts: params.use_constant_size_upload_parts, + list_is_lexically_ordered: params.list_is_lexically_ordered.unwrap_or(true), + io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, + download_retry_count: storage_options.download_retry_count(), + io_tracker: Default::default(), + store_prefix: self.calculate_object_store_prefix(&url, params.storage_options())?, + }) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use super::TosStoreProvider; + use crate::object_store::dynamic_opendal::DynamicOpenDalStore; + use crate::object_store::test_utils::StaticMockStorageOptionsProvider; + use crate::object_store::{ObjectStoreProvider, StorageOptionsAccessor}; + use url::Url; + + #[test] + fn test_tos_store_path() { + let provider = TosStoreProvider; + + let url = Url::parse("tos://bucket/path/to/file").unwrap(); + let path = provider.extract_path(&url).unwrap(); + let expected_path = object_store::path::Path::from("path/to/file"); + assert_eq!(path, expected_path); + } + + #[test] + fn test_tos_env_options_normalize_supported_prefixes() { + let config = TosStoreProvider::tos_env_options_from_iter([ + ("VOLCENGINE_ENDPOINT", "https://tos-cn-beijing.volces.com"), + ("TOS_ACCESS_KEY_ID", "tos-akid"), + ("TOS_SECRET_ACCESS_KEY", "tos-secret"), + ]); + + assert_eq!( + config.get("endpoint").unwrap(), + "https://tos-cn-beijing.volces.com" + ); + assert_eq!(config.get("access_key_id").unwrap(), "tos-akid"); + assert_eq!(config.get("secret_access_key").unwrap(), "tos-secret"); + } + + #[test] + fn test_tos_alias_options_override_canonical_env_options() { + let config = TosStoreProvider::normalize_tos_config(&HashMap::from([ + ( + "endpoint".to_string(), + "https://env.example.com".to_string(), + ), + ( + "tos_endpoint".to_string(), + "https://user.example.com".to_string(), + ), + ("region".to_string(), "env-region".to_string()), + ("tos_region".to_string(), "user-region".to_string()), + ("access_key_id".to_string(), "env-akid".to_string()), + ("tos_access_key_id".to_string(), "user-akid".to_string()), + ("secret_access_key".to_string(), "env-secret".to_string()), + ( + "tos_secret_access_key".to_string(), + "user-secret".to_string(), + ), + ("security_token".to_string(), "env-token".to_string()), + ("tos_security_token".to_string(), "user-token".to_string()), + ("bucket".to_string(), "bucket".to_string()), + ])) + .unwrap(); + + assert_eq!(config.get("endpoint").unwrap(), "https://user.example.com"); + assert_eq!(config.get("region").unwrap(), "user-region"); + assert_eq!(config.get("access_key_id").unwrap(), "user-akid"); + assert_eq!(config.get("secret_access_key").unwrap(), "user-secret"); + assert_eq!(config.get("security_token").unwrap(), "user-token"); + assert!(!config.contains_key("tos_endpoint")); + assert!(!config.contains_key("tos_secret_access_key")); + assert!(!config.contains_key("tos_security_token")); + } + + #[test] + fn test_tos_url_bucket_and_root_are_authoritative() { + let storage_options = crate::object_store::StorageOptions(HashMap::from([ + ( + "tos_endpoint".to_string(), + "https://tos-cn-beijing.volces.com".to_string(), + ), + ("bucket".to_string(), "storage-options-bucket".to_string()), + ("root".to_string(), "/storage-options-root".to_string()), + ])); + let base_options = TosStoreProvider::base_tos_options( + &Url::parse("tos://url-bucket/path").unwrap(), + &storage_options, + ) + .unwrap(); + let config = TosStoreProvider::normalize_tos_config(&base_options).unwrap(); + + assert_eq!(config.get("bucket").unwrap(), "url-bucket"); + assert_eq!(config.get("root").unwrap(), "/"); + + let base_options = TosStoreProvider::base_tos_options( + &Url::parse("tos://url-bucket").unwrap(), + &storage_options, + ) + .unwrap(); + let config = TosStoreProvider::normalize_tos_config(&base_options).unwrap(); + + assert_eq!(config.get("bucket").unwrap(), "url-bucket"); + assert!(!config.contains_key("root")); + } + + #[tokio::test] + async fn test_dynamic_opendal_tos_store_uses_provider_credentials() { + let accessor = Arc::new(StorageOptionsAccessor::with_provider(Arc::new( + StaticMockStorageOptionsProvider { + options: HashMap::from([ + ( + "tos_endpoint".to_string(), + "https://tos-cn-beijing.volces.com".to_string(), + ), + ("tos_region".to_string(), "cn-beijing".to_string()), + ("tos_access_key_id".to_string(), "akid".to_string()), + ("tos_secret_access_key".to_string(), "secret".to_string()), + ("tos_security_token".to_string(), "token".to_string()), + ]), + }, + ))); + + let base_options = TosStoreProvider::base_tos_options( + &Url::parse("tos://url-bucket/path").unwrap(), + &crate::object_store::StorageOptions(HashMap::new()), + ) + .unwrap(); + + let store = DynamicOpenDalStore::new( + "tos", + base_options, + accessor, + TosStoreProvider::normalize_tos_config, + TosStoreProvider::build_tos_store, + ) + .with_protected_keys(["bucket", "root"]); + + let current_store = store + .current_store() + .await + .expect("dynamic OpenDAL TOS store should build"); + + assert!(current_store.to_string().contains("Opendal")); + } +} diff --git a/rust/lance-io/src/object_store/throttle.rs b/rust/lance-io/src/object_store/throttle.rs index dc66f69cf90..bac1d3a538e 100644 --- a/rust/lance-io/src/object_store/throttle.rs +++ b/rust/lance-io/src/object_store/throttle.rs @@ -318,7 +318,7 @@ impl AimdThrottleConfig { struct TokenBucketState { tokens: f64, - last_refill: std::time::Instant, + last_refill: tokio::time::Instant, rate: f64, } @@ -346,7 +346,7 @@ impl OperationThrottle { controller, bucket: Mutex::new(TokenBucketState { tokens: burst_capacity, - last_refill: std::time::Instant::now(), + last_refill: tokio::time::Instant::now(), rate: initial_rate, }), burst_capacity, @@ -364,7 +364,7 @@ impl OperationThrottle { async fn acquire_token(&self) { let sleep_duration = { let mut bucket = self.bucket.lock().await; - let now = std::time::Instant::now(); + let now = tokio::time::Instant::now(); let elapsed = now.duration_since(bucket.last_refill).as_secs_f64(); bucket.tokens = (bucket.tokens + elapsed * bucket.rate).min(self.burst_capacity); bucket.last_refill = now; @@ -1176,12 +1176,15 @@ mod tests { } fn list_start_throttle_config() -> AimdThrottleConfig { + // Use a low rate (10 tokens/s) so that the token-acquisition sleep is + // 1/10 = 100 ms — well above the 50 ms timeout used in assertions, + // avoiding flakiness from coarse OS timer resolution (e.g. Windows ~16 ms). AimdThrottleConfig::default() .with_burst_capacity(0) - .with_list_aimd(AimdConfig::default().with_initial_rate(50.0)) + .with_list_aimd(AimdConfig::default().with_initial_rate(10.0)) } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_list_acquires_token_before_starting_underlying_stream() { let store = Arc::new(CountingListStartStore::default()); store @@ -1199,14 +1202,16 @@ mod tests { let mut stream = throttled.list(Some(&Path::from("prefix"))); assert_eq!(store.list_calls(), 0); + // With rate=10 tokens/s and burst_capacity=0, the token acquisition + // sleeps for 100 ms. A 50 ms timeout must expire before that. assert!( - tokio::time::timeout(std::time::Duration::from_millis(5), stream.next()) + tokio::time::timeout(std::time::Duration::from_millis(50), stream.next()) .await .is_err() ); assert_eq!(store.list_calls(), 0); - let item = tokio::time::timeout(std::time::Duration::from_millis(100), stream.next()) + let item = tokio::time::timeout(std::time::Duration::from_millis(300), stream.next()) .await .unwrap() .unwrap() @@ -1215,7 +1220,7 @@ mod tests { assert_eq!(store.list_calls(), 1); } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_list_with_offset_acquires_token_before_starting_underlying_stream() { let store = Arc::new(CountingListStartStore::default()); store @@ -1231,14 +1236,16 @@ mod tests { let mut stream = throttled.list_with_offset(Some(&Path::from("prefix")), &Path::from("prefix/a")); assert_eq!(store.offset_calls(), 0); + // With rate=10 tokens/s and burst_capacity=0, the token acquisition + // sleeps for 100 ms. A 50 ms timeout must expire before that. assert!( - tokio::time::timeout(std::time::Duration::from_millis(5), stream.next()) + tokio::time::timeout(std::time::Duration::from_millis(50), stream.next()) .await .is_err() ); assert_eq!(store.offset_calls(), 0); - let item = tokio::time::timeout(std::time::Duration::from_millis(100), stream.next()) + let item = tokio::time::timeout(std::time::Duration::from_millis(300), stream.next()) .await .unwrap() .unwrap() diff --git a/rust/lance-io/src/object_writer.rs b/rust/lance-io/src/object_writer.rs index 4b9bb901446..0fd0a30f9e7 100644 --- a/rust/lance-io/src/object_writer.rs +++ b/rust/lance-io/src/object_writer.rs @@ -47,8 +47,12 @@ fn max_conn_reset_retries() -> u16 { }) } -/// Maximum part size in GCS and S3: 5GB. -const MAX_UPLOAD_PART_SIZE: usize = 1024 * 1024 * 1024 * 5; +/// Maximum body size for a single S3 PUT: strictly less than 5 GiB. +/// AWS rejects single-PUT bodies of exactly 5 GiB (= 5 * 1024^3) with +/// `EntityTooLarge`, so we clamp `LANCE_INITIAL_UPLOAD_SIZE` one byte +/// below that threshold to keep the buffer-fills-to-clamp single-PUT +/// path safe. See lance#6750 for the related txn-file write fix. +const MAX_UPLOAD_PART_SIZE: usize = 1024 * 1024 * 1024 * 5 - 1; /// Clamps a requested upload part size to the valid [5MB, 5GB] range. /// Returns the clamped value and whether clamping was necessary. @@ -898,4 +902,17 @@ mod tests { (MAX_UPLOAD_PART_SIZE, true) ); } + + /// Regression for the foot-gun where `LANCE_INITIAL_UPLOAD_SIZE=5368709120` + /// (exactly 5 GiB, Pucheng's setting) caused a single-PUT of 5 GiB on + /// shutdown — which S3 rejects with `EntityTooLarge`. After tightening + /// `MAX_UPLOAD_PART_SIZE` to 5 GiB - 1, raw 5 GiB must clamp DOWN. + #[test] + fn clamp_initial_upload_size_at_5gib_clamps_down() { + let exactly_5_gib: usize = 5 * 1024 * 1024 * 1024; + assert_eq!( + clamp_initial_upload_size(exactly_5_gib), + (MAX_UPLOAD_PART_SIZE, true) + ); + } } diff --git a/rust/lance-io/src/scheduler.rs b/rust/lance-io/src/scheduler.rs index 5993b161497..efe4b9b0c24 100644 --- a/rust/lance-io/src/scheduler.rs +++ b/rust/lance-io/src/scheduler.rs @@ -15,6 +15,7 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; use tokio::sync::Notify; +use lance_core::utils::io_stats::IoStatsRecorder; use lance_core::utils::parse::str_is_truthy; use lance_core::{Error, Result}; @@ -475,8 +476,25 @@ impl StatsCollector { Ordering::Relaxed, ); } + + /// Add already-aggregated counts (e.g. a snapshot captured from another + /// scheduler) into these counters. + fn add(&self, iops: u64, requests: u64, bytes_read: u64) { + self.iops.fetch_add(iops, Ordering::Relaxed); + self.requests.fetch_add(requests, Ordering::Relaxed); + self.bytes_read.fetch_add(bytes_read, Ordering::Relaxed); + } } +impl IoStatsRecorder for StatsCollector { + fn record_request(&self, request: &[Range]) { + // Inherent methods take precedence in resolution, so this delegates to + // the inherent `record_request` above rather than recursing. + Self::record_request(self, request) + } +} + +#[derive(Debug, Clone, Copy, Default)] pub struct ScanStats { pub iops: u64, pub requests: u64, @@ -493,6 +511,57 @@ impl ScanStats { } } +/// A shareable, cloneable handle to a set of cumulative I/O counters. +/// +/// All clones share the same underlying counters. This serves two purposes: +/// +/// 1. It backs each [`ScanScheduler`]'s own running totals. +/// 2. It can be attached to an individual [`FileScheduler`] (via +/// [`FileScheduler::with_io_stats`]) as a *secondary* sink, so a caller can +/// measure the exact bytes/IOPS performed through that file handle for a +/// bounded scope (e.g. a single query) without disturbing the scheduler's +/// global totals. Read the result back with [`IoStats::snapshot`]. +#[derive(Debug, Clone)] +pub struct IoStats(Arc); + +impl IoStats { + pub fn new() -> Self { + Self(Arc::new(StatsCollector::new())) + } + + /// Record a single completed request. `request` holds the byte ranges as + /// actually submitted to storage (post coalescing/splitting), so the counts + /// reflect physical I/O. + pub fn record_request(&self, request: &[Range]) { + self.0.record_request(request); + } + + /// Take an immutable snapshot of the current cumulative counters. + pub fn snapshot(&self) -> ScanStats { + ScanStats::new(self.0.as_ref()) + } + + /// Return this handle as a type-erased [`IoStatsRecorder`], suitable for + /// attaching to a file reader (e.g. `FileReader::with_io_stats`). The + /// returned recorder shares the same underlying counters as `self`. + pub fn recorder(&self) -> Arc { + self.0.clone() + } + + /// Add a snapshot of already-aggregated statistics into this sink. Used to + /// fold in I/O measured on a separate scheduler (e.g. the one-time reads + /// performed while opening an index). + pub fn add_scan_stats(&self, stats: &ScanStats) { + self.0.add(stats.iops, stats.requests, stats.bytes_read); + } +} + +impl Default for IoStats { + fn default() -> Self { + Self::new() + } +} + enum IoQueueType { Standard(Arc), Lite(Arc), @@ -509,7 +578,7 @@ enum IoQueueType { pub struct ScanScheduler { object_store: Arc, io_queue: IoQueueType, - stats: Arc, + stats: IoStats, } impl Debug for ScanScheduler { @@ -606,7 +675,7 @@ impl ScanScheduler { Arc::new(Self { object_store, io_queue, - stats: Arc::new(StatsCollector::new()), + stats: IoStats::new(), }) } @@ -646,6 +715,7 @@ impl ScanScheduler { base_priority, max_iop_size, bypass_backpressure: false, + extra_stats: None, }) } @@ -791,7 +861,7 @@ impl ScanScheduler { } pub fn stats(&self) -> ScanStats { - ScanStats::new(self.stats.as_ref()) + self.stats.snapshot() } #[cfg(test)] @@ -829,6 +899,10 @@ pub struct FileScheduler { base_priority: u64, max_iop_size: u64, bypass_backpressure: bool, + /// Optional secondary statistics sink. When set, every request submitted + /// through this handle is also recorded here, in addition to the + /// scheduler's global totals. Used to measure per-scope I/O. + extra_stats: Option>, } fn is_close_together(range1: &Range, range2: &Range, block_size: u64) -> bool { @@ -899,6 +973,9 @@ impl FileScheduler { } self.root.stats.record_request(&updated_requests); + if let Some(extra_stats) = &self.extra_stats { + extra_stats.record_request(&updated_requests); + } let bytes_vec_fut = self.root.submit_request( self.reader.clone(), @@ -964,6 +1041,23 @@ impl FileScheduler { max_iop_size: self.max_iop_size, base_priority: priority, bypass_backpressure: self.bypass_backpressure, + extra_stats: self.extra_stats.clone(), + } + } + + /// Returns a copy of this scheduler that additionally records the I/O it + /// performs into `stats`, on top of the scheduler's global statistics. + /// + /// This is the mechanism for measuring exact per-scope (e.g. per-query) I/O: + /// attach a recorder here (e.g. via [`IoStats::recorder`]), perform the reads + /// through the returned handle, then read the totals back with + /// [`IoStats::snapshot`]. The returned handle is cheap to create (a few + /// `Arc` clones) and reuses the same underlying reader, so it does not + /// re-open the file. + pub fn with_io_stats(&self, stats: Arc) -> Self { + Self { + extra_stats: Some(stats), + ..self.clone() } } @@ -1183,6 +1277,59 @@ mod tests { assert_eq!(11, scheduler.stats().iops); } + #[tokio::test] + async fn test_io_stats_sink() { + let tmp_file = TempObjFile::default(); + let obj_store = Arc::new(ObjectStore::local()); + + const DATA_SIZE: u64 = 1024 * 1024; + let mut some_data = vec![0; DATA_SIZE as usize]; + rand::rng().fill_bytes(&mut some_data); + obj_store.put(&tmp_file, &some_data).await.unwrap(); + + let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing()); + + // Attach a per-scope sink to one file handle. + let sink = IoStats::new(); + let file_scheduler = scheduler + .open_file(&tmp_file, &CachedFileSize::unknown()) + .await + .unwrap() + .with_io_stats(sink.recorder()); + + // Three reads within 4KiB coalesce into a single physical IOP. The sink + // and the scheduler's global totals must agree exactly, because both are + // recorded from the same post-coalescing request. + file_scheduler + .submit_request(vec![50_000..51_000, 52_000..53_000, 54_000..55_000], 0) + .await + .unwrap(); + + let global = scheduler.stats(); + let scoped = sink.snapshot(); + assert_eq!(1, scoped.iops); + assert_eq!(1, scoped.requests); + // Coalesced range 50_000..55_000 => 5000 physical bytes. + assert_eq!(5000, scoped.bytes_read); + assert_eq!(global.iops, scoped.iops); + assert_eq!(global.requests, scoped.requests); + assert_eq!(global.bytes_read, scoped.bytes_read); + + // A sibling handle without the sink: the global totals advance but the + // sink stays put, proving per-scope isolation. + let other = scheduler + .open_file(&tmp_file, &CachedFileSize::unknown()) + .await + .unwrap(); + other.submit_request(vec![0..1000], 0).await.unwrap(); + + let global_after = scheduler.stats(); + let scoped_after = sink.snapshot(); + assert_eq!(global.bytes_read + 1000, global_after.bytes_read); + assert_eq!(scoped.bytes_read, scoped_after.bytes_read); + assert_eq!(scoped.iops, scoped_after.iops); + } + #[tokio::test] async fn test_priority() { let some_path = Path::parse("foo").unwrap(); @@ -1411,8 +1558,8 @@ mod tests { path: Path, } - impl deepsize::DeepSizeOf for TrackingReader { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + impl lance_core::deepsize::DeepSizeOf for TrackingReader { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } diff --git a/rust/lance-io/src/traits.rs b/rust/lance-io/src/traits.rs index eb83cf1a2ba..6a40171b6e0 100644 --- a/rust/lance-io/src/traits.rs +++ b/rust/lance-io/src/traits.rs @@ -5,8 +5,8 @@ use std::ops::Range; use async_trait::async_trait; use bytes::Bytes; -use deepsize::DeepSizeOf; use futures::{StreamExt, future::BoxFuture, stream::BoxStream}; +use lance_core::deepsize::DeepSizeOf; use object_store::path::Path; use prost::Message; use tokio::io::{AsyncWrite, AsyncWriteExt}; diff --git a/rust/lance-io/src/uring/current_thread.rs b/rust/lance-io/src/uring/current_thread.rs index bc09af058e6..abac772218b 100644 --- a/rust/lance-io/src/uring/current_thread.rs +++ b/rust/lance-io/src/uring/current_thread.rs @@ -14,10 +14,10 @@ use crate::traits::Reader; use crate::uring::DEFAULT_URING_QUEUE_DEPTH; use crate::utils::tracking_store::IOTracker; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::{FutureExt, TryFutureExt}; use io_uring::{IoUring, opcode, types}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use object_store::path::Path; @@ -258,7 +258,7 @@ pub struct UringCurrentThreadReader { } impl DeepSizeOf for UringCurrentThreadReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skip file handle (just a system resource) // Only count the path's deep size self.handle.path.as_ref().deep_size_of_children(context) diff --git a/rust/lance-io/src/uring/reader.rs b/rust/lance-io/src/uring/reader.rs index 0e7b0101ba8..a948e6c63dc 100644 --- a/rust/lance-io/src/uring/reader.rs +++ b/rust/lance-io/src/uring/reader.rs @@ -12,9 +12,9 @@ use crate::traits::Reader; use crate::uring::requests::RequestState; use crate::utils::tracking_store::IOTracker; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::{FutureExt, TryFutureExt}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use object_store::path::Path; use std::fs::File; @@ -109,7 +109,7 @@ pub struct UringReader { } impl DeepSizeOf for UringReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skip file handle (just a system resource) // Only count the path's deep size self.handle.path.as_ref().deep_size_of_children(context) diff --git a/rust/lance-io/src/utils.rs b/rust/lance-io/src/utils.rs index 48d28526a5e..b36dff75133 100644 --- a/rust/lance-io/src/utils.rs +++ b/rust/lance-io/src/utils.rs @@ -10,8 +10,8 @@ use arrow_array::{ use arrow_schema::DataType; use byteorder::{ByteOrder, LittleEndian}; use bytes::Bytes; -use deepsize::DeepSizeOf; use lance_arrow::*; +use lance_core::deepsize::DeepSizeOf; use prost::Message; use serde::{Deserialize, Serialize}; diff --git a/rust/lance-io/tests/goosefs_integration.rs b/rust/lance-io/tests/goosefs_integration.rs new file mode 100644 index 00000000000..fd9015bcda0 --- /dev/null +++ b/rust/lance-io/tests/goosefs_integration.rs @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! GooseFS integration tests via OpenDAL. +//! +//! Covers Stage 2 (OpenDAL direct), Stage 3 (Lance ObjectStore I/O), +//! and diagnostic tests (OpenDAL via lance-io ObjectStore). +//! +//! Run: +//! cargo test -p lance-io --features "goosefs goosefs-test" --test goosefs_integration -- --ignored --nocapture --test-threads=1 +#![cfg(feature = "goosefs-test")] +#![allow(clippy::print_stderr)] + +use std::sync::Arc; + +use futures::TryStreamExt; +use object_store::ObjectStoreExt; +use opendal::{Operator, services::GooseFs}; +use std::collections::HashMap; + +fn get_operator() -> Operator { + let addr = std::env::var("GOOSEFS_MASTER_ADDR").unwrap_or("127.0.0.1:9200".into()); + let auth_type = std::env::var("GOOSEFS_AUTH_TYPE").unwrap_or("simple".into()); + let mut cfg = HashMap::new(); + cfg.insert("master_addr".to_string(), addr); + cfg.insert("root".to_string(), "/lance-test/opendal".to_string()); + cfg.insert("auth_type".to_string(), auth_type); + Operator::from_iter::(cfg).unwrap().finish() +} + +// ============================================================ +// Stage 2: OpenDAL GooseFs Service tests +// ============================================================ + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_opendal_write_read() { + let op = get_operator(); + // Cleanup any leftover from previous runs + let _ = op.delete("hello.txt").await; + op.write("hello.txt", "Hello from OpenDAL").await.unwrap(); + let data = op.read("hello.txt").await.unwrap(); + assert_eq!(data.to_vec(), b"Hello from OpenDAL"); + op.delete("hello.txt").await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_opendal_list() { + let op = get_operator(); + // Write files directly (GooseFS may have h2 issues with newly-created subdirs) + let _ = op.delete("list_a.txt").await; + let _ = op.delete("list_b.txt").await; + op.write("list_a.txt", "aaa").await.unwrap(); + op.write("list_b.txt", "bbb").await.unwrap(); + let entries: Vec<_> = op.list("/").await.unwrap(); + let names: Vec = entries.iter().map(|e| e.name().to_string()).collect(); + eprintln!("Listed entries: {:?}", names); + assert!( + entries.len() >= 2, + "Expected at least 2 entries, got {}", + entries.len() + ); + op.delete("list_a.txt").await.unwrap(); + op.delete("list_b.txt").await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_opendal_stat() { + let op = get_operator(); + // Cleanup leftover from previous runs + let _ = op.delete("stat_test.txt").await; + op.write("stat_test.txt", "12345").await.unwrap(); + let meta = op.stat("stat_test.txt").await.unwrap(); + assert_eq!(meta.content_length(), 5); + op.delete("stat_test.txt").await.unwrap(); +} + +// ============================================================ +// Stage 3: Lance ObjectStore I/O tests +// ============================================================ + +use lance_io::object_store::ObjectStore; + +async fn get_lance_store() -> Arc { + let addr = std::env::var("GOOSEFS_MASTER_ADDR").unwrap_or("127.0.0.1:9200".into()); + let uri = format!("goosefs://{}/lance-test/lance-io", addr); + ObjectStore::from_uri(&uri).await.unwrap().0 +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_lance_objectstore_put_get() { + let store = get_lance_store().await; + let path = object_store::path::Path::from("test_put_get.bin"); + + // Cleanup + let _ = store.inner.delete(&path).await; + + // Write + store + .inner + .put(&path, (&b"lance-goosefs-test"[..]).into()) + .await + .unwrap(); + + // Read + let result = store.inner.get(&path).await.unwrap(); + let bytes = result.bytes().await.unwrap(); + assert_eq!(&bytes[..], b"lance-goosefs-test"); + + // Cleanup + store.inner.delete(&path).await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_lance_objectstore_list() { + let store = get_lance_store().await; + + let file_a = object_store::path::Path::from("list_a.bin"); + let file_b = object_store::path::Path::from("list_b.bin"); + + // Cleanup leftovers + let _ = store.inner.delete(&file_a).await; + let _ = store.inner.delete(&file_b).await; + + store + .inner + .put(&file_a, (&b"aaa"[..]).into()) + .await + .unwrap(); + store + .inner + .put(&file_b, (&b"bbb"[..]).into()) + .await + .unwrap(); + + let entries: Vec<_> = store.inner.list(None).try_collect().await.unwrap(); + eprintln!("Listed {} entries", entries.len()); + assert!( + entries.len() >= 2, + "Expected at least 2 entries, got {}", + entries.len() + ); + + store.inner.delete(&file_a).await.unwrap(); + store.inner.delete(&file_b).await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_lance_objectstore_large_file() { + let store = get_lance_store().await; + let path = object_store::path::Path::from("large_file.bin"); + let _ = store.inner.delete(&path).await; + + // Write 5MB file + let data = vec![42u8; 5 * 1024 * 1024]; + store.inner.put(&path, data.clone().into()).await.unwrap(); + + let result = store.inner.get(&path).await.unwrap(); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes.len(), 5 * 1024 * 1024); + assert_eq!(&bytes[..10], &[42u8; 10]); + + store.inner.delete(&path).await.unwrap(); +} + +// ============================================================ +// Diagnostic: lance-io ObjectStore advanced write modes +// ============================================================ + +use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry}; + +#[tokio::test] +#[ignore = "Requires GooseFS cluster"] +async fn test_diag_lance_io_write_modes() { + let addr = std::env::var("GOOSEFS_MASTER_ADDR").unwrap_or_else(|_| "127.0.0.1:9200".into()); + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis(); + let root = format!("goosefs://{}/lance-test/lance_io_direct_{}", addr, ts); + + eprintln!("[DIAG] Creating ObjectStore at: {}", root); + + let params = ObjectStoreParams::default(); + let registry = Arc::new(ObjectStoreRegistry::default()); + let (object_store, _path) = ObjectStore::from_uri_and_params(registry, &root, ¶ms) + .await + .expect("Failed to create ObjectStore"); + + // Test 1: Basic put + get + let test_path = object_store::path::Path::parse("test_file.txt").unwrap(); + let test_data = bytes::Bytes::from("Hello from lance-io ObjectStore!"); + + eprintln!( + "[DIAG] Writing test_file.txt ({} bytes)...", + test_data.len() + ); + match object_store + .inner + .put(&test_path, test_data.clone().into()) + .await + { + Ok(_) => eprintln!("[DIAG] Write succeeded! ✅"), + Err(e) => { + eprintln!("[DIAG] Write FAILED: {:?}", e); + eprintln!("[DIAG] Error source: {:?}", std::error::Error::source(&e)); + return; + } + } + + eprintln!("[DIAG] Reading test_file.txt..."); + match object_store.inner.get(&test_path).await { + Ok(result) => { + let bytes = result.bytes().await.unwrap(); + let content = String::from_utf8_lossy(&bytes); + eprintln!("[DIAG] Read content: '{}' ({} bytes)", content, bytes.len()); + assert_eq!(bytes, test_data); + } + Err(e) => eprintln!("[DIAG] Read FAILED: {:?}", e), + } + + // Test 2: PutMode::Create (if_not_exists) + eprintln!("[DIAG] Writing with PutMode::Create (if_not_exists)..."); + match object_store + .inner + .put_opts( + &object_store::path::Path::parse("test_create.txt").unwrap(), + bytes::Bytes::from("conditional write!").into(), + object_store::PutOptions { + mode: object_store::PutMode::Create, + ..Default::default() + }, + ) + .await + { + Ok(_) => eprintln!("[DIAG] PutMode::Create succeeded! ✅"), + Err(e) => { + eprintln!("[DIAG] PutMode::Create FAILED: {:?}", e); + } + } + + // Test 3: rename_if_not_exists + eprintln!("[DIAG] Testing rename_if_not_exists..."); + let tmp_path = object_store::path::Path::parse("_tmp_rename.txt").unwrap(); + let dest_path = object_store::path::Path::parse("renamed.txt").unwrap(); + match object_store + .inner + .put(&tmp_path, bytes::Bytes::from("rename me!").into()) + .await + { + Ok(_) => { + eprintln!("[DIAG] Tmp file written ✅"); + match object_store + .inner + .rename_if_not_exists(&tmp_path, &dest_path) + .await + { + Ok(_) => eprintln!("[DIAG] rename_if_not_exists succeeded! ✅"), + Err(e) => eprintln!("[DIAG] rename_if_not_exists FAILED: {:?}", e), + } + } + Err(e) => eprintln!("[DIAG] Tmp file write FAILED: {:?}", e), + } + + eprintln!("[DIAG] lance-io direct write test complete ✅"); +} diff --git a/rust/lance-io/tests/tos_integration.rs b/rust/lance-io/tests/tos_integration.rs new file mode 100644 index 00000000000..97eca30090e --- /dev/null +++ b/rust/lance-io/tests/tos_integration.rs @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors +//! These integration tests can only be run against a real Volcengine TOS bucket. + +#![cfg(feature = "tos-test")] + +use futures::TryStreamExt; +use lance_io::object_store::ObjectStore; +use object_store::ObjectStoreExt; +use object_store::path::Path; +use tokio::io::AsyncWriteExt; + +fn tos_bucket() -> String { + std::env::var("TOS_BUCKET").expect("TOS_BUCKET must be set") +} + +async fn delete_prefix(store: &ObjectStore, prefix: &str) { + let prefix_path = Path::from(prefix); + let locations = store + .inner + .list(Some(&prefix_path)) + .map_ok(|meta| meta.location) + .try_collect::>() + .await + .unwrap_or_default(); + + for location in locations { + let _ = store.inner.delete(&location).await; + } +} + +#[ignore = "Must be run manually on Volcengine TOS"] +#[tokio::test] +async fn test_tos_write_read_list_delete() { + let prefix = format!("lance-tos-{}-{}", std::process::id(), rand::random::()); + let bucket = tos_bucket(); + let (store, base_path) = ObjectStore::from_uri(&format!("tos://{bucket}/{prefix}")) + .await + .unwrap(); + assert_eq!(base_path, Path::from(prefix.as_str())); + + let path = Path::from(format!("{prefix}/small.txt")); + delete_prefix(&store, &prefix).await; + + let result: Result<(), Box> = async { + let mut writer = store.create(&path).await?; + writer.write_all(b"hello").await?; + writer.write_all(b" tos").await?; + writer.shutdown().await?; + + let meta = store.inner.head(&path).await?; + if meta.size != 9 { + return Err(format!("expected object size 9, got {}", meta.size).into()); + } + + let data = store.inner.get(&path).await?.bytes().await?; + if data.as_ref() != b"hello tos" { + return Err("downloaded TOS object content did not match".into()); + } + + let listed = store + .inner + .list(Some(&Path::from(prefix.as_str()))) + .try_collect::>() + .await?; + if !listed.iter().any(|meta| meta.location == path) { + return Err("uploaded TOS object was not returned by list".into()); + } + + store.inner.delete(&path).await?; + if store.exists(&path).await? { + return Err("deleted TOS object still exists".into()); + } + + Ok(()) + } + .await; + + delete_prefix(&store, &prefix).await; + result.unwrap(); +} diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml index 19d7ad4da47..6a188ec3c62 100644 --- a/rust/lance-linalg/Cargo.toml +++ b/rust/lance-linalg/Cargo.toml @@ -13,12 +13,12 @@ categories = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } -deepsize = { workspace = true } half = { workspace = true } lance-arrow = { workspace = true } lance-core = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } +rayon = { workspace = true } [dev-dependencies] approx = { workspace = true } @@ -51,10 +51,6 @@ harness = false name = "cosine" harness = false -[[bench]] -name = "hamming" -harness = false - [[bench]] name = "norm_l2" harness = false diff --git a/rust/lance-linalg/benches/cosine.rs b/rust/lance-linalg/benches/cosine.rs index 223299a934c..49e816b76df 100644 --- a/rust/lance-linalg/benches/cosine.rs +++ b/rust/lance-linalg/benches/cosine.rs @@ -40,7 +40,7 @@ where T::Native: Cosine, { const DIMENSION: usize = 1024; - const TOTAL: usize = 1024 * 1024; // 1M vectors + const TOTAL: usize = 512 * 1024; let type_name = std::any::type_name::(); let key = generate_random_array_with_seed::(DIMENSION, [0; 32]); diff --git a/rust/lance-linalg/benches/hamming.rs b/rust/lance-linalg/benches/hamming.rs deleted file mode 100644 index 9af3bf4614b..00000000000 --- a/rust/lance-linalg/benches/hamming.rs +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -use std::iter::repeat_with; - -use std::hint::black_box; - -use criterion::{Criterion, criterion_group, criterion_main}; -use lance_linalg::distance::hamming::{hamming, hamming_scalar}; -use rand::Rng; - -const DIMENSION: usize = 1024; -const TOTAL: usize = 1024 * 1024; // 1M vectors - -fn bench_hamming(c: &mut Criterion) { - let mut rng = rand::rng(); - - let key = repeat_with(|| rng.random::()) - .take(DIMENSION) - .collect::>(); - let target = repeat_with(|| rng.random::()) - .take(TOTAL * DIMENSION) - .collect::>(); - - c.bench_function("hamming,scalar", |b| { - b.iter(|| { - black_box( - target - .chunks_exact(DIMENSION) - .map(|tgt| hamming_scalar(&key, tgt)) - .sum::(), - ); - }) - }); - - c.bench_function("hamming,auto_vec", |b| { - b.iter(|| { - black_box( - target - .chunks_exact(DIMENSION) - .map(|tgt| hamming(&key, tgt)) - .sum::(), - ); - }) - }); -} - -criterion_group!( - name=benches; - config = Criterion::default().significance_level(0.1).sample_size(10); - targets = bench_hamming); -criterion_main!(benches); diff --git a/rust/lance-linalg/build.rs b/rust/lance-linalg/build.rs index 06e1439c77a..407f2a589ea 100644 --- a/rust/lance-linalg/build.rs +++ b/rust/lance-linalg/build.rs @@ -16,7 +16,9 @@ fn main() -> Result<(), String> { } // Let clippy know about our custom cfg attribute - println!("cargo::rustc-check-cfg=cfg(kernel_support, values(\"avx512\"))"); + println!( + "cargo::rustc-check-cfg=cfg(kernel_support, values(\"avx512_f16\", \"avx512_bf16\", \"avx512_dist_table\"))" + ); println!("cargo:rerun-if-changed=src/simd/f16.c"); println!("cargo:rerun-if-changed=src/simd/bf16.c"); @@ -58,10 +60,10 @@ fn main() -> Result<(), String> { "cargo:warning=Skipping build of AVX-512 fp16 kernels. Error: {}", err ); - } else { + } else if cfg!(feature = "fp16kernels") { // We create a special cfg so that we can detect we have in fact // generated the AVX512 version of the f16 kernels. - println!("cargo:rustc-cfg=kernel_support=\"avx512\""); + println!("cargo:rustc-cfg=kernel_support=\"avx512_f16\""); }; // Build AVX-512 bf16 kernels (sapphirerapids has native vdpbf16ps) if let Err(err) = @@ -71,16 +73,16 @@ fn main() -> Result<(), String> { "cargo:warning=Skipping build of AVX-512 bf16 kernels. Error: {}", err ); - } else { - println!("cargo:rustc-cfg=kernel_support=\"avx512\""); + } else if cfg!(feature = "fp16kernels") { + println!("cargo:rustc-cfg=kernel_support=\"avx512_bf16\""); }; - if let Err(err) = build_dist_table_with_flags("avx512", &["-march=native"]) { + if let Err(err) = build_dist_table_with_flags("avx512", &["-march=sapphirerapids"]) { println!( "cargo:warning=Skipping build of AVX-512 dist_table. Error: {}", err ); } else { - println!("cargo:rustc-cfg=kernel_support=\"avx512\""); + println!("cargo:rustc-cfg=kernel_support=\"avx512_dist_table\""); }; // Build a version with AVX // While GCC doesn't have support for _Float16 until GCC 12, clang diff --git a/rust/lance-linalg/src/distance.rs b/rust/lance-linalg/src/distance.rs index 9dd57edb3f9..23d1cae2d63 100644 --- a/rust/lance-linalg/src/distance.rs +++ b/rust/lance-linalg/src/distance.rs @@ -26,10 +26,14 @@ pub mod l2_u8; pub mod norm_l2; pub use cosine::*; -use deepsize::DeepSizeOf; pub use dot::*; -use hamming::hamming_distance_arrow_batch; +pub use hamming::{ + Cluster, ClusteringResult, PairwiseResult, UnionFind, cluster_edges, cluster_pairwise_result, + extract_hashes_from_fixed_list, hamming_distance_arrow_batch, hamming_u64, + pairwise_hamming_distance, pairwise_hamming_distance_parallel, +}; pub use l2::*; +use lance_core::deepsize::DeepSizeOf; pub use norm_l2::*; use crate::Result; diff --git a/rust/lance-linalg/src/distance/cosine.rs b/rust/lance-linalg/src/distance/cosine.rs index be5bf436344..995191b77eb 100644 --- a/rust/lance-linalg/src/distance/cosine.rs +++ b/rust/lance-linalg/src/distance/cosine.rs @@ -82,7 +82,7 @@ mod bf16_kernel { #[cfg(target_arch = "aarch64")] pub fn cosine_bf16_neon(x: *const bf16, x_norm: f32, y: *const bf16, dimension: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn cosine_bf16_avx512( x: *const bf16, x_norm: f32, @@ -109,7 +109,7 @@ impl Cosine for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -141,7 +141,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn cosine_f16_neon(x: *const f16, x_norm: f32, y: *const f16, dimension: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn cosine_f16_avx512(x: *const f16, x_norm: f32, y: *const f16, dimension: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn cosine_f16_avx2(x: *const f16, x_norm: f32, y: *const f16, dimension: u32) -> f32; @@ -161,7 +161,7 @@ impl Cosine for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { diff --git a/rust/lance-linalg/src/distance/dot.rs b/rust/lance-linalg/src/distance/dot.rs index cf045b1996a..5903d24e0e5 100644 --- a/rust/lance-linalg/src/distance/dot.rs +++ b/rust/lance-linalg/src/distance/dot.rs @@ -122,7 +122,7 @@ mod bf16_kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn dot_bf16_neon(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn dot_bf16_avx512(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn dot_bf16_avx2(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; @@ -143,7 +143,7 @@ impl Dot for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -175,7 +175,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn dot_f16_neon(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn dot_f16_avx512(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn dot_f16_avx2(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; @@ -196,7 +196,7 @@ impl Dot for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { diff --git a/rust/lance-linalg/src/distance/hamming.rs b/rust/lance-linalg/src/distance/hamming.rs index d8fd60f4054..a6f4b038195 100644 --- a/rust/lance-linalg/src/distance/hamming.rs +++ b/rust/lance-linalg/src/distance/hamming.rs @@ -2,14 +2,24 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors //! Hamming distance. +//! +//! This module provides hamming distance computation for binary vectors, +//! including SIMD-accelerated pairwise hamming distance for 64-bit hashes. +use std::collections::HashMap; use std::sync::Arc; -use crate::{Error, Result}; +use arrow_array::builder::{ListBuilder, UInt64Builder}; use arrow_array::cast::AsArray; use arrow_array::types::UInt8Type; -use arrow_array::{Array, FixedSizeListArray, Float32Array}; -use arrow_schema::DataType; +use arrow_array::{ + Array, ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator, + RecordBatchReader, UInt32Array, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use rayon::prelude::*; + +use crate::{Error, Result}; pub trait Hamming { /// Hamming distance between two vectors. @@ -86,6 +96,640 @@ pub fn hamming_distance_arrow_batch( ))) } +/// Compute hamming distance between two 64-bit values using POPCNT. +#[inline(always)] +pub fn hamming_u64(a: u64, b: u64) -> u32 { + (a ^ b).count_ones() +} + +/// Result of pairwise hamming distance computation. +#[derive(Debug, Clone)] +pub struct PairwiseResult { + pub row_id_a: Vec, + pub row_id_b: Vec, + pub distances: Vec, +} + +impl PairwiseResult { + pub fn new() -> Self { + Self { + row_id_a: Vec::new(), + row_id_b: Vec::new(), + distances: Vec::new(), + } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + row_id_a: Vec::with_capacity(capacity), + row_id_b: Vec::with_capacity(capacity), + distances: Vec::with_capacity(capacity), + } + } + + pub fn push(&mut self, a: u64, b: u64, dist: u32) { + self.row_id_a.push(a); + self.row_id_b.push(b); + self.distances.push(dist); + } + + pub fn len(&self) -> usize { + self.row_id_a.len() + } + + pub fn is_empty(&self) -> bool { + self.row_id_a.is_empty() + } + + pub fn extend(&mut self, other: Self) { + self.row_id_a.extend(other.row_id_a); + self.row_id_b.extend(other.row_id_b); + self.distances.extend(other.distances); + } + + /// Convert to Arrow RecordBatch, consuming self. + pub fn into_record_batch(self) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("row_id_a", DataType::UInt64, false), + Field::new("row_id_b", DataType::UInt64, false), + Field::new("distance", DataType::UInt32, false), + ])); + + let row_id_a = Arc::new(UInt64Array::from(self.row_id_a)); + let row_id_b = Arc::new(UInt64Array::from(self.row_id_b)); + let distances = Arc::new(UInt32Array::from(self.distances)); + + RecordBatch::try_new(schema, vec![row_id_a, row_id_b, distances]) + .expect("Failed to create RecordBatch") + } +} + +impl Default for PairwiseResult { + fn default() -> Self { + Self::new() + } +} + +/// Compute hamming distances for a query against multiple targets. +/// Uses SIMD acceleration when available. +#[inline] +pub fn hamming_batch_u64(query: u64, targets: &[u64], results: &mut [u32]) { + debug_assert_eq!(targets.len(), results.len()); + hamming_batch_simd(query, targets, results); +} + +/// SIMD-accelerated batch hamming distance computation. +#[inline] +fn hamming_batch_simd(query: u64, targets: &[u64], results: &mut [u32]) { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") { + unsafe { + hamming_batch_avx512(query, targets, results); + } + return; + } + if is_x86_feature_detected!("avx2") { + unsafe { + hamming_batch_avx2(query, targets, results); + } + return; + } + } + + // Scalar fallback (LLVM auto-vectorizes well on Apple Silicon) + hamming_batch_scalar(query, targets, results); +} + +/// Scalar fallback using count_ones() which compiles to POPCNT. +#[inline] +fn hamming_batch_scalar(query: u64, targets: &[u64], results: &mut [u32]) { + // Unroll for better auto-vectorization + let n = targets.len(); + let chunks = n / 8; + let mut i = 0; + + for _ in 0..chunks { + results[i] = (query ^ targets[i]).count_ones(); + results[i + 1] = (query ^ targets[i + 1]).count_ones(); + results[i + 2] = (query ^ targets[i + 2]).count_ones(); + results[i + 3] = (query ^ targets[i + 3]).count_ones(); + results[i + 4] = (query ^ targets[i + 4]).count_ones(); + results[i + 5] = (query ^ targets[i + 5]).count_ones(); + results[i + 6] = (query ^ targets[i + 6]).count_ones(); + results[i + 7] = (query ^ targets[i + 7]).count_ones(); + i += 8; + } + + // Handle remainder + while i < n { + results[i] = (query ^ targets[i]).count_ones(); + i += 1; + } +} + +/// AVX-512 VPOPCNTDQ: Process 8 x 64-bit values at once. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f", enable = "avx512vpopcntdq")] +unsafe fn hamming_batch_avx512(query: u64, targets: &[u64], results: &mut [u32]) { + use std::arch::x86_64::*; + + let n = targets.len(); + let query_vec = _mm512_set1_epi64(query as i64); + + let chunks = n / 8; + let remainder = n % 8; + + for i in 0..chunks { + let offset = i * 8; + let targets_ptr = targets.as_ptr().add(offset) as *const __m512i; + let target_vec = _mm512_loadu_si512(targets_ptr); + + let xor_result = _mm512_xor_si512(query_vec, target_vec); + let popcount = _mm512_popcnt_epi64(xor_result); + let popcount_32 = _mm512_cvtepi64_epi32(popcount); + + _mm256_storeu_si256( + results.as_mut_ptr().add(offset) as *mut __m256i, + popcount_32, + ); + } + + if remainder > 0 { + let offset = chunks * 8; + for j in 0..remainder { + results[offset + j] = (query ^ targets[offset + j]).count_ones(); + } + } +} + +/// AVX2 popcount using lookup table (Harley-Seal / PSHUFB method). +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn hamming_batch_avx2(query: u64, targets: &[u64], results: &mut [u32]) { + use std::arch::x86_64::*; + + let n = targets.len(); + + let lookup = _mm256_setr_epi8( + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, + 3, 4, + ); + let low_mask = _mm256_set1_epi8(0x0f); + let query_vec = _mm256_set1_epi64x(query as i64); + + let chunks = n / 4; + let remainder = n % 4; + + for i in 0..chunks { + let offset = i * 4; + let targets_ptr = targets.as_ptr().add(offset) as *const __m256i; + let target_vec = _mm256_loadu_si256(targets_ptr); + + let xor_result = _mm256_xor_si256(query_vec, target_vec); + + // Popcount using nibble lookup + let lo = _mm256_and_si256(xor_result, low_mask); + let hi = _mm256_and_si256(_mm256_srli_epi16(xor_result, 4), low_mask); + let popcnt_lo = _mm256_shuffle_epi8(lookup, lo); + let popcnt_hi = _mm256_shuffle_epi8(lookup, hi); + let popcnt_bytes = _mm256_add_epi8(popcnt_lo, popcnt_hi); + let popcount = _mm256_sad_epu8(popcnt_bytes, _mm256_setzero_si256()); + + let results_ptr = results.as_mut_ptr().add(offset); + *results_ptr = _mm256_extract_epi32::<0>(popcount) as u32; + *results_ptr.add(1) = _mm256_extract_epi32::<2>(popcount) as u32; + *results_ptr.add(2) = _mm256_extract_epi32::<4>(popcount) as u32; + *results_ptr.add(3) = _mm256_extract_epi32::<6>(popcount) as u32; + } + + if remainder > 0 { + let offset = chunks * 4; + for j in 0..remainder { + results[offset + j] = (query ^ targets[offset + j]).count_ones(); + } + } +} + +/// Compute pairwise hamming distances for all pairs of hashes. +/// +/// Returns pairs where distance <= threshold (if provided). +/// +/// # Arguments +/// * `hashes` - Vector of 64-bit hash values +/// * `row_ids` - Optional row IDs (defaults to indices if None) +/// * `threshold` - Optional maximum distance to include in results +pub fn pairwise_hamming_distance( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: Option, +) -> PairwiseResult { + let n = hashes.len(); + if n < 2 { + return PairwiseResult::new(); + } + + let threshold = threshold.unwrap_or(u32::MAX); + let num_pairs = n * (n - 1) / 2; + let mut result = PairwiseResult::with_capacity(num_pairs.min(1_000_000)); + + for i in 0..n { + for j in (i + 1)..n { + let dist = hamming_u64(hashes[i], hashes[j]); + if dist <= threshold { + let id_a = row_ids.map_or(i as u64, |ids| ids[i]); + let id_b = row_ids.map_or(j as u64, |ids| ids[j]); + result.push(id_a, id_b, dist); + } + } + } + + result +} + +/// Compute pairwise hamming distances in parallel using rayon + SIMD. +/// +/// Uses chunked parallelization for balanced workload distribution. +pub fn pairwise_hamming_distance_parallel( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: Option, +) -> PairwiseResult { + let n = hashes.len(); + if n < 2 { + return PairwiseResult::new(); + } + + let threshold = threshold.unwrap_or(u32::MAX); + let total_pairs = n * (n - 1) / 2; + + // For small datasets, use sequential to avoid thread overhead + if total_pairs < 10_000 { + return pairwise_hamming_distance(hashes, row_ids, Some(threshold)); + } + + let threads = rayon::current_num_threads(); + let pairs_per_chunk = total_pairs.div_ceil(threads); + let chunks = compute_balanced_chunks(n, pairs_per_chunk); + + let results: Vec = chunks + .into_par_iter() + .map(|(start_row, end_row)| { + process_row_range(hashes, row_ids, threshold, start_row, end_row) + }) + .collect(); + + let mut combined = PairwiseResult::new(); + for r in results { + combined.extend(r); + } + combined +} + +/// Compute balanced chunks for parallel processing. +fn compute_balanced_chunks(n: usize, target_pairs_per_chunk: usize) -> Vec<(usize, usize)> { + let mut chunks = Vec::new(); + let mut current_start = 0; + let mut current_pairs = 0; + + for i in 0..n { + let pairs_for_row = n - i - 1; + current_pairs += pairs_for_row; + + if current_pairs >= target_pairs_per_chunk || i == n - 1 { + chunks.push((current_start, i + 1)); + current_start = i + 1; + current_pairs = 0; + } + } + + chunks +} + +/// Process a range of rows for pairwise comparison using SIMD. +fn process_row_range( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: u32, + start_row: usize, + end_row: usize, +) -> PairwiseResult { + let n = hashes.len(); + let mut result = PairwiseResult::new(); + + for i in start_row..end_row { + let remaining = n - i - 1; + if remaining == 0 { + continue; + } + + let mut distances = vec![0u32; remaining]; + hamming_batch_u64(hashes[i], &hashes[i + 1..], &mut distances); + + let id_a = row_ids.map_or(i as u64, |ids| ids[i]); + for (j_offset, &dist) in distances.iter().enumerate() { + if dist <= threshold { + let j = i + 1 + j_offset; + let id_b = row_ids.map_or(j as u64, |ids| ids[j]); + result.push(id_a, id_b, dist); + } + } + } + + result +} + +/// Extract u64 hashes from a FixedSizeList Arrow array. +pub fn extract_hashes_from_fixed_list(array: &FixedSizeListArray) -> Result> { + let list_size = array.value_length(); + if list_size != 8 { + return Err(Error::InvalidArgumentError(format!( + "Expected FixedSizeList with size 8, got size {}", + list_size + ))); + } + + let values = array + .values() + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::InvalidArgumentError("Expected UInt8Array values in FixedSizeList".to_string()) + })?; + + let n = array.len(); + let mut hashes = Vec::with_capacity(n); + + for i in 0..n { + let start = i * 8; + let bytes = &values.values()[start..start + 8]; + let mut arr = [0u8; 8]; + arr.copy_from_slice(bytes); + hashes.push(u64::from_le_bytes(arr)); + } + + Ok(hashes) +} + +/// Union-Find data structure with path compression for clustering. +pub struct UnionFind { + parent: HashMap, + rank: HashMap, +} + +impl UnionFind { + pub fn new() -> Self { + Self { + parent: HashMap::new(), + rank: HashMap::new(), + } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + parent: HashMap::with_capacity(capacity), + rank: HashMap::with_capacity(capacity), + } + } + + /// Find the root of a node with path compression. + pub fn find(&mut self, x: u64) -> u64 { + if let std::collections::hash_map::Entry::Vacant(e) = self.parent.entry(x) { + e.insert(x); + self.rank.insert(x, 0); + return x; + } + + let mut current = x; + let mut path = Vec::new(); + + while self.parent[¤t] != current { + path.push(current); + current = self.parent[¤t]; + } + let root = current; + + for node in path { + self.parent.insert(node, root); + } + + root + } + + /// Union two nodes, using union by rank. + pub fn union(&mut self, a: u64, b: u64) -> bool { + let root_a = self.find(a); + let root_b = self.find(b); + + if root_a == root_b { + return false; + } + + let rank_a = self.rank[&root_a]; + let rank_b = self.rank[&root_b]; + + if rank_a < rank_b { + self.parent.insert(root_a, root_b); + } else if rank_a > rank_b { + self.parent.insert(root_b, root_a); + } else if root_a < root_b { + self.parent.insert(root_b, root_a); + *self.rank.get_mut(&root_a).unwrap() += 1; + } else { + self.parent.insert(root_a, root_b); + *self.rank.get_mut(&root_b).unwrap() += 1; + } + + true + } + + pub fn nodes(&self) -> impl Iterator { + self.parent.keys() + } + + pub fn len(&self) -> usize { + self.parent.len() + } + + pub fn is_empty(&self) -> bool { + self.parent.is_empty() + } +} + +impl Default for UnionFind { + fn default() -> Self { + Self::new() + } +} + +/// A cluster with representative and duplicates. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Cluster { + /// The representative row ID (smallest in the cluster). + pub representative: u64, + /// List of duplicate row IDs (excludes the representative). + pub duplicates: Vec, +} + +impl Cluster { + pub fn size(&self) -> usize { + 1 + self.duplicates.len() + } +} + +/// Result of the clustering operation. +#[derive(Debug, Clone)] +pub struct ClusteringResult { + /// List of clusters, each with a representative and duplicates. + pub clusters: Vec, +} + +impl ClusteringResult { + pub fn num_clusters(&self) -> usize { + self.clusters.len() + } + + pub fn num_duplicates(&self) -> usize { + self.clusters.iter().map(|c| c.duplicates.len()).sum() + } + + pub fn num_unique(&self) -> usize { + self.clusters.len() + } + + /// Get the schema for clustering result batches. + pub fn schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("representative", DataType::UInt64, false), + Field::new( + "duplicates", + DataType::List(Arc::new(Field::new("item", DataType::UInt64, true))), + false, + ), + ])) + } + + /// Convert to Arrow RecordBatch with columns: + /// - `representative`: `UInt64` + /// - `duplicates`: `List` + pub fn to_record_batch(&self) -> RecordBatch { + let schema = Self::schema(); + + let mut representatives = Vec::with_capacity(self.clusters.len()); + let mut duplicates_builder = ListBuilder::new(UInt64Builder::new()); + + for cluster in &self.clusters { + representatives.push(cluster.representative); + for &dup in &cluster.duplicates { + duplicates_builder.values().append_value(dup); + } + duplicates_builder.append(true); + } + + let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives)); + let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish()); + + RecordBatch::try_new(schema, vec![representative_array, duplicates_array]) + .expect("Failed to create RecordBatch") + } + + /// Convert to a RecordBatchReader that yields batches of the specified size. + /// + /// # Arguments + /// * `batch_size` - Number of clusters per batch (default: 10000) + pub fn into_reader(self, batch_size: Option) -> Box { + let batch_size = batch_size.unwrap_or(10_000); + let schema = Self::schema(); + + if self.clusters.is_empty() { + // Return empty reader + let batches: Vec> = vec![]; + return Box::new(RecordBatchIterator::new(batches, schema)); + } + + let batches: Vec> = self + .clusters + .chunks(batch_size) + .map(|chunk| { + let mut representatives = Vec::with_capacity(chunk.len()); + let mut duplicates_builder = ListBuilder::new(UInt64Builder::new()); + + for cluster in chunk { + representatives.push(cluster.representative); + for &dup in &cluster.duplicates { + duplicates_builder.values().append_value(dup); + } + duplicates_builder.append(true); + } + + let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives)); + let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish()); + + RecordBatch::try_new(Self::schema(), vec![representative_array, duplicates_array]) + }) + .collect(); + + Box::new(RecordBatchIterator::new(batches, schema)) + } +} + +/// Cluster edges using union-find algorithm. +/// +/// Takes a list of edges (row_id_a, row_id_b) and groups connected nodes +/// into clusters. Each cluster has a representative (smallest row ID) +/// and a list of duplicates. +pub fn cluster_edges(edges: I) -> ClusteringResult +where + I: IntoIterator, +{ + let mut uf = UnionFind::new(); + + for (a, b) in edges { + uf.union(a, b); + } + + let mut clusters_map: HashMap> = HashMap::new(); + let nodes: Vec = uf.nodes().copied().collect(); + + for node in nodes { + let root = uf.find(node); + clusters_map.entry(root).or_default().push(node); + } + + let mut clusters = Vec::new(); + for (_root, mut members) in clusters_map { + members.sort_unstable(); + + if members.len() > 1 { + let representative = *members.iter().min().unwrap(); + let duplicates: Vec = members + .into_iter() + .filter(|&m| m != representative) + .collect(); + + clusters.push(Cluster { + representative, + duplicates, + }); + } + } + + clusters.sort_by_key(|c| c.representative); + + ClusteringResult { clusters } +} + +/// Cluster edges from PairwiseResult. +pub fn cluster_pairwise_result(result: &PairwiseResult) -> ClusteringResult { + let edges = result + .row_id_a + .iter() + .zip(result.row_id_b.iter()) + .map(|(&a, &b)| (a, b)); + + cluster_edges(edges) +} + #[cfg(test)] mod tests { use super::*; @@ -102,4 +746,677 @@ mod tests { let y = vec![0b1101_1010, 0b1010_1010, 0b1010_1001]; assert_eq!(hamming(&x, &y), 2.0); } + + #[test] + fn test_hamming_u64() { + assert_eq!(hamming_u64(0, 0), 0); + assert_eq!(hamming_u64(0, 1), 1); + assert_eq!(hamming_u64(0b1111, 0b0000), 4); + assert_eq!(hamming_u64(u64::MAX, 0), 64); + assert_eq!(hamming_u64(0xAAAAAAAAAAAAAAAA, 0x5555555555555555), 64); + } + + #[test] + fn test_hamming_batch_u64() { + let query = 0u64; + let targets: Vec = (0..128).collect(); + let mut results = vec![0u32; 128]; + + hamming_batch_u64(query, &targets, &mut results); + + assert_eq!(results[0], 0); + assert_eq!(results[1], 1); + assert_eq!(results[3], 2); // 0b11 has 2 bits set + assert_eq!(results[7], 3); // 0b111 has 3 bits set + } + + #[test] + fn test_pairwise_basic() { + let hashes = vec![0b0000u64, 0b0001, 0b0011, 0b0111]; + let result = pairwise_hamming_distance(&hashes, None, None); + + assert_eq!(result.len(), 6); // C(4,2) = 6 pairs + assert!(result.distances.iter().all(|&d| d <= 3)); + } + + #[test] + fn test_pairwise_with_threshold() { + let hashes = vec![0b0000u64, 0b0001, 0b1111]; + let result = pairwise_hamming_distance(&hashes, None, Some(1)); + + assert_eq!(result.len(), 1); + assert_eq!(result.row_id_a[0], 0); + assert_eq!(result.row_id_b[0], 1); + assert_eq!(result.distances[0], 1); + } + + #[test] + fn test_pairwise_with_row_ids() { + let hashes = vec![0b0000u64, 0b0001]; + let row_ids = vec![100u64, 200u64]; + let result = pairwise_hamming_distance(&hashes, Some(&row_ids), None); + + assert_eq!(result.len(), 1); + assert_eq!(result.row_id_a[0], 100); + assert_eq!(result.row_id_b[0], 200); + } + + #[test] + fn test_pairwise_parallel() { + let hashes: Vec = (0..100).collect(); + let result_seq = pairwise_hamming_distance(&hashes, None, None); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, None); + + assert_eq!(result_seq.len(), result_par.len()); + } + + #[test] + fn test_union_find_basic() { + let mut uf = UnionFind::new(); + + assert_eq!(uf.find(1), 1); + assert_eq!(uf.find(2), 2); + assert_eq!(uf.find(3), 3); + + assert!(uf.union(1, 2)); + assert_eq!(uf.find(1), uf.find(2)); + + assert!(uf.union(2, 3)); + assert_eq!(uf.find(1), uf.find(3)); + + assert!(!uf.union(1, 3)); + } + + #[test] + fn test_cluster_edges_simple() { + let edges = vec![(1, 2), (2, 3), (4, 5)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.representative == 1) + .unwrap(); + assert_eq!(c1.duplicates.len(), 2); + assert!(c1.duplicates.contains(&2)); + assert!(c1.duplicates.contains(&3)); + + let c2 = result + .clusters + .iter() + .find(|c| c.representative == 4) + .unwrap(); + assert_eq!(c2.duplicates.len(), 1); + assert!(c2.duplicates.contains(&5)); + } + + #[test] + fn test_cluster_pairwise_result() { + let hashes = vec![0b0000u64, 0b0001, 0b0011]; // distances: (0,1)=1, (0,2)=2, (1,2)=1 + let pairwise = pairwise_hamming_distance(&hashes, None, Some(1)); // threshold 1 + + // Only pairs with distance <= 1: (0,1) and (1,2) + assert_eq!(pairwise.len(), 2); + + let clustering = cluster_pairwise_result(&pairwise); + // All three should be in one cluster since 0-1-2 are connected + assert_eq!(clustering.num_clusters(), 1); + assert_eq!(clustering.clusters[0].representative, 0); + assert_eq!(clustering.clusters[0].duplicates.len(), 2); + } + + #[test] + fn test_into_record_batch() { + let hashes = vec![0b0000u64, 0b0001, 0b0011]; + let result = pairwise_hamming_distance(&hashes, None, None); + let batch = result.into_record_batch(); + + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.schema().field(0).name(), "row_id_a"); + assert_eq!(batch.schema().field(1).name(), "row_id_b"); + assert_eq!(batch.schema().field(2).name(), "distance"); + } + + // ========================================================================= + // Additional tests from pairwise-hamming reference implementation + // ========================================================================= + + /// Reference implementation for validation - simple O(n²) nested loop + fn reference_pairwise(hashes: &[u64], threshold: Option) -> Vec<(usize, usize, u32)> { + let threshold = threshold.unwrap_or(u32::MAX); + let mut results = Vec::new(); + for i in 0..hashes.len() { + for j in (i + 1)..hashes.len() { + let dist = (hashes[i] ^ hashes[j]).count_ones(); + if dist <= threshold { + results.push((i, j, dist)); + } + } + } + results + } + + /// Convert PairwiseResult to sorted vec for comparison + fn result_to_sorted_vec(result: &PairwiseResult) -> Vec<(u64, u64, u32)> { + let mut v: Vec<_> = result + .row_id_a + .iter() + .zip(result.row_id_b.iter()) + .zip(result.distances.iter()) + .map(|((&a, &b), &d)| (a, b, d)) + .collect(); + v.sort(); + v + } + + #[test] + fn test_pairwise_correctness_small() { + // Deterministic hashes with known distances + let hashes = vec![ + 0b0000_0000u64, // 0 + 0b0000_0001u64, // 1 bit from 0 + 0b0000_0011u64, // 2 bits from 0, 1 bit from 1 + 0b0000_0111u64, // 3 bits from 0, 2 bits from 1, 1 bit from 2 + 0b0000_1111u64, // 4 bits from 0, 3 bits from 1, 2 bits from 2, 1 bit from 3 + ]; + + let result = pairwise_hamming_distance(&hashes, None, None); + let reference = reference_pairwise(&hashes, None); + + assert_eq!(result.len(), reference.len()); + assert_eq!(result.len(), 10); // C(5,2) = 10 pairs + + // Verify specific distances + let result_vec = result_to_sorted_vec(&result); + for (i, j, expected_dist) in &reference { + let found = result_vec + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + assert!(found.is_some(), "Missing pair ({}, {})", i, j); + assert_eq!( + found.unwrap().2, + *expected_dist, + "Wrong distance for pair ({}, {})", + i, + j + ); + } + } + + #[test] + fn test_pairwise_correctness_1000_deterministic() { + // Generate deterministic hashes using simple linear pattern + let hashes: Vec = (0u64..1000) + .map(|i| i.wrapping_mul(0x123456789ABCDEF)) + .collect(); + + let result_seq = pairwise_hamming_distance(&hashes, None, Some(10)); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(10)); + let reference = reference_pairwise(&hashes, Some(10)); + + // Both implementations should match reference + assert_eq!( + result_seq.len(), + reference.len(), + "Sequential result count mismatch" + ); + assert_eq!( + result_par.len(), + reference.len(), + "Parallel result count mismatch" + ); + + // Verify all pairs match + let seq_sorted = result_to_sorted_vec(&result_seq); + let par_sorted = result_to_sorted_vec(&result_par); + + for (i, j, dist) in &reference { + let seq_found = seq_sorted + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + let par_found = par_sorted + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + + assert!( + seq_found.is_some(), + "Sequential missing pair ({}, {})", + i, + j + ); + assert!(par_found.is_some(), "Parallel missing pair ({}, {})", i, j); + assert_eq!(seq_found.unwrap().2, *dist); + assert_eq!(par_found.unwrap().2, *dist); + } + } + + #[test] + fn test_pairwise_correctness_10000_deterministic() { + // Larger test with 10K hashes + let hashes: Vec = (0u64..10_000) + .map(|i| { + // Mix bits using a simple hash-like transformation + let x = i.wrapping_mul(0xDEADBEEFCAFEBABE); + x ^ (x >> 17) ^ (x << 13) + }) + .collect(); + + let result_seq = pairwise_hamming_distance(&hashes, None, Some(5)); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(5)); + + // Both should find the same number of pairs + assert_eq!( + result_seq.len(), + result_par.len(), + "10K test: sequential found {} pairs, parallel found {} pairs", + result_seq.len(), + result_par.len() + ); + + // Verify they contain the same pairs (sorted comparison) + let seq_sorted = result_to_sorted_vec(&result_seq); + let par_sorted = result_to_sorted_vec(&result_par); + assert_eq!(seq_sorted, par_sorted, "10K test: pair contents differ"); + } + + #[test] + fn test_pairwise_total_pairs_count() { + // Without threshold, should return exactly n*(n-1)/2 pairs + for n in [10, 50, 100, 500] { + let hashes: Vec = (0..n).map(|i| i as u64).collect(); + let result = pairwise_hamming_distance_parallel(&hashes, None, None); + let expected = n * (n - 1) / 2; + assert_eq!( + result.len(), + expected, + "n={}: expected {} pairs, got {}", + n, + expected, + result.len() + ); + } + } + + #[test] + fn test_pairwise_threshold_filtering() { + // All identical hashes should have distance 0 + let hashes = vec![0xABCDEF0123456789u64; 100]; + let result = pairwise_hamming_distance_parallel(&hashes, None, Some(0)); + + // All pairs should be included (distance 0) + assert_eq!(result.len(), 100 * 99 / 2); + assert!(result.distances.iter().all(|&d| d == 0)); + + // With threshold 0 and all different hashes, should find fewer pairs + let different_hashes: Vec = (0u64..100).collect(); + let result2 = pairwise_hamming_distance_parallel(&different_hashes, None, Some(0)); + // Only pairs with identical values should match (none in this case except 0^0) + assert!(result2.len() < 100 * 99 / 2); + } + + #[test] + fn test_pairwise_row_ids_preserved() { + let hashes: Vec = (0u64..100).collect(); + let row_ids: Vec = (1000u64..1100).collect(); // offset row IDs + + let result = pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(5)); + + // All row IDs should be in range [1000, 1100) + for &id in &result.row_id_a { + assert!((1000..1100).contains(&id), "row_id_a {} out of range", id); + } + for &id in &result.row_id_b { + assert!((1000..1100).contains(&id), "row_id_b {} out of range", id); + } + // row_id_a should always be less than row_id_b (upper triangular) + for (&a, &b) in result.row_id_a.iter().zip(result.row_id_b.iter()) { + assert!(a < b, "Expected row_id_a < row_id_b, got {} >= {}", a, b); + } + } + + #[test] + fn test_pairwise_distance_bounds() { + // All distances should be in [0, 64] for u64 hashes + let hashes: Vec = (0u64..1000).map(|i| i.wrapping_mul(0x123456789)).collect(); + + let result = pairwise_hamming_distance_parallel(&hashes, None, None); + + for &d in &result.distances { + assert!(d <= 64, "Distance {} exceeds maximum 64", d); + } + } + + #[test] + fn test_pairwise_symmetry() { + // Hamming distance is symmetric: d(a,b) = d(b,a) + let hashes: Vec = vec![ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xAAAAAAAAAAAAAAAA, + 0x5555555555555555, + 0x123456789ABCDEF0, + ]; + + let result = pairwise_hamming_distance(&hashes, None, None); + + // For each pair (i,j), verify distance matches manual calculation + for idx in 0..result.len() { + let i = result.row_id_a[idx] as usize; + let j = result.row_id_b[idx] as usize; + let dist = result.distances[idx]; + + let expected = (hashes[i] ^ hashes[j]).count_ones(); + assert_eq!(dist, expected, "Distance mismatch for pair ({}, {})", i, j); + } + } + + #[test] + fn test_balanced_chunks() { + // Verify chunks are reasonably balanced + let n = 10000; + let total_pairs = n * (n - 1) / 2; + let target_per_chunk = total_pairs / 16; + + let chunks = compute_balanced_chunks(n, target_per_chunk); + + // Should have roughly 16 chunks + assert!( + chunks.len() >= 14 && chunks.len() <= 18, + "Expected ~16 chunks, got {}", + chunks.len() + ); + + // Each chunk should have roughly equal work + for (start, end) in &chunks { + let mut chunk_pairs = 0usize; + for i in *start..*end { + chunk_pairs += n - i - 1; + } + // Allow 20% deviation from target + let lower = target_per_chunk * 80 / 100; + // last chunk may be smaller + assert!( + chunk_pairs >= lower || *end == n, + "Chunk [{}, {}) has {} pairs, expected ~{}", + start, + end, + chunk_pairs, + target_per_chunk + ); + } + + // Chunks should cover all rows without gaps + assert_eq!(chunks[0].0, 0); + assert_eq!(chunks.last().unwrap().1, n); + for i in 1..chunks.len() { + assert_eq!(chunks[i].0, chunks[i - 1].1, "Gap between chunks"); + } + } + + // ========================================================================= + // SIMD-specific tests + // ========================================================================= + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx2_popcount() { + if !is_x86_feature_detected!("avx2") { + return; + } + + let query = 0u64; + let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127]; + let mut results = vec![0u32; 8]; + + unsafe { + hamming_batch_avx2(query, &targets, &mut results); + } + + assert_eq!(results[0], 0); // 0 ^ 0 = 0 bits + assert_eq!(results[1], 1); // 0 ^ 1 = 1 bit + assert_eq!(results[2], 2); // 0 ^ 3 = 2 bits + assert_eq!(results[3], 3); // 0 ^ 7 = 3 bits + assert_eq!(results[4], 4); // 0 ^ 15 = 4 bits + assert_eq!(results[5], 5); // 0 ^ 31 = 5 bits + assert_eq!(results[6], 6); // 0 ^ 63 = 6 bits + assert_eq!(results[7], 7); // 0 ^ 127 = 7 bits + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx2_max_distance() { + if !is_x86_feature_detected!("avx2") { + return; + } + + let query = 0u64; + let targets = vec![u64::MAX; 4]; + let mut results = vec![0u32; 4]; + + unsafe { + hamming_batch_avx2(query, &targets, &mut results); + } + + for &r in &results { + assert_eq!(r, 64); + } + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx512_popcount() { + if !is_x86_feature_detected!("avx512vpopcntdq") || !is_x86_feature_detected!("avx512f") { + return; + } + + let query = 0u64; + let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127]; + let mut results = vec![0u32; 8]; + + unsafe { + hamming_batch_avx512(query, &targets, &mut results); + } + + assert_eq!(results[0], 0); + assert_eq!(results[1], 1); + assert_eq!(results[2], 2); + assert_eq!(results[3], 3); + assert_eq!(results[4], 4); + assert_eq!(results[5], 5); + assert_eq!(results[6], 6); + assert_eq!(results[7], 7); + } + + // ========================================================================= + // Additional clustering tests + // ========================================================================= + + #[test] + fn test_union_find_path_compression() { + let mut uf = UnionFind::new(); + + // Create a chain: 1 -> 2 -> 3 -> 4 -> 5 + uf.union(4, 5); + uf.union(3, 4); + uf.union(2, 3); + uf.union(1, 2); + + // All should have the same root + let root = uf.find(1); + assert_eq!(uf.find(2), root); + assert_eq!(uf.find(3), root); + assert_eq!(uf.find(4), root); + assert_eq!(uf.find(5), root); + } + + #[test] + fn test_cluster_edges_single_cluster() { + // All connected: 1-2-3-4-5 + let edges = vec![(1, 2), (2, 3), (3, 4), (4, 5)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + let cluster = &result.clusters[0]; + assert_eq!(cluster.representative, 1); + assert_eq!(cluster.duplicates.len(), 4); + assert_eq!(cluster.size(), 5); + } + + #[test] + fn test_cluster_edges_no_duplicates() { + // No edges means no clusters + let edges: Vec<(u64, u64)> = vec![]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 0); + assert_eq!(result.num_duplicates(), 0); + } + + #[test] + fn test_cluster_edges_self_loop() { + // Self-loop shouldn't create a cluster (size 1) + let edges = vec![(1, 1), (2, 3)]; + let result = cluster_edges(edges); + + // Only {2,3} should be a cluster + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].representative, 2); + } + + #[test] + fn test_cluster_edges_duplicate_edges() { + // Duplicate edges should be handled correctly + let edges = vec![(1, 2), (1, 2), (2, 3), (2, 3), (3, 1)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].size(), 3); + } + + #[test] + fn test_cluster_edges_large() { + // Create 100 clusters of size 10 each + let mut edges = Vec::new(); + for cluster_id in 0..100u64 { + let base = cluster_id * 10; + for i in 0..9 { + edges.push((base + i, base + i + 1)); + } + } + + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 100); + for cluster in &result.clusters { + assert_eq!(cluster.size(), 10); + assert_eq!(cluster.duplicates.len(), 9); + } + } + + #[test] + fn test_cluster_edges_random_order() { + // Same edges in different order should produce same result + let edges1 = vec![(1, 2), (2, 3), (4, 5), (3, 4)]; + let edges2 = vec![(4, 5), (1, 2), (3, 4), (2, 3)]; + let edges3 = vec![(3, 4), (4, 5), (2, 3), (1, 2)]; + + let r1 = cluster_edges(edges1); + let r2 = cluster_edges(edges2); + let r3 = cluster_edges(edges3); + + // All should produce the same single cluster + assert_eq!(r1.num_clusters(), 1); + assert_eq!(r2.num_clusters(), 1); + assert_eq!(r3.num_clusters(), 1); + + assert_eq!(r1.clusters[0].representative, 1); + assert_eq!(r2.clusters[0].representative, 1); + assert_eq!(r3.clusters[0].representative, 1); + + assert_eq!(r1.clusters[0].size(), 5); + assert_eq!(r2.clusters[0].size(), 5); + assert_eq!(r3.clusters[0].size(), 5); + } + + #[test] + fn test_cluster_edges_non_contiguous_ids() { + // Row IDs don't need to be contiguous + let edges = vec![(100, 200), (200, 500), (1000, 2000)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.representative == 100) + .unwrap(); + assert_eq!(c1.duplicates, vec![200, 500]); + + let c2 = result + .clusters + .iter() + .find(|c| c.representative == 1000) + .unwrap(); + assert_eq!(c2.duplicates, vec![2000]); + } + + #[test] + fn test_cluster_representative_is_minimum() { + // Representative should always be the minimum row ID in cluster + let edges = vec![ + (5, 3), + (3, 7), + (7, 1), // 1 is minimum + (100, 50), + (50, 75), // 50 is minimum + ]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.duplicates.contains(&7)) + .unwrap(); + assert_eq!(c1.representative, 1); + + let c2 = result + .clusters + .iter() + .find(|c| c.duplicates.contains(&100)) + .unwrap(); + assert_eq!(c2.representative, 50); + } + + #[test] + fn test_cluster_duplicates_sorted() { + // Duplicates should be sorted + let edges = vec![(1, 5), (1, 3), (1, 7), (1, 2)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].representative, 1); + assert_eq!(result.clusters[0].duplicates, vec![2, 3, 5, 7]); + } + + #[test] + fn test_clustering_result_stats() { + let edges = vec![ + (1, 2), + (2, 3), // cluster of 3 + (10, 20), + (20, 30), + (30, 40), // cluster of 4 + ]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + assert_eq!(result.num_duplicates(), 5); // 2 + 3 + assert_eq!(result.num_unique(), 2); + } } diff --git a/rust/lance-linalg/src/distance/l2.rs b/rust/lance-linalg/src/distance/l2.rs index 9aa5de6b9c5..c47aedd749f 100644 --- a/rust/lance-linalg/src/distance/l2.rs +++ b/rust/lance-linalg/src/distance/l2.rs @@ -18,6 +18,7 @@ use arrow_schema::DataType; use half::{bf16, f16}; use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray}; use lance_core::assume_eq; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::cpu::SIMD_SUPPORT; #[cfg(feature = "fp16kernels")] use lance_core::utils::cpu::SimdSupport; @@ -151,7 +152,7 @@ mod bf16_kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn l2_bf16_neon(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn l2_bf16_avx512(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn l2_bf16_avx2(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; @@ -172,7 +173,7 @@ impl L2 for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -204,7 +205,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn l2_f16_neon(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn l2_f16_avx512(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn l2_f16_avx2(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; @@ -225,7 +226,7 @@ impl L2 for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -331,7 +332,7 @@ fn accumulate_l2_dimension(q: f32, row: &[f32], result: &mut [f32]) { /// sub-vector codebooks (e.g. 256 centroids × 16 dims = 16 KB). /// For large target sets the SoA layout causes L1 thrashing and /// [`l2_distance_batch`] with its AoS per-target locality is faster. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, DeepSizeOf)] pub struct L2Prepared { transposed: Vec, dimension: usize, diff --git a/rust/lance-linalg/src/distance/norm_l2.rs b/rust/lance-linalg/src/distance/norm_l2.rs index cd604ac8c05..b1daf85ab3b 100644 --- a/rust/lance-linalg/src/distance/norm_l2.rs +++ b/rust/lance-linalg/src/distance/norm_l2.rs @@ -29,7 +29,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn norm_l2_f16_neon(ptr: *const f16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn norm_l2_f16_avx512(ptr: *const f16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn norm_l2_f16_avx2(ptr: *const f16, len: u32) -> f32; @@ -57,7 +57,7 @@ impl Normalize for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -87,7 +87,7 @@ mod bf16_kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn norm_l2_bf16_neon(ptr: *const bf16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn norm_l2_bf16_avx512(ptr: *const bf16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn norm_l2_bf16_avx2(ptr: *const bf16, len: u32) -> f32; @@ -108,7 +108,7 @@ impl Normalize for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { diff --git a/rust/lance-linalg/src/simd/dist_table.c b/rust/lance-linalg/src/simd/dist_table.c index 9e7fc2b2205..e8be8e52068 100644 --- a/rust/lance-linalg/src/simd/dist_table.c +++ b/rust/lance-linalg/src/simd/dist_table.c @@ -23,8 +23,15 @@ void sum_4bit_dist_table_32bytes_batch_avx512(const uint8_t *codes, __m512i accu3 = _mm512_setzero_si512(); for (size_t i = 0; i < code_length; i += 64) { - c = _mm512_loadu_si512(&codes[i]); - lut = _mm512_loadu_si512(&dist_table[i]); + const size_t remaining = code_length - i; + if (remaining >= 64) { + c = _mm512_loadu_si512(&codes[i]); + lut = _mm512_loadu_si512(&dist_table[i]); + } else { + const __mmask64 load_mask = (UINT64_C(1) << remaining) - 1; + c = _mm512_maskz_loadu_epi8(load_mask, &codes[i]); + lut = _mm512_maskz_loadu_epi8(load_mask, &dist_table[i]); + } lo = _mm512_and_si512(c, lo_mask); hi = _mm512_and_si512(_mm512_srli_epi16(c, 4), lo_mask); @@ -50,4 +57,4 @@ void sum_4bit_dist_table_32bytes_batch_avx512(const uint8_t *codes, ret = _mm512_add_epi16(ret, _mm512_shuffle_i64x2(ret1, ret2, 0xDD)); _mm512_storeu_si512(dists, ret); -} \ No newline at end of file +} diff --git a/rust/lance-linalg/src/simd/dist_table.rs b/rust/lance-linalg/src/simd/dist_table.rs index 66c30c75427..626c1581b15 100644 --- a/rust/lance-linalg/src/simd/dist_table.rs +++ b/rust/lance-linalg/src/simd/dist_table.rs @@ -10,7 +10,7 @@ use std::arch::x86_64::*; use lance_core::utils::cpu::{SIMD_SUPPORT, SimdSupport}; pub const PERM0: [usize; 16] = [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15]; -pub const PERM0_INVERSE: [usize; 16] = [0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15]; +pub const PERM0_INVERSE: [usize; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15]; pub const BATCH_SIZE: usize = 32; // This function is used to sum the distance table for 4-bit codes. @@ -37,18 +37,22 @@ pub fn sum_4bit_dist_table( debug_assert!(n.is_multiple_of(BATCH_SIZE)); match *SIMD_SUPPORT { - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] - SimdSupport::Avx512 | SimdSupport::Avx512FP16 => unsafe { + #[cfg(all(kernel_support = "avx512_dist_table", target_arch = "x86_64"))] + SimdSupport::Avx512 | SimdSupport::Avx512FP16 + if std::arch::is_x86_feature_detected!("avx512bw") => + { for i in (0..n).step_by(BATCH_SIZE) { let codes = &codes[i * code_len..(i + BATCH_SIZE) * code_len]; - sum_4bit_dist_table_32bytes_batch_avx512( - codes.as_ptr(), - codes.len(), - dist_table.as_ptr(), - dists[i..i + BATCH_SIZE].as_mut_ptr(), - ) + unsafe { + sum_4bit_dist_table_32bytes_batch_avx512( + codes.as_ptr(), + codes.len(), + dist_table.as_ptr(), + dists[i..i + BATCH_SIZE].as_mut_ptr(), + ) + } } - }, + } #[cfg(target_arch = "x86_64")] SimdSupport::Avx2 => unsafe { for i in (0..n).step_by(BATCH_SIZE) { @@ -109,6 +113,274 @@ pub fn sum_4bit_dist_table_scalar( } } +#[inline] +#[allow(unused)] +pub fn sum_4bit_dist_table_u16( + n: usize, + code_len: usize, + codes: &[u8], + dist_table: &[u16], + dists: &mut [u32], +) { + debug_assert!(n.is_multiple_of(BATCH_SIZE)); + debug_assert!(dists.len() >= n); + debug_assert!(codes.len() >= n * code_len); + sum_4bit_dist_table_u16_scalar( + code_len, + &codes[..n * code_len], + dist_table, + &mut dists[..n], + ); +} + +#[inline] +pub fn transfer_4bit_dist_table_u16(dist_table: &[u16], hacc_dist_table: &mut Vec) { + debug_assert!(dist_table.len().is_multiple_of(32)); + + let num_tables = dist_table.len() / 16; + hacc_dist_table.clear(); + hacc_dist_table.resize(dist_table.len() * 2, 0); + + for table_idx in 0..num_tables { + let table = &dist_table[table_idx * 16..(table_idx + 1) * 16]; + let low_offset = (table_idx / 2) * 64 + (table_idx % 2) * 16; + let high_offset = low_offset + 32; + for (code, value) in table.iter().enumerate() { + hacc_dist_table[low_offset + code] = *value as u8; + hacc_dist_table[high_offset + code] = (value >> 8) as u8; + } + } +} + +#[inline] +pub fn sum_4bit_hacc_dist_table( + n: usize, + code_len: usize, + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + debug_assert!(n.is_multiple_of(BATCH_SIZE)); + debug_assert!(dists.len() >= n); + debug_assert!(codes.len() >= n * code_len); + debug_assert!(hacc_dist_table.len() >= code_len * 64); + + match *SIMD_SUPPORT { + #[cfg(target_arch = "x86_64")] + SimdSupport::Avx512 | SimdSupport::Avx512FP16 | SimdSupport::Avx2 + if std::arch::is_x86_feature_detected!("avx2") => + { + sum_4bit_hacc_dist_table_avx2(n, code_len, codes, hacc_dist_table, dists); + } + _ => sum_4bit_hacc_dist_table_scalar(code_len, codes, hacc_dist_table, dists), + } +} + +#[inline] +#[allow(unused)] +pub fn sum_4bit_hacc_dist_table_scalar( + code_len: usize, + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + let num_full_vectors = codes.len() / (BATCH_SIZE * code_len) * BATCH_SIZE; + dists[..num_full_vectors].fill(0); + + for (vec_block_idx, blocks) in codes.chunks_exact(BATCH_SIZE * code_len).enumerate() { + for (sub_vec_idx, block) in blocks.chunks_exact(BATCH_SIZE).enumerate() { + let table_offset = sub_vec_idx * 64; + let current_low = &hacc_dist_table[table_offset..table_offset + 16]; + let next_low = &hacc_dist_table[table_offset + 16..table_offset + 32]; + let current_high = &hacc_dist_table[table_offset + 32..table_offset + 48]; + let next_high = &hacc_dist_table[table_offset + 48..table_offset + 64]; + + for j in 0..16 { + let low_current_code = (block[j] & 0x0F) as usize; + let high_current_code = (block[j] >> 4) as usize; + let low_next_code = (block[j + 16] & 0x0F) as usize; + let high_next_code = (block[j + 16] >> 4) as usize; + + let lower_id = vec_block_idx * BATCH_SIZE + PERM0[j]; + let higher_id = lower_id + 16; + dists[lower_id] += ((current_high[low_current_code] as u32) << 8) + + current_low[low_current_code] as u32 + + ((next_high[low_next_code] as u32) << 8) + + next_low[low_next_code] as u32; + dists[higher_id] += ((current_high[high_current_code] as u32) << 8) + + current_low[high_current_code] as u32 + + ((next_high[high_next_code] as u32) << 8) + + next_low[high_next_code] as u32; + } + } + } +} + +#[inline] +#[allow(unused)] +pub fn sum_4bit_dist_table_u16_scalar( + code_len: usize, + codes: &[u8], + dist_table: &[u16], + dists: &mut [u32], +) { + let num_full_vectors = codes.len() / (BATCH_SIZE * code_len) * BATCH_SIZE; + dists[..num_full_vectors].fill(0); + + for (vec_block_idx, blocks) in codes.chunks_exact(BATCH_SIZE * code_len).enumerate() { + for (sub_vec_idx, block) in blocks.chunks_exact(BATCH_SIZE).enumerate() { + let current_dist_table = &dist_table[sub_vec_idx * 2 * 16..(sub_vec_idx * 2 + 1) * 16]; + let next_dist_table = + &dist_table[(sub_vec_idx * 2 + 1) * 16..(sub_vec_idx * 2 + 2) * 16]; + + for j in 0..16 { + let low_current_code = (block[j] & 0x0F) as usize; + let high_current_code = (block[j] >> 4) as usize; + let low_next_code = (block[j + 16] & 0x0F) as usize; + let high_next_code = (block[j + 16] >> 4) as usize; + + let lower_id = vec_block_idx * BATCH_SIZE + PERM0[j]; + let higher_id = lower_id + 16; + dists[lower_id] += current_dist_table[low_current_code] as u32 + + next_dist_table[low_next_code] as u32; + dists[higher_id] += current_dist_table[high_current_code] as u32 + + next_dist_table[high_next_code] as u32; + } + } + } +} + +#[cfg(target_arch = "x86_64")] +#[inline] +fn sum_4bit_hacc_dist_table_avx2( + n: usize, + code_len: usize, + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + const SAFE_CODE_LEN: usize = 128; + + for i in (0..n).step_by(BATCH_SIZE) { + let batch_codes = &codes[i * code_len..(i + BATCH_SIZE) * code_len]; + let batch_dists = &mut dists[i..i + BATCH_SIZE]; + batch_dists.fill(0); + + for code_start in (0..code_len).step_by(SAFE_CODE_LEN) { + let code_end = (code_start + SAFE_CODE_LEN).min(code_len); + let code_range = code_start * BATCH_SIZE..code_end * BATCH_SIZE; + let table_range = code_start * 64..code_end * 64; + if code_start == 0 && code_end == code_len { + unsafe { + sum_hacc_dist_table_32bytes_batch_avx2( + &batch_codes[code_range], + &hacc_dist_table[table_range], + batch_dists, + ); + } + } else { + let mut chunk_dists = [0u32; BATCH_SIZE]; + unsafe { + sum_hacc_dist_table_32bytes_batch_avx2( + &batch_codes[code_range], + &hacc_dist_table[table_range], + &mut chunk_dists, + ); + } + batch_dists + .iter_mut() + .zip(chunk_dists.iter()) + .for_each(|(dist, chunk_dist)| *dist += *chunk_dist); + } + } + } +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +#[inline] +#[allow(unused)] +unsafe fn sum_hacc_dist_table_32bytes_batch_avx2( + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + let low_mask = _mm256_set1_epi8(0x0f); + let mut low_accu0 = _mm256_setzero_si256(); + let mut low_accu1 = _mm256_setzero_si256(); + let mut low_accu2 = _mm256_setzero_si256(); + let mut low_accu3 = _mm256_setzero_si256(); + let mut high_accu0 = _mm256_setzero_si256(); + let mut high_accu1 = _mm256_setzero_si256(); + let mut high_accu2 = _mm256_setzero_si256(); + let mut high_accu3 = _mm256_setzero_si256(); + + for code_offset in (0..codes.len()).step_by(BATCH_SIZE) { + let table_offset = code_offset * 2; + let c = _mm256_loadu_si256(codes.as_ptr().add(code_offset) as *const __m256i); + let lo = _mm256_and_si256(c, low_mask); + let hi = _mm256_and_si256(_mm256_srli_epi16(c, 4), low_mask); + + let low_lut = + _mm256_loadu_si256(hacc_dist_table.as_ptr().add(table_offset) as *const __m256i); + let low_res_lo = _mm256_shuffle_epi8(low_lut, lo); + let low_res_hi = _mm256_shuffle_epi8(low_lut, hi); + low_accu0 = _mm256_add_epi16(low_accu0, low_res_lo); + low_accu1 = _mm256_add_epi16(low_accu1, _mm256_srli_epi16(low_res_lo, 8)); + low_accu2 = _mm256_add_epi16(low_accu2, low_res_hi); + low_accu3 = _mm256_add_epi16(low_accu3, _mm256_srli_epi16(low_res_hi, 8)); + + let high_lut = + _mm256_loadu_si256(hacc_dist_table.as_ptr().add(table_offset + 32) as *const __m256i); + let high_res_lo = _mm256_shuffle_epi8(high_lut, lo); + let high_res_hi = _mm256_shuffle_epi8(high_lut, hi); + high_accu0 = _mm256_add_epi16(high_accu0, high_res_lo); + high_accu1 = _mm256_add_epi16(high_accu1, _mm256_srli_epi16(high_res_lo, 8)); + high_accu2 = _mm256_add_epi16(high_accu2, high_res_hi); + high_accu3 = _mm256_add_epi16(high_accu3, _mm256_srli_epi16(high_res_hi, 8)); + } + + low_accu0 = _mm256_sub_epi16(low_accu0, _mm256_slli_epi16(low_accu1, 8)); + let low_dis0 = _mm256_add_epi16( + _mm256_permute2f128_si256(low_accu0, low_accu1, 0x21), + _mm256_blend_epi32(low_accu0, low_accu1, 0xF0), + ); + low_accu2 = _mm256_sub_epi16(low_accu2, _mm256_slli_epi16(low_accu3, 8)); + let low_dis1 = _mm256_add_epi16( + _mm256_permute2f128_si256(low_accu2, low_accu3, 0x21), + _mm256_blend_epi32(low_accu2, low_accu3, 0xF0), + ); + + high_accu0 = _mm256_sub_epi16(high_accu0, _mm256_slli_epi16(high_accu1, 8)); + let high_dis0 = _mm256_add_epi16( + _mm256_permute2f128_si256(high_accu0, high_accu1, 0x21), + _mm256_blend_epi32(high_accu0, high_accu1, 0xF0), + ); + high_accu2 = _mm256_sub_epi16(high_accu2, _mm256_slli_epi16(high_accu3, 8)); + let high_dis1 = _mm256_add_epi16( + _mm256_permute2f128_si256(high_accu2, high_accu3, 0x21), + _mm256_blend_epi32(high_accu2, high_accu3, 0xF0), + ); + + let low0 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(low_dis0)); + let low1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(low_dis0, 1)); + let high0 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(high_dis0)); + let high1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(high_dis0, 1)); + let res0 = _mm256_add_epi32(low0, _mm256_slli_epi32(high0, 8)); + let res1 = _mm256_add_epi32(low1, _mm256_slli_epi32(high1, 8)); + _mm256_storeu_si256(dists.as_mut_ptr() as *mut __m256i, res0); + _mm256_storeu_si256(dists.as_mut_ptr().add(8) as *mut __m256i, res1); + + let low2 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(low_dis1)); + let low3 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(low_dis1, 1)); + let high2 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(high_dis1)); + let high3 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(high_dis1, 1)); + let res2 = _mm256_add_epi32(low2, _mm256_slli_epi32(high2, 8)); + let res3 = _mm256_add_epi32(low3, _mm256_slli_epi32(high3, 8)); + _mm256_storeu_si256(dists.as_mut_ptr().add(16) as *mut __m256i, res2); + _mm256_storeu_si256(dists.as_mut_ptr().add(24) as *mut __m256i, res3); +} + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] #[inline] @@ -145,6 +417,10 @@ unsafe fn sum_dist_table_32bytes_batch_avx2(codes: &[u8], dist_table: &[u8], dis accu2 = _mm256_add_epi16(accu2, res_hi); accu3 = _mm256_add_epi16(accu3, _mm256_srli_epi16(res_hi, 8)); + if i + 32 >= codes.len() { + continue; + } + // load the left 32 bytes of codes and lut c = _mm256_loadu_si256(codes.as_ptr().add(i + 32) as *const __m256i); lut_vec = _mm256_loadu_si256(dist_table.as_ptr().add(i + 32) as *const __m256i); @@ -253,7 +529,7 @@ unsafe fn sum_dist_table_32bytes_batch_neon(codes: &[u8], dist_table: &[u8], dis // We implement the AVX512 version in C because AVX512 is not stable yet in Rust, // implement it in Rust once we upgrade rust to 1.89.0. unsafe extern "C" { - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_dist_table", target_arch = "x86_64"))] pub fn sum_4bit_dist_table_32bytes_batch_avx512( codes: *const u8, code_length: usize, @@ -266,6 +542,13 @@ unsafe extern "C" { mod tests { use super::*; + #[test] + fn test_perm0_inverse_matches_perm0() { + for (idx, &value) in PERM0.iter().enumerate() { + assert_eq!(PERM0_INVERSE[value], idx); + } + } + #[test] fn test_sum_4bit_dist_table_basic() { // we have 32 vectors @@ -323,6 +606,101 @@ mod tests { assert!(actual.iter().all(|dist| *dist != u16::MAX)); } + #[test] + fn test_sum_4bit_dist_table_u16_basic() { + let n = BATCH_SIZE; + let code_len = 2; + let codes = [ + 0x12, 0x34, 0x56, 0x78, 0x9a, 0xbc, 0xde, 0xf0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x12, 0x34, 0x56, 0x78, + 0x9a, 0xbc, 0xde, 0xf0, + ]; + let codes = codes.repeat(n * code_len / codes.len()); + let dist_table: Vec = (0..16 * 4).map(|idx| (idx % 16 + 1) as u16).collect(); + + let mut dists = vec![0u32; n]; + sum_4bit_dist_table_u16(n, code_len, &codes, &dist_table, &mut dists); + + assert_eq!(dists[1], 38); + } + + #[test] + fn test_transfer_4bit_dist_table_u16_layout() { + let dist_table: Vec = (0..32).map(|idx| 0x1200 + idx as u16).collect(); + let mut hacc_dist_table = Vec::new(); + transfer_4bit_dist_table_u16(&dist_table, &mut hacc_dist_table); + + assert_eq!(hacc_dist_table.len(), 64); + for code in 0..16 { + assert_eq!(hacc_dist_table[code], dist_table[code] as u8); + assert_eq!(hacc_dist_table[16 + code], dist_table[16 + code] as u8); + assert_eq!(hacc_dist_table[32 + code], (dist_table[code] >> 8) as u8); + assert_eq!( + hacc_dist_table[48 + code], + (dist_table[16 + code] >> 8) as u8 + ); + } + } + + #[test] + fn test_sum_4bit_dist_table_u16_matches_reference_multi_batch() { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(99); + + for code_len in [1, 3, 16, 191, 192, 1024] { + let n = BATCH_SIZE * 4; + let codes: Vec = (0..n * code_len).map(|_| rng.random::()).collect(); + let dist_table: Vec = (0..BATCH_SIZE * code_len) + .map(|_| rng.random::()) + .collect(); + + let mut expected = vec![0u32; n]; + sum_4bit_dist_table_u16_scalar(code_len, &codes, &dist_table, &mut expected); + + let mut actual = vec![u32::MAX; n]; + sum_4bit_dist_table_u16(n, code_len, &codes, &dist_table, &mut actual); + + assert_eq!( + actual, + expected, + "u16 dist-table mismatch for code_len={} (DIM={})", + code_len, + code_len * 8, + ); + } + } + + #[test] + fn test_sum_4bit_hacc_dist_table_matches_u16_reference_multi_batch() { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(101); + + for code_len in [1, 3, 16, 191, 192, 1024] { + let n = BATCH_SIZE * 4; + let codes: Vec = (0..n * code_len).map(|_| rng.random::()).collect(); + let dist_table: Vec = (0..BATCH_SIZE * code_len) + .map(|_| rng.random::()) + .collect(); + + let mut hacc_dist_table = Vec::new(); + transfer_4bit_dist_table_u16(&dist_table, &mut hacc_dist_table); + + let mut expected = vec![0u32; n]; + sum_4bit_dist_table_u16_scalar(code_len, &codes, &dist_table, &mut expected); + + let mut actual = vec![u32::MAX; n]; + sum_4bit_hacc_dist_table(n, code_len, &codes, &hacc_dist_table, &mut actual); + + assert_eq!( + actual, + expected, + "hacc dist-table mismatch for code_len={} (DIM={})", + code_len, + code_len * 8, + ); + } + } + /// Test that the SIMD path (NEON on ARM, AVX2 on x86) produces identical /// results to the scalar reference across a range of dimensions, including /// very large ones (up to DIM=65536). @@ -341,7 +719,7 @@ mod tests { // directly since that's what the function sees. // code_len=16 → DIM=128, code_len=192 → DIM=1536, // code_len=512 → DIM=4096, code_len=8192 → DIM=65536 - for code_len in [2, 16, 96, 192, 512, 1024, 8192] { + for code_len in [1, 2, 3, 16, 95, 96, 192, 512, 1024, 8192] { let n = BATCH_SIZE; // 32 vectors per batch // Each code byte produces 2 lookups; cap values so @@ -375,7 +753,7 @@ mod tests { use rand::{Rng, SeedableRng}; let mut rng = rand::rngs::StdRng::seed_from_u64(123); - for code_len in [16, 192, 1024] { + for code_len in [1, 3, 16, 191, 192, 1024] { let n = BATCH_SIZE * 10; // 320 vectors = 10 batches let max_val = (u16::MAX as usize / (2 * code_len)).min(255) as u8; diff --git a/rust/lance-namespace-datafusion/tests/sql.rs b/rust/lance-namespace-datafusion/tests/sql.rs index e49cd7e58e3..5332e831cb6 100755 --- a/rust/lance-namespace-datafusion/tests/sql.rs +++ b/rust/lance-namespace-datafusion/tests/sql.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +#![recursion_limit = "256"] + use std::sync::Arc; use arrow_array::{Int32Array, Int64Array, RecordBatch, RecordBatchIterator, StringArray}; diff --git a/rust/lance-namespace-impls/BENCHMARK.md b/rust/lance-namespace-impls/BENCHMARK.md new file mode 100644 index 00000000000..074ec303347 --- /dev/null +++ b/rust/lance-namespace-impls/BENCHMARK.md @@ -0,0 +1,73 @@ +# `__manifest` commit benchmark + +Measures how fast the copy-on-write directory catalog commits `__manifest` mutations as +the manifest scales, with the inline scalar indices on or off. + +The catalog commits every mutation by rewriting the whole `__manifest` (copy-on-write) +and atomically writing a new manifest version. This benchmark characterises: + +- **Continuous commit** — a single process commits `N` times into a manifest already + holding `rows` entries (per-commit latency + throughput). +- **Concurrent commit** — `C` processes commit continuously for a fixed duration against + a manifest of `rows` entries (steady, contended TPS). + +## Binary: `examples/manifest_bench.rs` + +``` +manifest_bench seed-large --root --count --inline-optimization \ + [--storage-option aws_region=us-east-1] +manifest_bench run --root --operation write-create-namespace \ + --concurrency 1 --operations 100 --initial-entries --inline-optimization # continuous +manifest_bench run --root --operation write-create-namespace \ + --concurrency 50 --duration-secs 30 --initial-entries --inline-optimization # concurrent +``` + +- `seed-large` bootstraps a manifest to `count` rows by writing the Lance dataset + directly (O(rows) once) and then triggering one CoW rewrite so the on-disk state + matches the steady catalog form (single fragment; inline indices when enabled). +- `run` spawns `--concurrency` worker subprocesses. With `--operations` it runs a fixed + commit budget (continuous); with `--duration-secs` each worker commits until the + deadline (steady TPS). It prints one JSON `BenchResult` per concurrency level with + throughput and p50/p90/p99 latency. +- The committed operation (`--operation`) defaults to `write-create-namespace`, the + cheapest pure-`__manifest` mutation (no table data). `write-create-table` / + `write-declare-table` are also available. + +S3 requires the default `dir-aws` feature (on by default) and AWS credentials in the +environment; pass `--storage-option aws_region=`. + +## Sweep panel: `benches/manifest_commit_sweep.sh` + +Runs the full panel — sizes × {inline index, no index} × {continuous, concurrent×C} — +with per-run S3-copy isolation (each run starts at exactly the bootstrapped size), +JSONL results, a `summary.csv`, and resume support. + +```bash +cargo build --release --example manifest_bench -p lance-namespace-impls +S3_BASE=s3:///manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \ + rust/lance-namespace-impls/benches/manifest_commit_sweep.sh +``` + +Default panel (override via env): `SIZES="1000 2000 5000 10000 20000 50000 100000 200000 +500000 1000000"`, `CONCURRENCY="10 20 50 100 120 150 200"`, `INLINE_VARIANTS="true false"`, +`CONT_OPS=100`, `CONC_DURATION_SECS=30`. Results land in `$OUT_DIR` (default +`~/manifest_cow_bench_`). + +## Representative results + +EC2 `c7i.48xlarge`, S3 `us-east-1`, op `write-create-namespace`. The catalog is a +single-writer-throughput system: per-commit cost scales ~O(rows) and throughput does **not** +scale with concurrency (every commit is a serialized `__manifest` version bump). + +Continuous (1 process, 100 commits), ops/s — inline index vs no index: + +| rows | inline | no index | +|---:|---:|---:| +| 1,000 | 2.0 | 3.5 | +| 100,000 | 1.1 | 2.1 | +| 1,000,000 | 0.34 | 0.53 | + +Concurrent steady TPS is flat across C=10..200 (e.g. inline @100k ≈ 1.4–1.5 ops/s at every C; +@1M ≈ 0.3 ops/s). Conflicts that exceed the retry budget surface as errors and grow with C +(≈0 at C≤20, climbing at C≥100) — the contention ceiling, not data loss. No-index commits run +~1.5–2× faster (no per-commit index build) at the cost of unindexed reads. diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 963edf5e8ca..27b9a4bc0e2 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -21,6 +21,7 @@ dir-aws = ["lance-io/aws", "lance/aws"] dir-azure = ["lance-io/azure", "lance/azure"] dir-oss = ["lance-io/oss", "lance/oss"] dir-huggingface = ["lance-io/huggingface", "lance/huggingface"] +dir-goosefs = ["lance-io/goosefs", "lance/goosefs"] # Credential vending features credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:sha2", "dep:base64"] credential-vendor-gcp = ["dep:reqwest", "dep:serde", "dep:sha2", "dep:base64", "dep:ring", "dep:rustls-pki-types"] @@ -50,6 +51,8 @@ object_store = { workspace = true } arrow = { workspace = true } arrow-ipc = { workspace = true } arrow-schema = { workspace = true } +datafusion-common = { workspace = true } +datafusion-physical-plan = { workspace = true } # REST adapter implementation dependencies (optional, enabled by "rest-adapter" feature) axum = { workspace = true, optional = true } @@ -65,6 +68,8 @@ serde_json = { workspace = true } futures.workspace = true log.workspace = true rand.workspace = true +roaring.workspace = true +uuid.workspace = true # Shared credential vending dependencies sha2 = { version = "0.10", optional = true } @@ -74,6 +79,11 @@ base64 = { version = "0.22", optional = true } aws-sdk-sts = { version = "1.38.0", optional = true, default-features = false, features = ["default-https-client", "rt-tokio"] } aws-config = { workspace = true, optional = true } +# Pin: time 0.3.48 conflicts with aws-smithy-types (E0119: conflicting `From` impls), which this +# crate pulls in via the AWS credential vendor. Capping time here forces the workspace resolver to +# 0.3.47 even for no-lock builds. Not used directly; remove once the upstream conflict is resolved. +time = "=0.3.47" + # GCP credential vending dependencies (optional, enabled by "credential-vendor-gcp" feature) ring = { version = "0.17", optional = true } rustls-pki-types = { version = "1", optional = true } @@ -84,13 +94,22 @@ hmac = { version = "0.12", optional = true } quick-xml = { version = "0.38", optional = true } [dev-dependencies] +opendal = { workspace = true, features = ["services-goosefs"] } tokio = { workspace = true, features = ["full"] } tempfile.workspace = true wiremock.workspace = true arrow = { workspace = true } +arrow-array = { workspace = true } arrow-ipc = { workspace = true } rstest.workspace = true lance-table.workspace = true +lance-arrow = { workspace = true } +lance = { workspace = true } +serde = { workspace = true, features = ["derive"] } + +[[example]] +name = "manifest_bench" +path = "examples/manifest_bench.rs" [lints] workspace = true diff --git a/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh new file mode 100755 index 00000000000..7384ced4152 --- /dev/null +++ b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +# Copy-on-write __manifest commit benchmark sweep panel. +# +# Drives `cargo run --release --example manifest_bench` across a panel of: +# - bootstrap manifest sizes (rows already in __manifest) +# - inline scalar indices on vs off +# - continuous commit (single process, N commits) and +# concurrent commit (C processes, steady TPS over a fixed duration) +# +# Each run is isolated: a "golden" manifest is bootstrapped once per (size, index) +# and server-side-copied to a fresh S3 prefix per run, so every run starts at exactly +# the bootstrapped size. Results are written as JSONL (one BenchResult per line) and +# summarised to CSV. The sweep is resumable: completed runs are skipped. +# +# Usage: +# S3_BASE=s3://jack-devland-build/manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \ +# ./manifest_commit_sweep.sh +# +# Env knobs (defaults match the requested panel): +# SIZES, CONCURRENCY, INLINE_VARIANTS, CONT_OPS, CONC_DURATION_SECS, +# AWS_REGION, OUT_DIR, BIN +# +# Resilient by design: a single failed run is logged and skipped rather than aborting +# the sweep, and re-running fills the gaps (completed runs are detected and skipped). +set -uo pipefail + +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" +S3_BASE="${S3_BASE:?set S3_BASE, e.g. s3://jack-devland-build/manifest-cow-bench/$RUN_ID}" +AWS_REGION="${AWS_REGION:-us-east-1}" +export AWS_REGION AWS_DEFAULT_REGION="$AWS_REGION" + +REPO_ROOT="${REPO_ROOT:-$HOME/oss/lance}" +BIN="${BIN:-$REPO_ROOT/target/release/examples/manifest_bench}" +OUT_DIR="${OUT_DIR:-$HOME/manifest_cow_bench_${RUN_ID}}" +RESULTS="$OUT_DIR/results.jsonl" +PROGRESS="$OUT_DIR/progress.log" +mkdir -p "$OUT_DIR" + +SIZES=(${SIZES:-1000 2000 5000 10000 20000 50000 100000 200000 500000 1000000}) +CONCURRENCY=(${CONCURRENCY:-10 20 50 100 120 150 200}) +INLINE_VARIANTS=(${INLINE_VARIANTS:-true false}) +CONT_OPS="${CONT_OPS:-100}" +CONC_DURATION_SECS="${CONC_DURATION_SECS:-30}" +STORAGE_OPT=(--storage-option "aws_region=${AWS_REGION}") + +log() { printf '%s %s\n' "$(date -u +%H:%M:%S)" "$*" | tee -a "$PROGRESS"; } + +# Skip a run if its tag already appears in results.jsonl (resume support). +done_already() { grep -q "\"bench_tag\":\"$1\"" "$RESULTS" 2>/dev/null; } + +# Append a result line, tagging it so reruns can resume and we can pivot later. +record() { + local tag="$1"; shift + # shellcheck disable=SC2016 + python3 -c 'import json,sys; d=json.load(sys.stdin); d["bench_tag"]=sys.argv[1]; print(json.dumps(d))' \ + "$tag" >> "$RESULTS" +} + +s3_copy() { aws s3 cp --recursive --quiet "$1" "$2" --region "$AWS_REGION"; } +s3_rm() { aws s3 rm --recursive --quiet "$1" --region "$AWS_REGION" || true; } + +# Backstops for unattended runs: cap any single run and clear leaked worker processes +# (a killed coordinator can orphan its worker children) before the next run. +RUN_TIMEOUT="${RUN_TIMEOUT:-1200}" +clear_stragglers() { pkill -f 'examples/manifest_bench worker' 2>/dev/null || true; sleep 1; } + +for inline in "${INLINE_VARIANTS[@]}"; do + for rows in "${SIZES[@]}"; do + golden="${S3_BASE}/golden/inline_${inline}_rows_${rows}" + boot_tag="boot_inline_${inline}_rows_${rows}" + + if ! done_already "$boot_tag"; then + log "BOOTSTRAP inline=$inline rows=$rows -> $golden" + s3_rm "$golden" + if "$BIN" seed-large --root "$golden" --count "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}"; then + echo "{\"bench_tag\":\"$boot_tag\"}" >> "$RESULTS" + else + log "BOOTSTRAP FAILED inline=$inline rows=$rows (skipping this size)" + continue + fi + else + log "skip bootstrap $boot_tag (done)" + fi + + # ---- Continuous: single process, CONT_OPS commits ---- + cont_tag="cont_inline_${inline}_rows_${rows}" + if ! done_already "$cont_tag"; then + run_prefix="${S3_BASE}/run/${cont_tag}" + log "CONTINUOUS inline=$inline rows=$rows ops=$CONT_OPS" + clear_stragglers + s3_copy "$golden" "$run_prefix" + timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \ + --concurrency 1 --operations "$CONT_OPS" --initial-entries "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}" \ + 2>>"$PROGRESS" | while read -r line; do record "$cont_tag" <<<"$line"; done + s3_rm "$run_prefix" + else + log "skip continuous $cont_tag (done)" + fi + + # ---- Concurrent: C processes, steady TPS over CONC_DURATION_SECS ---- + for c in "${CONCURRENCY[@]}"; do + conc_tag="conc_inline_${inline}_rows_${rows}_c_${c}" + if done_already "$conc_tag"; then log "skip concurrent $conc_tag (done)"; continue; fi + run_prefix="${S3_BASE}/run/${conc_tag}" + log "CONCURRENT inline=$inline rows=$rows c=$c dur=${CONC_DURATION_SECS}s" + clear_stragglers + s3_copy "$golden" "$run_prefix" + timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \ + --concurrency "$c" --duration-secs "$CONC_DURATION_SECS" --initial-entries "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}" \ + 2>>"$PROGRESS" | while read -r line; do record "$conc_tag" <<<"$line"; done + s3_rm "$run_prefix" + done + done +done + +# ---- Summarise to CSV ---- +CSV="$OUT_DIR/summary.csv" +python3 - "$RESULTS" "$CSV" <<'PY' +import json, sys, csv +rows = [] +with open(sys.argv[1]) as f: + for line in f: + d = json.loads(line) + if "throughput_ops_per_sec" not in d: + continue # bootstrap marker + mode = "continuous" if d["duration_secs"] == 0 else "concurrent" + rows.append({ + "mode": mode, "variant": d["variant"], "initial_entries": d["initial_entries"], + "concurrency": d["concurrency"], "duration_secs": d["duration_secs"], + "ops": d["total_operations"], "errors": d["errors"], + "tps": round(d["throughput_ops_per_sec"], 3), + "avg_ms": round(d["avg_latency_ms"], 2), "p50_ms": round(d["p50_latency_ms"], 2), + "p90_ms": round(d["p90_latency_ms"], 2), "p99_ms": round(d["p99_latency_ms"], 2), + }) +rows.sort(key=lambda r: (r["mode"], r["variant"], r["initial_entries"], r["concurrency"])) +with open(sys.argv[2], "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=list(rows[0].keys()) if rows else []) + w.writeheader(); w.writerows(rows) +print(f"wrote {len(rows)} rows to {sys.argv[2]}") +PY + +log "SWEEP COMPLETE. Results: $RESULTS Summary: $CSV" +s3_rm "${S3_BASE}/golden" "${S3_BASE}/run" 2>/dev/null || true diff --git a/rust/lance-namespace-impls/examples/manifest_bench.rs b/rust/lance-namespace-impls/examples/manifest_bench.rs new file mode 100644 index 00000000000..4841f2471d7 --- /dev/null +++ b/rust/lance-namespace-impls/examples/manifest_bench.rs @@ -0,0 +1,714 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Copy-on-write `__manifest` directory-catalog commit benchmark (S3 capable). +//! +//! Measures how fast the directory catalog commits `__manifest` mutations as the +//! manifest scales, with the inline scalar indices on or off. +//! +//! Modes: +//! seed-large — bootstrap a `__manifest` with N rows (direct dataset write + one +//! CoW rewrite to build indices) +//! run — coordinator: spawn `--concurrency` worker processes committing for +//! either a fixed op count (continuous) or a fixed duration (steady TPS) +//! worker — (internal) a single committing process spawned by `run` +//! +//! Examples: +//! # Bootstrap 100k rows with inline indices +//! manifest_bench seed-large --root s3://bucket/bench/p --count 100000 \ +//! --inline-optimization true --storage-option aws_region=us-east-1 +//! +//! # Continuous: 100 commits, single process +//! manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \ +//! --concurrency 1 --operations 100 --initial-entries 100000 --inline-optimization true +//! +//! # Concurrent steady TPS: 50 processes committing for 30s +//! manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \ +//! --concurrency 50 --duration-secs 30 --initial-entries 100000 --inline-optimization true + +// A CLI benchmark tool: workers emit JSON latency records on stdout and progress on +// stderr, so stdout/stderr printing is intentional here. +#![allow(clippy::print_stdout, clippy::print_stderr)] + +use std::collections::HashMap; +use std::io::{BufRead, BufReader}; +use std::process::{Command, Stdio}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use arrow::array::builder::{ListBuilder, StringBuilder}; +use arrow::array::{RecordBatch, RecordBatchIterator, StringArray}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use bytes::Bytes; +use lance::dataset::{InsertBuilder, WriteMode, WriteParams}; +use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; +use lance_namespace::LanceNamespace; +use lance_namespace::models::{ + CreateNamespaceRequest, CreateTableRequest, DeclareTableRequest, DescribeTableRequest, + ListNamespacesRequest, ListTablesRequest, +}; +use lance_namespace_impls::DirectoryNamespaceBuilder; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Clone)] +struct LatencyRecord { + operation: String, + latency_ms: f64, + error: bool, +} + +#[derive(Serialize)] +struct BenchResult { + variant: String, + operation: String, + concurrency: usize, + initial_entries: usize, + duration_secs: u64, + total_operations: usize, + total_duration_ms: f64, + throughput_ops_per_sec: f64, + avg_latency_ms: f64, + p50_latency_ms: f64, + p90_latency_ms: f64, + p99_latency_ms: f64, + min_latency_ms: f64, + max_latency_ms: f64, + errors: usize, +} + +fn percentile(sorted: &[f64], p: f64) -> f64 { + if sorted.is_empty() { + return 0.0; + } + let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize; + sorted[idx.min(sorted.len() - 1)] +} + +#[allow(clippy::too_many_arguments)] +fn compute_result( + variant: &str, + operation: &str, + concurrency: usize, + initial_entries: usize, + duration_secs: u64, + wall_duration: Duration, + mut latencies: Vec, + errors: usize, +) -> BenchResult { + latencies.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let total = latencies.len(); + let total_ms = wall_duration.as_secs_f64() * 1000.0; + let throughput = if total_ms > 0.0 { + total as f64 / (total_ms / 1000.0) + } else { + 0.0 + }; + BenchResult { + variant: variant.to_string(), + operation: operation.to_string(), + concurrency, + initial_entries, + duration_secs, + total_operations: total, + total_duration_ms: total_ms, + throughput_ops_per_sec: throughput, + avg_latency_ms: if total > 0 { + latencies.iter().sum::() / total as f64 + } else { + 0.0 + }, + p50_latency_ms: percentile(&latencies, 0.50), + p90_latency_ms: percentile(&latencies, 0.90), + p99_latency_ms: percentile(&latencies, 0.99), + min_latency_ms: latencies.first().copied().unwrap_or(0.0), + max_latency_ms: latencies.last().copied().unwrap_or(0.0), + errors, + } +} + +fn create_test_ipc_data() -> Vec { + use arrow::array::Int32Array; + use arrow_ipc::writer::StreamWriter; + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + buffer +} + +/// The `__manifest` schema used by the copy-on-write directory catalog: +/// `object_id`, `object_type`, `location`, `metadata` (Utf8), `base_objects` (List). +fn manifest_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + Field::new("object_id", DataType::Utf8, false).with_metadata( + [( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_string(), + "0".to_string(), + )] + .into_iter() + .collect(), + ), + Field::new("object_type", DataType::Utf8, false), + Field::new("location", DataType::Utf8, true), + Field::new("metadata", DataType::Utf8, true), + Field::new( + "base_objects", + DataType::List(Arc::new(Field::new("object_id", DataType::Utf8, true))), + true, + ), + ])) +} + +async fn build_namespace( + root: &str, + inline_optimization: bool, + storage_options: &HashMap, +) -> Box { + let mut properties = HashMap::new(); + properties.insert("root".to_string(), root.to_string()); + properties.insert("dir_listing_enabled".to_string(), "false".to_string()); + properties.insert( + "inline_optimization_enabled".to_string(), + inline_optimization.to_string(), + ); + for (k, v) in storage_options { + properties.insert(format!("storage.{}", k), v.clone()); + } + let builder = DirectoryNamespaceBuilder::from_properties(properties, None) + .expect("Failed to create namespace builder from properties"); + Box::new(builder.build().await.expect("Failed to build namespace")) +} + +// ──────────────────── seed-large mode ──────────────────── +// Bootstrap a `__manifest` with N rows by writing the Lance dataset directly (fast, +// O(N) once), then trigger a single CoW rewrite via the namespace so the on-disk state +// matches what the catalog produces (single fragment + inline indices when enabled). + +const SEED_LARGE_BATCH_SIZE: usize = 50_000; + +fn generate_manifest_batch(start_idx: usize, batch_size: usize, total_count: usize) -> RecordBatch { + let ns_count = total_count / 3; + let actual_size = batch_size.min(total_count - start_idx); + + let mut object_ids = Vec::with_capacity(actual_size); + let mut object_types = Vec::with_capacity(actual_size); + let mut locations: Vec> = Vec::with_capacity(actual_size); + let mut metadatas: Vec> = Vec::with_capacity(actual_size); + + for i in start_idx..start_idx + actual_size { + if i < ns_count { + object_ids.push(format!("ns_{}", i)); + object_types.push("namespace".to_string()); + locations.push(None); + metadatas.push(None); + } else { + let table_idx = i - ns_count; + object_ids.push(format!("table_{}", table_idx)); + object_types.push("table".to_string()); + locations.push(Some(format!("table_{}", table_idx))); + metadatas.push(Some(r#"{"bench":"true"}"#.to_string())); + } + } + + // base_objects is null for every bootstrapped row. + let mut base_objects_builder = ListBuilder::new(StringBuilder::new()) + .with_field(Arc::new(Field::new("object_id", DataType::Utf8, true))); + for _ in 0..actual_size { + base_objects_builder.append_null(); + } + + RecordBatch::try_new( + manifest_schema(), + vec![ + Arc::new(StringArray::from(object_ids)), + Arc::new(StringArray::from(object_types)), + Arc::new(StringArray::from( + locations.iter().map(|l| l.as_deref()).collect::>(), + )), + Arc::new(StringArray::from( + metadatas.iter().map(|m| m.as_deref()).collect::>(), + )), + Arc::new(base_objects_builder.finish()), + ], + ) + .expect("Failed to create manifest batch") +} + +async fn seed_large( + root: &str, + count: usize, + inline_optimization: bool, + storage_options: &HashMap, +) { + let manifest_uri = format!("{}/{}", root, "__manifest"); + eprintln!("Seed-large: writing {} rows to {}", count, manifest_uri); + + let schema = manifest_schema(); + let mut batches = Vec::new(); + let mut offset = 0; + while offset < count { + let batch_size = SEED_LARGE_BATCH_SIZE.min(count - offset); + batches.push(generate_manifest_batch(offset, batch_size, count)); + offset += batch_size; + } + eprintln!(" generated {} batches", batches.len()); + + let mut write_params = WriteParams { + mode: WriteMode::Create, + ..WriteParams::default() + }; + if !storage_options.is_empty() { + let accessor = Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + storage_options.clone(), + ), + ); + write_params.store_params = Some(lance_io::object_store::ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..Default::default() + }); + } + + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + InsertBuilder::new(manifest_uri.as_str()) + .with_params(&write_params) + .execute_stream(reader) + .await + .expect("Failed to write manifest dataset"); + eprintln!(" wrote Lance dataset"); + + // Trigger one CoW rewrite so the manifest is in steady catalog form (single + // fragment; inline indices when enabled). For the no-index variant the first real + // commit performs this rewrite instead. + if inline_optimization { + eprintln!(" triggering initial CoW rewrite to build indices..."); + let start = Instant::now(); + let ns = build_namespace(root, true, storage_options).await; + let mut req = CreateNamespaceRequest::new(); + req.id = Some(vec!["__seed_trigger__".to_string()]); + ns.create_namespace(req) + .await + .expect("Failed to trigger CoW rewrite"); + eprintln!( + " CoW rewrite with index build took {:.1}s", + start.elapsed().as_secs_f64() + ); + } + + let ns_count = count / 3; + eprintln!( + "Seed-large complete: {} rows ({} namespaces, {} tables)", + count, + ns_count, + count - ns_count + ); +} + +// ──────────────────── worker mode ──────────────────── + +#[allow(clippy::too_many_arguments)] +async fn worker( + root: &str, + operation: &str, + operations: usize, + duration_secs: u64, + warmup: usize, + worker_id: usize, + table_count: usize, + inline_optimization: bool, + storage_options: &HashMap, +) { + let ns = build_namespace(root, inline_optimization, storage_options).await; + let ipc_data = Bytes::from(create_test_ipc_data()); + + if operation.starts_with("warm-read") { + for _ in 0..warmup { + let _ = + run_operation(ns.as_ref(), operation, worker_id, 0, table_count, &ipc_data).await; + } + } + + let emit = |op_idx: usize, start: Instant, err: bool| { + let record = LatencyRecord { + operation: operation.to_string(), + latency_ms: start.elapsed().as_secs_f64() * 1000.0, + error: err, + }; + let _ = op_idx; + println!("{}", serde_json::to_string(&record).unwrap()); + }; + + if duration_secs > 0 { + // Steady-TPS mode: commit continuously until the deadline. + let deadline = Instant::now() + Duration::from_secs(duration_secs); + let mut op_idx = 0; + while Instant::now() < deadline { + let start = Instant::now(); + let err = run_operation( + ns.as_ref(), + operation, + worker_id, + op_idx, + table_count, + &ipc_data, + ) + .await + .is_err(); + emit(op_idx, start, err); + op_idx += 1; + } + } else { + for op_idx in 0..operations { + let start = Instant::now(); + let err = run_operation( + ns.as_ref(), + operation, + worker_id, + op_idx, + table_count, + &ipc_data, + ) + .await + .is_err(); + emit(op_idx, start, err); + } + } +} + +async fn run_operation( + ns: &dyn LanceNamespace, + operation: &str, + worker_id: usize, + op_idx: usize, + table_count: usize, + ipc_data: &Bytes, +) -> Result<(), Box> { + match operation { + "cold-read-list-namespaces" | "warm-read-list-namespaces" => { + let mut req = ListNamespacesRequest::new(); + req.id = Some(vec![]); + ns.list_namespaces(req).await?; + } + "cold-read-list-tables" | "warm-read-list-tables" => { + let mut req = ListTablesRequest::new(); + req.id = Some(vec![]); + ns.list_tables(req).await?; + } + "cold-read-describe-table" | "warm-read-describe-table" => { + let table_idx = (worker_id * 1_000_000 + op_idx) % table_count.max(1); + let req = DescribeTableRequest { + id: Some(vec![format!("table_{}", table_idx)]), + ..Default::default() + }; + ns.describe_table(req).await?; + } + "write-create-namespace" => { + let mut req = CreateNamespaceRequest::new(); + req.id = Some(vec![format!("bench_w{}_{}", worker_id, op_idx)]); + ns.create_namespace(req).await?; + } + "write-create-table" => { + let mut req = CreateTableRequest::new(); + req.id = Some(vec![format!("bench_t{}_{}", worker_id, op_idx)]); + ns.create_table(req, ipc_data.clone()).await?; + } + "write-declare-table" => { + let req = DeclareTableRequest { + id: Some(vec![format!("bench_d{}_{}", worker_id, op_idx)]), + ..Default::default() + }; + ns.declare_table(req).await?; + } + _ => { + return Err(format!("unknown operation: {}", operation).into()); + } + } + Ok(()) +} + +// ──────────────────── run mode (coordinator) ──────────────────── + +#[allow(clippy::too_many_arguments)] +fn run_workers( + self_exe: &str, + root: &str, + operation: &str, + concurrency: usize, + operations: usize, + duration_secs: u64, + warmup: usize, + table_count: usize, + initial_entries: usize, + inline_optimization: bool, + variant: &str, + storage_options: &HashMap, +) -> BenchResult { + // Continuous mode splits a fixed op budget across workers; steady-TPS mode lets each + // worker run for the full duration. + let ops_per_worker = if duration_secs > 0 { + 0 + } else { + operations / concurrency.max(1) + }; + if duration_secs == 0 && ops_per_worker == 0 { + return compute_result( + variant, + operation, + concurrency, + initial_entries, + duration_secs, + Duration::ZERO, + vec![], + 0, + ); + } + + let wall_start = Instant::now(); + let children: Vec<_> = (0..concurrency) + .map(|worker_id| { + let mut cmd = Command::new(self_exe); + cmd.arg("worker") + .arg("--root") + .arg(root) + .arg("--operation") + .arg(operation) + .arg("--operations") + .arg(ops_per_worker.to_string()) + .arg("--duration-secs") + .arg(duration_secs.to_string()) + .arg("--warmup") + .arg(warmup.to_string()) + .arg("--worker-id") + .arg(worker_id.to_string()) + .arg("--table-count") + .arg(table_count.to_string()) + .arg("--inline-optimization") + .arg(inline_optimization.to_string()); + for (k, v) in storage_options { + cmd.arg("--storage-option").arg(format!("{}={}", k, v)); + } + cmd.stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .expect("Failed to spawn worker") + }) + .collect(); + + let mut all_latencies = Vec::new(); + let mut total_errors = 0; + for mut child in children { + let stdout = child.stdout.take().unwrap(); + for line in BufReader::new(stdout).lines() { + let line = line.expect("failed to read worker output"); + if let Ok(record) = serde_json::from_str::(&line) { + if record.error { + total_errors += 1; + } else { + all_latencies.push(record.latency_ms); + } + } + } + let status = child.wait().expect("failed to wait for worker"); + if !status.success() { + eprintln!("Worker exited with status: {}", status); + } + } + + compute_result( + variant, + operation, + concurrency, + initial_entries, + duration_secs, + wall_start.elapsed(), + all_latencies, + total_errors, + ) +} + +fn parse_concurrency_list(s: &str) -> Vec { + s.split(',') + .filter_map(|v| v.trim().parse::().ok()) + .filter(|v| *v > 0) + .collect() +} + +#[tokio::main] +async fn main() { + let args: Vec = std::env::args().collect(); + if args.len() < 2 { + eprintln!("Usage: manifest_bench [options]"); + std::process::exit(1); + } + + let mode = args[1].as_str(); + let mut root = String::new(); + let mut operation = String::new(); + let mut operations: usize = 100; + let mut duration_secs: u64 = 0; + let mut warmup: usize = 0; + let mut concurrency_list = vec![1]; + let mut count: usize = 1000; + let mut worker_id: usize = 0; + let mut table_count: usize = 667; + let mut initial_entries: usize = 0; + let mut inline_optimization = true; + let mut variant = String::new(); + let mut storage_options: HashMap = HashMap::new(); + + let mut i = 2; + while i < args.len() { + match args[i].as_str() { + "--root" => { + root = args[i + 1].clone(); + i += 2; + } + "--operation" => { + operation = args[i + 1].clone(); + i += 2; + } + "--operations" => { + operations = args[i + 1].parse().unwrap(); + i += 2; + } + "--duration-secs" => { + duration_secs = args[i + 1].parse().unwrap(); + i += 2; + } + "--warmup" => { + warmup = args[i + 1].parse().unwrap(); + i += 2; + } + "--concurrency" => { + concurrency_list = parse_concurrency_list(&args[i + 1]); + i += 2; + } + "--count" => { + count = args[i + 1].parse().unwrap(); + i += 2; + } + "--worker-id" => { + worker_id = args[i + 1].parse().unwrap(); + i += 2; + } + "--table-count" => { + table_count = args[i + 1].parse().unwrap(); + i += 2; + } + "--initial-entries" => { + initial_entries = args[i + 1].parse().unwrap(); + i += 2; + } + "--inline-optimization" => { + inline_optimization = args[i + 1].parse().unwrap(); + i += 2; + } + "--variant" => { + variant = args[i + 1].clone(); + i += 2; + } + "--storage-option" => { + if let Some((k, v)) = args[i + 1].split_once('=') { + storage_options.insert(k.to_string(), v.to_string()); + } + i += 2; + } + other => { + eprintln!("Unknown argument: {}", other); + std::process::exit(1); + } + } + } + + if variant.is_empty() { + variant = if inline_optimization { + "inline_index".to_string() + } else { + "no_index".to_string() + }; + } + + match mode { + "seed-large" => { + seed_large(&root, count, inline_optimization, &storage_options).await; + } + "worker" => { + worker( + &root, + &operation, + operations, + duration_secs, + warmup, + worker_id, + table_count, + inline_optimization, + &storage_options, + ) + .await; + } + "run" => { + let self_exe = std::env::current_exe() + .expect("failed to get self exe path") + .to_string_lossy() + .to_string(); + let op = if operation.is_empty() { + "write-create-namespace" + } else { + operation.as_str() + }; + + eprintln!("=== Manifest commit benchmark ==="); + eprintln!( + "variant={} op={} root={} initial_entries={} concurrency={:?} operations={} duration_secs={}", + variant, op, root, initial_entries, concurrency_list, operations, duration_secs + ); + + for &concurrency in &concurrency_list { + let result = run_workers( + &self_exe, + &root, + op, + concurrency, + operations, + duration_secs, + warmup, + table_count, + initial_entries, + inline_optimization, + &variant, + &storage_options, + ); + eprintln!( + " c={} -> {:.2} ops/s ({} ops, {} errors, p50={:.0}ms p99={:.0}ms)", + concurrency, + result.throughput_ops_per_sec, + result.total_operations, + result.errors, + result.p50_latency_ms, + result.p99_latency_ms + ); + println!("{}", serde_json::to_string(&result).unwrap()); + } + eprintln!("=== complete ==="); + } + _ => { + eprintln!("Unknown mode: {}. Use seed-large, run, or worker.", mode); + std::process::exit(1); + } + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index fb0b03ad239..681cfa430f2 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -7,6 +7,7 @@ //! that stores tables as Lance datasets in a filesystem directory structure. pub mod manifest; +pub mod manifest_feature_flags; use arrow::array::Float32Array; use arrow::record_batch::RecordBatchIterator; @@ -15,6 +16,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::{StreamExt, TryStreamExt}; use lance::dataset::builder::DatasetBuilder; +use lance::dataset::refs::check_valid_branch; use lance::dataset::scanner::Scanner; use lance::dataset::statistics::DatasetStatisticsExt; use lance::dataset::transaction::{Operation, Transaction}; @@ -44,31 +46,35 @@ use std::sync::{Arc, Mutex}; use crate::context::DynamicContextProvider; use lance_namespace::models::{ + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, AnalyzeTableQueryPlanRequest, BatchDeleteTableVersionsRequest, - BatchDeleteTableVersionsResponse, CountTableRowsRequest, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, + BatchDeleteTableVersionsResponse, BranchContents as ModelBranchContents, CountTableRowsRequest, + CreateNamespaceRequest, CreateNamespaceResponse, CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, + CreateTableRequest, CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, CreateTableVersionRequest, CreateTableVersionResponse, - DeclareTableRequest, DeclareTableResponse, DeleteTableTagRequest, DeleteTableTagResponse, - DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, - DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, - DescribeTableVersionRequest, DescribeTableVersionResponse, DescribeTransactionRequest, - DescribeTransactionResponse, DropNamespaceRequest, DropNamespaceResponse, - DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, DropTableResponse, - ExplainTableQueryPlanRequest, FragmentStats, FragmentSummary, GetTableStatsRequest, - GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, Identity, - IndexContent, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, - ListNamespacesResponse, ListTableIndicesRequest, ListTableIndicesResponse, - ListTableTagsRequest, ListTableTagsResponse, ListTableVersionsRequest, - ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, - MergeInsertIntoTableResponse, NamespaceExistsRequest, QueryTableRequest, - QueryTableRequestColumns, QueryTableRequestVector, RestoreTableRequest, RestoreTableResponse, - TableExistsRequest, TableVersion, TagContents as ModelTagContents, + DeclareTableRequest, DeclareTableResponse, DeleteTableBranchRequest, DeleteTableBranchResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DescribeNamespaceRequest, + DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, + DescribeTableRequest, DescribeTableResponse, DescribeTableVersionRequest, + DescribeTableVersionResponse, DescribeTransactionRequest, DescribeTransactionResponse, + DropNamespaceRequest, DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, + DropTableRequest, DropTableResponse, ExplainTableQueryPlanRequest, FragmentStats, + FragmentSummary, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, + GetTableTagVersionResponse, Identity, IndexContent, InsertIntoTableRequest, + InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, ListTableBranchesResponse, ListTableIndicesRequest, + ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, + MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, + QueryTableRequest, QueryTableRequestColumns, QueryTableRequestVector, RestoreTableRequest, + RestoreTableResponse, TableExistsRequest, TableVersion, TagContents as ModelTagContents, UpdateTableSchemaMetadataRequest, UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, }; -use lance_core::{Error, Result}; +use lance_core::{Error, Result, box_error}; use lance_namespace::LanceNamespace; use lance_namespace::error::NamespaceError; use lance_namespace::schema::arrow_schema_to_json; @@ -107,6 +113,70 @@ impl OpsMetrics { } } +/// Build SQL expression list for the add_columns operation. +/// Returns an explicit error when the expression is missing, instead of silently using an empty string. +pub(crate) fn build_sql_expressions( + new_columns: &[lance_namespace::models::AddColumnsEntry], +) -> Result> { + new_columns + .iter() + .map(|col| { + // expression is Option>: outer Option means whether the + // field is present, inner Option means whether the value is JSON null. + let expression = col.expression.clone().and_then(|opt| opt).ok_or_else(|| { + Error::invalid_input(format!( + "Expression is required for new column '{}'", + col.name + )) + })?; + Ok((col.name.clone(), expression)) + }) + .collect() +} + +/// Build column alteration list for the alter_columns operation. +/// Returns an explicit error when data_type conversion fails, instead of silently ignoring it. +pub(crate) fn build_column_alterations( + alterations: &[lance_namespace::models::AlterColumnsEntry], +) -> Result> { + alterations + .iter() + .map(|entry| { + let mut alteration = lance::dataset::ColumnAlteration::new(entry.path.clone()); + // rename is Option>: flatten to get the actual rename value. + if let Some(Some(rename)) = &entry.rename { + alteration = alteration.rename(rename.clone()); + } + // nullable is Option>: flatten to get the actual nullable value. + if let Some(Some(nullable)) = entry.nullable { + alteration = alteration.set_nullable(nullable); + } + // data_type is Option: only process when present and not null. + if let Some(data_type) = &entry.data_type + && !data_type.is_null() + { + let type_str = data_type.as_str().ok_or_else(|| { + Error::invalid_input(format!( + "data_type for column '{}' must be a JSON string, got: {}", + entry.path, data_type + )) + })?; + let json_type = + lance_namespace::models::JsonArrowDataType::new(type_str.to_string()); + let dt = + lance_namespace::schema::convert_json_arrow_type(&json_type).map_err(|e| { + Error::invalid_input(format!( + "Failed to parse data_type '{}' for column '{}': {}", + type_str, entry.path, e + )) + })?; + alteration = alteration.cast_to(dt); + } + Ok(alteration) + }) + .collect() +} + /// Result of checking table status atomically. /// /// This struct captures the state of a table directory in a single snapshot, @@ -192,9 +262,6 @@ pub struct DirectoryNamespaceBuilder { dir_listing_enabled: bool, inline_optimization_enabled: bool, table_version_tracking_enabled: bool, - /// When true, table versions are stored in the `__manifest` table instead of - /// relying on Lance's native version management. - table_version_storage_enabled: bool, /// When true, enables migration mode where the namespace checks the manifest first /// before falling back to directory listing for root-level tables. When false (default), /// root-level tables use directory listing directly without checking the manifest, @@ -230,10 +297,6 @@ impl std::fmt::Debug for DirectoryNamespaceBuilder { "table_version_tracking_enabled", &self.table_version_tracking_enabled, ) - .field( - "table_version_storage_enabled", - &self.table_version_storage_enabled, - ) .field( "dir_listing_to_manifest_migration_enabled", &self.dir_listing_to_manifest_migration_enabled, @@ -270,7 +333,6 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled: true, // Default to enabled for backwards compatibility inline_optimization_enabled: true, table_version_tracking_enabled: false, // Default to disabled - table_version_storage_enabled: false, // Default to disabled dir_listing_to_manifest_migration_enabled: false, // Default to disabled credential_vendor_properties: HashMap::new(), context_provider: None, @@ -310,11 +372,10 @@ impl DirectoryNamespaceBuilder { self } - /// Enable or disable inline optimization of the __manifest table. + /// Enable or disable replacement index maintenance for the __manifest table. /// - /// When enabled (default), performs compaction and indexing on the __manifest table - /// after every write operation to maintain optimal performance. - /// When disabled, manual optimization must be performed separately. + /// When enabled (default), copy-on-write manifest rewrites build replacement indices + /// for fast reads. When disabled, rewrites only replace data files. pub fn inline_optimization_enabled(mut self, enabled: bool) -> Self { self.inline_optimization_enabled = enabled; self @@ -332,19 +393,6 @@ impl DirectoryNamespaceBuilder { self } - /// Enable or disable table version management through the `__manifest` table. - /// - /// When enabled, table versions are tracked as `table_version` entries in the - /// `__manifest` Lance table. This enables: - /// - Centralized version tracking instead of per-table `_versions/` directories - /// - /// Requires `manifest_enabled` to be true. - /// When disabled (default), version storage uses per-table storage operations. - pub fn table_version_storage_enabled(mut self, enabled: bool) -> Self { - self.table_version_storage_enabled = enabled; - self - } - /// Create a DirectoryNamespaceBuilder from properties HashMap. /// /// This method parses a properties map into builder configuration. @@ -352,7 +400,7 @@ impl DirectoryNamespaceBuilder { /// - `root`: The root directory path (required) /// - `manifest_enabled`: Enable manifest-based table tracking (optional, default: true) /// - `dir_listing_enabled`: Enable directory listing for table discovery (optional, default: true) - /// - `inline_optimization_enabled`: Enable inline optimization of __manifest table (optional, default: true) + /// - `inline_optimization_enabled`: Enable replacement indices on __manifest rewrites (optional, default: true) /// - `storage.*`: Storage options (optional, prefix will be stripped) /// /// Credential vendor properties (prefixed with `credential_vendor.`, prefix is stripped): @@ -462,12 +510,6 @@ impl DirectoryNamespaceBuilder { .and_then(|v| v.parse::().ok()) .unwrap_or(false); - // Extract table_version_storage_enabled (default: false) - let table_version_storage_enabled = properties - .get("table_version_storage_enabled") - .and_then(|v| v.parse::().ok()) - .unwrap_or(false); - // Extract dir_listing_to_manifest_migration_enabled (default: false) let dir_listing_to_manifest_migration_enabled = properties .get("dir_listing_to_manifest_migration_enabled") @@ -514,7 +556,6 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled, inline_optimization_enabled, table_version_tracking_enabled, - table_version_storage_enabled, dir_listing_to_manifest_migration_enabled, credential_vendor_properties, context_provider: None, @@ -691,14 +732,6 @@ impl DirectoryNamespaceBuilder { /// - Connection to the storage backend fails /// - Storage options are invalid pub async fn build(self) -> Result { - // Validate: table_version_storage_enabled requires manifest_enabled - if self.table_version_storage_enabled && !self.manifest_enabled { - return Err(NamespaceError::InvalidInput { - message: "table_version_storage_enabled requires manifest_enabled=true".to_string(), - } - .into()); - } - let (object_store, base_path) = Self::initialize_object_store(&self.root, &self.storage_options, &self.session).await?; @@ -712,11 +745,16 @@ impl DirectoryNamespaceBuilder { self.dir_listing_enabled, self.inline_optimization_enabled, self.commit_retries, - self.table_version_storage_enabled, ) .await { Ok(ns) => Some(Arc::new(ns)), + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // The manifest exists but was written with a feature flag this + // build does not understand. Refuse rather than silently + // degrading to a directory-listing view that ignores it. + return Err(e); + } Err(e) => { // Failed to initialize manifest namespace, fall back to directory listing only log::warn!( @@ -757,7 +795,6 @@ impl DirectoryNamespaceBuilder { dir_listing_to_manifest_migration_enabled: self .dir_listing_to_manifest_migration_enabled, table_version_tracking_enabled: self.table_version_tracking_enabled, - table_version_storage_enabled: self.table_version_storage_enabled, credential_vendor, context_provider: self.context_provider, vend_input_storage_options: self.vend_input_storage_options, @@ -840,8 +877,6 @@ pub struct DirectoryNamespace { /// When true, `describe_table` returns `managed_versioning: true` to indicate /// commits should go through namespace table version APIs. table_version_tracking_enabled: bool, - /// When true, table versions are stored in the `__manifest` table. - table_version_storage_enabled: bool, /// Credential vendor created once during initialization. /// Used to vend temporary credentials for table access. credential_vendor: Option>, @@ -1062,6 +1097,44 @@ impl DirectoryNamespace { } } + /// Map lance-core ref errors from branch operations to namespace errors. + /// + /// `RefConflict` is intentionally not handled here: create-time duplicates are rejected by + /// the existence pre-check before `create_branch` runs, and delete maps its own `RefConflict` + /// (branch still has dependents) inline. + fn map_branch_error( + err: lance_core::Error, + branch: &str, + table_uri: &str, + ) -> lance_core::Error { + match err { + lance_core::Error::RefNotFound { .. } => NamespaceError::TableBranchNotFound { + message: format!("branch '{}' for table at '{}'", branch, table_uri), + } + .into(), + lance_core::Error::InvalidRef { message } => NamespaceError::InvalidInput { + message: format!("invalid branch '{}': {}", branch, message), + } + .into(), + lance_core::Error::VersionNotFound { message } => { + NamespaceError::TableVersionNotFound { + message: format!( + "source version for branch '{}' not found for table at '{}': {}", + branch, table_uri, message + ), + } + .into() + } + other => NamespaceError::Internal { + message: format!( + "branch operation failed for branch '{}' on table at '{}': {}", + branch, table_uri, other + ), + } + .into(), + } + } + async fn table_has_actual_manifests(&self, table_name: &str) -> Result { manifest::ManifestNamespace::path_has_actual_manifests( &self.object_store, @@ -1160,6 +1233,54 @@ impl DirectoryNamespace { ObjectStore::extract_path_from_uri(registry, uri) } + /// Normalize and validate a branch selector: `None`, empty, and `main` mean + /// the main branch; any other name is validated with lance's + /// `check_valid_branch` (lance skips this on the open path) so it cannot + /// escape the table root via `..`. + fn normalized_branch(branch: Option<&str>) -> Result> { + match branch.filter(|b| !b.is_empty() && *b != "main") { + Some(branch) => { + check_valid_branch(branch).map_err(|e| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: format!("invalid branch name '{}': {}", branch, e), + }) + })?; + Ok(Some(branch)) + } + None => Ok(None), + } + } + + async fn open_validated_branch(&self, table_uri: &str, branch: &str) -> Result { + let dataset = self + .configured_builder(table_uri) + .with_branch(branch, None) + .load() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::TableNotFound { + message: format!( + "branch '{}' not found for table at '{}': {}", + branch, table_uri, e + ), + }) + })?; + dataset.branches().get(branch).await.map_err(|_| { + lance_core::Error::from(NamespaceError::TableNotFound { + message: format!("branch '{}' not found for table at '{}'", branch, table_uri), + }) + })?; + Ok(dataset) + } + + async fn resolve_branch_location(&self, table_uri: &str, branch: &str) -> Result { + Ok(self + .open_validated_branch(table_uri, branch) + .await? + .branch_location() + .uri) + } + fn validate_dir_only_properties( properties: Option<&HashMap>, operation: &str, @@ -1217,6 +1338,13 @@ impl DirectoryNamespace { Ok(dataset) } + /// Logical table version parsed from a manifest filename, or `None` for + /// non-manifest / detached entries. Delegates to lance's scheme detection so + /// version listing and deletion stay consistent with the on-disk format. + fn manifest_version_from_filename(filename: &str) -> Option { + ManifestNamingScheme::detect_scheme(filename)?.parse_version(filename) + } + async fn list_table_versions_from_storage( &self, table_uri: &str, @@ -1247,17 +1375,7 @@ impl DirectoryNamespace { .into_iter() .filter_map(|meta| { let filename = meta.location.filename()?; - let version_str = filename.strip_suffix(".manifest")?; - if version_str.starts_with('d') { - return None; - } - let file_version: u64 = version_str.parse().ok()?; - - let actual_version = if file_version > u64::MAX / 2 { - u64::MAX - file_version - } else { - file_version - }; + let actual_version = Self::manifest_version_from_filename(filename)?; Some(TableVersion { version: actual_version as i64, @@ -1327,6 +1445,11 @@ impl DirectoryNamespace { } return Ok(response); } + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // An incompatible manifest must surface "please upgrade" + // rather than degrading to a directory-listing view. + return Err(e); + } Err(_) if self.dir_listing_enabled && is_root_level => { // Fall through to directory check only for single-level IDs } @@ -1525,6 +1648,19 @@ impl DirectoryNamespace { } } + /// Build a `DatasetBuilder` for `table_uri` with this namespace's storage + /// options and session applied. Callers add version/branch scoping. + fn configured_builder(&self, table_uri: &str) -> DatasetBuilder { + let mut builder = DatasetBuilder::from_uri(table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(sess) = &self.session { + builder = builder.with_session(sess.clone()); + } + builder + } + async fn load_dataset( &self, table_uri: &str, @@ -1543,13 +1679,7 @@ impl DirectoryNamespace { .into()); } - let mut builder = DatasetBuilder::from_uri(table_uri); - if let Some(opts) = &self.storage_options { - builder = builder.with_storage_options(opts.clone()); - } - if let Some(sess) = &self.session { - builder = builder.with_session(sess.clone()); - } + let builder = self.configured_builder(table_uri); let dataset = builder.load().await.map_err(|e| { lance_core::Error::from(NamespaceError::TableNotFound { @@ -2050,6 +2180,7 @@ impl DirectoryNamespace { /// to the manifest to enable manifest-only mode: /// /// ```no_run + /// #![recursion_limit = "256"] /// # use lance_namespace_impls::DirectoryNamespaceBuilder; /// # async fn example() -> Result<(), Box> { /// // Create namespace with dual mode (manifest + directory listing) @@ -2118,53 +2249,68 @@ impl DirectoryNamespace { Ok(migrated_count) } - /// Delete physical manifest files for the given table version ranges (best-effort). + /// Delete physical manifest files for the given table version ranges. /// - /// This helper is used by `batch_delete_table_versions` in both the manifest-enabled - /// and non-manifest paths. It resolves each table's storage location, computes the - /// version file paths, and attempts to delete them. Errors are logged (best-effort) - /// when `best_effort` is true, or returned immediately when false. + /// This helper backs `batch_delete_table_versions`. It resolves each table's storage + /// location, computes the version file paths, and deletes them, returning an error on + /// the first failure. /// /// Returns the number of files successfully deleted. async fn delete_physical_version_files( &self, table_entries: &[TableDeleteEntry], - best_effort: bool, + branch: Option<&str>, ) -> Result { let mut deleted_count = 0i64; for te in table_entries { let table_uri = self.resolve_table_location(&te.table_id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let table_path = self.object_store_path_from_uri(&table_uri)?; let versions_dir_path = table_path.clone().join(VERSIONS_DIR); - for (start, end) in &te.ranges { - for version in *start..=*end { - let version_path = versions_dir_path - .clone() - .join(format!("{}.manifest", version as u64)); - match self.object_store.inner.delete(&version_path).await { - Ok(_) => { - deleted_count += 1; - } - Err(object_store::Error::NotFound { .. }) => {} - Err(e) => { - if best_effort { - log::warn!( - "Failed to delete manifest file for version {} of table {:?}: {:?}", - version, - te.table_id, - e - ); - } else { - return Err(NamespaceError::Internal { - message: format!( - "Failed to delete version {} for table at '{}': {}", - version, table_uri, e - ), - } - .into()); - } + // Match listed files, not constructed names (`{version}.manifest` misses V2). + let manifest_metas: Vec<_> = self + .object_store + .read_dir_all(&versions_dir_path, None) + .try_collect() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to list manifest files for table at '{}': {}", + table_uri, e + ), + }) + })?; + let location_by_version: HashMap = manifest_metas + .into_iter() + .filter_map(|meta| { + let version = Self::manifest_version_from_filename(meta.location.filename()?)?; + Some((version, meta.location)) + }) + .collect(); + + for (&v, version_path) in &location_by_version { + let vi = v as i64; + if !te.ranges.iter().any(|&(s, e)| vi >= s && (e < 0 || vi < e)) { + continue; + } + match self.object_store.inner.delete(version_path).await { + Ok(_) => { + deleted_count += 1; + } + Err(object_store::Error::NotFound { .. }) => {} + Err(e) => { + return Err(NamespaceError::Internal { + message: format!( + "Failed to delete version {} for table at '{}': {}", + v, table_uri, e + ), } + .into()); } } } @@ -2531,6 +2677,11 @@ impl LanceNamespace for DirectoryNamespace { { match manifest_ns.table_exists(request.clone()).await { Ok(()) => return Ok(()), + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // An incompatible manifest must surface "please upgrade" + // rather than degrading to a directory-listing view. + return Err(e); + } Err(_) if self.dir_listing_enabled && is_root_level => { // Fall through to directory check only for single-level IDs } @@ -2802,24 +2953,174 @@ impl LanceNamespace for DirectoryNamespace { }) } + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + return manifest_ns.alter_table_add_columns(request).await; + } + + // Non-manifest mode: open Dataset directly via table URI and perform the operation + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status before opening the dataset + let status = self.check_table_status(&table_name).await; + if !status.exists { + return Err(NamespaceError::TableNotFound { + message: table_name, + } + .into()); + } + if status.is_deregistered { + return Err(NamespaceError::TableNotFound { + message: format!("Table is deregistered: {}", table_name), + } + .into()); + } + + let mut dataset = self + .configured_builder(&table_uri) + .load() + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + let sql_expressions = build_sql_expressions(&request.new_columns)?; + + dataset + .add_columns( + lance::dataset::NewColumnTransform::SqlExpressions(sql_expressions), + None, + None, + ) + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to add columns: {}", + e + )))) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableAddColumnsResponse::new(version)) + } + + async fn alter_table_alter_columns( + &self, + request: AlterTableAlterColumnsRequest, + ) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + return manifest_ns.alter_table_alter_columns(request).await; + } + + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status before opening the dataset + let status = self.check_table_status(&table_name).await; + if !status.exists { + return Err(NamespaceError::TableNotFound { + message: table_name, + } + .into()); + } + if status.is_deregistered { + return Err(NamespaceError::TableNotFound { + message: format!("Table is deregistered: {}", table_name), + } + .into()); + } + + let mut dataset = self + .configured_builder(&table_uri) + .load() + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + let alterations = build_column_alterations(&request.alterations)?; + + dataset.alter_columns(&alterations).await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to alter columns: {}", + e + )))) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableAlterColumnsResponse::new(version)) + } + + async fn alter_table_drop_columns( + &self, + request: AlterTableDropColumnsRequest, + ) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + return manifest_ns.alter_table_drop_columns(request).await; + } + + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status before opening the dataset + let status = self.check_table_status(&table_name).await; + if !status.exists { + return Err(NamespaceError::TableNotFound { + message: table_name, + } + .into()); + } + if status.is_deregistered { + return Err(NamespaceError::TableNotFound { + message: format!("Table is deregistered: {}", table_name), + } + .into()); + } + + let mut dataset = self + .configured_builder(&table_uri) + .load() + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + let columns: Vec<&str> = request.columns.iter().map(|s| s.as_str()).collect(); + dataset.drop_columns(&columns).await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to drop columns: {}", + e + )))) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableDropColumnsResponse::new(version)) + } + async fn list_table_versions( &self, request: ListTableVersionsRequest, ) -> Result { self.record_op("list_table_versions"); - // When table_version_storage_enabled, query from __manifest - if self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - let table_id = request.id.clone().unwrap_or_default(); - let want_descending = request.descending == Some(true); - return manifest_ns - .list_table_versions(&table_id, want_descending, request.limit) - .await; - } - - // Fallback when table_version_storage is not enabled: list from _versions/ directory + let branch = Self::normalized_branch(request.branch.as_deref())?; let table_uri = self.resolve_table_location(&request.id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let want_descending = request.descending == Some(true); let table_versions = self .list_table_versions_from_storage(&table_uri, want_descending, request.limit) @@ -2836,7 +3137,12 @@ impl LanceNamespace for DirectoryNamespace { request: CreateTableVersionRequest, ) -> Result { self.record_op("create_table_version"); + let branch = Self::normalized_branch(request.branch.as_deref())?; let table_uri = self.resolve_table_location(&request.id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let staging_manifest_path = &request.manifest_path; let version = request.version as u64; @@ -2956,41 +3262,6 @@ impl LanceNamespace for DirectoryNamespace { ); } - // If table_version_storage_enabled is enabled, also record in __manifest (best-effort) - if self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - let table_id_str = - manifest::ManifestNamespace::str_object_id(&request.id.clone().unwrap_or_default()); - let object_id = - manifest::ManifestNamespace::build_version_object_id(&table_id_str, version as i64); - let metadata_json = serde_json::json!({ - "manifest_path": final_path.to_string(), - "manifest_size": manifest_size, - "e_tag": final_meta.e_tag, - "naming_scheme": request.naming_scheme.as_deref().unwrap_or("V2"), - }) - .to_string(); - - if let Err(e) = manifest_ns - .insert_into_manifest_with_metadata( - vec![manifest::ManifestEntry { - object_id, - object_type: manifest::ObjectType::TableVersion, - location: None, - metadata: Some(metadata_json), - }], - None, - ) - .await - { - log::warn!( - "Failed to record table version in __manifest (best-effort): {:?}", - e - ); - } - } - Ok(CreateTableVersionResponse { transaction_id: None, version: Some(Box::new(TableVersion { @@ -3009,17 +3280,12 @@ impl LanceNamespace for DirectoryNamespace { request: DescribeTableVersionRequest, ) -> Result { self.record_op("describe_table_version"); - // When table_version_storage_enabled and a specific version is requested, - // query from __manifest to avoid opening the entire dataset - if self.table_version_storage_enabled - && let (Some(manifest_ns), Some(version)) = (&self.manifest_ns, request.version) - { - let table_id = request.id.clone().unwrap_or_default(); - return manifest_ns.describe_table_version(&table_id, version).await; - } - - // Fallback when table_version_storage is not enabled: inspect physical manifests directly. + let branch = Self::normalized_branch(request.branch.as_deref())?; let table_uri = self.resolve_table_location(&request.id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let versions = self .list_table_versions_from_storage(&table_uri, true, None) .await?; @@ -3057,76 +3323,46 @@ impl LanceNamespace for DirectoryNamespace { request: BatchDeleteTableVersionsRequest, ) -> Result { self.record_op("batch_delete_table_versions"); + let branch = Self::normalized_branch(request.branch.as_deref())?; // Single-table mode: use `id` (from path parameter) + `ranges` to delete // versions from one table. let ranges: Vec<(i64, i64)> = request .ranges .iter() - .map(|r| { - let start = r.start_version; - let end = if r.end_version > 0 { - r.end_version - } else { - start - }; - (start, end) - }) + .map(|r| (r.start_version, r.end_version)) .collect(); - let table_entries = vec![TableDeleteEntry { - table_id: request.id.clone(), - ranges, - }]; - - let mut total_deleted_count = 0i64; - if self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - // Phase 1 (atomic commit point): Delete version records from __manifest - // for ALL tables in a single atomic operation. This is the authoritative - // source of truth — once __manifest entries are removed, the versions - // are logically deleted across all tables atomically. - - // Collect all (table_id_str, ranges) for batch deletion - let mut all_object_ids: Vec = Vec::new(); - for te in &table_entries { - let table_id_str = manifest::ManifestNamespace::str_object_id( - &te.table_id.clone().unwrap_or_default(), - ); - for (start, end) in &te.ranges { - for version in *start..=*end { - let object_id = manifest::ManifestNamespace::build_version_object_id( - &table_id_str, - version, - ); - all_object_ids.push(object_id); - } - } - } - - if !all_object_ids.is_empty() { - total_deleted_count = manifest_ns - .batch_delete_table_versions_by_object_ids(&all_object_ids) - .await?; + // Reject pathological bounded ranges up front: an explicit huge bounded + // range like (0, i64::MAX) is almost certainly a mistake. A through-latest + // range (end < 0) is bounded by the manifests that actually exist on storage. + const MAX_VERSIONS_PER_REQUEST: i128 = 1_000_000; + let requested: i128 = ranges + .iter() + .map(|(s, e)| { + if *e < 0 { + 0 + } else { + (*e as i128 - *s as i128).max(0) + } + }) + .sum(); + if requested > MAX_VERSIONS_PER_REQUEST { + return Err(NamespaceError::InvalidInput { + message: format!( + "batch_delete requested {} versions; limit is {}", + requested, MAX_VERSIONS_PER_REQUEST + ), } - - // Phase 2: Delete physical manifest files (best-effort). - // Even if some file deletions fail, the versions are already removed from - // __manifest, so they won't be visible to readers. Leftover files are - // orphaned but harmless and can be cleaned up later. - let _ = self - .delete_physical_version_files(&table_entries, true) - .await; - - return Ok(BatchDeleteTableVersionsResponse { - deleted_count: Some(total_deleted_count), - transaction_id: None, - }); + .into()); } - // Fallback when table_version_storage is not enabled: delete physical files directly (no __manifest) - total_deleted_count = self - .delete_physical_version_files(&table_entries, false) + let table_entries = vec![TableDeleteEntry { + table_id: request.id.clone(), + ranges, + }]; + + let total_deleted_count = self + .delete_physical_version_files(&table_entries, branch) .await?; Ok(BatchDeleteTableVersionsResponse { @@ -3213,6 +3449,11 @@ impl LanceNamespace for DirectoryNamespace { let dataset = self .load_dataset(&table_uri, request.version, "list_table_indices") .await?; + let total_rows = dataset.count_rows(None).await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to count rows for table '{}': {:?}", table_uri, e), + }) + })? as u64; let mut indices = dataset .describe_indices(None) .await @@ -3255,12 +3496,35 @@ impl LanceNamespace for DirectoryNamespace { }) .collect::>>()?; - Ok(IndexContent { + let segments = description.segments(); + let created_at = segments + .iter() + .filter_map(|segment| segment.created_at) + .min() + .map(|ts| ts.to_rfc3339()); + + // `..Default::default()` keeps this tolerant of additive reqwest + // client model changes (see #7212). + #[allow(clippy::needless_update)] + let content = IndexContent { index_name: description.name().to_string(), index_uuid: description.metadata()[0].uuid.to_string(), columns, status: "SUCCEEDED".to_string(), - }) + index_type: Some(description.index_type().to_string()), + type_url: Some(description.type_url().to_string()), + num_indexed_rows: Some(description.rows_indexed() as i64), + num_unindexed_rows: Some( + total_rows.saturating_sub(description.rows_indexed()) as i64, + ), + size_bytes: description.total_size_bytes().map(|size| size as i64), + num_segments: Some(segments.len() as i32), + created_at, + index_version: segments.first().map(|segment| segment.index_version), + index_details: description.details().ok(), + ..Default::default() + }; + Ok(content) }) .collect::>>()?; @@ -3462,8 +3726,12 @@ impl LanceNamespace for DirectoryNamespace { )); } + let branch = Self::normalized_branch(request.branch.as_deref())?; let table_uri = self.resolve_table_location(&request.id).await?; - let mut dataset = self.load_dataset(&table_uri, None, "restore_table").await?; + let mut dataset = match branch { + Some(branch) => self.open_validated_branch(&table_uri, branch).await?, + None => self.load_dataset(&table_uri, None, "restore_table").await?, + }; dataset = dataset .checkout_version(version as u64) @@ -4162,14 +4430,15 @@ impl LanceNamespace for DirectoryNamespace { .load_dataset(&table_uri, None, "get_table_tag_version") .await?; - let version = dataset + let contents = dataset .tags() - .get_version(&request.tag) + .get(&request.tag) .await .map_err(|e| Self::map_tag_error(e, &request.tag, &table_uri))?; Ok(GetTableTagVersionResponse { - version: version as i64, + version: contents.version as i64, + branch: contents.branch, }) } @@ -4275,6 +4544,156 @@ impl LanceNamespace for DirectoryNamespace { }) } + async fn create_table_branch( + &self, + request: CreateTableBranchRequest, + ) -> Result { + self.record_op("create_table_branch"); + if request.name.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "branch name must not be empty for create_table_branch".to_string(), + } + .into()); + } + let from_version = match request.from_version { + Some(v) if v <= 0 => { + return Err(NamespaceError::InvalidInput { + message: format!( + "from_version must be a positive integer, got {} for create_table_branch", + v + ), + } + .into()); + } + Some(v) => Some(v as u64), + None => None, + }; + + let table_uri = self.resolve_table_location(&request.id).await?; + let mut dataset = self + .load_dataset(&table_uri, None, "create_table_branch") + .await?; + + // Best-effort pre-check: a duplicate returns a clean TableBranchAlreadyExists conflict + // instead of the opaque Internal error create_branch raises on a pre-existing branch. A + // concurrent create can still race past this window. Remove once lance-core create_branch + // returns RefConflict up front. + if dataset.branches().get(&request.name).await.is_ok() { + return Err(NamespaceError::TableBranchAlreadyExists { + message: format!("branch '{}' for table at '{}'", request.name, table_uri), + } + .into()); + } + + dataset + .create_branch( + &request.name, + (request.from_branch.as_deref(), from_version), + None, + ) + .await + .map_err(|e| { + // After load_dataset + the dup pre-check, a DatasetNotFound from create_branch + // means the requested fork source (from_branch/from_version) doesn't exist. + if matches!(e, lance_core::Error::DatasetNotFound { .. }) { + NamespaceError::InvalidInput { + message: format!( + "from_branch/from_version for branch '{}' refers to a source that does not exist: {}", + request.name, e + ), + } + .into() + } else { + Self::map_branch_error(e, &request.name, &table_uri) + } + })?; + + Ok(CreateTableBranchResponse { + transaction_id: None, + }) + } + + async fn list_table_branches( + &self, + request: ListTableBranchesRequest, + ) -> Result { + self.record_op("list_table_branches"); + let table_uri = self.resolve_table_location(&request.id).await?; + let dataset = self + .load_dataset(&table_uri, None, "list_table_branches") + .await?; + + let raw_branches = dataset.list_branches().await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to list branches for table at '{}': {}", + table_uri, e + ), + }) + })?; + + let branches = raw_branches + .into_iter() + .map(|(name, contents)| { + // The namespace `BranchContents` model has no `identifier` field, so the + // lance-core branch identifier is intentionally dropped here. + let mut branch_model = ModelBranchContents::new( + contents.parent_version as i64, + contents.create_at as i64, + contents.manifest_size as i64, + ); + branch_model.parent_branch = contents.parent_branch; + branch_model.metadata = if contents.metadata.is_empty() { + None + } else { + Some(contents.metadata) + }; + (name, branch_model) + }) + .collect(); + + Ok(ListTableBranchesResponse { + branches, + page_token: None, + }) + } + + async fn delete_table_branch( + &self, + request: DeleteTableBranchRequest, + ) -> Result { + self.record_op("delete_table_branch"); + if request.name.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "branch name must not be empty for delete_table_branch".to_string(), + } + .into()); + } + + let table_uri = self.resolve_table_location(&request.id).await?; + let mut dataset = self + .load_dataset(&table_uri, None, "delete_table_branch") + .await?; + + dataset + .delete_branch(&request.name) + .await + .map_err(|e| match e { + lance_core::Error::RefConflict { message } => NamespaceError::InvalidInput { + message: format!( + "branch '{}' for table at '{}': {}", + request.name, table_uri, message + ), + } + .into(), + other => Self::map_branch_error(other, &request.name, &table_uri), + })?; + + Ok(DeleteTableBranchResponse { + transaction_id: None, + }) + } + fn namespace_id(&self) -> String { format!("DirectoryNamespace {{ root: {:?} }}", self.root) } @@ -4289,6 +4708,7 @@ mod tests { use lance_core::utils::tempfile::{TempStdDir, TempStrDir}; use lance_core::utils::testing::CountingObjectStore; use lance_io::object_store::{providers::local::FileStoreProvider, uri_to_url}; + use lance_namespace::error::ErrorCode; use lance_namespace::models::{ CreateTableRequest, JsonArrowDataType, JsonArrowField, JsonArrowSchema, ListTablesRequest, QueryTableRequestColumns, @@ -4325,6 +4745,7 @@ mod tests { } #[derive(Debug)] + #[allow(dead_code)] struct CountingFileStoreProvider { listing_count: Arc, } @@ -4360,6 +4781,7 @@ mod tests { } } + #[allow(dead_code)] fn file_object_store_uri(path: &str) -> String { let file_url = uri_to_url(path).unwrap(); let mut url = Url::parse("file-object-store:///").unwrap(); @@ -4367,6 +4789,7 @@ mod tests { url.to_string() } + #[allow(dead_code)] fn build_listing_counting_session(listing_count: Arc) -> Arc { let registry = Arc::new(ObjectStoreRegistry::default()); registry.insert( @@ -4567,108 +4990,1513 @@ mod tests { .transaction_id } - #[tokio::test] - async fn test_create_table() { - let (namespace, _temp_dir) = create_test_namespace().await; - - // Create test IPC data - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); - - let mut request = CreateTableRequest::new(); - request.id = Some(vec!["test_table".to_string()]); - - let response = namespace - .create_table(request, bytes::Bytes::from(ipc_data)) + /// Fork `branch_name` from the table's current version and append + /// `extra_versions` commits to it (each a new version on the branch, written + /// with the default V2 naming). The main branch is left untouched. Returns + /// the branch's storage URI (`/tree/`). + async fn create_branch_with_commits( + namespace: &DirectoryNamespace, + table_name: &str, + branch_name: &str, + extra_versions: usize, + ) -> String { + let mut main = open_dataset(namespace, table_name).await; + let fork_version = main.version().version; + let branch = main + .create_branch(branch_name, fork_version, None) .await .unwrap(); + let branch_uri = branch.uri().to_string(); + for i in 0..extra_versions { + append_scalar_version(&branch_uri, (i as i32 + 1) * 100).await; + } + branch_uri + } - assert!(response.location.is_some()); - assert!(response.location.unwrap().ends_with("test_table.lance")); - assert_eq!(response.version, Some(1)); + /// Append one scalar-schema batch to the dataset at `uri`, creating a new + /// version (default V2 naming). Shared by branch and main chain setup. + async fn append_scalar_version(uri: &str, seed: i32) { + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + let batch = arrow::record_batch::RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![seed, seed + 1])), + Arc::new(StringArray::from(vec![Some("x"), Some("y")])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + Dataset::write( + reader, + uri, + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + } + + /// List a table's versions on `branch` (None == main) via the namespace. + async fn list_versions( + namespace: &DirectoryNamespace, + table_name: &str, + branch: Option<&str>, + ) -> Result> { + let req = ListTableVersionsRequest { + id: Some(vec![table_name.to_string()]), + branch: branch.map(|b| b.to_string()), + ..Default::default() + }; + namespace.list_table_versions(req).await.map(|r| r.versions) } #[tokio::test] - async fn test_create_table_without_data() { + async fn test_list_table_versions_on_branch() { let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; - let mut request = CreateTableRequest::new(); - request.id = Some(vec!["test_table".to_string()]); + // The branch lists its own chain, and every version resolves to a + // manifest under the branch's tree path. + let branch_versions = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + assert!(branch_versions.len() >= 2); + assert!( + branch_versions + .iter() + .all(|v| v.manifest_path.contains("tree/exp")), + "branch versions must resolve to branch manifests: {:?}", + branch_versions + ); - let result = namespace.create_table(request, bytes::Bytes::new()).await; - assert!(result.is_err()); + // Unset and "main" behave identically and never see the tree path. + let main_versions = list_versions(&namespace, "users", None).await.unwrap(); + let main_explicit = list_versions(&namespace, "users", Some("main")) + .await + .unwrap(); + assert_eq!(main_versions.len(), main_explicit.len()); assert!( - result - .unwrap_err() - .to_string() - .contains("Arrow IPC stream) is required") + main_versions + .iter() + .all(|v| !v.manifest_path.contains("tree/")) ); + + // A non-existent branch is a clean not-found, not an empty list. + let missing = list_versions(&namespace, "users", Some("does-not-exist")).await; + assert!(missing.is_err()); + assert!(missing.unwrap_err().to_string().contains("not found")); } #[tokio::test] - async fn test_create_table_with_invalid_id() { + async fn test_describe_table_version_on_branch() { let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; - // Create test IPC data - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); + let branch_versions = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + let latest = branch_versions.iter().map(|v| v.version).max().unwrap(); - // Test with empty ID - let mut request = CreateTableRequest::new(); - request.id = Some(vec![]); + // Describe latest on the branch returns the branch's manifest_path. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + branch: Some("exp".to_string()), + ..Default::default() + }; + let resp = namespace.describe_table_version(req).await.unwrap(); + assert_eq!(resp.version.version, latest); + assert!(resp.version.manifest_path.contains("tree/exp")); + + // A specific existing branch version resolves. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: Some(latest), + branch: Some("exp".to_string()), + ..Default::default() + }; + assert!(namespace.describe_table_version(req).await.is_ok()); - let result = namespace - .create_table(request, bytes::Bytes::from(ipc_data.clone())) - .await; - assert!(result.is_err()); + // A version absent on the branch is not found. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: Some(999_999), + branch: Some("exp".to_string()), + ..Default::default() + }; + assert!(namespace.describe_table_version(req).await.is_err()); - // Test with multi-level ID - should now work with manifest enabled - // First create the parent namespace - let mut create_ns_req = CreateNamespaceRequest::new(); - create_ns_req.id = Some(vec!["test_namespace".to_string()]); - namespace.create_namespace(create_ns_req).await.unwrap(); + // A non-existent branch is not found. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + branch: Some("nope".to_string()), + ..Default::default() + }; + let err = namespace.describe_table_version(req).await; + assert!(err.is_err() && err.unwrap_err().to_string().contains("not found")); + } - // Now create table in the namespace - let mut request = CreateTableRequest::new(); - request.id = Some(vec!["test_namespace".to_string(), "table".to_string()]); + #[tokio::test] + async fn test_restore_table_on_branch() { + use lance_namespace::models::RestoreTableRequest; - let result = namespace - .create_table(request, bytes::Bytes::from(ipc_data)) - .await; - // Should succeed with manifest enabled + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; + + let before = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + let branch_latest = before.iter().map(|v| v.version).max().unwrap(); + let earliest = before.iter().map(|v| v.version).min().unwrap(); + let main_before = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + + // Restoring the branch to an earlier version commits a NEW version on + // the branch (restore is itself a commit), and must not touch main. + let req = RestoreTableRequest { + id: Some(vec!["users".to_string()]), + version: earliest, + branch: Some("exp".to_string()), + ..Default::default() + }; + let resp = namespace.restore_table(req).await.unwrap(); + assert!(resp.transaction_id.is_some()); + + let after = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + let new_latest = after.iter().map(|v| v.version).max().unwrap(); assert!( - result.is_ok(), - "Multi-level table IDs should work with manifest enabled" + new_latest > branch_latest, + "restore should add a branch version" ); + + let main_after = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + assert_eq!(main_after, main_before, "main must be unaffected"); } #[tokio::test] - async fn test_list_tables() { - let (namespace, _temp_dir) = create_test_namespace().await; - - // Initially, no tables - let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 0); + async fn test_batch_delete_table_versions_on_branch() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; - // Create test IPC data - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; - // Create a table - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["table1".to_string()]); - namespace - .create_table(create_request, bytes::Bytes::from(ipc_data.clone())) + let before = list_versions(&namespace, "users", Some("exp")) .await .unwrap(); + let main_before = list_versions(&namespace, "users", None).await.unwrap(); - // Create another table - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["table2".to_string()]); - namespace - .create_table(create_request, bytes::Bytes::from(ipc_data)) + // Delete the branch's whole history with a through-latest range (end = -1). + // The branch manifests use V2 naming (inverted, zero-padded), so a nonzero + // deleted_count proves the V2 fix: the old code constructed + // "{version}.manifest" and silently matched nothing. + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + branch: Some("exp".to_string()), + ranges: vec![VersionRange::new(0, -1)], + ..Default::default() + }; + let resp = namespace.batch_delete_table_versions(req).await.unwrap(); + assert_eq!( + resp.deleted_count, + Some(before.len() as i64), + "every branch manifest should be physically deleted" + ); + + // The emptied branch now reads as not-found, and main is untouched. + assert!( + list_versions(&namespace, "users", Some("exp")) + .await + .is_err() + ); + let main_after = list_versions(&namespace, "users", None).await.unwrap(); + assert_eq!( + main_after.len(), + main_before.len(), + "main must be untouched" + ); + } + + #[tokio::test] + async fn test_create_table_version_on_branch() { + use futures::TryStreamExt; + use lance_namespace::models::CreateTableVersionRequest; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + let branch_uri = create_branch_with_commits(&namespace, "users", "exp", 1).await; + + // Stage a manifest by copying one of the branch's existing manifests. + let branch_ds = Dataset::open(&branch_uri).await.unwrap(); + let versions_dir = branch_ds.versions_dir(); + let store = branch_ds.object_store(None).await.unwrap(); + let existing = store + .inner + .list(Some(&versions_dir)) + .try_collect::>() + .await + .unwrap() + .into_iter() + .find(|m| { + m.location + .filename() + .map(|f| f.ends_with(".manifest")) + .unwrap_or(false) + }) + .expect("a branch manifest"); + let bytes = store + .inner + .get(&existing.location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let staging = versions_dir.join("staging_manifest"); + store.inner.put(&staging, bytes.into()).await.unwrap(); + + let main_before = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + let new_version = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap() + .iter() + .map(|v| v.version) + .max() + .unwrap() + + 1; + + let req = CreateTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: new_version, + manifest_path: staging.to_string(), + naming_scheme: Some("V2".to_string()), + branch: Some("exp".to_string()), + ..Default::default() + }; + let resp = namespace.create_table_version(req).await.unwrap(); + let info = resp.version.expect("version info"); + // The new manifest must land under the branch's tree path. + assert!( + info.manifest_path.contains("tree/exp"), + "got {}", + info.manifest_path + ); + + // It is visible on the branch, and main did not gain a version. + let after = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + assert!(after.iter().any(|v| v.version == new_version)); + let main_after = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + assert_eq!(main_after, main_before, "main must be unaffected"); + } + + /// The namespace-managed commit store derives the branch a request targets + /// from the base path it is handed, so a single store serves every branch of + /// the table: a branch-qualified base resolves and commits against the + /// branch chain while the table root targets main. + #[tokio::test] + async fn test_external_manifest_store_resolves_branch_from_base_path() { + use futures::TryStreamExt; + use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; + use lance_table::io::commit::external_manifest::ExternalManifestStore; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // main: version 1 + let branch_uri = create_branch_with_commits(&namespace, "users", "exp", 2).await; + + let namespace = Arc::new(namespace); + let table_id = vec!["users".to_string()]; + let branch_ds = Dataset::open(&branch_uri).await.unwrap(); + let branch_base = branch_ds.branch_location().path; + let root_base = branch_ds.branch_location().find_main().unwrap().path; + let store = LanceNamespaceExternalManifestStore::new( + namespace.clone(), + table_id.clone(), + root_base.clone(), + ); + + // The branch-qualified base resolves the branch chain, the root base + // resolves main: proof the base path reaches list_table_versions. + let (branch_latest, branch_path) = store + .get_latest_version(branch_base.as_ref()) + .await + .unwrap() + .expect("branch has versions"); + let (_main_latest, main_path) = store + .get_latest_version(root_base.as_ref()) + .await + .unwrap() + .expect("main has versions"); + assert!( + branch_path.contains("tree/exp"), + "branch latest must resolve to the branch tree: {}", + branch_path + ); + assert!( + !main_path.contains("tree/exp"), + "main latest must not resolve to a branch tree: {}", + main_path + ); + + // describe (get) with the branch base also resolves to the branch tree. + let described = store + .get(branch_base.as_ref(), branch_latest) + .await + .unwrap(); + assert!( + described.contains("tree/exp"), + "describe on the branch must resolve to the branch tree: {}", + described + ); + + // A base that is neither the root nor a branch chain is rejected. + assert!(store.get_latest_version("somewhere/else").await.is_err()); + + // Commit (put) with the branch base: the new version must land on the + // branch chain. Stage a manifest by copying an existing branch manifest. + let versions_dir = branch_ds.versions_dir(); + let obj = branch_ds.object_store(None).await.unwrap(); + let existing = obj + .inner + .list(Some(&versions_dir)) + .try_collect::>() + .await + .unwrap() + .into_iter() + .find(|m| { + m.location + .filename() + .map(|f| f.ends_with(".manifest")) + .unwrap_or(false) + }) + .expect("a branch manifest"); + let bytes = obj + .inner + .get(&existing.location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let size = bytes.len() as u64; + let staging = versions_dir.clone().join("staging_manifest"); + obj.inner.put(&staging, bytes.into()).await.unwrap(); + + let committed = store + .put( + &branch_base, + branch_latest + 1, + &staging, + size, + None, + obj.inner.as_ref(), + ManifestNamingScheme::V2, + ) + .await + .unwrap(); + assert!( + committed.path.to_string().contains("tree/exp"), + "a commit through a branch-qualified base must land on the branch tree: {}", + committed.path + ); + } + + /// write_into_namespace_on_branch must append against the branch chain + /// THROUGH the managed commit handler: the version is registered with the + /// namespace (create_table_version), lands on the branch tree, and main's + /// catalog is untouched. The ops-metrics assertions exist because a + /// physical-only commit is invisible to DirectoryNamespace branch listing + /// (it lists storage), while a catalog-authoritative namespace would + /// silently lose the version. + #[tokio::test] + async fn test_write_into_namespace_on_branch_appends_to_branch() { + use lance::dataset::builder::DatasetBuilder; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let namespace = Arc::new( + DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .ops_metrics_enabled(true) + .build() + .await + .unwrap(), + ); + let ns: Arc = namespace.clone(); + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; // main: v1 (id=1), v2 (id=2) + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let main_chain_len = |ns: Arc, table_id: Vec| async move { + ns.list_table_versions(ListTableVersionsRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap() + .versions + .len() + }; + let main_before = main_chain_len(ns.clone(), table_id.clone()).await; + let commits_before = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + + let branch_ds = Dataset::write_into_namespace_on_branch( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + ns.clone(), + table_id.clone(), + "exp", + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(branch_ds.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&branch_ds).await, vec![1, 2, 3]); + + // The append must commit through the namespace, not just write a + // physical manifest under the branch tree. + let commits_after = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + assert_eq!( + commits_after, + commits_before + 1, + "the branch append must register its version via create_table_version" + ); + let exp_versions = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + branch: Some("exp".to_string()), + ..Default::default() + }) + .await + .unwrap() + .versions; + assert!( + exp_versions + .iter() + .all(|v| v.manifest_path.contains("tree/exp")), + "branch versions must resolve to the branch tree: {:?}", + exp_versions + ); + assert_eq!( + main_chain_len(ns.clone(), table_id.clone()).await, + main_before, + "main's catalog must be untouched by the branch append" + ); + + // A managed main append through the same entry point must register in + // the catalog too, so a fresh managed open resolves the new latest. + Dataset::write_into_namespace( + RecordBatchIterator::new(vec![Ok(single_int_batch(100))], single_int_schema()), + ns.clone(), + table_id.clone(), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!( + main_chain_len(ns.clone(), table_id.clone()).await, + main_before + 1, + "a managed main append must register its version in the catalog" + ); + let fresh = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .load() + .await + .unwrap(); + assert_eq!( + scan_id_column(&fresh).await, + vec![1, 2, 100], + "a fresh managed open must resolve the appended version, not a stale latest" + ); + } + + /// CREATE on a branch is rejected: a branch forks from an existing version. + #[tokio::test] + async fn test_write_into_namespace_on_branch_rejects_create() { + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + + let (namespace, _temp_dir) = create_test_namespace().await; + let namespace = Arc::new(namespace); + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + let batch = arrow::record_batch::RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec![Some("a")])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let result = Dataset::write_into_namespace_on_branch( + reader, + namespace.clone(), + vec!["new_table".to_string()], + "exp", + Some(WriteParams { + mode: WriteMode::Create, + ..Default::default() + }), + ) + .await; + assert!(result.is_err(), "create on a branch must be rejected"); + assert!( + result.unwrap_err().to_string().contains("branch"), + "error should mention the branch restriction" + ); + } + + #[tokio::test] + async fn test_branch_name_validation_rejects_traversal() { + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + + // A traversal-style branch name is rejected as invalid input before any + // storage path is built from it. + let err = list_versions(&namespace, "users", Some("../evil")).await; + assert!(err.is_err()); + assert!(err.unwrap_err().to_string().contains("invalid branch name")); + } + + #[tokio::test] + async fn test_branch_ops_reject_zombie_branch() { + use futures::TryStreamExt; + use lance_namespace::models::{ + BatchDeleteTableVersionsRequest, CreateTableVersionRequest, RestoreTableRequest, + VersionRange, + }; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + + // Stage a real (loadable) manifest under tree/ghost/_versions/ without + // create_branch, so the path exists but has no BranchContents ref. + let dataset = open_dataset(&namespace, "users").await; + let store = dataset.object_store(None).await.unwrap(); + let manifest = store + .inner + .list(Some(&dataset.versions_dir())) + .try_collect::>() + .await + .unwrap() + .into_iter() + .find(|m| { + m.location + .filename() + .map(|f| f.ends_with(".manifest")) + .unwrap_or(false) + }) + .expect("a manifest"); + let bytes = store + .inner + .get(&manifest.location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let zombie = Path::from(format!( + "{}/tree/ghost/_versions/{}", + dataset.branch_location().path, + manifest.location.filename().unwrap() + )); + store.inner.put(&zombie, bytes.into()).await.unwrap(); + + // The directory is physically present, but the source of truth has no + // such branch -- this is what makes every op below reject it. + assert!(dataset.branches().get("ghost").await.is_err()); + + fn rejected(label: &str, r: Result) { + match r { + Ok(v) => panic!("{label} must reject the zombie branch, got Ok({v:?})"), + Err(e) => assert!(e.to_string().contains("not found"), "{label}: {e}"), + } + } + + rejected( + "list", + list_versions(&namespace, "users", Some("ghost")).await, + ); + rejected( + "describe", + namespace + .describe_table_version(DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await, + ); + rejected( + "create", + namespace + .create_table_version(CreateTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: 2, + manifest_path: zombie.to_string(), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await, + ); + rejected( + "restore", + namespace + .restore_table(RestoreTableRequest { + id: Some(vec!["users".to_string()]), + version: 1, + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await, + ); + rejected( + "batch_delete", + namespace + .batch_delete_table_versions(BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + branch: Some("ghost".to_string()), + ranges: vec![VersionRange::new(1, 1)], + ..Default::default() + }) + .await, + ); + } + + /// V2 is the default naming scheme, and the pre-rewrite delete path + /// constructed `{version}.manifest` (a V1 name) and silently matched nothing + /// on a V2 table, returning deleted_count 0. This pins the fix on the main + /// chain (branch=None), which previously had no batch_delete coverage at all. + #[tokio::test] + async fn test_batch_delete_table_versions_main_v2() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // version 1 + let main_uri = open_dataset(&namespace, "users").await.uri().to_string(); + append_scalar_version(&main_uri, 100).await; // version 2 + append_scalar_version(&main_uri, 200).await; // version 3 + + let before = list_versions(&namespace, "users", None).await.unwrap(); + assert!(before.len() >= 3); + // Confirm these really are V2-named manifests (20-digit inverted version + // + ".manifest" == 29 chars), i.e. the case the old code skipped. + assert!( + before + .iter() + .all(|v| v.manifest_path.rsplit('/').next().unwrap().len() == 29), + "expected V2-named manifests: {:?}", + before + ); + let min_v = before.iter().map(|v| v.version).min().unwrap(); + let max_v = before.iter().map(|v| v.version).max().unwrap(); + + // Delete everything except the latest version. end is exclusive, so + // [min_v, max_v) keeps max_v. + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + ranges: vec![VersionRange::new(min_v, max_v)], + ..Default::default() + }; + let resp = namespace.batch_delete_table_versions(req).await.unwrap(); + assert_eq!( + resp.deleted_count, + Some((before.len() - 1) as i64), + "V2 manifests must actually be deleted (was 0 before the fix)" + ); + + let after = list_versions(&namespace, "users", None).await.unwrap(); + assert_eq!(after.len(), 1); + assert_eq!(after[0].version, max_v); + } + + /// Pins the exclusive end of VersionRange: [v, v+1) must match only v. + #[tokio::test] + async fn test_batch_delete_end_is_exclusive() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // version 1 + let main_uri = open_dataset(&namespace, "users").await.uri().to_string(); + append_scalar_version(&main_uri, 100).await; // version 2 + append_scalar_version(&main_uri, 200).await; // version 3 + + let before = list_versions(&namespace, "users", None).await.unwrap(); + let min_v = before.iter().map(|v| v.version).min().unwrap(); + + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + ranges: vec![VersionRange::new(min_v, min_v + 1)], + ..Default::default() + }; + let resp = namespace.batch_delete_table_versions(req).await.unwrap(); + assert_eq!( + resp.deleted_count, + Some(1), + "only min_v is in [min_v, min_v+1)" + ); + + let after = list_versions(&namespace, "users", None).await.unwrap(); + assert!( + !after.iter().any(|v| v.version == min_v), + "min_v must be deleted" + ); + assert_eq!(after.len(), before.len() - 1, "exactly one version removed"); + } + + #[tokio::test] + async fn test_batch_delete_rejects_unbounded_range() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + + // An unbounded range must be rejected up front, not turned into ~10^19 + // iterations / an unbounded id list. + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + ranges: vec![VersionRange::new(0, i64::MAX)], + ..Default::default() + }; + let err = namespace.batch_delete_table_versions(req).await; + assert!(err.is_err()); + assert!( + err.unwrap_err().to_string().contains("limit"), + "expected a range-too-large error" + ); + } + + /// Build a managed (manifest-tracked) namespace over `path`. + async fn create_managed_namespace(path: &str) -> Arc { + Arc::new( + DirectoryNamespaceBuilder::new(path) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .build() + .await + .unwrap(), + ) + } + + fn single_int_schema() -> Arc { + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])) + } + + fn single_int_batch(seed: i32) -> arrow::record_batch::RecordBatch { + use arrow::array::Int32Array; + arrow::record_batch::RecordBatch::try_new( + single_int_schema(), + vec![Arc::new(Int32Array::from(vec![seed]))], + ) + .unwrap() + } + + /// Create a managed table with versions v1 (id=1) and v2 (id=2) on main and + /// return the main dataset handle. + async fn create_managed_table(ns: &Arc, table_id: &[String]) -> Dataset { + let mut ds = Dataset::write_into_namespace( + RecordBatchIterator::new(vec![Ok(single_int_batch(1))], single_int_schema()), + ns.clone(), + table_id.to_vec(), + Some(WriteParams { + mode: WriteMode::Create, + ..Default::default() + }), + ) + .await + .unwrap(); + ds.append( + RecordBatchIterator::new(vec![Ok(single_int_batch(2))], single_int_schema()), + None, + ) + .await + .unwrap(); + ds + } + + /// Sorted values of the `id` column across a full scan. + async fn scan_id_column(ds: &Dataset) -> Vec { + use arrow::array::Int32Array; + use futures::TryStreamExt; + let batches: Vec = ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let mut ids: Vec = batches + .iter() + .flat_map(|b| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + ids.sort(); + ids + } + + /// E2e for the managed branch path through the builder: create a branch via the + /// namespace op, open it with `from_namespace(managed).with_branch`, commit on + /// it, and confirm the dataset is rooted at the branch chain (manifest, base + /// path and data placement) while main's catalog is untouched. + #[tokio::test] + async fn test_managed_branch_open_and_commit() { + use futures::TryStreamExt; + use lance::dataset::builder::DatasetBuilder; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; + let main_before = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .versions + .len(); + + // Create a branch via the namespace op (the FS-handler path, which succeeds + // on a managed table). + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + // Open the managed table on the branch: the base path is qualified up + // front and the manifest store derives the branch from it. + let mut branch_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + assert_eq!( + branch_ds.manifest.branch.as_deref(), + Some("exp"), + "with_branch on a managed table must open the branch chain" + ); + let branch_base = branch_ds.branch_location().path; + assert!( + branch_base.as_ref().ends_with("tree/exp"), + "the branch dataset must be rooted at the branch chain: {}", + branch_base + ); + let branch_v_before = branch_ds.version().version; + + // Commit on the branch. + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + None, + ) + .await + .unwrap(); + assert_eq!( + branch_ds.manifest.branch.as_deref(), + Some("exp"), + "the commit must stay on the branch" + ); + assert!( + branch_ds.version().version > branch_v_before, + "the branch version must advance after the commit" + ); + assert_eq!(scan_id_column(&branch_ds).await, vec![1, 2, 3]); + + // The committed data files live under the branch chain, not main's data + // dir, so unmanaged readers of the branch and main's cleanup see a + // consistent layout. + let store = branch_ds.object_store(None).await.unwrap(); + let branch_data = branch_base.clone().join("data"); + let branch_files = store + .inner + .list(Some(&branch_data)) + .try_collect::>() + .await + .unwrap(); + assert!( + !branch_files.is_empty(), + "the branch commit must place data files under the branch chain" + ); + + // The same branch is readable through the unmanaged (path-based) open. + let table_uri = ns + .describe_table(DescribeTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .location + .unwrap(); + let fs_branch_ds = DatasetBuilder::from_uri(&table_uri) + .with_branch("exp", None) + .load() + .await + .unwrap(); + assert_eq!(fs_branch_ds.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&fs_branch_ds).await, vec![1, 2, 3]); + + // Main's catalog is untouched (branches are not tracked in __manifest), + // and main still reads its own data. + let main_after = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .versions + .len(); + assert_eq!( + main_after, main_before, + "committing on the branch must not change main's chain" + ); + let main_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .load() + .await + .unwrap(); + assert_eq!(main_ds.manifest.branch, None); + assert_eq!(scan_id_column(&main_ds).await, vec![1, 2]); + } + + /// Branch-pointing tags on a managed table: create them through the normal + /// API (from both the main and the branch handle), open the table at the + /// tag, and check the tag out from an already-open dataset. All of these + /// must resolve the branch chain, never main's chain. + #[tokio::test] + async fn test_managed_branch_tags() { + use lance::dataset::builder::DatasetBuilder; + use lance::dataset::refs::Ref; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + let main_ds = create_managed_table(&ns, &table_id).await; + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + let mut branch_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + None, + ) + .await + .unwrap(); + let branch_version = branch_ds.version().version; + + // A branch-pointing tag created from the main handle must validate + // against the branch chain (the version does not exist on main). + main_ds + .tags() + .create("exp-tag", ("exp", Some(branch_version))) + .await + .unwrap(); + let tag = main_ds.tags().get("exp-tag").await.unwrap(); + assert_eq!(tag.branch.as_deref(), Some("exp")); + assert_eq!(tag.version, branch_version); + + // A tag created from the branch handle resolves the branch implicitly. + branch_ds + .tags() + .create("exp-tag2", branch_version) + .await + .unwrap(); + let tag2 = branch_ds.tags().get("exp-tag2").await.unwrap(); + assert_eq!(tag2.branch.as_deref(), Some("exp")); + + // Opening the managed table at the branch-pointing tag checks out the + // branch chain. + let tag_open = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_tag("exp-tag") + .load() + .await + .unwrap(); + assert_eq!(tag_open.manifest.branch.as_deref(), Some("exp")); + assert_eq!(tag_open.version().version, branch_version); + assert_eq!(scan_id_column(&tag_open).await, vec![1, 2, 3]); + + // So does checking the tag out from an already-open main dataset. + let tag_checkout = main_ds + .checkout_version(Ref::Tag("exp-tag".to_string())) + .await + .unwrap(); + assert_eq!(tag_checkout.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&tag_checkout).await, vec![1, 2, 3]); + + // A missing tag on a managed table errors at open. + let err = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_tag("no-such-tag") + .load() + .await; + assert!(err.is_err(), "a missing tag must error"); + } + + /// Cross-branch checkout on a managed table, including version numbers that + /// exist on both chains (branch numbering continues from the fork point, so + /// overlap is the common case). Every checkout must land on the requested + /// chain and read that chain's data. + #[tokio::test] + async fn test_managed_cross_branch_checkout() { + use lance::dataset::builder::DatasetBuilder; + use lance::dataset::refs::Ref; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + let mut main_ds = create_managed_table(&ns, &table_id).await; + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + // exp gets id=3 at its tip; main gets id=100 at the same version number. + let mut branch_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + None, + ) + .await + .unwrap(); + let overlap_version = branch_ds.version().version; + while main_ds.version().version < overlap_version { + main_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(100))], single_int_schema()), + None, + ) + .await + .unwrap(); + } + + // main -> branch at the overlapping version number: must read the + // branch's data, not main's same-numbered version. + let on_branch = main_ds + .checkout_version(Ref::Version(Some("exp".to_string()), Some(overlap_version))) + .await + .unwrap(); + assert_eq!(on_branch.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&on_branch).await, vec![1, 2, 3]); + + // main -> branch latest. + let mut on_branch_latest = main_ds.checkout_branch("exp").await.unwrap(); + assert_eq!(on_branch_latest.manifest.branch.as_deref(), Some("exp")); + assert_eq!(on_branch_latest.version().version, overlap_version); + + // A commit through the checked-out handle (which shares main's commit + // handler) must land on the branch chain, not main's. + let main_chain_len = |ns: Arc, table_id: Vec| async move { + ns.list_table_versions(ListTableVersionsRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap() + .versions + .len() + }; + let main_before = main_chain_len(ns.clone(), table_id.clone()).await; + on_branch_latest + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(4))], single_int_schema()), + None, + ) + .await + .unwrap(); + assert_eq!(on_branch_latest.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&on_branch_latest).await, vec![1, 2, 3, 4]); + assert_eq!( + main_chain_len(ns.clone(), table_id.clone()).await, + main_before, + "a commit on the checked-out branch must not advance main's chain" + ); + + // branch -> main at a specific version. + let on_main = branch_ds + .checkout_version(Ref::Version(None, Some(1))) + .await + .unwrap(); + assert_eq!(on_main.manifest.branch, None); + assert_eq!(scan_id_column(&on_main).await, vec![1]); + + // branch -> another branch. + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp2".to_string(), + ..Default::default() + }) + .await + .unwrap(); + let on_branch2 = branch_ds.checkout_branch("exp2").await.unwrap(); + assert_eq!(on_branch2.manifest.branch.as_deref(), Some("exp2")); + + // A version missing from the branch chain errors loudly. + let err = main_ds + .checkout_version(Ref::Version(Some("exp".to_string()), Some(999))) + .await; + assert!(err.is_err(), "a version missing from the branch must error"); + } + + /// CommitBuilder must honor an explicitly supplied commit handler for a + /// Dataset destination: a managed-versioning commit through a dataset that + /// was opened without the namespace handler (as the Java and Python commit + /// APIs allow) must still register with the catalog instead of silently + /// writing a physical manifest the catalog never sees. + #[tokio::test] + async fn test_commit_builder_honors_explicit_handler_for_dataset_dest() { + use lance::dataset::write::{CommitBuilder, InsertBuilder}; + use lance::dataset::{WriteDestination, builder::DatasetBuilder}; + use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; + use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler; + + let temp = TempStdDir::default(); + let namespace = Arc::new( + DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .ops_metrics_enabled(true) + .build() + .await + .unwrap(), + ); + let ns: Arc = namespace.clone(); + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; // main: v1 (id=1), v2 (id=2) + + // Open WITHOUT the namespace handler, the way a binding caller can. + let table_uri = ns + .describe_table(DescribeTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .location + .unwrap(); + let plain_ds = Arc::new(Dataset::open(&table_uri).await.unwrap()); + + let transaction = InsertBuilder::new(WriteDestination::Dataset(plain_ds.clone())) + .with_params(&WriteParams { + mode: WriteMode::Append, + ..Default::default() + }) + .execute_uncommitted(vec![single_int_batch(3)]) + .await + .unwrap(); + + let handler = Arc::new(ExternalManifestCommitHandler { + external_manifest_store: Arc::new( + LanceNamespaceExternalManifestStore::for_table_uri( + ns.clone(), + table_id.clone(), + &table_uri, + ) + .unwrap(), + ), + }); + let commits_before = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + let committed = CommitBuilder::new(WriteDestination::Dataset(plain_ds)) + .with_commit_handler(handler) + .execute(transaction) + .await + .unwrap(); + assert_eq!(scan_id_column(&committed).await, vec![1, 2, 3]); + + let commits_after = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + assert_eq!( + commits_after, + commits_before + 1, + "the explicit handler must route the commit through create_table_version" + ); + let fresh = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .load() + .await + .unwrap(); + assert_eq!( + scan_id_column(&fresh).await, + vec![1, 2, 3], + "a fresh managed open must resolve the committed version" + ); + } + + /// A branch forked from a non-latest version opens on its own chain. + #[tokio::test] + async fn test_managed_branch_from_non_latest_fork() { + use lance::dataset::builder::DatasetBuilder; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; // main: v1 (id=1), v2 (id=2) + + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "old".to_string(), + from_version: Some(1), + ..Default::default() + }) + .await + .unwrap(); + + let old_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("old", None) + .load() + .await + .unwrap(); + assert_eq!(old_ds.manifest.branch.as_deref(), Some("old")); + assert_eq!( + scan_id_column(&old_ds).await, + vec![1], + "the fork must contain only the fork-point data" + ); + } + + /// The shared parser must decode both naming schemes; this is the cheap + /// V1 no-regression guard (creating a real V1 table is not exposed here). + #[test] + fn test_manifest_version_from_filename() { + // V1: the plain version number. + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("5.manifest"), + Some(5) + ); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("0.manifest"), + Some(0) + ); + // V2: version stored as u64::MAX - version, zero-padded to 20 digits. + let v2_five = format!("{:020}.manifest", u64::MAX - 5); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename(&v2_five), + Some(5) + ); + let v2_zero = format!("{:020}.manifest", u64::MAX); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename(&v2_zero), + Some(0) + ); + // Non-manifest and detached (`d`-prefixed) entries are ignored. + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("data.lance"), + None + ); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("d5.manifest"), + None + ); + } + + #[tokio::test] + async fn test_create_table() { + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create test IPC data + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + + let mut request = CreateTableRequest::new(); + request.id = Some(vec!["test_table".to_string()]); + + let response = namespace + .create_table(request, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + assert!(response.location.is_some()); + assert!(response.location.unwrap().ends_with("test_table.lance")); + assert_eq!(response.version, Some(1)); + } + + #[tokio::test] + async fn test_create_table_without_data() { + let (namespace, _temp_dir) = create_test_namespace().await; + + let mut request = CreateTableRequest::new(); + request.id = Some(vec!["test_table".to_string()]); + + let result = namespace.create_table(request, bytes::Bytes::new()).await; + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Arrow IPC stream) is required") + ); + } + + #[tokio::test] + async fn test_create_table_with_invalid_id() { + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create test IPC data + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + + // Test with empty ID + let mut request = CreateTableRequest::new(); + request.id = Some(vec![]); + + let result = namespace + .create_table(request, bytes::Bytes::from(ipc_data.clone())) + .await; + assert!(result.is_err()); + + // Test with multi-level ID - should now work with manifest enabled + // First create the parent namespace + let mut create_ns_req = CreateNamespaceRequest::new(); + create_ns_req.id = Some(vec!["test_namespace".to_string()]); + namespace.create_namespace(create_ns_req).await.unwrap(); + + // Now create table in the namespace + let mut request = CreateTableRequest::new(); + request.id = Some(vec!["test_namespace".to_string(), "table".to_string()]); + + let result = namespace + .create_table(request, bytes::Bytes::from(ipc_data)) + .await; + // Should succeed with manifest enabled + assert!( + result.is_ok(), + "Multi-level table IDs should work with manifest enabled" + ); + } + + #[tokio::test] + async fn test_list_tables() { + let (namespace, _temp_dir) = create_test_namespace().await; + + // Initially, no tables + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 0); + + // Create test IPC data + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + + // Create a table + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["table1".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data.clone())) + .await + .unwrap(); + + // Create another table + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["table2".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data)) .await .unwrap(); @@ -4828,7 +6656,7 @@ mod tests { #[tokio::test] async fn test_list_table_indices() { - use lance_namespace::models::ListTableIndicesRequest; + use lance_namespace::models::{CreateTableIndexRequest, ListTableIndicesRequest}; let (namespace, _temp_dir) = create_test_namespace().await; create_scalar_table(&namespace, "users").await; @@ -4857,6 +6685,22 @@ mod tests { assert_eq!(users_id_idx.columns, vec!["id"]); assert_eq!(users_id_idx.status, "SUCCEEDED"); + // Enriched fields populated from the index metadata for a scalar index. + assert_eq!(users_id_idx.index_type.as_deref(), Some("BTree")); + assert!( + users_id_idx + .type_url + .as_deref() + .is_some_and(|s| !s.is_empty()) + ); + assert_eq!(users_id_idx.num_indexed_rows, Some(3)); + assert_eq!(users_id_idx.num_unindexed_rows, Some(0)); + assert_eq!(users_id_idx.num_segments, Some(1)); + assert!(users_id_idx.size_bytes.is_some_and(|size| size > 0)); + assert!(users_id_idx.created_at.is_some()); + assert!(users_id_idx.index_version.is_some()); + assert!(users_id_idx.index_details.is_some()); + let dataset = open_dataset(&namespace, "users").await; let expected_transaction_id = dataset .read_transaction() @@ -4900,6 +6744,44 @@ mod tests { assert_eq!(second_page.indexes.len(), 1); assert_eq!(second_page.indexes[0].index_name, "users_id_idx"); assert!(second_page.page_token.is_none()); + + // A vector index exercises a different type_url, index_type, and details payload. + create_vector_table(&namespace, "vectors").await; + let mut create_index_request = + CreateTableIndexRequest::new("vector".to_string(), "IVF_FLAT".to_string()); + create_index_request.id = Some(vec!["vectors".to_string()]); + create_index_request.name = Some("vector_idx".to_string()); + create_index_request.distance_type = Some("l2".to_string()); + namespace + .create_table_index(create_index_request) + .await + .unwrap(); + + let vector_response = namespace + .list_table_indices(ListTableIndicesRequest { + id: Some(vec!["vectors".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + + assert_eq!(vector_response.indexes.len(), 1); + let vector_idx = &vector_response.indexes[0]; + assert_eq!(vector_idx.index_name, "vector_idx"); + assert_eq!(vector_idx.columns, vec!["vector"]); + assert_eq!(vector_idx.index_type.as_deref(), Some("IVF_FLAT")); + assert!( + vector_idx + .type_url + .as_deref() + .is_some_and(|s| !s.is_empty()) + ); + assert!(vector_idx.num_indexed_rows.is_some()); + assert!(vector_idx.num_unindexed_rows.is_some()); + assert_eq!(vector_idx.num_segments, Some(1)); + assert!(vector_idx.created_at.is_some()); + assert!(vector_idx.index_version.is_some()); + assert!(vector_idx.index_details.is_some()); } #[tokio::test] @@ -9277,155 +11159,6 @@ mod tests { } } - /// Tests for multi-table transaction support via table_version_storage_enabled. - mod multi_table_transactions { - use super::*; - use futures::TryStreamExt; - use lance::dataset::builder::DatasetBuilder; - use lance_namespace::models::CreateTableVersionRequest; - - /// Helper to create a namespace with table_version_storage_enabled enabled - async fn create_managed_namespace(temp_path: &str) -> Arc { - Arc::new( - DirectoryNamespaceBuilder::new(temp_path) - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) - .manifest_enabled(true) - .build() - .await - .unwrap(), - ) - } - - /// Helper to create a table and get its staging manifest path - async fn create_table_and_get_staging( - namespace: Arc, - table_name: &str, - ) -> (Vec, object_store::path::Path) { - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); - let mut create_req = CreateTableRequest::new(); - create_req.id = Some(vec![table_name.to_string()]); - namespace - .create_table(create_req, bytes::Bytes::from(ipc_data)) - .await - .unwrap(); - - let table_id = vec![table_name.to_string()]; - let dataset = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone()) - .await - .unwrap() - .load() - .await - .unwrap(); - - // Find existing manifest and create a staging copy - let versions_path = dataset.versions_dir(); - let manifest_metas: Vec<_> = dataset - .object_store(None) - .await - .unwrap() - .inner - .list(Some(&versions_path)) - .try_collect() - .await - .unwrap(); - - let manifest_meta = manifest_metas - .iter() - .find(|m| { - m.location - .filename() - .map(|f| f.ends_with(".manifest")) - .unwrap_or(false) - }) - .expect("No manifest file found"); - - let manifest_data = dataset - .object_store(None) - .await - .unwrap() - .inner - .get(&manifest_meta.location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - - let staging_path = dataset - .versions_dir() - .join(format!("staging_{}", table_name)); - dataset - .object_store(None) - .await - .unwrap() - .inner - .put(&staging_path, manifest_data.into()) - .await - .unwrap(); - - (table_id, staging_path) - } - - #[tokio::test] - async fn test_table_version_storage_enabled_requires_manifest() { - // table_version_storage_enabled=true requires manifest_enabled=true - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); - - let result = DirectoryNamespaceBuilder::new(temp_path) - .table_version_storage_enabled(true) - .manifest_enabled(false) - .build() - .await; - - assert!( - result.is_err(), - "Should fail when table_version_storage_enabled=true but manifest_enabled=false" - ); - } - - #[tokio::test] - async fn test_create_table_version_records_in_manifest() { - // When table_version_storage_enabled is enabled, single create_table_version - // should also record the version in __manifest - let temp_dir = TempStrDir::default(); - let temp_path: &str = &temp_dir; - - let namespace = create_managed_namespace(temp_path).await; - let ns: Arc = namespace.clone(); - - let (table_id, staging_path) = - create_table_and_get_staging(ns.clone(), "table_managed").await; - - // Create version 2 - let mut create_req = CreateTableVersionRequest::new(2, staging_path.to_string()); - create_req.id = Some(table_id.clone()); - create_req.naming_scheme = Some("V2".to_string()); - let response = namespace.create_table_version(create_req).await.unwrap(); - - assert!(response.version.is_some()); - let version = response.version.unwrap(); - assert_eq!(version.version, 2); - - // Verify the version is recorded in __manifest by querying it - let manifest_ns = namespace.manifest_ns.as_ref().unwrap(); - let table_id_str = manifest::ManifestNamespace::str_object_id(&table_id); - let versions = manifest_ns - .query_table_versions(&table_id_str, false, None) - .await - .unwrap(); - - assert!( - !versions.is_empty(), - "Version should be recorded in __manifest" - ); - let (ver, _path) = &versions[0]; - assert_eq!(*ver, 2, "Recorded version should be 2"); - } - } - #[tokio::test] async fn test_list_all_tables() { use lance_namespace::models::ListTablesRequest; @@ -9479,6 +11212,55 @@ mod tests { ); } + #[tokio::test] + async fn test_alter_table_add_columns() { + use lance_namespace::models::{ + AddColumnsEntry, AlterTableAddColumnsRequest, DescribeTableRequest, + }; + + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Add a new column + let mut new_col = AddColumnsEntry::new("doubled_id".to_string()); + new_col.expression = Some(Some("id * 2".to_string())); + let mut add_request = AlterTableAddColumnsRequest::new(vec![new_col]); + add_request.id = Some(vec!["test_table".to_string()]); + + let response = namespace + .alter_table_add_columns(add_request) + .await + .unwrap(); + assert!( + response.version > 1, + "Version should increment after adding columns" + ); + + // Verify via describe_table + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = namespace.describe_table(describe_request).await.unwrap(); + assert!(describe_response.schema.is_some()); + + let resp_schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = resp_schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"doubled_id"), + "Column 'doubled_id' should exist, got: {:?}", + field_names + ); + } + #[tokio::test] async fn test_update_table_schema_metadata() { use lance_namespace::models::UpdateTableSchemaMetadataRequest; @@ -9506,6 +11288,72 @@ mod tests { ); } + #[tokio::test] + async fn test_alter_table_add_columns_missing_id() { + use lance_namespace::models::{AddColumnsEntry, AlterTableAddColumnsRequest}; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let new_col = AddColumnsEntry::new("col".to_string()); + let request = AlterTableAddColumnsRequest::new(vec![new_col]); + let result = namespace.alter_table_add_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[tokio::test] + async fn test_alter_table_alter_columns_rename() { + use lance_namespace::models::{ + AlterColumnsEntry, AlterTableAlterColumnsRequest, DescribeTableRequest, + }; + + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Rename "name" to "full_name" + let mut entry = AlterColumnsEntry::new("name".to_string()); + entry.rename = Some(Some("full_name".to_string())); + let mut alter_request = AlterTableAlterColumnsRequest::new(vec![entry]); + alter_request.id = Some(vec!["test_table".to_string()]); + + let response = namespace + .alter_table_alter_columns(alter_request) + .await + .unwrap(); + assert!( + response.version > 1, + "Version should increment after altering columns" + ); + + // Verify the rename + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = namespace.describe_table(describe_request).await.unwrap(); + assert!(describe_response.schema.is_some()); + + let resp_schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = resp_schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"full_name"), + "Column should be renamed to 'full_name', got: {:?}", + field_names + ); + assert!( + !field_names.contains(&"name"), + "Old column 'name' should not exist, got: {:?}", + field_names + ); + } + #[tokio::test] async fn test_get_table_stats() { use lance_namespace::models::GetTableStatsRequest; @@ -9557,6 +11405,68 @@ mod tests { ); } + #[tokio::test] + async fn test_alter_table_alter_columns_missing_id() { + use lance_namespace::models::{AlterColumnsEntry, AlterTableAlterColumnsRequest}; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let entry = AlterColumnsEntry::new("name".to_string()); + let request = AlterTableAlterColumnsRequest::new(vec![entry]); + let result = namespace.alter_table_alter_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[tokio::test] + async fn test_alter_table_drop_columns() { + use lance_namespace::models::{AlterTableDropColumnsRequest, DescribeTableRequest}; + + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Drop the "name" column + let mut drop_request = AlterTableDropColumnsRequest::new(vec!["name".to_string()]); + drop_request.id = Some(vec!["test_table".to_string()]); + + let response = namespace + .alter_table_drop_columns(drop_request) + .await + .unwrap(); + assert!( + response.version > 1, + "Version should increment after dropping columns" + ); + + // Verify column was dropped + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = namespace.describe_table(describe_request).await.unwrap(); + assert!(describe_response.schema.is_some()); + + let resp_schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = resp_schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + !field_names.contains(&"name"), + "Column 'name' should be dropped, got: {:?}", + field_names + ); + assert!( + field_names.contains(&"id"), + "Column 'id' should still exist, got: {:?}", + field_names + ); + } + #[tokio::test] async fn test_analyze_table_query_plan() { use lance_namespace::models::AnalyzeTableQueryPlanRequest; @@ -9766,17 +11676,51 @@ mod tests { // describe_table follows the same path when the table is not yet registered in __manifest. listing_count.store(0, Ordering::SeqCst); - let mut describe_req = DescribeTableRequest::new(); - describe_req.id = Some(vec!["test_table".to_string()]); - hybrid_ns.describe_table(describe_req).await.unwrap(); + let mut describe_req = DescribeTableRequest::new(); + describe_req.id = Some(vec!["test_table".to_string()]); + hybrid_ns.describe_table(describe_req).await.unwrap(); + + let count = listing_count.load(Ordering::SeqCst); + assert_eq!( + count, 1, + "Expected exactly 1 listing call for describe_table with migration mode \ + (table directory fallback; manifest reload uses the version hint), but got {}", + count + ); + } + + #[tokio::test] + async fn test_manifest_reload_observes_new_version_from_other_namespace() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace_a = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + create_scalar_table(&namespace_a, "alpha").await; + + let namespace_b = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + create_scalar_table(&namespace_b, "beta").await; + + let response = namespace_a + .list_tables(ListTablesRequest { + id: Some(vec![]), + ..Default::default() + }) + .await + .unwrap(); - let count = listing_count.load(Ordering::SeqCst); - assert_eq!( - count, 1, - "Expected exactly 1 listing call for describe_table with migration mode \ - (table directory fallback; manifest reload uses the version hint), but got {}", - count - ); + let mut tables = response.tables; + tables.sort(); + assert_eq!(tables, vec!["alpha", "beta"]); } #[tokio::test] @@ -9910,6 +11854,400 @@ mod tests { (namespace, temp_dir, table_id) } + /// Downcast a lance-core error to its NamespaceError code for precise assertions. + fn namespace_code(err: &Error) -> Option { + match err { + Error::Namespace { source, .. } => { + source.downcast_ref::().map(|e| e.code()) + } + _ => None, + } + } + + #[tokio::test] + async fn test_create_and_list_branches() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(3).await; + + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "staging".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let resp = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!( + resp.branches.len(), + 2, + "expected 2 branches, got: {:?}", + resp.branches + ); + assert!(resp.branches.contains_key("dev")); + assert!(resp.branches.contains_key("staging")); + assert!(resp.page_token.is_none()); + + // Deleting one branch is reflected in a subsequent list. + namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let resp = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!(resp.branches.len(), 1, "expected 1 branch after delete"); + assert!(!resp.branches.contains_key("dev")); + assert!(resp.branches.contains_key("staging")); + } + + #[tokio::test] + async fn test_create_branch_from_version() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(3).await; + + // Fork explicitly from version 1 of main. + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "from-v1".to_string(), + from_version: Some(1), + ..Default::default() + }) + .await + .unwrap(); + + let resp = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap(); + let branch = resp + .branches + .get("from-v1") + .expect("forked branch should be listed"); + assert_eq!( + branch.parent_version, 1, + "branch should fork from version 1" + ); + assert!( + branch.parent_branch.is_none(), + "a branch forked from main has no parent branch" + ); + } + + /// Forking from a NON-main source branch must clone that branch's chain. + /// Both chains are given a version 2 with diverged content, so a clone that + /// wrongly resolves the version under main succeeds silently with main's + /// data instead of erroring. + #[tokio::test] + async fn test_create_branch_from_other_branch() { + use lance::dataset::builder::DatasetBuilder; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // main v1: ids [1, 2, 3] + // dev: forked at v1, one append (ids 100, 101) -> dev v2 + create_branch_with_commits(&namespace, "users", "dev", 1).await; + // Diverge main to the same version number with different content. + let main_ds = open_dataset(&namespace, "users").await; + append_scalar_version(main_ds.uri(), 500).await; // main v2: + ids [500, 501] + + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(vec!["users".to_string()]), + name: "child".to_string(), + from_branch: Some("dev".to_string()), + from_version: Some(2), + ..Default::default() + }) + .await + .unwrap(); + + let child_ds = DatasetBuilder::from_uri(main_ds.uri()) + .with_branch("child", None) + .load() + .await + .unwrap(); + let ids = scan_id_column(&child_ds).await; + assert!( + ids.contains(&100) && ids.contains(&101), + "child must contain dev's appended rows, got: {:?}", + ids + ); + assert!( + !ids.contains(&500), + "child must not contain main's diverged rows, got: {:?}", + ids + ); + + // The recorded metadata and the cloned data must agree on the parent. + let listed = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(vec!["users".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!( + listed + .branches + .get("child") + .unwrap() + .parent_branch + .as_deref(), + Some("dev") + ); + } + + #[tokio::test] + async fn test_create_existing_branch_conflict() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::TableBranchAlreadyExists), + "expected TableBranchAlreadyExists, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("already exists"), + "expected already-exists message, got: {}", + err + ); + } + + #[tokio::test] + async fn test_delete_unknown_branch() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + let err = namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id), + name: "does-not-exist".to_string(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::TableBranchNotFound), + "expected TableBranchNotFound, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("not found"), + "expected not-found message, got: {}", + err + ); + } + + #[tokio::test] + async fn test_delete_referenced_branch_conflict() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + // A child forked from `parent` (via from_branch) makes `parent` a referenced branch. + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "parent".to_string(), + ..Default::default() + }) + .await + .unwrap(); + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "child".to_string(), + from_branch: Some("parent".to_string()), + ..Default::default() + }) + .await + .unwrap(); + + // from_branch resolution: the child records its parent branch as its fork point. + let listed = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap(); + let child = listed + .branches + .get("child") + .expect("child branch should be listed"); + assert_eq!( + child.parent_branch.as_deref(), + Some("parent"), + "child should record parent branch as its fork point" + ); + assert!( + child.parent_version >= 1, + "child should record the parent version it forked from, got {}", + child.parent_version + ); + + // Deleting a branch that still has dependents is refused. The delete spec has no 409, + // so it surfaces as a documented InvalidInput (400), not a conflict status. + let err = namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id), + name: "parent".to_string(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::InvalidInput), + "expected InvalidInput for deleting a referenced branch, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("referenced"), + "error should explain the branch is still referenced, got: {}", + err + ); + } + + #[tokio::test] + async fn test_branch_name_required() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + let create_err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: String::new(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&create_err), + Some(ErrorCode::InvalidInput), + "empty name on create should be InvalidInput, got: {}", + create_err + ); + assert!( + create_err + .to_string() + .to_lowercase() + .contains("must not be empty") + ); + + let delete_err = namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id), + name: String::new(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&delete_err), + Some(ErrorCode::InvalidInput), + "empty name on delete should be InvalidInput, got: {}", + delete_err + ); + assert!( + delete_err + .to_string() + .to_lowercase() + .contains("must not be empty") + ); + } + + #[tokio::test] + async fn test_create_branch_rejects_negative_from_version() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + let err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id), + name: "dev".to_string(), + from_version: Some(-1), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::InvalidInput), + "negative from_version should be InvalidInput, got: {}", + err + ); + assert!(err.to_string().to_lowercase().contains("from_version")); + } + + #[tokio::test] + async fn test_create_branch_nonexistent_from_version() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + // Version 999 does not exist (the table has 2 versions). create_branch's clone phase + // raises DatasetNotFound, which we map to a documented InvalidInput (400). + let err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id), + name: "dev".to_string(), + from_version: Some(999), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::InvalidInput), + "non-existent from_version should map to InvalidInput, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("does not exist"), + "error should name the missing source, got: {}", + err + ); + } + #[tokio::test] async fn test_create_and_list_tags() { let (namespace, _temp_dir, table_id) = create_tagged_test_table(3).await; @@ -9962,6 +12300,7 @@ mod tests { get_req.id = Some(table_id); let resp = namespace.get_table_tag_version(get_req).await.unwrap(); assert_eq!(resp.version, 2); + assert_eq!(resp.branch, None); } #[tokio::test] @@ -10087,4 +12426,26 @@ mod tests { err ); } + #[tokio::test] + async fn test_alter_table_drop_columns_missing_id() { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + let result = namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[tokio::test] + async fn test_alter_table_drop_columns_nonexistent_table() { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let mut request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + request.id = Some(vec!["nonexistent".to_string()]); + let result = namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table does not exist"); + } } diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 0e22f1e8b69..c05db91ad56 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -6,52 +6,74 @@ //! This module provides a namespace implementation that uses a manifest table //! to track tables and nested namespaces. +use super::manifest_feature_flags::{ensure_readable, ensure_writable}; use arrow::array::builder::{ListBuilder, StringBuilder}; -use arrow::array::{Array, RecordBatch, RecordBatchIterator, StringArray}; -use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use arrow::array::{Array, ListArray, RecordBatch, RecordBatchIterator, StringArray, UInt64Array}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef}; use arrow_ipc::reader::StreamReader; use async_trait::async_trait; use bytes::Bytes; -use futures::{FutureExt, TryStreamExt, stream::StreamExt}; -use lance::dataset::optimize::{CompactionOptions, compact_files}; +use datafusion_common::DataFusionError; +use datafusion_physical_plan::{ + SendableRecordBatchStream, + stream::RecordBatchStreamAdapter as DatafusionRecordBatchStreamAdapter, +}; +use futures::{ + FutureExt, TryStreamExt, + stream::{self, StreamExt}, +}; +use lance::dataset::index::LanceIndexStoreExt; +use lance::dataset::transaction::{Operation, Transaction}; use lance::dataset::{ - DeleteBuilder, MergeInsertBuilder, ReadParams, WhenMatched, WhenNotMatched, WriteMode, - WriteParams, builder::DatasetBuilder, + InsertBuilder, ReadParams, WhenMatched, WriteMode, WriteParams, builder::DatasetBuilder, }; -use lance::index::DatasetIndexExt; use lance::session::Session; use lance::{Dataset, dataset::scanner::Scanner}; use lance_core::Error as LanceError; use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; -use lance_core::{Error, Result}; -use lance_index::IndexType; -use lance_index::optimize::OptimizeOptions; -use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; +use lance_core::{Error, ROW_ID, Result, box_error}; +use lance_index::progress::noop_progress; +use lance_index::registry::IndexPluginRegistry; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::registry::VALUE_COLUMN_NAME; +use lance_index::scalar::{BuiltinIndexType, CreatedIndex, ScalarIndexParams}; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; +use lance_io::stream::RecordBatchStream as LanceRecordBatchStream; use lance_namespace::LanceNamespace; use lance_namespace::error::NamespaceError; use lance_namespace::models::{ + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest, - DescribeTableResponse, DescribeTableVersionResponse, DropNamespaceRequest, - DropNamespaceResponse, DropTableRequest, DropTableResponse, ListNamespacesRequest, - ListNamespacesResponse, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, - NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest, - TableVersion, + DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, + DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTablesRequest, + ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, + TableExistsRequest, }; use lance_namespace::schema::arrow_schema_to_json; +use lance_table::feature_flags::apply_feature_flags; +use lance_table::format::{Fragment, IndexMetadata, Manifest}; +use lance_table::io::commit::{ + CommitError, CommitHandler, commit_handler_from_url, write_manifest_file_to_path, +}; use object_store::{Error as ObjectStoreError, path::Path}; +use roaring::RoaringBitmap; use std::io::Cursor; +use std::time::{SystemTime, UNIX_EPOCH}; use std::{ - collections::HashMap, + collections::{BTreeMap, HashMap, HashSet}, hash::{DefaultHasher, Hash, Hasher}, ops::{Deref, DerefMut}, - sync::Arc, + sync::{Arc, Mutex as StdMutex, MutexGuard as StdMutexGuard}, }; use tokio::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; +use uuid::Uuid; const MANIFEST_TABLE_NAME: &str = "__manifest"; +const LANCE_DATA_DIR: &str = "data"; +const LANCE_INDICES_DIR: &str = "_indices"; const DELIMITER: &str = "$"; /// Bounded concurrency for per-table `_versions/` probes when filtering declared tables. /// Higher values reduce latency but increase burst load against the object store. @@ -64,24 +86,23 @@ const OBJECT_ID_INDEX_NAME: &str = "object_id_btree"; const OBJECT_TYPE_INDEX_NAME: &str = "object_type_bitmap"; /// LabelList index on the base_objects column for view dependencies const BASE_OBJECTS_INDEX_NAME: &str = "base_objects_label_list"; -/// Inline maintenance on the manifest table is expensive relative to a single-row mutation. -/// Wait until enough fragments accumulate before compacting files or merging indices. -const MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD: usize = 8; +// Each retry reloads and rewrites the full manifest. Match the regular Lance +// commit retry budget so multi-process namespace writes can make progress. +const DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES: u32 = 20; +const MANIFEST_INDEX_BATCH_SIZE: usize = 8192; /// Object types that can be stored in the manifest #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ObjectType { Namespace, Table, - TableVersion, } impl ObjectType { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Namespace => "namespace", Self::Table => "table", - Self::TableVersion => "table_version", } } @@ -89,7 +110,6 @@ impl ObjectType { match s { "namespace" => Ok(Self::Namespace), "table" => Ok(Self::Table), - "table_version" => Ok(Self::TableVersion), _ => Err(NamespaceError::Internal { message: format!("Invalid object type: {}", s), } @@ -152,7 +172,7 @@ pub struct TableInfo { pub struct ManifestEntry { /// The unique object identifier (e.g., table name or version object_id) pub object_id: String, - /// The type of the object (Namespace, Table, or TableVersion) + /// The type of the object (Namespace or Table) pub object_type: ObjectType, /// The storage location (e.g., directory name for tables) pub location: Option, @@ -160,6 +180,401 @@ pub struct ManifestEntry { pub metadata: Option, } +struct CopyOnWriteMutation { + result: T, + has_changes: bool, +} + +impl CopyOnWriteMutation { + fn updated(result: T) -> Self { + Self { + result, + has_changes: true, + } + } + + fn unchanged(result: T) -> Self { + Self { + result, + has_changes: false, + } + } +} + +struct ManifestIndexBuildInput { + index_name: &'static str, + column_name: &'static str, + params: ScalarIndexParams, + field: Field, + stream: SendableRecordBatchStream, +} + +struct ManifestTrainedIndex { + index_name: &'static str, + column_name: &'static str, + uuid: Uuid, + created_index: CreatedIndex, +} + +struct ManifestRowValue { + object_id: String, + object_type: ObjectType, + location: Option, + metadata: Option, + base_objects: Option>, +} + +struct ManifestOutputRow<'a> { + object_id: &'a str, + object_type: ObjectType, + location: Option<&'a str>, + metadata: Option<&'a str>, + base_objects: Option<&'a [String]>, +} + +#[derive(Default)] +struct ManifestIndexAccumulator { + object_ids: BTreeMap, u64>, + object_types: BTreeMap<&'static str, RoaringBitmap>, + base_objects_values: Vec>>, + base_objects_row_ids: Vec, + row_count: u64, +} + +impl ManifestIndexAccumulator { + fn next_row_id(&self) -> Result { + if self.row_count >= u64::from(u32::MAX) { + return Err(NamespaceError::Internal { + message: format!( + "Manifest rewrite exceeded maximum single-fragment row count: {}", + self.row_count + ), + } + .into()); + } + Ok(self.row_count) + } + + fn push(&mut self, row: &ManifestOutputRow<'_>) -> Result { + let row_id = self.next_row_id()?; + if self + .object_ids + .insert(Arc::::from(row.object_id), row_id) + .is_some() + { + return Err(NamespaceError::Internal { + message: format!("Manifest contains duplicate object_id '{}'", row.object_id), + } + .into()); + } + self.object_types + .entry(row.object_type.as_str()) + .or_default() + .insert(row_id as u32); + self.base_objects_values + .push(row.base_objects.map(|objects| objects.to_vec())); + self.base_objects_row_ids.push(row_id); + self.row_count += 1; + Ok(row_id) + } +} + +struct ManifestBatchBuilder { + object_ids: Vec, + object_types: Vec<&'static str>, + locations: Vec>, + metadatas: Vec>, + base_objects: Vec>>, +} + +impl ManifestBatchBuilder { + fn new() -> Self { + Self { + object_ids: Vec::new(), + object_types: Vec::new(), + locations: Vec::new(), + metadatas: Vec::new(), + base_objects: Vec::new(), + } + } + + fn is_empty(&self) -> bool { + self.object_ids.is_empty() + } + + fn append( + &mut self, + index_data: &mut ManifestIndexAccumulator, + row: ManifestOutputRow<'_>, + ) -> Result<()> { + index_data.push(&row)?; + self.object_ids.push(row.object_id.to_string()); + self.object_types.push(row.object_type.as_str()); + self.locations.push(row.location.map(ToString::to_string)); + self.metadatas.push(row.metadata.map(ToString::to_string)); + self.base_objects + .push(row.base_objects.map(|objects| objects.to_vec())); + Ok(()) + } + + fn finish(self) -> Result { + let base_objects_array = ManifestNamespace::base_objects_array(&self.base_objects); + RecordBatch::try_new( + ManifestNamespace::manifest_schema(), + vec![ + Arc::new(StringArray::from(self.object_ids)), + Arc::new(StringArray::from(self.object_types)), + Arc::new(StringArray::from(self.locations)), + Arc::new(StringArray::from(self.metadatas)), + Arc::new(base_objects_array), + ], + ) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to create manifest snapshot batch: {:?}", e), + }) + }) + } +} + +/// How to resolve a storage commit conflict (or an ambiguous commit error that did +/// not land) against the latest catalog state, without re-staging the full rewrite. +enum ConflictResolution { + /// Re-read the latest manifest and re-apply the mutation (upserts, version-range + /// deletes). The staged data/index files are discarded and a new rewrite is attempted. + Retry, + /// Creating these object ids with fail-on-conflict semantics. If any of them now + /// exists in the latest manifest, the create lost the race and must fail with a + /// concurrent-modification error; otherwise retry the rewrite. + FailIfExists(Vec), + /// Deleting `object_id`. If it is already absent from the latest manifest the delete + /// has effectively happened, so return `output` as success; otherwise retry. + SucceedIfAbsent { object_id: String, output: O }, +} + +trait ManifestStreamMutation: Send { + type Output: Clone + Send + 'static; + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()>; + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()>; + + fn finish(&self) -> CopyOnWriteMutation; + + /// Declares how a storage commit conflict should be resolved against the latest + /// committed catalog state. Defaults to re-reading and re-applying. + fn conflict_resolution(&self) -> ConflictResolution { + ConflictResolution::Retry + } +} + +struct ManifestRewriteShared { + mutation: M, + index_data: Option, + result: Option>, + error: Option, +} + +impl ManifestRewriteShared { + fn new(mutation: M) -> Self { + Self { + mutation, + index_data: Some(ManifestIndexAccumulator::default()), + result: None, + error: None, + } + } +} + +struct UpsertManifestMutation { + entries: Vec, + base_objects: Vec>>, + entry_positions: HashMap, + matched: Vec, + when_matched: WhenMatched, +} + +impl UpsertManifestMutation { + fn new( + entries: Vec, + base_objects: Option>, + when_matched: WhenMatched, + ) -> Self { + let entry_positions = entries + .iter() + .enumerate() + .map(|(index, entry)| (entry.object_id.clone(), index)) + .collect(); + let matched = vec![false; entries.len()]; + let mut entry_base_objects = vec![None; entries.len()]; + if !entry_base_objects.is_empty() { + entry_base_objects[0] = base_objects; + } + Self { + entries, + base_objects: entry_base_objects, + entry_positions, + matched, + when_matched, + } + } + + fn entry_row(&self, index: usize) -> ManifestOutputRow<'_> { + let entry = &self.entries[index]; + ManifestOutputRow { + object_id: &entry.object_id, + object_type: entry.object_type, + location: entry.location.as_deref(), + metadata: entry.metadata.as_deref(), + base_objects: self.base_objects[index].as_deref(), + } + } +} + +impl ManifestStreamMutation for UpsertManifestMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + if let Some(index) = self.entry_positions.get(&row.object_id).copied() { + match self.when_matched { + WhenMatched::Fail => { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "Object '{}' was concurrently created by another operation", + row.object_id + ), + } + .into()); + } + WhenMatched::UpdateAll => { + self.matched[index] = true; + output.append(index_data, self.entry_row(index))?; + return Ok(()); + } + _ => { + return Err(NamespaceError::Internal { + message: format!( + "Unsupported manifest rewrite matched action: {:?}", + self.when_matched + ), + } + .into()); + } + } + } + + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + for index in 0..self.entries.len() { + if !self.matched[index] { + output.append(index_data, self.entry_row(index))?; + } + } + Ok(()) + } + + fn finish(&self) -> CopyOnWriteMutation { + CopyOnWriteMutation::updated(()) + } + + fn conflict_resolution(&self) -> ConflictResolution { + match self.when_matched { + // Fail-on-conflict create: a concurrent writer may have created one of these + // ids. Re-applying would still fail, so check directly instead of re-staging. + WhenMatched::Fail => ConflictResolution::FailIfExists( + self.entries.iter().map(|e| e.object_id.clone()).collect(), + ), + // Metadata upsert is last-writer-wins: re-read and re-apply. + _ => ConflictResolution::Retry, + } + } +} + +struct DeleteObjectMutation { + object_id: String, + deleted: bool, +} + +impl ManifestStreamMutation for DeleteObjectMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + if row.object_id == self.object_id { + self.deleted = true; + return Ok(()); + } + + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + _output: &mut ManifestBatchBuilder, + _index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + Ok(()) + } + + fn finish(&self) -> CopyOnWriteMutation { + if self.deleted { + CopyOnWriteMutation::updated(()) + } else { + CopyOnWriteMutation::unchanged(()) + } + } + + fn conflict_resolution(&self) -> ConflictResolution { + // If a concurrent writer already removed the object, the delete is satisfied. + ConflictResolution::SucceedIfAbsent { + object_id: self.object_id.clone(), + output: (), + } + } +} + /// Information about a namespace stored in the manifest #[derive(Debug, Clone)] pub struct NamespaceInfo { @@ -171,13 +586,23 @@ pub struct NamespaceInfo { /// A wrapper around a Dataset that provides concurrent access. /// /// This can be cloned cheaply. It supports concurrent reads or exclusive writes. -/// The manifest dataset is always kept strongly consistent by reloading on each read. +/// The manifest dataset uses contiguous attached versions and this module never +/// runs old-version cleanup on it, allowing reads to check only the immediate +/// successor manifest before deciding whether a reload is needed. #[derive(Debug, Clone)] pub struct DatasetConsistencyWrapper(Arc>); impl DatasetConsistencyWrapper { /// Create a new wrapper with the given dataset. pub fn new(dataset: Dataset) -> Self { + debug_assert!( + !dataset + .manifest() + .config + .keys() + .any(|key| key.starts_with("lance.auto_cleanup.")), + "the directory manifest dataset must not enable old-version cleanup" + ); Self(Arc::new(RwLock::new(dataset))) } @@ -185,18 +610,35 @@ impl DatasetConsistencyWrapper { /// Always reloads to ensure strong consistency. pub async fn get(&self) -> Result> { self.reload().await?; - Ok(DatasetReadGuard { + let guard = DatasetReadGuard { guard: self.0.read().await, - }) + }; + // Refuse manifests written with a reader feature flag this build does + // not understand instead of misreading them. + ensure_readable(guard.metadata())?; + Ok(guard) + } + + /// Reload the dataset and return a reference. + pub async fn get_refreshed(&self) -> Result> { + self.reload().await?; + let guard = DatasetReadGuard { + guard: self.0.read().await, + }; + ensure_readable(guard.metadata())?; + Ok(guard) } /// Get a mutable reference to the dataset. /// Always reloads to ensure strong consistency. pub async fn get_mut(&self) -> Result> { self.reload().await?; - Ok(DatasetWriteGuard { + let guard = DatasetWriteGuard { guard: self.0.write().await, - }) + }; + ensure_readable(guard.metadata())?; + ensure_writable(guard.metadata())?; + Ok(guard) } /// Provide a known latest version of the dataset. @@ -221,21 +663,25 @@ impl DatasetConsistencyWrapper { dataset_uri, current_version ); - let latest_version = read_guard.latest_version_id().await.map_err(|e| { + // The directory manifest table uses contiguous attached versions and + // does not run old-version cleanup, so the immediate successor probe is + // enough to detect changes without resolving or loading the latest + // manifest on every namespace read. + let has_successor_version = read_guard.has_successor_version().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to get latest version: {:?}", e), + message: format!("Failed to check dataset staleness: {:?}", e), }) })?; log::debug!( - "Reload got latest_version={} for uri={}, current_version={}", - latest_version, + "Reload checked successor_version_exists={} for uri={}, current_version={}", + has_successor_version, dataset_uri, current_version ); drop(read_guard); // If already up-to-date, return early - if latest_version == current_version { + if !has_successor_version { log::debug!("Already up-to-date for uri={}", dataset_uri); return Ok(()); } @@ -244,13 +690,13 @@ impl DatasetConsistencyWrapper { let mut write_guard = self.0.write().await; // Double-check after acquiring write lock (someone else might have reloaded) - let latest_version = write_guard.latest_version_id().await.map_err(|e| { + let has_successor_version = write_guard.has_successor_version().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to get latest version: {:?}", e), + message: format!("Failed to check dataset staleness: {:?}", e), }) })?; - if latest_version != write_guard.version().version { + if has_successor_version { write_guard.checkout_latest().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { message: format!("Failed to checkout latest: {:?}", e), @@ -306,8 +752,8 @@ pub struct ManifestNamespace { /// If true, root namespace tables use {table_name}.lance naming /// If false, they use namespace-prefixed names dir_listing_enabled: bool, - /// Whether to perform inline optimization (compaction and indexing) on the __manifest table - /// after every write. Defaults to true. + /// Whether copy-on-write manifest rewrites should build replacement indices. + /// Defaults to true. inline_optimization_enabled: bool, /// Number of retries for commit operations on the manifest table. /// If None, defaults to [`lance_table::io::commit::CommitConfig`] default (20). @@ -401,15 +847,10 @@ impl ManifestNamespace { dir_listing_enabled: bool, inline_optimization_enabled: bool, commit_retries: Option, - table_version_storage_enabled: bool, ) -> Result { - let manifest_dataset = Self::ensure_manifest_table_up_to_date( - &root, - &storage_options, - session.clone(), - table_version_storage_enabled, - ) - .await?; + let manifest_dataset = + Self::ensure_manifest_table_up_to_date(&root, &storage_options, session.clone()) + .await?; Ok(Self { root, @@ -473,34 +914,6 @@ impl ManifestNamespace { format!("table id '{}'", Self::str_object_id(table_id)) } - /// Format a version number as a zero-padded lexicographically sortable string. - /// - /// Versions are stored as 20-digit zero-padded integers (e.g., `00000000000000000001` - /// for version 1) so that string-based range queries and sorting work correctly. - pub fn format_table_version(version: i64) -> String { - format!("{:020}", version) - } - - /// Build the object_id for a table version entry. - /// - /// Format: `{table_object_id}${zero_padded_version}` - pub fn build_version_object_id(table_object_id: &str, version: i64) -> String { - format!( - "{}{}{}", - table_object_id, - DELIMITER, - Self::format_table_version(version) - ) - } - - /// Parse a version number from the version suffix of a table version object_id. - /// - /// The object_id is formatted as `{table_id}${zero_padded_version}`. - pub fn parse_version_from_object_id(object_id: &str) -> Option { - let (_namespace, name) = Self::parse_object_id(object_id); - name.parse::().ok() - } - /// Generate a new directory name in format: `_` /// The hash is used to (1) optimize object store throughput, /// (2) have high enough entropy in a short period of time to prevent issues like @@ -556,168 +969,392 @@ impl ManifestNamespace { Ok(full_url.to_string()) } - /// Perform inline optimization on the __manifest table. - /// - /// This method: - /// 1. Creates three indexes on the manifest table: - /// - BTREE index on object_id for fast lookups - /// - Bitmap index on object_type for filtering by type - /// - LabelList index on base_objects for view dependencies - /// 2. Runs file compaction to merge small files - /// 3. Optimizes existing indices - /// - /// This is called automatically after writes when inline_optimization_enabled is true. - async fn run_inline_optimization(&self) -> Result<()> { - if !self.inline_optimization_enabled { - return Ok(()); - } - - // Get a mutable reference to the dataset to perform optimization - let mut dataset_guard = self.manifest_dataset.get_mut().await?; - let dataset: &mut Dataset = &mut dataset_guard; - - // Step 1: Create indexes if they don't already exist - let indices = dataset.load_indices().await?; - - // Check which indexes already exist - let has_object_id_index = indices.iter().any(|idx| idx.name == OBJECT_ID_INDEX_NAME); - let has_object_type_index = indices.iter().any(|idx| idx.name == OBJECT_TYPE_INDEX_NAME); - let has_base_objects_index = indices - .iter() - .any(|idx| idx.name == BASE_OBJECTS_INDEX_NAME); - - // Create BTREE index on object_id - if !has_object_id_index { - log::debug!( - "Creating BTREE index '{}' on object_id for __manifest table", - OBJECT_ID_INDEX_NAME - ); - let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); - if let Err(e) = dataset - .create_index( - &["object_id"], - IndexType::BTree, - Some(OBJECT_ID_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create BTREE index on object_id for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created BTREE index '{}' on object_id for __manifest table", - OBJECT_ID_INDEX_NAME - ); + fn string_list_array(values: &[Option>], child_name: &str) -> ListArray { + let string_builder = StringBuilder::new(); + let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new( + child_name, + DataType::Utf8, + true, + ))); + for value in values { + match value { + Some(objects) => { + for object in objects { + list_builder.values().append_value(object); + } + list_builder.append(true); + } + None => list_builder.append_null(), } } + list_builder.finish() + } - // Create Bitmap index on object_type - if !has_object_type_index { - log::debug!( - "Creating Bitmap index '{}' on object_type for __manifest table", - OBJECT_TYPE_INDEX_NAME - ); - let params = ScalarIndexParams::default(); - if let Err(e) = dataset - .create_index( - &["object_type"], - IndexType::Bitmap, - Some(OBJECT_TYPE_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create Bitmap index on object_type for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created Bitmap index '{}' on object_type for __manifest table", - OBJECT_TYPE_INDEX_NAME - ); - } - } + fn base_objects_array(values: &[Option>]) -> ListArray { + Self::string_list_array(values, "object_id") + } - // Create LabelList index on base_objects - if !has_base_objects_index { - log::debug!( - "Creating LabelList index '{}' on base_objects for __manifest table", - BASE_OBJECTS_INDEX_NAME - ); - let params = ScalarIndexParams::default(); - if let Err(e) = dataset - .create_index( - &["base_objects"], - IndexType::LabelList, - Some(BASE_OBJECTS_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create LabelList index on base_objects for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created LabelList index '{}' on base_objects for __manifest table", - BASE_OBJECTS_INDEX_NAME - ); - } - } + fn value_row_id_schema(value_field: Field) -> SchemaRef { + Arc::new(ArrowSchema::new(vec![ + value_field, + Field::new(ROW_ID, DataType::UInt64, false), + ])) + } - let should_compact_and_optimize = - dataset.count_fragments() >= MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD; + fn string_row_id_batch( + schema: SchemaRef, + values: Vec, + row_ids: Vec, + ) -> Result { + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(values)), + Arc::new(UInt64Array::from(row_ids)), + ], + ) + .map_err(Into::into) + } - if !should_compact_and_optimize { - return Ok(()); - } + fn list_row_id_batch( + schema: SchemaRef, + values: Vec>>, + row_ids: Vec, + ) -> Result { + RecordBatch::try_new( + schema, + vec![ + Arc::new(Self::string_list_array(&values, "item")), + Arc::new(UInt64Array::from(row_ids)), + ], + ) + .map_err(Into::into) + } - // Step 2: Run file compaction - log::debug!("Running file compaction on __manifest table"); - match compact_files(dataset, CompactionOptions::default(), None).await { - Ok(compaction_metrics) => { - if compaction_metrics.fragments_removed > 0 { - log::info!( - "Compacted __manifest table: removed {} fragments, added {} fragments", - compaction_metrics.fragments_removed, - compaction_metrics.fragments_added - ); + fn object_id_index_stream(object_ids: BTreeMap, u64>) -> SendableRecordBatchStream { + let schema = + Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false)); + let stream_schema = schema.clone(); + let stream = stream::unfold( + (object_ids.into_iter(), false, schema), + |(mut iter, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + for _ in 0..MANIFEST_INDEX_BATCH_SIZE { + let Some((value, row_id)) = iter.next() else { + break; + }; + values.push(value.to_string()); + row_ids.push(row_id); } - } - Err(e) => { - log::warn!( - "Failed to compact files for __manifest table: {:?}. Continuing with optimization.", - e - ); - } - } - - // Step 3: Optimize indices - log::debug!("Optimizing indices on __manifest table"); - match dataset.optimize_indices(&OptimizeOptions::default()).await { - Ok(_) => { - log::info!("Successfully optimized indices on __manifest table"); - } - Err(e) => { - log::warn!( - "Failed to optimize indices on __manifest table: {:?}. Continuing anyway.", - e - ); - } - } - - Ok(()) + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) } - /// Get the manifest schema + fn object_type_index_stream( + object_types: BTreeMap<&'static str, RoaringBitmap>, + ) -> SendableRecordBatchStream { + let schema = + Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false)); + let stream_schema = schema.clone(); + let entries = object_types + .into_iter() + .map(|(value, bitmap)| { + ( + value, + Box::new(bitmap.into_iter()) as Box + Send>, + ) + }) + .collect::>() + .into_iter(); + let stream = stream::unfold( + (entries, None, false, schema), + |(mut entries, mut current, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + while values.len() < MANIFEST_INDEX_BATCH_SIZE { + if current.is_none() { + current = entries.next(); + } + let Some((value, iter)) = current.as_mut() else { + break; + }; + if let Some(row_id) = iter.next() { + values.push((*value).to_string()); + row_ids.push(u64::from(row_id)); + } else { + current = None; + } + } + + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (entries, current, true, schema))) + } + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (entries, current, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) + } + + fn base_objects_index_stream( + base_objects_values: Vec>>, + base_objects_row_ids: Vec, + ) -> SendableRecordBatchStream { + let schema = Self::value_row_id_schema(Field::new( + VALUE_COLUMN_NAME, + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + )); + let stream_schema = schema.clone(); + let stream = stream::unfold( + ( + base_objects_values.into_iter().zip(base_objects_row_ids), + false, + schema, + ), + |(mut iter, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + for _ in 0..MANIFEST_INDEX_BATCH_SIZE { + let Some((value, row_id)) = iter.next() else { + break; + }; + values.push(value); + row_ids.push(row_id); + } + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::list_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + } else { + let batch = Self::list_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) + } + + async fn train_manifest_index( + dataset: &Dataset, + registry: Arc, + input: ManifestIndexBuildInput, + index_uuid: Uuid, + ) -> Result { + let index_store = LanceIndexStore::from_dataset_for_new(dataset, &index_uuid)?; + let plugin = registry.get_plugin_by_name(&input.params.index_type)?; + let training_request = plugin + .new_training_request(input.params.params.as_deref().unwrap_or("{}"), &input.field)?; + let created_index = plugin + .train_index( + input.stream, + &index_store, + training_request, + None, + noop_progress(), + ) + .await?; + Ok(ManifestTrainedIndex { + index_name: input.index_name, + column_name: input.column_name, + uuid: index_uuid, + created_index, + }) + } + + fn manifest_index_metadata( + lance_schema: &lance_core::datatypes::Schema, + fragment_bitmap: &RoaringBitmap, + dataset_version: u64, + trained_index: ManifestTrainedIndex, + ) -> Result { + Ok(IndexMetadata { + uuid: trained_index.uuid, + fields: vec![lance_schema.field_id(trained_index.column_name)?], + name: trained_index.index_name.to_string(), + dataset_version, + fragment_bitmap: Some(fragment_bitmap.clone()), + index_details: Some(Arc::new(trained_index.created_index.index_details)), + index_version: trained_index.created_index.index_version as i32, + created_at: None, + base_id: None, + files: Some(trained_index.created_index.files), + }) + } + + fn manifest_fragment_bitmap(manifest: &Manifest) -> Result { + let mut bitmap = RoaringBitmap::new(); + for fragment in manifest.fragments.iter() { + let fragment_id = u32::try_from(fragment.id).map_err(|_| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Manifest fragment id {} exceeds u32", fragment.id), + }) + })?; + bitmap.insert(fragment_id); + } + Ok(bitmap) + } + + fn manifest_from_overwrite_transaction( + previous: &Manifest, + schema: lance_core::datatypes::Schema, + fragments: &[Fragment], + ) -> Manifest { + let mut next_fragment_id = 0; + let mut fragments = fragments + .iter() + .cloned() + .map(|mut fragment| { + if fragment.id == 0 { + fragment.id = next_fragment_id; + next_fragment_id += 1; + } + fragment + }) + .collect::>(); + fragments.sort_by_key(|fragment| fragment.id); + Manifest::new_from_previous(previous, schema, Arc::new(fragments)) + } + + async fn build_manifest_indices( + dataset: &Dataset, + manifest: &Manifest, + index_data: ManifestIndexAccumulator, + index_uuids: [Uuid; 3], + ) -> Result> { + let fragment_bitmap = Self::manifest_fragment_bitmap(manifest)?; + let schema = &manifest.schema; + let ManifestIndexAccumulator { + object_ids, + object_types, + base_objects_values, + base_objects_row_ids, + .. + } = index_data; + let [object_id_uuid, object_type_uuid, base_objects_uuid] = index_uuids; + let registry = IndexPluginRegistry::with_default_plugins(); + + let dataset_version = manifest.version; + let object_id_index_fut = Self::build_manifest_index( + dataset, + registry.clone(), + schema, + ManifestIndexBuildInput { + index_name: OBJECT_ID_INDEX_NAME, + column_name: "object_id", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::BTree), + field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + stream: Self::object_id_index_stream(object_ids), + }, + &fragment_bitmap, + dataset_version, + object_id_uuid, + ); + let object_type_index_fut = Self::build_manifest_index( + dataset, + registry.clone(), + schema, + ManifestIndexBuildInput { + index_name: OBJECT_TYPE_INDEX_NAME, + column_name: "object_type", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap), + field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + stream: Self::object_type_index_stream(object_types), + }, + &fragment_bitmap, + dataset_version, + object_type_uuid, + ); + let base_objects_index_fut = Self::build_manifest_index( + dataset, + registry, + schema, + ManifestIndexBuildInput { + index_name: BASE_OBJECTS_INDEX_NAME, + column_name: "base_objects", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::LabelList), + field: Field::new( + VALUE_COLUMN_NAME, + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ), + stream: Self::base_objects_index_stream(base_objects_values, base_objects_row_ids), + }, + &fragment_bitmap, + dataset_version, + base_objects_uuid, + ); + + let (object_id_index, object_type_index, base_objects_index) = futures::join!( + object_id_index_fut, + object_type_index_fut, + base_objects_index_fut + ); + + Ok(vec![ + object_id_index?, + object_type_index?, + base_objects_index?, + ]) + } + + async fn build_manifest_index( + dataset: &Dataset, + registry: Arc, + lance_schema: &lance_core::datatypes::Schema, + input: ManifestIndexBuildInput, + fragment_bitmap: &RoaringBitmap, + dataset_version: u64, + index_uuid: Uuid, + ) -> Result { + let trained_index = + Self::train_manifest_index(dataset, registry, input, index_uuid).await?; + Self::manifest_index_metadata( + lance_schema, + fragment_bitmap, + dataset_version, + trained_index, + ) + } + + /// Get the manifest schema fn manifest_schema() -> Arc { Arc::new(ArrowSchema::new(vec![ // Set unenforced primary key on object_id for bloom filter conflict detection @@ -783,6 +1420,627 @@ impl ManifestNamespace { }) } + fn required_string_value<'a>( + array: &'a StringArray, + row: usize, + column_name: &str, + ) -> Result<&'a str> { + if array.is_null(row) { + return Err(NamespaceError::Internal { + message: format!("Manifest column '{}' has null at row {}", column_name, row), + } + .into()); + } + Ok(array.value(row)) + } + + fn optional_string_value(array: &StringArray, row: usize) -> Option { + (!array.is_null(row)).then(|| array.value(row).to_string()) + } + + fn base_objects_column_values(batch: &RecordBatch) -> Result>>> { + let Some(column) = batch.column_by_name("base_objects") else { + return Ok(vec![None; batch.num_rows()]); + }; + let array = column.as_any().downcast_ref::().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Column 'base_objects' is not a list array: {:?}", + column.data_type() + ), + }) + })?; + + let mut values = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + if array.is_null(row) { + values.push(None); + continue; + } + let row_values = array.value(row); + let row_values = row_values + .as_any() + .downcast_ref::() + .ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Column 'base_objects' values are not strings".to_string(), + }) + })?; + let mut objects = Vec::with_capacity(row_values.len()); + for value_index in 0..row_values.len() { + if row_values.is_null(value_index) { + return Err(NamespaceError::Internal { + message: format!( + "Manifest column 'base_objects' has null item at row {} item {}", + row, value_index + ), + } + .into()); + } + objects.push(row_values.value(value_index).to_string()); + } + values.push(Some(objects)); + } + Ok(values) + } + + async fn manifest_projected_stream(dataset: &Dataset) -> Result { + let mut scanner = dataset.scan(); + scanner + .project(&[ + "object_id", + "object_type", + "location", + "metadata", + "base_objects", + ]) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to project manifest columns: {:?}", e), + }) + })?; + let stream = scanner.try_into_stream().await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to create manifest stream: {:?}", e), + }) + })?; + let schema = stream.schema(); + let stream = stream.map_err(|err| DataFusionError::External(Box::new(err))); + Ok(Box::pin(DatafusionRecordBatchStreamAdapter::new( + schema, + stream.fuse(), + ))) + } + + fn manifest_rewrite_commit_retries(&self) -> u32 { + self.commit_retries + .unwrap_or(DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES) + } + + fn lock_manifest_rewrite_shared( + shared: &Arc>>, + ) -> Result>> { + shared.lock().map_err(|_| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite state mutex was poisoned".to_string(), + }) + }) + } + + fn set_manifest_rewrite_error( + shared: &Arc>>, + err: LanceError, + ) { + match shared.lock() { + Ok(mut guard) => { + guard.error = Some(err); + } + Err(poisoned) => { + let mut guard = poisoned.into_inner(); + guard.error = Some(err); + } + } + } + + fn take_manifest_rewrite_error( + shared: &Arc>>, + ) -> Result> { + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + Ok(guard.error.take()) + } + + fn process_manifest_rewrite_batch( + batch: RecordBatch, + shared: &Arc>>, + ) -> Result> { + let object_ids = Self::get_string_column(&batch, "object_id")?; + let object_types = Self::get_string_column(&batch, "object_type")?; + let locations = Self::get_string_column(&batch, "location")?; + let metadatas = Self::get_string_column(&batch, "metadata")?; + let base_objects = Self::base_objects_column_values(&batch)?; + let mut output = ManifestBatchBuilder::new(); + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let mut index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + for (row, base_objects) in base_objects.into_iter().enumerate().take(batch.num_rows()) { + let row_value = ManifestRowValue { + object_id: Self::required_string_value(object_ids, row, "object_id")?.to_string(), + object_type: ObjectType::parse(Self::required_string_value( + object_types, + row, + "object_type", + )?)?, + location: Self::optional_string_value(locations, row), + metadata: Self::optional_string_value(metadatas, row), + base_objects, + }; + guard + .mutation + .process_existing_row(row_value, &mut output, &mut index_data)?; + } + guard.index_data = Some(index_data); + if output.is_empty() { + return Ok(None); + } + Ok(Some(output.finish()?)) + } + + fn finish_manifest_rewrite_stream( + shared: &Arc>>, + ) -> Result> { + let mut output = ManifestBatchBuilder::new(); + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let mut index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + guard.mutation.append_rows(&mut output, &mut index_data)?; + let result = guard.mutation.finish(); + let force_empty_batch = index_data.row_count == 0; + guard.result = Some(result); + guard.index_data = Some(index_data); + if output.is_empty() && !force_empty_batch { + Ok(None) + } else { + Ok(Some(output.finish()?)) + } + } + + fn manifest_rewrite_output_stream( + source: SendableRecordBatchStream, + shared: Arc>>, + ) -> SendableRecordBatchStream { + enum Phase { + Source, + Finish, + Done, + } + + let schema = Self::manifest_schema(); + let stream = stream::unfold( + (source, shared, Phase::Source), + |(mut source, shared, mut phase)| async move { + loop { + match phase { + Phase::Source => match source.next().await { + Some(Ok(batch)) => { + match Self::process_manifest_rewrite_batch(batch, &shared) { + Ok(Some(batch)) => { + return Some((Ok(batch), (source, shared, phase))); + } + Ok(None) => continue, + Err(err) => { + let message = err.to_string(); + Self::set_manifest_rewrite_error(&shared, err); + return Some(( + Err(DataFusionError::External(Box::new( + std::io::Error::other(message), + ))), + (source, shared, Phase::Done), + )); + } + } + } + Some(Err(err)) => { + return Some((Err(err), (source, shared, Phase::Done))); + } + None => phase = Phase::Finish, + }, + Phase::Finish => { + phase = Phase::Done; + match Self::finish_manifest_rewrite_stream(&shared) { + Ok(Some(batch)) => { + return Some((Ok(batch), (source, shared, phase))); + } + Ok(None) => continue, + Err(err) => { + let message = err.to_string(); + Self::set_manifest_rewrite_error(&shared, err); + return Some(( + Err(DataFusionError::External(Box::new( + std::io::Error::other(message), + ))), + (source, shared, Phase::Done), + )); + } + } + } + Phase::Done => return None, + } + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + schema, + stream.fuse(), + )) + } + + fn take_manifest_rewrite_result( + shared: &Arc>>, + ) -> Result<(CopyOnWriteMutation, ManifestIndexAccumulator)> { + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let result = guard.result.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite stream did not finish".to_string(), + }) + })?; + let index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + Ok((result, index_data)) + } + + /// Delete the staged (uncommitted) data files and index directories for a rewrite. + /// Only call this once the rewrite is known *not* to have landed (a put-if-not-exists + /// conflict, or an ambiguous error whose target version does not reference our data + /// file) — otherwise it would orphan files a committed manifest still references. + async fn cleanup_staged_manifest_files( + &self, + object_store: &ObjectStore, + data_files: &HashSet, + index_uuids: &[Uuid], + ) { + let data_dir = self + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_DATA_DIR); + for path in data_files { + let data_path = data_dir.clone().join(path.as_str()); + if let Err(err) = object_store.delete(&data_path).await { + log::warn!( + "Failed to clean up uncommitted manifest rewrite data file '{}': {}", + data_path, + err + ); + } + } + self.cleanup_uncommitted_manifest_index_dirs(object_store, index_uuids.iter().copied()) + .await; + } + + async fn cleanup_uncommitted_manifest_index_dirs( + &self, + object_store: &ObjectStore, + index_uuids: impl IntoIterator, + ) { + for index_uuid in index_uuids { + let index_dir = self + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_INDICES_DIR) + .join(index_uuid.to_string()); + if let Err(err) = object_store.remove_dir_all(index_dir.clone()).await + && !matches!(err, LanceError::NotFound { .. }) + { + log::warn!( + "Failed to clean up uncommitted manifest rewrite index directory '{}': {}", + index_dir, + err + ); + } + } + } + + /// Resolve the commit handler for the `__manifest` dataset's storage backend. + async fn manifest_commit_handler(&self) -> Result> { + commit_handler_from_url(&self.root, &None) + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to resolve manifest commit handler: {:?}", e), + }) + }) + } + + /// Directly write the rewritten `__manifest` as a new version using the storage + /// backend's atomic put-if-not-exists. The overwrite transaction is embedded inline + /// (no separate transaction file) and the commit handler writes the version hint. + async fn commit_manifest_overwrite( + &self, + dataset: &Dataset, + commit_handler: &dyn CommitHandler, + manifest: &mut Manifest, + indices: Option>, + transaction: Transaction, + ) -> std::result::Result<(), CommitError> { + apply_feature_flags(manifest, false, false).map_err(CommitError::from)?; + let timestamp_nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + manifest.set_timestamp(timestamp_nanos); + manifest.update_max_fragment_id(); + + // Commit through the dataset's own object store, not `self.object_store`: for + // stores like `memory://` the namespace and the dataset can hold different + // instances, and a commit written to the wrong one is invisible to reads. + let object_store = dataset + .object_store(None) + .await + .map_err(CommitError::from)?; + let base_path = self.base_path.clone().join(MANIFEST_TABLE_NAME); + let naming_scheme = dataset.manifest_location().naming_scheme; + commit_handler + .commit( + manifest, + indices, + &base_path, + &object_store, + write_manifest_file_to_path, + naming_scheme, + Some((&transaction).into()), + ) + .await + .map(|_location| ()) + } + + /// After an ambiguous commit error, determine whether our overwrite actually landed at + /// `target_version`. A network failure can leave the manifest committed even though the + /// client observed an error; in that case the committed version references one of our + /// staged data files, and deleting them would corrupt the catalog. + async fn manifest_commit_landed( + &self, + dataset: &Dataset, + target_version: u64, + data_files: &HashSet, + ) -> bool { + let Ok(committed) = dataset.checkout_version(target_version).await else { + return false; + }; + committed.manifest().fragments.iter().any(|fragment| { + fragment + .files + .iter() + .any(|file| data_files.contains(file.path.as_str())) + }) + } + + /// Resolve a storage commit conflict against the latest committed catalog state. + /// Returns `Some(output)` when the mutation's intent is already satisfied (no retry + /// needed), `Ok(None)` to retry the rewrite, or an error for a terminal conflict. + async fn resolve_manifest_conflict( + &self, + resolution: &ConflictResolution, + ) -> Result> { + match resolution { + ConflictResolution::Retry => Ok(None), + ConflictResolution::FailIfExists(object_ids) => { + for object_id in object_ids { + if self.manifest_contains_object(object_id).await? { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "Object '{}' was concurrently created by another operation", + object_id + ), + } + .into()); + } + } + Ok(None) + } + ConflictResolution::SucceedIfAbsent { object_id, output } => { + if self.manifest_contains_object(object_id).await? { + Ok(None) + } else { + Ok(Some(output.clone())) + } + } + } + } + + /// Validate that this build can write the current `__manifest` before a + /// mutating operation performs any side effect (e.g. writing table data), so + /// a refused write leaves nothing orphaned behind. The eventual + /// `rewrite_manifest` commit re-checks `ensure_writable` on each retry, so a + /// concurrent upgrade in between is still caught. + async fn ensure_manifest_writable(&self) -> Result<()> { + let dataset_guard = self.manifest_dataset.get().await?; + ensure_writable(dataset_guard.metadata()) + } + + async fn rewrite_manifest( + &self, + operation: &str, + mut make_mutation: F, + ) -> Result + where + M: ManifestStreamMutation + 'static, + F: FnMut() -> M, + { + let _mutation_guard = self.manifest_mutation_lock.lock().await; + let max_retries = self.manifest_rewrite_commit_retries(); + let mut retries = 0; + let build_indices = self.inline_optimization_enabled; + let commit_handler = self.manifest_commit_handler().await?; + + loop { + let dataset_guard = self.manifest_dataset.get_refreshed().await?; + let dataset = Arc::new(dataset_guard.clone()); + drop(dataset_guard); + // Refuse to mutate a manifest written with a writer feature flag this + // build does not understand. + ensure_writable(dataset.metadata())?; + // Staged files, indices, the commit, and cleanup must all use the dataset's + // own object store (see `commit_manifest_overwrite`). + let object_store = dataset.object_store(None).await?; + + let source = Self::manifest_projected_stream(&dataset).await?; + let resolution = make_mutation().conflict_resolution(); + let shared = Arc::new(StdMutex::new(ManifestRewriteShared::new(make_mutation()))); + let output_stream = Self::manifest_rewrite_output_stream(source, shared.clone()); + // Pin both limits so the overwrite never splits into multiple fragments: the + // replacement indices map each row to address `(0 << 32) | offset`, valid only + // for a single fragment with id 0. The row count is bounded below u32::MAX by + // `ManifestIndexAccumulator::next_row_id`. + let write_params = WriteParams { + mode: WriteMode::Overwrite, + session: self.session.clone(), + max_rows_per_file: u32::MAX as usize, + max_bytes_per_file: usize::MAX, + skip_auto_cleanup: true, + ..WriteParams::default() + }; + + let transaction = match InsertBuilder::new(dataset.clone()) + .with_params(&write_params) + .execute_uncommitted_stream(output_stream) + .await + { + Ok(transaction) => transaction, + Err(err) => { + if let Some(stream_err) = Self::take_manifest_rewrite_error(&shared)? { + return Err(stream_err); + } + return Err(convert_lance_commit_error(&err, operation, None)); + } + }; + + let (mutation, index_data) = Self::take_manifest_rewrite_result(&shared)?; + + let Operation::Overwrite { + fragments, schema, .. + } = &transaction.operation + else { + return Err(NamespaceError::Internal { + message: "Manifest rewrite transaction is not an overwrite".to_string(), + } + .into()); + }; + // Unique data files this attempt staged. Used to clean up orphans and to + // attribute an ambiguous commit error back to us. + let staged_data_files = fragments + .iter() + .flat_map(|fragment| fragment.files.iter()) + .filter(|file| file.base_id.is_none()) + .map(|file| file.path.clone()) + .collect::>(); + + if !mutation.has_changes { + self.cleanup_staged_manifest_files(&object_store, &staged_data_files, &[]) + .await; + return Ok(mutation.result); + } + + let mut manifest = Self::manifest_from_overwrite_transaction( + dataset.manifest(), + schema.clone(), + fragments, + ); + let target_version = manifest.version; + + let index_uuids = [Uuid::new_v4(), Uuid::new_v4(), Uuid::new_v4()]; + let indices = if build_indices { + match Self::build_manifest_indices(&dataset, &manifest, index_data, index_uuids) + .await + { + Ok(indices) => Some(indices), + Err(err) => { + self.cleanup_staged_manifest_files( + &object_store, + &staged_data_files, + &index_uuids, + ) + .await; + return Err(err); + } + } + } else { + None + }; + let staged_index_uuids: &[Uuid] = if build_indices { &index_uuids } else { &[] }; + + let commit_result = self + .commit_manifest_overwrite( + &dataset, + commit_handler.as_ref(), + &mut manifest, + indices, + transaction, + ) + .await; + + match commit_result { + Ok(()) => { + let _ = self.manifest_dataset.get_refreshed().await; + return Ok(mutation.result); + } + Err(err) => { + // The put may have landed even though the client saw an error (lost + // ack). Verify before deleting anything so we never orphan files that a + // committed manifest still references. + if self + .manifest_commit_landed(&dataset, target_version, &staged_data_files) + .await + { + let _ = self.manifest_dataset.get_refreshed().await; + return Ok(mutation.result); + } + self.cleanup_staged_manifest_files( + &object_store, + &staged_data_files, + staged_index_uuids, + ) + .await; + match err { + CommitError::CommitConflict => { + if let Some(output) = + self.resolve_manifest_conflict(&resolution).await? + { + return Ok(output); + } + if retries >= max_retries { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "{}: still conflicting after {} retries", + operation, max_retries + ), + } + .into()); + } + retries += 1; + tokio::time::sleep(std::time::Duration::from_millis( + 10 * u64::from(retries), + )) + .await; + } + CommitError::OtherError(err) => { + return Err(convert_lance_commit_error(&err, operation, None)); + } + } + } + } + } + } + /// Check if the manifest contains an object with the given ID async fn manifest_contains_object(&self, object_id: &str) -> Result { let escaped_id = object_id.replace('\'', "''"); @@ -999,7 +2257,6 @@ impl ManifestNamespace { /// Insert one or more entries into the manifest table with metadata and base_objects. /// /// This is the unified entry point for both single and batch inserts. - /// Uses a single MergeInsert operation to insert all entries at once. /// If any entry already exists (matching object_id), the entire batch fails. pub async fn insert_into_manifest_with_metadata( &self, @@ -1029,181 +2286,55 @@ impl ManifestNamespace { return Ok(()); } - let schema = Self::manifest_schema(); - - let mut object_ids = Vec::with_capacity(entries.len()); - let mut object_types = Vec::with_capacity(entries.len()); - let mut locations: Vec> = Vec::with_capacity(entries.len()); - let mut metadatas: Vec> = Vec::with_capacity(entries.len()); + self.rewrite_manifest("Failed to overwrite manifest", || { + UpsertManifestMutation::new(entries.clone(), base_objects.clone(), when_matched.clone()) + }) + .await + } - let string_builder = StringBuilder::new(); - let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new( - "object_id", - DataType::Utf8, - true, - ))); + /// Delete an entry from the manifest table + pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> { + let object_id = object_id.to_string(); + self.rewrite_manifest("Failed to delete from manifest", || DeleteObjectMutation { + object_id: object_id.clone(), + deleted: false, + }) + .await + } - for (i, entry) in entries.iter().enumerate() { - object_ids.push(entry.object_id.as_str()); - object_types.push(entry.object_type.as_str()); - locations.push(entry.location.clone()); - metadatas.push(entry.metadata.clone()); - - // Only the first entry gets the base_objects (for single-entry inserts - // with base_objects like view creation); batch entries use null. - if i == 0 { - match &base_objects { - Some(objects) => { - for obj in objects { - list_builder.values().append_value(obj); - } - list_builder.append(true); - } - None => { - list_builder.append_null(); - } - } - } else { - list_builder.append_null(); + /// Register a table in the manifest without creating the physical table (internal helper for migration) + pub async fn register_table(&self, name: &str, location: String) -> Result<()> { + let object_id = Self::build_object_id(&[], name); + if self.manifest_contains_object(&object_id).await? { + return Err(NamespaceError::Internal { + message: format!("Table '{}' already exists", name), } + .into()); } - let base_objects_array = list_builder.finish(); - - let location_array: Arc = Arc::new(StringArray::from( - locations.iter().map(|l| l.as_deref()).collect::>(), - )); - - let metadata_array: Arc = Arc::new(StringArray::from( - metadatas.iter().map(|m| m.as_deref()).collect::>(), - )); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(StringArray::from(object_ids)), - Arc::new(StringArray::from(object_types.to_vec())), - location_array, - metadata_array, - Arc::new(base_objects_array), - ], - ) - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create manifest entries: {:?}", e), - }) - })?; - - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); - - // Use MergeInsert so callers can choose fail-on-existing inserts or metadata upserts. - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset_arc = Arc::new(dataset_guard.clone()); - drop(dataset_guard); // Drop read guard before merge insert - - let mut merge_builder = - MergeInsertBuilder::try_new(dataset_arc, vec!["object_id".to_string()]).map_err( - |e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create merge builder: {:?}", e), - }) - }, - )?; - merge_builder.when_matched(when_matched); - merge_builder.when_not_matched(WhenNotMatched::InsertAll); - // Use conflict_retries to handle cross-process races on manifest mutations. - merge_builder.conflict_retries(5); - // TODO: after BTREE index creation on object_id, has_scalar_index=true causes - // MergeInsert to use V1 path which lacks bloom filters for conflict detection. This - // results in (Some, None) filter mismatch when rebasing against V2 operations. - // Setting use_index=false ensures all operations consistently use V2 path. - merge_builder.use_index(false); - if let Some(retries) = self.commit_retries { - merge_builder.commit_retries(retries); - } - - let (new_dataset_arc, _merge_stats) = merge_builder - .try_build() - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to build merge: {:?}", e), - }) - })? - .execute_reader(Box::new(reader)) + self.insert_into_manifest(object_id, ObjectType::Table, Some(location)) .await - .map_err(|e| { - convert_lance_commit_error(&e, "Failed to execute merge insert into manifest", None) - })?; - - let new_dataset = Arc::try_unwrap(new_dataset_arc).unwrap_or_else(|arc| (*arc).clone()); - self.manifest_dataset.set_latest(new_dataset).await; - - // Run inline optimization after write - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); - } - - Ok(()) } - /// Delete an entry from the manifest table - pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> { - let predicate = format!("object_id = '{}'", object_id); - - // Get dataset and use DeleteBuilder with configured retries - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); // Drop read guard before delete - - let new_dataset = DeleteBuilder::new(dataset, &predicate) - .execute() - .await - .map_err(|e| convert_lance_commit_error(&e, "Failed to delete", None))?; - - // Update the wrapper with the new dataset - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; - - // Run inline optimization after delete - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); + /// Validate that all levels of a namespace path exist + async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> { + for i in 1..=namespace_path.len() { + let partial_path = &namespace_path[..i]; + let object_id = partial_path.join(DELIMITER); + if !self.manifest_contains_object(&object_id).await? { + return Err(NamespaceError::NamespaceNotFound { + message: format!("parent namespace '{}'", object_id), + } + .into()); + } } - Ok(()) } - /// Query the manifest for all versions of a table, sorted by version. - /// - /// Returns a list of (version, metadata_json_string) tuples where metadata_json_string - /// contains the full metadata JSON stored in the manifest (manifest_path, manifest_size, - /// e_tag, naming_scheme). - /// - /// **Known limitation**: All matching rows are loaded into memory, sorted in Rust, - /// and then truncated. For tables with a very large number of versions this may be - /// expensive. Pushing sort/limit into the scan is not yet supported by Lance. - pub async fn query_table_versions( - &self, - object_id: &str, - descending: bool, - limit: Option, - ) -> Result> { + /// Query the manifest for a namespace with the given object ID + async fn query_manifest_for_namespace(&self, object_id: &str) -> Result> { let escaped_id = object_id.replace('\'', "''"); - // table_version object_ids are formatted as "{object_id}${zero_padded_version}" - let filter = format!( - "object_type = 'table_version' AND starts_with(object_id, '{}{}')", - escaped_id, DELIMITER - ); + let filter = format!("object_id = '{}' AND object_type = 'namespace'", escaped_id); let mut scanner = self.manifest_scanner().await?; scanner.filter(&filter).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { @@ -1217,200 +2348,285 @@ impl ManifestNamespace { })?; let batches = Self::execute_scanner(scanner).await?; - let mut versions: Vec<(i64, String)> = Vec::new(); + let mut found_result: Option = None; + let mut total_rows = 0; + for batch in batches { if batch.num_rows() == 0 { continue; } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; - for i in 0..batch.num_rows() { - let oid = object_id_array.value(i); - // Parse version from object_id - if let Some(version) = Self::parse_version_from_object_id(oid) { - let metadata_str = metadata_array.value(i).to_string(); - versions.push((version, metadata_str)); + + total_rows += batch.num_rows(); + if total_rows > 1 { + return Err(NamespaceError::Internal { + message: format!( + "Expected exactly 1 namespace with id '{}', found {}", + object_id, total_rows + ), } + .into()); } - } - if descending { - versions.sort_by(|a, b| b.0.cmp(&a.0)); - } else { - versions.sort_by(|a, b| a.0.cmp(&b.0)); - } + let object_id_array = Self::get_string_column(&batch, "object_id")?; + let metadata_array = Self::get_string_column(&batch, "metadata")?; + + let object_id_str = object_id_array.value(0); + let metadata = if !metadata_array.is_null(0) { + let metadata_str = metadata_array.value(0); + match serde_json::from_str::>(metadata_str) { + Ok(map) => Some(map), + Err(e) => { + return Err(NamespaceError::Internal { + message: format!( + "Failed to deserialize metadata for namespace '{}': {}", + object_id, e + ), + } + .into()); + } + } + } else { + None + }; - if let Some(limit) = limit { - versions.truncate(limit as usize); + let (namespace, name) = Self::parse_object_id(object_id_str); + found_result = Some(NamespaceInfo { + namespace, + name, + metadata, + }); } - Ok(versions) + Ok(found_result) } - /// Query the manifest for a specific version of a table. - /// - /// Returns the full metadata JSON string if found, which contains - /// manifest_path, manifest_size, e_tag, and naming_scheme. + /// Create or load the manifest dataset, ensuring it has the latest schema setup. /// - pub async fn query_table_version( - &self, - object_id: &str, - version: i64, - ) -> Result> { - let version_object_id = Self::build_version_object_id(object_id, version); - self.query_table_version_by_object_id(&version_object_id) - .await - } + /// This function will: + /// 1. Try to load an existing manifest table + /// 2. If it exists, check and migrate the schema if needed (e.g., add primary key metadata) + /// 3. If it doesn't exist, create a new manifest table with the current schema + async fn ensure_manifest_table_up_to_date( + root: &str, + storage_options: &Option>, + session: Option>, + ) -> Result { + let manifest_path = format!("{}/{}", root, MANIFEST_TABLE_NAME); + log::debug!("Attempting to load manifest from {}", manifest_path); + let store_options = ObjectStoreParams { + storage_options_accessor: storage_options.as_ref().map(|opts| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + ) + }), + ..Default::default() + }; + let read_params = ReadParams { + session: session.clone(), + store_options: Some(store_options.clone()), + ..Default::default() + }; + let dataset_result = DatasetBuilder::from_uri(&manifest_path) + .with_read_params(read_params) + .load() + .await; + if let Ok(mut dataset) = dataset_result { + // Reject a manifest written with a reader feature flag this build + // does not understand before touching it. + ensure_readable(dataset.metadata())?; - /// Query a specific table version by its exact object_id. - async fn query_table_version_by_object_id( - &self, - version_object_id: &str, - ) -> Result> { - let escaped_id = version_object_id.replace('\'', "''"); - let filter = format!( - "object_id = '{}' AND object_type = 'table_version'", - escaped_id - ); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["metadata"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; + // Check if the object_id field has primary key metadata, migrate if not + let needs_pk_migration = dataset + .schema() + .field("object_id") + .map(|f| { + !f.metadata + .contains_key(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) + }) + .unwrap_or(false); - for batch in batches { - if batch.num_rows() == 0 { - continue; + if needs_pk_migration { + // This legacy migration writes to the manifest, so confirm this + // build is allowed to write the current format first. + ensure_writable(dataset.metadata())?; + log::info!("Migrating __manifest table to add primary key metadata on object_id"); + dataset + .update_field_metadata() + .update("object_id", [(LANCE_UNENFORCED_PRIMARY_KEY_POSITION, "0")]) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to find object_id field for migration: {:?}", + e + ), + }) + })? + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to migrate primary key metadata: {:?}", e), + }) + })?; } - let metadata_array = Self::get_string_column(&batch, "metadata")?; - return Ok(Some(metadata_array.value(0).to_string())); - } - - Ok(None) - } - - /// Delete table version entries from the manifest for a given table and version ranges. - /// - /// Each range is (start_version, end_version) inclusive. Deletes all matching - /// `object_type = 'table_version'` entries whose object_id matches - /// `{object_id}${zero_padded_version}`. - /// - /// Builds a single filter expression covering all version ranges and executes - /// one bulk delete operation instead of deleting versions one at a time. - pub async fn delete_table_versions( - &self, - object_id: &str, - ranges: &[(i64, i64)], - ) -> Result { - if ranges.is_empty() { - return Ok(0); - } - // Collect all object_ids to delete (both new zero-padded and legacy formats) - let mut object_id_conditions: Vec = Vec::new(); - for (start, end) in ranges { - for version in *start..=*end { - let oid = Self::build_version_object_id(object_id, version); - let escaped = oid.replace('\'', "''"); - object_id_conditions.push(format!("'{}'", escaped)); - } - } + Ok(DatasetConsistencyWrapper::new(dataset)) + } else { + log::info!("Creating new manifest table at {}", manifest_path); + let schema = Self::manifest_schema(); + let empty_batch = RecordBatch::new_empty(schema.clone()); + let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); - if object_id_conditions.is_empty() { - return Ok(0); - } + let store_params = ObjectStoreParams { + storage_options_accessor: storage_options.as_ref().map(|opts| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + ) + }), + ..Default::default() + }; + let write_params = WriteParams { + session: session.clone(), + store_params: Some(store_params), + ..Default::default() + }; - // First, count how many entries exist so we can report the deleted count - let in_list = object_id_conditions.join(", "); - let filter = format!( - "object_type = 'table_version' AND object_id IN ({})", - in_list - ); + let dataset = + Dataset::write(Box::new(reader), &manifest_path, Some(write_params)).await; - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "location"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; - let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum(); - - if deleted_count == 0 { - return Ok(0); + // Handle race condition where another process created the manifest concurrently + match dataset { + Ok(dataset) => { + log::info!( + "Successfully created manifest table at {}, version={}, uri={}", + manifest_path, + dataset.version().version, + dataset.uri() + ); + Ok(DatasetConsistencyWrapper::new(dataset)) + } + Err(ref e) + if matches!( + e, + LanceError::DatasetAlreadyExists { .. } + | LanceError::CommitConflict { .. } + | LanceError::IncompatibleTransaction { .. } + | LanceError::RetryableCommitConflict { .. } + ) => + { + // Another process created the manifest concurrently, try to load it + log::info!( + "Manifest table was created by another process, loading it: {}", + manifest_path + ); + let recovery_store_options = ObjectStoreParams { + storage_options_accessor: storage_options.as_ref().map(|opts| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + ) + }), + ..Default::default() + }; + let recovery_read_params = ReadParams { + session, + store_options: Some(recovery_store_options), + ..Default::default() + }; + let dataset = DatasetBuilder::from_uri(&manifest_path) + .with_read_params(recovery_read_params) + .load() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to load manifest dataset after creation conflict: {}", + e + ), + }) + })?; + Ok(DatasetConsistencyWrapper::new(dataset)) + } + Err(e) => Err(lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to create manifest dataset: {:?}", e), + })), + } } + } - // Execute a single bulk delete with the combined filter - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); - - let new_dataset = DeleteBuilder::new(dataset, &filter) - .execute() - .await - .map_err(|e| { - convert_lance_commit_error(&e, "Failed to batch delete table versions", None) - })?; + /// Sorts names alphabetically and applies pagination using page_token (start_after) and limit. + /// + /// Returns the next page token (last item in this page) if more results exist beyond the limit, + /// or `None` if this is the last page. + fn apply_pagination( + names: &mut Vec, + page_token: Option, + limit: Option, + ) -> Option { + names.sort(); - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; + if let Some(start_after) = page_token { + if let Some(index) = names + .iter() + .position(|name| name.as_str() > start_after.as_str()) + { + names.drain(0..index); + } else { + names.clear(); + } + } - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); + if let Some(limit) = limit + && limit >= 0 + { + let limit = limit as usize; + if names.len() > limit { + let next_page_token = if limit > 0 { + Some(names[limit - 1].clone()) + } else { + None + }; + names.truncate(limit); + return next_page_token; + } } - Ok(deleted_count) + None } +} - /// Atomically delete table version entries from the manifest by their object_ids. - /// - /// This method supports multi-table transactional deletion: all specified - /// object_ids (which may span multiple tables) are deleted in a single atomic - /// `DeleteBuilder` operation. Either all entries are removed or none are. - /// - /// Object IDs are formatted as `{table_id}${version}`. - pub async fn batch_delete_table_versions_by_object_ids( - &self, - object_ids: &[String], - ) -> Result { - if object_ids.is_empty() { - return Ok(0); - } +#[async_trait] +impl LanceNamespace for ManifestNamespace { + fn namespace_id(&self) -> String { + self.root.clone() + } - let in_list: String = object_ids - .iter() - .map(|oid| { - let escaped = oid.replace('\'', "''"); - format!("'{}'", escaped) + async fn list_tables(&self, request: ListTablesRequest) -> Result { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), }) - .collect::>() - .join(", "); + })?; - let filter = format!( - "object_type = 'table_version' AND object_id IN ({})", - in_list - ); + // Build filter to find tables in this namespace + let filter = if namespace_id.is_empty() { + // Root namespace: find tables without a namespace prefix + "object_type = 'table' AND NOT contains(object_id, '$')".to_string() + } else { + // Namespaced: find tables that start with namespace$ but have no additional $ + let prefix = namespace_id.join(DELIMITER); + format!( + "object_type = 'table' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", + prefix, + DELIMITER, + prefix.len() + 2 + ) + }; - // Count how many entries exist so we can report the deleted count let mut scanner = self.manifest_scanner().await?; scanner.filter(&filter).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { @@ -1422,576 +2638,703 @@ impl ManifestNamespace { message: format!("Failed to project: {:?}", e), }) })?; - let batches = Self::execute_scanner(scanner).await?; - let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum(); - - if deleted_count == 0 { - return Ok(0); - } - - // Execute a single atomic bulk delete covering all tables - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); - let new_dataset = DeleteBuilder::new(dataset, &filter) - .execute() - .await - .map_err(|e| { - convert_lance_commit_error( - &e, - "Failed to batch delete table versions across multiple tables", - None, - ) - })?; + let batches = Self::execute_scanner(scanner).await?; - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; + let mut table_entries = Vec::new(); + for batch in batches { + if batch.num_rows() == 0 { + continue; + } - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); + let object_id_array = Self::get_string_column(&batch, "object_id")?; + let location_array = Self::get_string_column(&batch, "location")?; + for i in 0..batch.num_rows() { + let object_id = object_id_array.value(i); + let location = location_array.value(i); + let (_namespace, name) = Self::parse_object_id(object_id); + table_entries.push((name, location.to_string())); + } } - Ok(deleted_count) - } + let mut tables: Vec = if request.include_declared.unwrap_or(true) { + table_entries.into_iter().map(|(name, _)| name).collect() + } else { + let mut stream = futures::stream::iter(table_entries.into_iter().map( + |(name, location)| async move { + // `include_declared=false` is an explicit opt-in. We still pay one + // `_versions/` probe per table so declared-state is derived from actual + // manifests. This is linear in the total number of listed tables, and we do + // the probes with bounded concurrency before pagination. + if self.location_has_actual_manifests(&location).await? { + Ok::, Error>(Some(name)) + } else { + Ok::, Error>(None) + } + }, + )) + .buffered(DECLARED_FILTER_CONCURRENCY); - /// Set a property flag in the __manifest table's metadata key-value map. - /// - /// This uses `dataset.update_metadata()` to persist the flag in the - /// __manifest dataset's table metadata, rather than inserting a row. - /// If the property already exists with the same value, this is a no-op. - pub async fn set_property(&self, name: &str, value: &str) -> Result<()> { - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - if dataset_guard.metadata().get(name) == Some(&value.to_string()) { - return Ok(()); - } - drop(dataset_guard); + let mut filtered = Vec::new(); + while let Some(result) = stream.next().await { + if let Some(name) = result? { + filtered.push(name); + } + } + filtered + }; - let mut dataset_guard = self.manifest_dataset.get_mut().await?; - dataset_guard - .update_metadata([(name, value)]) - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to set property '{}' in __manifest metadata: {}", - name, e - ), - }) - })?; - Ok(()) + let next_page_token = + Self::apply_pagination(&mut tables, request.page_token, request.limit); + let mut response = ListTablesResponse::new(tables); + response.page_token = next_page_token; + Ok(response) } - /// Check if a property flag exists in the __manifest table's metadata key-value map. - pub async fn has_property(&self, name: &str) -> Result { - let dataset_guard = self.manifest_dataset.get().await?; - Ok(dataset_guard.metadata().contains_key(name)) - } + async fn describe_table(&self, request: DescribeTableRequest) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), + }) + })?; - /// Parse metadata JSON into a `TableVersion`. - /// - /// Returns `None` if metadata is invalid or missing required fields. - fn parse_table_version(version: i64, metadata_str: &str) -> Option { - let meta: serde_json::Value = match serde_json::from_str(metadata_str) { - Ok(v) => v, - Err(e) => { - log::warn!( - "Skipping version {} due to invalid metadata JSON: {}", - version, - e - ); - return None; - } - }; - let manifest_path = match meta.get("manifest_path").and_then(|v| v.as_str()) { - Some(p) => p.to_string(), - None => { - log::warn!( - "Skipping version {} due to missing 'manifest_path' in metadata — \ - this may indicate data corruption", - version - ); - return None; + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), } - }; - let manifest_size = meta.get("manifest_size").and_then(|v| v.as_i64()); - let e_tag = meta - .get("e_tag") - .and_then(|v| v.as_str()) - .map(|s| s.to_string()); - Some(TableVersion { - version, - manifest_path, - manifest_size, - e_tag, - timestamp_millis: None, - metadata: None, - }) - } + .into()); + } - /// List table versions from the __manifest table. - /// - /// Queries the manifest for all versions of the given table and returns - /// them as a `ListTableVersionsResponse`. - pub async fn list_table_versions( - &self, - table_id: &[String], - descending: bool, - limit: Option, - ) -> Result { let object_id = Self::str_object_id(table_id); - let manifest_versions = self - .query_table_versions(&object_id, descending, limit) - .await?; - - let table_versions: Vec = manifest_versions - .into_iter() - .filter_map(|(version, metadata_str)| Self::parse_table_version(version, &metadata_str)) - .collect(); + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; - Ok(ListTableVersionsResponse { - versions: table_versions, - page_token: None, - }) - } + // Extract table name and namespace from table_id + let table_name = table_id.last().cloned().unwrap_or_default(); + let namespace_id: Vec = if table_id.len() > 1 { + table_id[..table_id.len() - 1].to_vec() + } else { + vec![] + }; - /// Describe a specific table version from the __manifest table. - /// - /// Queries the manifest for a specific version and returns it as a - /// `DescribeTableVersionResponse`. Returns an error if the version is not found. - pub async fn describe_table_version( - &self, - table_id: &[String], - version: i64, - ) -> Result { - let object_id = Self::str_object_id(table_id); - if let Some(metadata_str) = self.query_table_version(&object_id, version).await? - && let Some(tv) = Self::parse_table_version(version, &metadata_str) - { - return Ok(DescribeTableVersionResponse { - version: Box::new(tv), - }); - } - Err(NamespaceError::TableVersionNotFound { - message: format!("version {} for table {:?}", version, table_id), + let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); + let should_check_declared = + load_detailed_metadata || request.check_declared.unwrap_or(false); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + + match table_info { + Some(info) => { + // Construct full URI from relative location + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + let is_only_declared = if should_check_declared { + Some(!self.location_has_actual_manifests(&info.location).await?) + } else { + None + }; + + if !load_detailed_metadata { + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: Some(namespace_id), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + properties: info.metadata, + is_only_declared, + ..Default::default() + }); + } + + if is_only_declared == Some(true) { + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: Some(namespace_id), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + properties: info.metadata, + is_only_declared, + ..Default::default() + }); + } + + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + + match builder.load().await { + Ok(mut dataset) => { + // If a specific version is requested, checkout that version + if let Some(requested_version) = request.version { + dataset = dataset.checkout_version(requested_version as u64).await?; + } + + let version = dataset.version().version; + let lance_schema = dataset.schema(); + let arrow_schema: arrow_schema::Schema = lance_schema.into(); + let json_schema = arrow_schema_to_json(&arrow_schema)?; + + Ok(DescribeTableResponse { + table: Some(table_name.clone()), + namespace: Some(namespace_id.clone()), + version: Some(version as i64), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + schema: Some(Box::new(json_schema)), + storage_options, + properties: info.metadata.clone(), + is_only_declared, + ..Default::default() + }) + } + Err(err) => Err(NamespaceError::Internal { + message: format!( + "Table exists in manifest but failed to load dataset '{}': {}", + object_id, err + ), + } + .into()), + } + } + None => Err(NamespaceError::TableNotFound { + message: Self::format_table_id(table_id), + } + .into()), } - .into()) } - /// Register a table in the manifest without creating the physical table (internal helper for migration) - pub async fn register_table(&self, name: &str, location: String) -> Result<()> { - let object_id = Self::build_object_id(&[], name); - if self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::Internal { - message: format!("Table '{}' already exists", name), + async fn table_exists(&self, request: TableExistsRequest) -> Result<()> { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), + }) + })?; + + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), } .into()); } - self.insert_into_manifest(object_id, ObjectType::Table, Some(location)) - .await - } - - /// Validate that all levels of a namespace path exist - async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> { - for i in 1..=namespace_path.len() { - let partial_path = &namespace_path[..i]; - let object_id = partial_path.join(DELIMITER); - if !self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::NamespaceNotFound { - message: format!("parent namespace '{}'", object_id), - } - .into()); + let object_id = Self::str_object_id(table_id); + let exists = self.manifest_contains_object(&object_id).await?; + if exists { + Ok(()) + } else { + Err(NamespaceError::TableNotFound { + message: Self::format_table_id(table_id), } + .into()) } - Ok(()) } - /// Query the manifest for a namespace with the given object ID - async fn query_manifest_for_namespace(&self, object_id: &str) -> Result> { - let escaped_id = object_id.replace('\'', "''"); - let filter = format!("object_id = '{}' AND object_type = 'namespace'", escaped_id); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "metadata"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), + async fn create_table( + &self, + request: CreateTableRequest, + data: Bytes, + ) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), }) })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut found_result: Option = None; - let mut total_rows = 0; - for batch in batches { - if batch.num_rows() == 0 { - continue; + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), } + .into()); + } - total_rows += batch.num_rows(); - if total_rows > 1 { - return Err(NamespaceError::Internal { - message: format!( - "Expected exactly 1 namespace with id '{}', found {}", - object_id, total_rows - ), - } - .into()); + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); + + // Refuse before writing any table data if this build cannot write the + // manifest, so a refused create leaves no orphaned dataset behind. + self.ensure_manifest_writable().await?; + + let existing_table = self.query_manifest_for_table(&object_id).await?; + let existing_has_manifests = if let Some(existing_table) = &existing_table { + Some( + self.location_has_actual_manifests(&existing_table.location) + .await?, + ) + } else { + None + }; + + if existing_has_manifests == Some(false) + && request + .properties + .as_ref() + .is_some_and(|properties| !properties.is_empty()) + { + return Err(NamespaceError::InvalidInput { + message: format!( + "create_table cannot set properties for already declared table '{}'", + object_id + ), } + .into()); + } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; + let create_mode = if existing_has_manifests == Some(false) { + CreateTableMode::Create + } else { + CreateTableMode::parse(request.mode.as_deref())? + }; + let dir_name = if let Some(existing_table) = &existing_table { + existing_table.location.clone() + } else if namespace.is_empty() && self.dir_listing_enabled { + format!("{}.lance", table_name) + } else { + Self::generate_dir_name(&object_id) + }; + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + let overwriting_existing_table = + existing_has_manifests == Some(true) && create_mode == CreateTableMode::Overwrite; - let object_id_str = object_id_array.value(0); - let metadata = if !metadata_array.is_null(0) { - let metadata_str = metadata_array.value(0); - match serde_json::from_str::>(metadata_str) { - Ok(map) => Some(map), - Err(e) => { - return Err(NamespaceError::Internal { - message: format!( - "Failed to deserialize metadata for namespace '{}': {}", - object_id, e - ), - } - .into()); + if existing_has_manifests == Some(true) { + match create_mode { + CreateTableMode::Create => { + return Err(NamespaceError::TableAlreadyExists { + message: table_name.clone(), } + .into()); } - } else { - None - }; + CreateTableMode::ExistOk => { + let properties = existing_table + .as_ref() + .and_then(|table| table.metadata.clone()); + return Ok(CreateTableResponse { + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties, + ..Default::default() + }); + } + CreateTableMode::Overwrite => {} + } + } - let (namespace, name) = Self::parse_object_id(object_id_str); - found_result = Some(NamespaceInfo { - namespace, - name, - metadata, - }); + // Validate that request_data is provided + if data.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Request data (Arrow IPC stream) is required for create_table".to_string(), + } + .into()); } - Ok(found_result) - } + // Write the data using Lance Dataset + let cursor = Cursor::new(data.to_vec()); + let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to read IPC stream: {:?}", e), + }) + })?; - /// Create or load the manifest dataset, ensuring it has the latest schema setup. - /// - /// This function will: - /// 1. Try to load an existing manifest table - /// 2. If it exists, check and migrate the schema if needed (e.g., add primary key metadata) - /// 3. If it doesn't exist, create a new manifest table with the current schema - /// 4. Persist feature flags (e.g., table_version_storage_enabled) if requested - async fn ensure_manifest_table_up_to_date( - root: &str, - storage_options: &Option>, - session: Option>, - table_version_storage_enabled: bool, - ) -> Result { - let manifest_path = format!("{}/{}", root, MANIFEST_TABLE_NAME); - log::debug!("Attempting to load manifest from {}", manifest_path); - let store_options = ObjectStoreParams { - storage_options_accessor: storage_options.as_ref().map(|opts| { + let batches: Vec = stream_reader + .collect::, _>>() + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to collect batches: {:?}", e), + }) + })?; + + if batches.is_empty() { + return Err(NamespaceError::Internal { + message: "No data provided for table creation".to_string(), + } + .into()); + } + + let schema = batches[0].schema(); + let batch_results: Vec> = + batches.into_iter().map(Ok).collect(); + let reader = RecordBatchIterator::new(batch_results, schema); + + let mut write_storage_options = self.storage_options.clone().unwrap_or_default(); + if let Some(request_storage_options) = request.storage_options.as_ref() { + write_storage_options.extend(request_storage_options.clone()); + } + + let store_params = ObjectStoreParams { + storage_options_accessor: (!write_storage_options.is_empty()).then(|| { Arc::new( lance_io::object_store::StorageOptionsAccessor::with_static_options( - opts.clone(), + write_storage_options, ), ) }), ..Default::default() }; - let read_params = ReadParams { - session: session.clone(), - store_options: Some(store_options.clone()), + let write_params = WriteParams { + mode: create_mode.write_mode(), + session: self.session.clone(), + store_params: Some(store_params), ..Default::default() }; - let dataset_result = DatasetBuilder::from_uri(&manifest_path) - .with_read_params(read_params) - .load() - .await; - if let Ok(mut dataset) = dataset_result { - // Check if the object_id field has primary key metadata, migrate if not - let needs_pk_migration = dataset - .schema() - .field("object_id") - .map(|f| { - !f.metadata - .contains_key(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) + let dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params)) + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to write dataset: {:?}", e), }) - .unwrap_or(false); + })?; + let version = dataset.version().version as i64; - if needs_pk_migration { - log::info!("Migrating __manifest table to add primary key metadata on object_id"); - dataset - .update_field_metadata() - .update("object_id", [(LANCE_UNENFORCED_PRIMARY_KEY_POSITION, "0")]) - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to find object_id field for migration: {:?}", - e - ), - }) - })? + if overwriting_existing_table { + let metadata = + Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; + self.upsert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some(dir_name), + metadata, + }], + None, + ) + .await?; + + Ok(CreateTableResponse { + version: Some(version), + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties: request.properties, + ..Default::default() + }) + } else { + match existing_table { + Some(existing_table) => Ok(CreateTableResponse { + version: Some(version), + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties: existing_table.metadata, + ..Default::default() + }), + None => { + let metadata = + Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; + // Register in manifest (store dir_name, not full URI) + self.insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some(dir_name.clone()), + metadata, + }], + None, + ) + .await?; + + Ok(CreateTableResponse { + version: Some(version), + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties: request.properties, + ..Default::default() + }) + } + } + } + } + + async fn drop_table(&self, request: DropTableRequest) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), + }) + })?; + + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), + } + .into()); + } + + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); + + // Query manifest for table location + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; + + match table_info { + Some(info) => { + // Delete from manifest first + self.delete_from_manifest(&object_id).boxed().await?; + + // Delete physical data directory using the dir_name from manifest + let table_path = self.base_path.clone().join(info.location.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + + // Remove the table directory + self.object_store + .remove_dir_all(table_path) + .boxed() .await .map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to migrate primary key metadata: {:?}", e), + message: format!("Failed to delete table directory: {:?}", e), }) })?; + + Ok(DropTableResponse { + id: request.id.clone(), + location: Some(table_uri), + ..Default::default() + }) } + None => Err(NamespaceError::TableNotFound { + message: table_name.to_string(), + } + .into()), + } + } + + async fn list_namespaces( + &self, + request: ListNamespacesRequest, + ) -> Result { + let parent_namespace = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), + }) + })?; + + // Build filter to find direct child namespaces + let filter = if parent_namespace.is_empty() { + // Root namespace: find all namespaces without a parent + "object_type = 'namespace' AND NOT contains(object_id, '$')".to_string() + } else { + // Non-root: find namespaces that start with parent$ but have no additional $ + let prefix = parent_namespace.join(DELIMITER); + format!( + "object_type = 'namespace' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", + prefix, + DELIMITER, + prefix.len() + 2 + ) + }; - // Persist table_version_storage_enabled flag in __manifest so that once - // enabled, it becomes a permanent property of this namespace. - if table_version_storage_enabled { - let needs_flag = dataset - .metadata() - .get("table_version_storage_enabled") - .map(|v| v != "true") - .unwrap_or(true); + let mut scanner = self.manifest_scanner().await?; + scanner.filter(&filter).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to filter: {:?}", e), + }) + })?; + scanner.project(&["object_id"]).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to project: {:?}", e), + }) + })?; - if needs_flag - && let Err(e) = dataset - .update_metadata([("table_version_storage_enabled", "true")]) - .await - { - log::warn!( - "Failed to persist table_version_storage_enabled flag in __manifest: {:?}", - e - ); - } + let batches = Self::execute_scanner(scanner).await?; + let mut namespaces = Vec::new(); + + for batch in batches { + if batch.num_rows() == 0 { + continue; } - Ok(DatasetConsistencyWrapper::new(dataset)) - } else { - log::info!("Creating new manifest table at {}", manifest_path); - let schema = Self::manifest_schema(); - let empty_batch = RecordBatch::new_empty(schema.clone()); - let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); + let object_id_array = Self::get_string_column(&batch, "object_id")?; + for i in 0..batch.num_rows() { + let object_id = object_id_array.value(i); + let (_namespace, name) = Self::parse_object_id(object_id); + namespaces.push(name); + } + } - let store_params = ObjectStoreParams { - storage_options_accessor: storage_options.as_ref().map(|opts| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - opts.clone(), - ), - ) - }), - ..Default::default() - }; - let write_params = WriteParams { - session: session.clone(), - store_params: Some(store_params), + let next_page_token = + Self::apply_pagination(&mut namespaces, request.page_token, request.limit); + let mut response = ListNamespacesResponse::new(namespaces); + response.page_token = next_page_token; + Ok(response) + } + + async fn describe_namespace( + &self, + request: DescribeNamespaceRequest, + ) -> Result { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), + }) + })?; + + // Root namespace always exists + if namespace_id.is_empty() { + #[allow(clippy::needless_update)] + return Ok(DescribeNamespaceResponse { + properties: Some(HashMap::new()), ..Default::default() - }; + }); + } - let dataset = - Dataset::write(Box::new(reader), &manifest_path, Some(write_params)).await; + // Check if namespace exists in manifest + let object_id = namespace_id.join(DELIMITER); + let namespace_info = self.query_manifest_for_namespace(&object_id).await?; - // Handle race condition where another process created the manifest concurrently - match dataset { - Ok(dataset) => { - log::info!( - "Successfully created manifest table at {}, version={}, uri={}", - manifest_path, - dataset.version().version, - dataset.uri() - ); - Ok(DatasetConsistencyWrapper::new(dataset)) - } - Err(ref e) - if matches!( - e, - LanceError::DatasetAlreadyExists { .. } - | LanceError::CommitConflict { .. } - | LanceError::IncompatibleTransaction { .. } - | LanceError::RetryableCommitConflict { .. } - ) => - { - // Another process created the manifest concurrently, try to load it - log::info!( - "Manifest table was created by another process, loading it: {}", - manifest_path - ); - let recovery_store_options = ObjectStoreParams { - storage_options_accessor: storage_options.as_ref().map(|opts| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - opts.clone(), - ), - ) - }), - ..Default::default() - }; - let recovery_read_params = ReadParams { - session, - store_options: Some(recovery_store_options), - ..Default::default() - }; - let dataset = DatasetBuilder::from_uri(&manifest_path) - .with_read_params(recovery_read_params) - .load() - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to load manifest dataset after creation conflict: {}", - e - ), - }) - })?; - Ok(DatasetConsistencyWrapper::new(dataset)) - } - Err(e) => Err(lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create manifest dataset: {:?}", e), - })), + match namespace_info { + #[allow(clippy::needless_update)] + Some(info) => Ok(DescribeNamespaceResponse { + properties: info.metadata, + ..Default::default() + }), + None => Err(NamespaceError::NamespaceNotFound { + message: object_id.to_string(), } + .into()), } } - /// Sorts names alphabetically and applies pagination using page_token (start_after) and limit. - /// - /// Returns the next page token (last item in this page) if more results exist beyond the limit, - /// or `None` if this is the last page. - fn apply_pagination( - names: &mut Vec, - page_token: Option, - limit: Option, - ) -> Option { - names.sort(); + async fn create_namespace( + &self, + request: CreateNamespaceRequest, + ) -> Result { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), + }) + })?; - if let Some(start_after) = page_token { - if let Some(index) = names - .iter() - .position(|name| name.as_str() > start_after.as_str()) - { - names.drain(0..index); - } else { - names.clear(); + // Root namespace always exists and cannot be created + if namespace_id.is_empty() { + return Err(NamespaceError::NamespaceAlreadyExists { + message: "root namespace".to_string(), } + .into()); } - if let Some(limit) = limit - && limit >= 0 - { - let limit = limit as usize; - if names.len() > limit { - let next_page_token = if limit > 0 { - Some(names[limit - 1].clone()) - } else { - None - }; - names.truncate(limit); - return next_page_token; + // Validate parent namespaces exist (but not the namespace being created) + if namespace_id.len() > 1 { + self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1]) + .await?; + } + + let object_id = namespace_id.join(DELIMITER); + if self.manifest_contains_object(&object_id).await? { + return Err(NamespaceError::NamespaceAlreadyExists { + message: object_id.to_string(), } + .into()); } - None - } -} + let metadata = + Self::serialize_metadata(request.properties.as_ref(), "namespace", &object_id)?; -#[async_trait] -impl LanceNamespace for ManifestNamespace { - fn namespace_id(&self) -> String { - self.root.clone() + self.insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Namespace, + location: None, + metadata, + }], + None, + ) + .await?; + + Ok(CreateNamespaceResponse { + properties: request.properties, + ..Default::default() + }) } - async fn list_tables(&self, request: ListTablesRequest) -> Result { + async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { let namespace_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { message: "Namespace ID is required".to_string(), }) })?; - // Build filter to find tables in this namespace - let filter = if namespace_id.is_empty() { - // Root namespace: find tables without a namespace prefix - "object_type = 'table' AND NOT contains(object_id, '$')".to_string() - } else { - // Namespaced: find tables that start with namespace$ but have no additional $ - let prefix = namespace_id.join(DELIMITER); - format!( - "object_type = 'table' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", - prefix, - DELIMITER, - prefix.len() + 2 - ) - }; + // Root namespace always exists and cannot be dropped + if namespace_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Root namespace cannot be dropped".to_string(), + } + .into()); + } + + let object_id = namespace_id.join(DELIMITER); + + // Check if namespace exists + if !self.manifest_contains_object(&object_id).boxed().await? { + return Err(NamespaceError::NamespaceNotFound { + message: object_id.to_string(), + } + .into()); + } - let mut scanner = self.manifest_scanner().await?; + // Check for child namespaces + let escaped_id = object_id.replace('\'', "''"); + let prefix = format!("{}{}", escaped_id, DELIMITER); + let filter = format!("starts_with(object_id, '{}')", prefix); + let mut scanner = self.manifest_scanner().boxed().await?; scanner.filter(&filter).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { message: format!("Failed to filter: {:?}", e), }) })?; - scanner.project(&["object_id", "location"]).map_err(|e| { + scanner.project::<&str>(&[]).map_err(|e| { lance_core::Error::from(NamespaceError::Internal { message: format!("Failed to project: {:?}", e), }) })?; + scanner.with_row_id(); + let count = scanner.count_rows().boxed().await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to count rows: {:?}", e), + }) + })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut table_entries = Vec::new(); - for batch in batches { - if batch.num_rows() == 0 { - continue; + if count > 0 { + return Err(NamespaceError::NamespaceNotEmpty { + message: format!("'{}' (contains {} child objects)", object_id, count), } + .into()); + } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let location_array = Self::get_string_column(&batch, "location")?; - for i in 0..batch.num_rows() { - let object_id = object_id_array.value(i); - let location = location_array.value(i); - let (_namespace, name) = Self::parse_object_id(object_id); - table_entries.push((name, location.to_string())); - } + self.delete_from_manifest(&object_id).boxed().await?; + + Ok(DropNamespaceResponse::default()) + } + + async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { + let namespace_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Namespace ID is required".to_string(), + }) + })?; + + // Root namespace always exists + if namespace_id.is_empty() { + return Ok(()); } - let mut tables: Vec = if request.include_declared.unwrap_or(true) { - table_entries.into_iter().map(|(name, _)| name).collect() + let object_id = namespace_id.join(DELIMITER); + if self.manifest_contains_object(&object_id).await? { + Ok(()) } else { - let mut stream = futures::stream::iter(table_entries.into_iter().map( - |(name, location)| async move { - // `include_declared=false` is an explicit opt-in. We still pay one - // `_versions/` probe per table so declared-state is derived from actual - // manifests. This is linear in the total number of listed tables, and we do - // the probes with bounded concurrency before pagination. - if self.location_has_actual_manifests(&location).await? { - Ok::, Error>(Some(name)) - } else { - Ok::, Error>(None) - } - }, - )) - .buffered(DECLARED_FILTER_CONCURRENCY); - - let mut filtered = Vec::new(); - while let Some(result) = stream.next().await { - if let Some(name) = result? { - filtered.push(name); - } + Err(NamespaceError::NamespaceNotFound { + message: object_id.to_string(), } - filtered - }; - - let next_page_token = - Self::apply_pagination(&mut tables, request.page_token, request.limit); - let mut response = ListTablesResponse::new(tables); - response.page_token = next_page_token; - Ok(response) + .into()) + } } - async fn describe_table(&self, request: DescribeTableRequest) -> Result { + async fn declare_table(&self, request: DeclareTableRequest) -> Result { let table_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { message: "Table ID is required".to_string(), @@ -2005,115 +3348,107 @@ impl LanceNamespace for ManifestNamespace { .into()); } - let object_id = Self::str_object_id(table_id); - let table_info = self.query_manifest_for_table(&object_id).boxed().await?; + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); - // Extract table name and namespace from table_id - let table_name = table_id.last().cloned().unwrap_or_default(); - let namespace_id: Vec = if table_id.len() > 1 { - table_id[..table_id.len() - 1].to_vec() + // Check if table already exists in manifest + let existing = self.query_manifest_for_table(&object_id).await?; + if existing.is_some() { + return Err(NamespaceError::TableAlreadyExists { + message: table_name.to_string(), + } + .into()); + } + + // Create table location path with hash-based naming + // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance + // Otherwise, use hash-based naming: {hash}_{object_id} + let dir_name = if namespace.is_empty() && self.dir_listing_enabled { + // Root table with directory listing enabled: use {table_name}.lance + format!("{}.lance", table_name) } else { - vec![] + // Child namespace table or dir listing disabled: use hash-based naming + Self::generate_dir_name(&object_id) }; + let table_path = self.base_path.clone().join(dir_name.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; - let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); - let should_check_declared = - load_detailed_metadata || request.check_declared.unwrap_or(false); - // For backwards compatibility, only skip vending credentials when explicitly set to false - let vend_credentials = request.vend_credentials.unwrap_or(true); - - match table_info { - Some(info) => { - // Construct full URI from relative location - let table_uri = Self::construct_full_uri(&self.root, &info.location)?; - - let storage_options = if vend_credentials { - self.storage_options.clone() - } else { - None - }; - let is_only_declared = if should_check_declared { - Some(!self.location_has_actual_manifests(&info.location).await?) - } else { - None - }; - - if !load_detailed_metadata { - return Ok(DescribeTableResponse { - table: Some(table_name), - namespace: Some(namespace_id), - location: Some(table_uri.clone()), - table_uri: Some(table_uri), - storage_options, - properties: info.metadata, - is_only_declared, - ..Default::default() - }); + // Validate location if provided + if let Some(req_location) = &request.location { + let req_location = req_location.trim_end_matches('/'); + if req_location != table_uri { + return Err(NamespaceError::InvalidInput { + message: format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, req_location, table_uri + ), } + .into()); + } + } - if is_only_declared == Some(true) { - return Ok(DescribeTableResponse { - table: Some(table_name), - namespace: Some(namespace_id), - location: Some(table_uri.clone()), - table_uri: Some(table_uri), - storage_options, - properties: info.metadata, - is_only_declared, - ..Default::default() - }); - } + // Create the .lance-reserved file to mark the table as existing + let reserved_file_path = table_path.clone().join(".lance-reserved"); - let mut builder = DatasetBuilder::from_uri(&table_uri); - if let Some(opts) = &self.storage_options { - builder = builder.with_storage_options(opts.clone()); - } - if let Some(session) = &self.session { - builder = builder.with_session(session.clone()); - } + self.object_store + .create(&reserved_file_path) + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to create .lance-reserved file for table {}: {}", + table_name, e + ), + }) + })? + .shutdown() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to finalize .lance-reserved file for table {}: {}", + table_name, e + ), + }) + })?; - match builder.load().await { - Ok(mut dataset) => { - // If a specific version is requested, checkout that version - if let Some(requested_version) = request.version { - dataset = dataset.checkout_version(requested_version as u64).await?; - } + let metadata = Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; - let version = dataset.version().version; - let lance_schema = dataset.schema(); - let arrow_schema: arrow_schema::Schema = lance_schema.into(); - let json_schema = arrow_schema_to_json(&arrow_schema)?; + // Add entry to manifest marking this as a declared table (store dir_name, not full path) + self.insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some(dir_name), + metadata, + }], + None, + ) + .await?; - Ok(DescribeTableResponse { - table: Some(table_name.clone()), - namespace: Some(namespace_id.clone()), - version: Some(version as i64), - location: Some(table_uri.clone()), - table_uri: Some(table_uri), - schema: Some(Box::new(json_schema)), - storage_options, - properties: info.metadata.clone(), - is_only_declared, - ..Default::default() - }) - } - Err(err) => Err(NamespaceError::Internal { - message: format!( - "Table exists in manifest but failed to load dataset '{}': {}", - object_id, err - ), - } - .into()), - } - } - None => Err(NamespaceError::TableNotFound { - message: Self::format_table_id(table_id), - } - .into()), - } + log::info!( + "Declared table '{}' in manifest at {}", + table_name, + table_uri + ); + + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + Ok(DeclareTableResponse { + location: Some(table_uri), + storage_options, + properties: request.properties, + ..Default::default() + }) } - async fn table_exists(&self, request: TableExistsRequest) -> Result<()> { + async fn register_table(&self, request: RegisterTableRequest) -> Result { let table_id = request.id.as_ref().ok_or_else(|| { lance_core::Error::from(NamespaceError::InvalidInput { message: "Table ID is required".to_string(), @@ -2127,783 +3462,1103 @@ impl LanceNamespace for ManifestNamespace { .into()); } - let object_id = Self::str_object_id(table_id); - let exists = self.manifest_contains_object(&object_id).await?; - if exists { - Ok(()) - } else { - Err(NamespaceError::TableNotFound { - message: Self::format_table_id(table_id), + let location = request.location.clone(); + + // Validate that location is a relative path within the root directory + // We don't allow absolute URIs or paths that escape the root + if location.contains("://") { + return Err(NamespaceError::InvalidInput { + message: format!( + "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}", + location + ), } - .into()) + .into()); } - } - - async fn create_table( - &self, - request: CreateTableRequest, - data: Bytes, - ) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), - }) - })?; - if table_id.is_empty() { + if location.starts_with('/') { return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), + message: format!( + "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}", + location + ), } .into()); } - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); - - let existing_table = self.query_manifest_for_table(&object_id).await?; - let existing_has_manifests = if let Some(existing_table) = &existing_table { - Some( - self.location_has_actual_manifests(&existing_table.location) - .await?, - ) - } else { - None - }; - - if existing_has_manifests == Some(false) - && request - .properties - .as_ref() - .is_some_and(|properties| !properties.is_empty()) - { + // Check for path traversal attempts + if location.contains("..") { return Err(NamespaceError::InvalidInput { message: format!( - "create_table cannot set properties for already declared table '{}'", - object_id + "Path traversal is not allowed. Location must be a relative path within the root directory: {}", + location ), } .into()); } - let create_mode = if existing_has_manifests == Some(false) { - CreateTableMode::Create - } else { - CreateTableMode::parse(request.mode.as_deref())? - }; - let dir_name = if let Some(existing_table) = &existing_table { - existing_table.location.clone() - } else if namespace.is_empty() && self.dir_listing_enabled { - format!("{}.lance", table_name) - } else { - Self::generate_dir_name(&object_id) - }; - let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; - let overwriting_existing_table = - existing_has_manifests == Some(true) && create_mode == CreateTableMode::Overwrite; + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); - if existing_has_manifests == Some(true) { - match create_mode { - CreateTableMode::Create => { - return Err(NamespaceError::TableAlreadyExists { - message: table_name.clone(), - } - .into()); - } - CreateTableMode::ExistOk => { - let properties = existing_table - .as_ref() - .and_then(|table| table.metadata.clone()); - return Ok(CreateTableResponse { - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties, - ..Default::default() - }); - } - CreateTableMode::Overwrite => {} - } + // Validate that parent namespaces exist (if not root) + if !namespace.is_empty() { + self.validate_namespace_levels_exist(&namespace).await?; } - // Validate that request_data is provided - if data.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Request data (Arrow IPC stream) is required for create_table".to_string(), + // Check if table already exists + if self.manifest_contains_object(&object_id).await? { + return Err(NamespaceError::TableAlreadyExists { + message: object_id.to_string(), } .into()); } - // Write the data using Lance Dataset - let cursor = Cursor::new(data.to_vec()); - let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to read IPC stream: {:?}", e), - }) - })?; + // Register the table with its location in the manifest + self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone())) + .await?; - let batches: Vec = stream_reader - .collect::, _>>() - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to collect batches: {:?}", e), + Ok(RegisterTableResponse { + location: Some(location), + ..Default::default() + }) + } + + async fn deregister_table( + &self, + request: DeregisterTableRequest, + ) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: "Table ID is required".to_string(), }) })?; - if batches.is_empty() { - return Err(NamespaceError::Internal { - message: "No data provided for table creation".to_string(), + if table_id.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "Table ID cannot be empty".to_string(), } .into()); } - let schema = batches[0].schema(); - let batch_results: Vec> = - batches.into_iter().map(Ok).collect(); - let reader = RecordBatchIterator::new(batch_results, schema); - - let mut write_storage_options = self.storage_options.clone().unwrap_or_default(); - if let Some(request_storage_options) = request.storage_options.as_ref() { - write_storage_options.extend(request_storage_options.clone()); - } - - let store_params = ObjectStoreParams { - storage_options_accessor: (!write_storage_options.is_empty()).then(|| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - write_storage_options, - ), - ) - }), - ..Default::default() - }; - let write_params = WriteParams { - mode: create_mode.write_mode(), - session: self.session.clone(), - store_params: Some(store_params), - ..Default::default() - }; - let dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params)) - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to write dataset: {:?}", e), - }) - })?; - let version = dataset.version().version as i64; - - if overwriting_existing_table { - let metadata = - Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; - self.upsert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Table, - location: Some(dir_name), - metadata, - }], - None, - ) - .await?; + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); - Ok(CreateTableResponse { - version: Some(version), - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties: request.properties, - ..Default::default() - }) - } else { - match existing_table { - Some(existing_table) => Ok(CreateTableResponse { - version: Some(version), - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties: existing_table.metadata, - ..Default::default() - }), - None => { - let metadata = - Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; - // Register in manifest (store dir_name, not full URI) - self.insert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Table, - location: Some(dir_name.clone()), - metadata, - }], - None, - ) - .await?; + // Get table info before deleting + let table_info = self.query_manifest_for_table(&object_id).await?; - Ok(CreateTableResponse { - version: Some(version), - location: Some(table_uri), - storage_options: self.storage_options.clone(), - properties: request.properties, - ..Default::default() - }) + let table_uri = match table_info { + Some(info) => { + // Delete from manifest only (leave physical data intact) + self.delete_from_manifest(&object_id).boxed().await?; + Self::construct_full_uri(&self.root, &info.location)? + } + None => { + return Err(NamespaceError::TableNotFound { + message: object_id.to_string(), } + .into()); } - } + }; + + Ok(DeregisterTableResponse { + id: request.id.clone(), + location: Some(table_uri), + ..Default::default() + }) } - async fn drop_table(&self, request: DropTableRequest) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), - }) - })?; + /// Add columns to a table. + /// + /// Converts the API `AddColumnsEntry` (SQL expressions) into Lance's + /// `NewColumnTransform::SqlExpressions` and delegates to `Dataset::add_columns`. + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), - } - .into()); + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); } - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); - - // Query manifest for table location + let object_id = Self::str_object_id(table_id); let table_info = self.query_manifest_for_table(&object_id).boxed().await?; match table_info { Some(info) => { - // Delete from manifest first - self.delete_from_manifest(&object_id).boxed().await?; - - // Delete physical data directory using the dir_name from manifest - let table_path = self.base_path.clone().join(info.location.as_str()); let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + // Use DatasetBuilder with storage options to align with describe_table + // and to support custom storage backends (e.g. S3 with custom endpoints). + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + let mut dataset = builder.load().await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; - // Remove the table directory - self.object_store - .remove_dir_all(table_path) - .boxed() + // Use shared helper to build SQL expressions, ensuring a clear error when expression is missing + let sql_expressions = super::build_sql_expressions(&request.new_columns)?; + + dataset + .add_columns( + lance::dataset::NewColumnTransform::SqlExpressions(sql_expressions), + None, + None, + ) .await .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to delete table directory: {:?}", e), - }) + // Surface specific commit/conflict errors (CommitConflict, + // RetryableCommitConflict, IncompatibleTransaction, ...) rather than + // collapsing every failure into a generic IO error. + convert_lance_commit_error(&e, "add_columns", Some(&object_id)) })?; - Ok(DropTableResponse { - id: request.id.clone(), - location: Some(table_uri), - ..Default::default() - }) - } - None => Err(NamespaceError::TableNotFound { - message: table_name.to_string(), + let version = dataset.version().version as i64; + Ok(AlterTableAddColumnsResponse::new(version)) } - .into()), + None => Err(NamespaceError::TableNotFound { message: object_id }.into()), } } - async fn list_namespaces( + /// Alter columns in a table (rename, change type, change nullability). + /// + /// Converts the API `AlterColumnsEntry` into Lance's `ColumnAlteration` + /// and delegates to `Dataset::alter_columns`. + async fn alter_table_alter_columns( &self, - request: ListNamespacesRequest, - ) -> Result { - let parent_namespace = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; + request: AlterTableAlterColumnsRequest, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; - // Build filter to find direct child namespaces - let filter = if parent_namespace.is_empty() { - // Root namespace: find all namespaces without a parent - "object_type = 'namespace' AND NOT contains(object_id, '$')".to_string() - } else { - // Non-root: find namespaces that start with parent$ but have no additional $ - let prefix = parent_namespace.join(DELIMITER); - format!( - "object_type = 'namespace' AND starts_with(object_id, '{}{}') AND NOT contains(substring(object_id, {}), '$')", - prefix, - DELIMITER, - prefix.len() + 2 - ) - }; + if table_id.is_empty() { + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); + } - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; + let object_id = Self::str_object_id(table_id); + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; - let batches = Self::execute_scanner(scanner).await?; - let mut namespaces = Vec::new(); + match table_info { + Some(info) => { + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + let mut dataset = builder.load().await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; - for batch in batches { - if batch.num_rows() == 0 { - continue; - } + // Use shared helper to build column alterations, ensuring a clear error when data_type conversion fails + let alterations = super::build_column_alterations(&request.alterations)?; - let object_id_array = Self::get_string_column(&batch, "object_id")?; - for i in 0..batch.num_rows() { - let object_id = object_id_array.value(i); - let (_namespace, name) = Self::parse_object_id(object_id); - namespaces.push(name); + dataset.alter_columns(&alterations).await.map_err(|e| { + convert_lance_commit_error(&e, "alter_columns", Some(&object_id)) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableAlterColumnsResponse::new(version)) } + None => Err(NamespaceError::TableNotFound { message: object_id }.into()), } - - let next_page_token = - Self::apply_pagination(&mut namespaces, request.page_token, request.limit); - let mut response = ListNamespacesResponse::new(namespaces); - response.page_token = next_page_token; - Ok(response) } - async fn describe_namespace( + /// Drop columns from a table. + /// + /// Delegates to `Dataset::drop_columns` with the column names from the request. + async fn alter_table_drop_columns( &self, - request: DescribeNamespaceRequest, - ) -> Result { - let namespace_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; + request: AlterTableDropColumnsRequest, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; - // Root namespace always exists - if namespace_id.is_empty() { - #[allow(clippy::needless_update)] - return Ok(DescribeNamespaceResponse { - properties: Some(HashMap::new()), - ..Default::default() - }); + if table_id.is_empty() { + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); } - // Check if namespace exists in manifest - let object_id = namespace_id.join(DELIMITER); - let namespace_info = self.query_manifest_for_namespace(&object_id).await?; + let object_id = Self::str_object_id(table_id); + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; - match namespace_info { - #[allow(clippy::needless_update)] - Some(info) => Ok(DescribeNamespaceResponse { - properties: info.metadata, - ..Default::default() - }), - None => Err(NamespaceError::NamespaceNotFound { - message: object_id.to_string(), - } - .into()), - } - } + match table_info { + Some(info) => { + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + let mut dataset = builder.load().await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; - async fn create_namespace( - &self, - request: CreateNamespaceRequest, - ) -> Result { - let namespace_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; + let columns: Vec<&str> = request.columns.iter().map(|s| s.as_str()).collect(); + dataset.drop_columns(&columns).await.map_err(|e| { + convert_lance_commit_error(&e, "drop_columns", Some(&object_id)) + })?; - // Root namespace always exists and cannot be created - if namespace_id.is_empty() { - return Err(NamespaceError::NamespaceAlreadyExists { - message: "root namespace".to_string(), + let version = dataset.version().version as i64; + Ok(AlterTableDropColumnsResponse::new(version)) } - .into()); - } - - // Validate parent namespaces exist (but not the namespace being created) - if namespace_id.len() > 1 { - self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1]) - .await?; + None => Err(NamespaceError::TableNotFound { message: object_id }.into()), } + } +} - let object_id = namespace_id.join(DELIMITER); - if self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::NamespaceAlreadyExists { - message: object_id.to_string(), - } - .into()); - } +#[cfg(test)] +mod tests { + use super::{ + BASE_OBJECTS_INDEX_NAME, ConflictResolution, CopyOnWriteMutation, DeleteObjectMutation, + LANCE_DATA_DIR, LANCE_INDICES_DIR, MANIFEST_TABLE_NAME, ManifestBatchBuilder, + ManifestEntry, ManifestIndexAccumulator, ManifestNamespace, ManifestOutputRow, + ManifestRowValue, ManifestStreamMutation, OBJECT_ID_INDEX_NAME, OBJECT_TYPE_INDEX_NAME, + ObjectType, + }; + use crate::DirectoryNamespaceBuilder; + use arrow::datatypes::DataType; + use bytes::Bytes; + use futures::StreamExt; + use lance::index::DatasetIndexExt; + use lance_core::utils::tempfile::TempStdDir; + use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; + use lance_namespace::LanceNamespace; + use lance_namespace::models::{ + CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest, + ListTablesRequest, TableExistsRequest, + }; + use lance_table::format::Fragment; + use rstest::rstest; + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; - let metadata = - Self::serialize_metadata(request.properties.as_ref(), "namespace", &object_id)?; + async fn create_manifest_namespace( + root: &str, + inline_optimization_enabled: bool, + ) -> ManifestNamespace { + create_manifest_namespace_with_retries(root, inline_optimization_enabled, None).await + } - self.insert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Namespace, - location: None, - metadata, - }], + async fn create_manifest_namespace_with_retries( + root: &str, + inline_optimization_enabled: bool, + commit_retries: Option, + ) -> ManifestNamespace { + let (object_store, base_path) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + root, + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + ManifestNamespace::from_directory( + root.to_string(), None, + None, + object_store, + base_path, + true, + inline_optimization_enabled, + commit_retries, ) - .await?; - - Ok(CreateNamespaceResponse { - properties: request.properties, - ..Default::default() - }) - } - - async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { - let namespace_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; + .await + .unwrap() + } - // Root namespace always exists and cannot be dropped - if namespace_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Root namespace cannot be dropped".to_string(), - } - .into()); - } + struct CommitConflictAfterRewriteMutation { + root: String, + conflict_object_id: String, + } - let object_id = namespace_id.join(DELIMITER); + impl ManifestStreamMutation for CommitConflictAfterRewriteMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } - // Check if namespace exists - if !self.manifest_contains_object(&object_id).boxed().await? { - return Err(NamespaceError::NamespaceNotFound { - message: object_id.to_string(), - } - .into()); + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + output.append( + index_data, + ManifestOutputRow { + object_id: "attempted_table", + object_type: ObjectType::Table, + location: Some("attempted_table.lance"), + metadata: None, + base_objects: None, + }, + ) } - // Check for child namespaces - let escaped_id = object_id.replace('\'', "''"); - let prefix = format!("{}{}", escaped_id, DELIMITER); - let filter = format!("starts_with(object_id, '{}')", prefix); - let mut scanner = self.manifest_scanner().boxed().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project::<&str>(&[]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - scanner.with_row_id(); - let count = scanner.count_rows().boxed().await.map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to count rows: {:?}", e), + fn finish(&self) -> CopyOnWriteMutation { + let root = self.root.clone(); + let object_id = self.conflict_object_id.clone(); + std::thread::spawn(move || { + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async move { + let writer = create_manifest_namespace(&root, false).await; + writer + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some("conflicting_table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + }); }) - })?; - - if count > 0 { - return Err(NamespaceError::NamespaceNotEmpty { - message: format!("'{}' (contains {} child objects)", object_id, count), - } - .into()); + .join() + .unwrap(); + CopyOnWriteMutation::updated(()) } + } - self.delete_from_manifest(&object_id).boxed().await?; - - Ok(DropNamespaceResponse::default()) + /// A delete mutation that, during staging, has a concurrent writer delete the same + /// object and commit first, so our own commit hits a conflict while the object is + /// already gone — exercising `ConflictResolution::SucceedIfAbsent`. + struct ConcurrentDeleteBeforeCommitMutation { + inner: DeleteObjectMutation, + root: String, + target: String, } - async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { - let namespace_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Namespace ID is required".to_string(), - }) - })?; + impl ManifestStreamMutation for ConcurrentDeleteBeforeCommitMutation { + type Output = (); - // Root namespace always exists - if namespace_id.is_empty() { - return Ok(()); + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + self.inner.process_existing_row(row, output, index_data) } - let object_id = namespace_id.join(DELIMITER); - if self.manifest_contains_object(&object_id).await? { - Ok(()) - } else { - Err(NamespaceError::NamespaceNotFound { - message: object_id.to_string(), - } - .into()) + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + self.inner.append_rows(output, index_data) } - } - async fn declare_table(&self, request: DeclareTableRequest) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), + fn finish(&self) -> CopyOnWriteMutation { + let root = self.root.clone(); + let target = self.target.clone(); + std::thread::spawn(move || { + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async move { + let writer = create_manifest_namespace(&root, false).await; + writer.delete_from_manifest(&target).await.unwrap(); + }); }) - })?; + .join() + .unwrap(); + self.inner.finish() + } - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), + fn conflict_resolution(&self) -> ConflictResolution { + ConflictResolution::SucceedIfAbsent { + object_id: self.target.clone(), + output: (), } - .into()); } + } - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); - - // Check if table already exists in manifest - let existing = self.query_manifest_for_table(&object_id).await?; - if existing.is_some() { - return Err(NamespaceError::TableAlreadyExists { - message: table_name.to_string(), + async fn manifest_base_objects( + manifest_ns: &ManifestNamespace, + ) -> HashMap>> { + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner.project(&["object_id", "base_objects"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let mut rows = HashMap::new(); + for batch in batches { + let object_ids = ManifestNamespace::get_string_column(&batch, "object_id").unwrap(); + let base_objects = ManifestNamespace::base_objects_column_values(&batch).unwrap(); + for (row, value) in base_objects.into_iter().enumerate() { + rows.insert(object_ids.value(row).to_string(), value); } - .into()); } + rows + } - // Create table location path with hash-based naming - // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance - // Otherwise, use hash-based naming: {hash}_{object_id} - let dir_name = if namespace.is_empty() && self.dir_listing_enabled { - // Root table with directory listing enabled: use {table_name}.lance - format!("{}.lance", table_name) - } else { - // Child namespace table or dir listing disabled: use hash-based naming - Self::generate_dir_name(&object_id) - }; - let table_path = self.base_path.clone().join(dir_name.as_str()); - let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + async fn manifest_data_paths(manifest_ns: &ManifestNamespace) -> HashSet { + let data_dir = manifest_ns + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_DATA_DIR); + let mut stream = manifest_ns.object_store.read_dir_all(&data_dir, None); + let mut paths = HashSet::new(); + while let Some(meta) = stream.next().await.transpose().unwrap() { + paths.insert(meta.location.to_string()); + } + paths + } - // Validate location if provided - if let Some(req_location) = &request.location { - let req_location = req_location.trim_end_matches('/'); - if req_location != table_uri { - return Err(NamespaceError::InvalidInput { - message: format!( - "Cannot declare table {} at location {}, must be at location {}", - table_name, req_location, table_uri - ), - } - .into()); - } + async fn manifest_index_paths(manifest_ns: &ManifestNamespace) -> HashSet { + let index_dir = manifest_ns + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_INDICES_DIR); + let mut stream = manifest_ns.object_store.read_dir_all(&index_dir, None); + let mut paths = HashSet::new(); + while let Some(meta) = stream.next().await.transpose().unwrap() { + paths.insert(meta.location.to_string()); } + paths + } - // Create the .lance-reserved file to mark the table as existing - let reserved_file_path = table_path.clone().join(".lance-reserved"); + fn create_test_ipc_data() -> Vec { + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::ipc::writer::StreamWriter; + use arrow::record_batch::RecordBatch; + use std::sync::Arc; - self.object_store - .create(&reserved_file_path) + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + buffer + } + + /// Open the `__manifest` dataset directly and set a table-metadata key, + /// simulating a future Lance client that persisted a feature flag. + async fn set_manifest_table_metadata(temp_path: &str, key: &str, value: &str) { + use lance::dataset::builder::DatasetBuilder; + let mut ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME)) + .load() .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to create .lance-reserved file for table {}: {}", - table_name, e - ), - }) - })? - .shutdown() + .unwrap(); + ds.update_metadata([(key, value)]).await.unwrap(); + } + + async fn create_namespace_with_one_table(temp_path: &str) { + let ns = DirectoryNamespaceBuilder::new(temp_path) + .build() .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to finalize .lance-reserved file for table {}: {}", - table_name, e - ), - }) - })?; + .unwrap(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["t1".to_string()]); + ns.create_table(create_request, Bytes::from(create_test_ipc_data())) + .await + .unwrap(); + } - let metadata = Self::serialize_metadata(request.properties.as_ref(), "table", &object_id)?; + /// This is a forward-compatibility checker only: it must not set any feature + /// flag, so existing clients keep treating the manifest as compatible. + #[tokio::test] + async fn test_manifest_has_no_feature_flags_by_default() { + use lance::dataset::builder::DatasetBuilder; + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; - // Add entry to manifest marking this as a declared table (store dir_name, not full path) - self.insert_into_manifest_with_metadata( - vec![ManifestEntry { - object_id, - object_type: ObjectType::Table, - location: Some(dir_name), - metadata, - }], - None, + let ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME)) + .load() + .await + .unwrap(); + assert!( + !ds.metadata() + .contains_key(crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY) + ); + assert!( + !ds.metadata() + .contains_key(crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY) + ); + } + + /// An unknown reader feature flag must block opening the catalog with a clear + /// "please upgrade" error rather than silently degrading to directory listing. + #[tokio::test] + async fn test_unknown_reader_flag_blocks_access() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; + set_manifest_table_metadata( + temp_path, + crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY, + "1", ) - .await?; + .await; + + let err = DirectoryNamespaceBuilder::new(temp_path) + .build() + .await + .expect_err("opening a manifest with an unknown reader flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + } + + /// An unknown writer feature flag must still allow reads but block writes. + #[tokio::test] + async fn test_unknown_writer_flag_blocks_writes_but_allows_reads() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; + set_manifest_table_metadata( + temp_path, + crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY, + "1", + ) + .await; + + let ns = DirectoryNamespaceBuilder::new(temp_path) + .build() + .await + .expect("reads should still be allowed with only a writer flag set"); + let mut list_request = ListTablesRequest::new(); + list_request.id = Some(vec![]); + assert_eq!(ns.list_tables(list_request).await.unwrap().tables.len(), 1); + + // A refused write must not leave an orphaned table dataset behind. + let entries_before = dir_entry_names(temp_path); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["t2".to_string()]); + let err = ns + .create_table(create_request, Bytes::from(create_test_ipc_data())) + .await + .expect_err("writing through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + assert_eq!( + entries_before, + dir_entry_names(temp_path), + "a refused create_table must not create an orphaned table directory" + ); + + // Mutations that go straight through rewrite_manifest (no early + // create_table check) must also be refused: an insert (create_namespace) + // and a delete (drop_table). This proves the writer check is enforced at + // the single copy-on-write chokepoint, not just on the create_table path. + let mut create_ns = CreateNamespaceRequest::new(); + create_ns.id = Some(vec!["ns1".to_string()]); + let err = ns + .create_namespace(create_ns) + .await + .expect_err("create_namespace through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(vec!["t1".to_string()]); + let err = ns + .drop_table(drop_request) + .await + .expect_err("drop_table through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + } + + fn dir_entry_names(path: &str) -> std::collections::BTreeSet { + std::fs::read_dir(path) + .unwrap() + .map(|e| e.unwrap().file_name().to_string_lossy().into_owned()) + .collect() + } + + #[tokio::test] + async fn test_manifest_rewrite_preserves_utf8_metadata_and_base_objects() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "view".to_string(), + object_type: ObjectType::Table, + location: Some("view.lance".to_string()), + metadata: Some(r#"{"kind":"view"}"#.to_string()), + }], + Some(vec!["base_a".to_string(), "base_b".to_string()]), + ) + .await + .unwrap(); + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "other".to_string(), + object_type: ObjectType::Namespace, + location: None, + metadata: Some(r#"{"kind":"namespace"}"#.to_string()), + }], + None, + ) + .await + .unwrap(); - log::info!( - "Declared table '{}' in manifest at {}", - table_name, - table_uri + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let metadata_field = dataset_guard.schema().field("metadata").unwrap(); + assert_eq!(metadata_field.data_type(), DataType::Utf8); + drop(dataset_guard); + + let base_objects = manifest_base_objects(&manifest_ns).await; + assert_eq!( + base_objects.get("view").cloned().unwrap(), + Some(vec!["base_a".to_string(), "base_b".to_string()]) ); + assert_eq!(base_objects.get("other").cloned().unwrap(), None); + } - // For backwards compatibility, only skip vending credentials when explicitly set to false - let vend_credentials = request.vend_credentials.unwrap_or(true); - let storage_options = if vend_credentials { - self.storage_options.clone() - } else { - None - }; + #[tokio::test] + async fn test_manifest_rewrite_replacement_indices_are_versioned() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; - Ok(DeclareTableResponse { - location: Some(table_uri), - storage_options, - properties: request.properties, - ..Default::default() - }) + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + Some(vec!["base".to_string()]), + ) + .await + .unwrap(); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let dataset_version = dataset_guard.version().version; + let indices = dataset_guard.load_indices().await.unwrap(); + let names = indices + .iter() + .map(|index| index.name.as_str()) + .collect::>(); + assert!(names.contains(OBJECT_ID_INDEX_NAME)); + assert!(names.contains(OBJECT_TYPE_INDEX_NAME)); + assert!(names.contains(BASE_OBJECTS_INDEX_NAME)); + for index in indices.iter() { + assert_eq!(index.dataset_version, dataset_version); + assert!(!index.fragment_bitmap.as_ref().unwrap().is_empty()); + } } - async fn register_table(&self, request: RegisterTableRequest) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), - }) - })?; + #[tokio::test] + async fn test_manifest_rewrite_empty_manifest_keeps_replacement_indices_valid() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), - } - .into()); + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + manifest_ns.delete_from_manifest("table").await.unwrap(); + + assert!(!manifest_ns.manifest_contains_object("table").await.unwrap()); + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner.project(&["object_id"]).unwrap(); + let rows = ManifestNamespace::execute_scanner(scanner) + .await + .unwrap() + .into_iter() + .map(|batch| batch.num_rows()) + .sum::(); + assert_eq!(rows, 0); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let dataset_version = dataset_guard.version().version; + let indices = dataset_guard.load_indices().await.unwrap(); + let names = indices + .iter() + .map(|index| index.name.as_str()) + .collect::>(); + assert!(names.contains(OBJECT_ID_INDEX_NAME)); + assert!(names.contains(OBJECT_TYPE_INDEX_NAME)); + assert!(names.contains(BASE_OBJECTS_INDEX_NAME)); + for index in indices.iter() { + assert_eq!(index.dataset_version, dataset_version); } + } - let location = request.location.clone(); + #[tokio::test] + async fn test_manifest_rewrite_fragment_bitmap_uses_overwrite_fragment_ids() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let fragments = vec![Fragment::new(0), Fragment::new(0), Fragment::new(7)]; + + let manifest = ManifestNamespace::manifest_from_overwrite_transaction( + dataset_guard.manifest(), + dataset_guard.manifest().schema.clone(), + &fragments, + ); - // Validate that location is a relative path within the root directory - // We don't allow absolute URIs or paths that escape the root - if location.contains("://") { - return Err(NamespaceError::InvalidInput { - message: format!( - "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}", - location - ), - } - .into()); - } + let fragment_ids = manifest + .fragments + .iter() + .map(|fragment| fragment.id) + .collect::>(); + assert_eq!(fragment_ids, vec![0, 1, 7]); + assert_eq!( + ManifestNamespace::manifest_fragment_bitmap(&manifest) + .unwrap() + .into_iter() + .collect::>(), + vec![0, 1, 7] + ); + } - if location.starts_with('/') { - return Err(NamespaceError::InvalidInput { - message: format!( - "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}", - location - ), - } - .into()); - } + #[tokio::test] + async fn test_manifest_noop_delete_uses_latest_snapshot() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let stale_ns = create_manifest_namespace(temp_path, false).await; + let writer_ns = create_manifest_namespace(temp_path, false).await; - // Check for path traversal attempts - if location.contains("..") { - return Err(NamespaceError::InvalidInput { - message: format!( - "Path traversal is not allowed. Location must be a relative path within the root directory: {}", - location - ), - } - .into()); - } + writer_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "late_table".to_string(), + object_type: ObjectType::Table, + location: Some("late_table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); + stale_ns.delete_from_manifest("late_table").await.unwrap(); - // Validate that parent namespaces exist (if not root) - if !namespace.is_empty() { - self.validate_namespace_levels_exist(&namespace).await?; - } + let check_ns = create_manifest_namespace(temp_path, false).await; + assert!( + !check_ns + .manifest_contains_object("late_table") + .await + .unwrap() + ); + } - // Check if table already exists - if self.manifest_contains_object(&object_id).await? { - return Err(NamespaceError::TableAlreadyExists { - message: object_id.to_string(), - } - .into()); - } + #[tokio::test] + async fn test_manifest_noop_delete_cleans_uncommitted_data_file() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; - // Register the table with its location in the manifest - self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone())) - .await?; + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - Ok(RegisterTableResponse { - location: Some(location), - ..Default::default() - }) + let before = manifest_data_paths(&manifest_ns).await; + assert!(!before.is_empty()); + + manifest_ns + .delete_from_manifest("missing_table") + .await + .unwrap(); + + let after = manifest_data_paths(&manifest_ns).await; + assert_eq!(after, before); } - async fn deregister_table( - &self, - request: DeregisterTableRequest, - ) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| { - lance_core::Error::from(NamespaceError::InvalidInput { - message: "Table ID is required".to_string(), + #[tokio::test] + async fn test_manifest_final_commit_failure_cleans_uncommitted_rewrite_files() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_with_retries(temp_path, true, Some(0)).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + + let before_data_paths = manifest_data_paths(&manifest_ns).await; + let before_index_paths = manifest_index_paths(&manifest_ns).await; + + let result = manifest_ns + .rewrite_manifest("Failed to test manifest cleanup", || { + CommitConflictAfterRewriteMutation { + root: temp_path.to_string(), + conflict_object_id: "conflicting_table".to_string(), + } }) - })?; + .await; + assert!(result.is_err()); - if table_id.is_empty() { - return Err(NamespaceError::InvalidInput { - message: "Table ID cannot be empty".to_string(), - } - .into()); - } + let after_data_paths = manifest_data_paths(&manifest_ns).await; + assert!(before_data_paths.is_subset(&after_data_paths)); + assert_eq!(after_data_paths.len(), before_data_paths.len() + 1); + assert_eq!(manifest_index_paths(&manifest_ns).await, before_index_paths); + assert!( + manifest_ns + .manifest_contains_object("conflicting_table") + .await + .unwrap() + ); + assert!( + !manifest_ns + .manifest_contains_object("attempted_table") + .await + .unwrap() + ); + } - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); + #[tokio::test] + async fn test_manifest_commit_visible_on_memory_store() { + // Regression: the commit must use the same object store the manifest dataset reads + // from. On `memory://` the namespace store and the dataset store can be different + // in-memory instances, so a commit written to the wrong one is invisible to reads + // (manifests as stale version -> endless conflict / "not found"). + let manifest_ns = create_manifest_namespace("memory://test_commit_visible", false).await; + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!(manifest_ns.manifest_contains_object("table").await.unwrap()); + // A second sequential commit must not falsely conflict. + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table2".to_string(), + object_type: ObjectType::Table, + location: Some("table2.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!( + manifest_ns + .manifest_contains_object("table2") + .await + .unwrap() + ); + } - // Get table info before deleting - let table_info = self.query_manifest_for_table(&object_id).await?; + #[tokio::test] + async fn test_manifest_commit_uses_inline_transaction() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; - let table_uri = match table_info { - Some(info) => { - // Delete from manifest only (leave physical data intact) - self.delete_from_manifest(&object_id).boxed().await?; - Self::construct_full_uri(&self.root, &info.location)? - } - None => { - return Err(NamespaceError::TableNotFound { - message: object_id.to_string(), - } - .into()); - } - }; + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); - Ok(DeregisterTableResponse { - id: request.id.clone(), - location: Some(table_uri), - ..Default::default() - }) + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let manifest = dataset_guard.manifest(); + // The overwrite transaction is embedded inline in the manifest, never written as a + // separate _transactions/*.txn file. + assert!(manifest.transaction_section.is_some()); + assert!(manifest.transaction_file.is_none()); } -} -#[cfg(test)] -mod tests { - use crate::{DirectoryNamespaceBuilder, ManifestNamespace}; - use bytes::Bytes; - use lance_core::utils::tempfile::TempStdDir; - use lance_namespace::LanceNamespace; - use lance_namespace::models::{ - CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest, - ListTablesRequest, TableExistsRequest, - }; - use rstest::rstest; + #[tokio::test] + async fn test_manifest_commit_landed_attributes_data_file() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + + let dataset = Arc::new(manifest_ns.manifest_dataset.get().await.unwrap().clone()); + let version = dataset.manifest().version; + let our_files = dataset + .manifest() + .fragments + .iter() + .flat_map(|fragment| fragment.files.iter()) + .map(|file| file.path.clone()) + .collect::>(); + assert!(!our_files.is_empty()); - fn create_test_ipc_data() -> Vec { - use arrow::array::{Int32Array, StringArray}; - use arrow::datatypes::{DataType, Field, Schema}; - use arrow::ipc::writer::StreamWriter; - use arrow::record_batch::RecordBatch; - use std::sync::Arc; + // The committed version references our data file => attributed to us (a lost-ack + // commit must be treated as success, not cleaned up). + assert!( + manifest_ns + .manifest_commit_landed(&dataset, version, &our_files) + .await + ); + // A different file set is not attributed to us. + let other = HashSet::from(["missing.lance".to_string()]); + assert!( + !manifest_ns + .manifest_commit_landed(&dataset, version, &other) + .await + ); + // A version that does not exist did not land. + assert!( + !manifest_ns + .manifest_commit_landed(&dataset, version + 100, &our_files) + .await + ); + } - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); + #[tokio::test] + async fn test_manifest_delete_conflict_with_concurrent_delete_succeeds() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_with_retries(temp_path, false, Some(0)).await; - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(StringArray::from(vec!["a", "b", "c"])), - ], - ) - .unwrap(); + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!(manifest_ns.manifest_contains_object("table").await.unwrap()); + + // A concurrent writer deletes "table" and commits first, so our own delete commit + // conflicts while "table" is already gone. Native resolution treats the goal as + // achieved and succeeds instead of erroring or retrying forever. + let result = manifest_ns + .rewrite_manifest("Failed to delete from manifest", || { + ConcurrentDeleteBeforeCommitMutation { + inner: DeleteObjectMutation { + object_id: "table".to_string(), + deleted: false, + }, + root: temp_path.to_string(), + target: "table".to_string(), + } + }) + .await; - let mut buffer = Vec::new(); - { - let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); - writer.write(&batch).unwrap(); - writer.finish().unwrap(); - } - buffer + assert!(result.is_ok(), "delete should succeed: {result:?}"); + assert!(!manifest_ns.manifest_contains_object("table").await.unwrap()); } #[rstest] @@ -3939,9 +5594,9 @@ mod tests { /// Test that concurrent create_table calls for the same table name don't /// create duplicate entries in the manifest. Uses two independent /// ManifestNamespace instances pointing at the same directory to simulate - /// two separate OS processes racing on table creation. The conflict_retries - /// setting on the MergeInsert ensures the second operation properly detects - /// the duplicate via WhenMatched::Fail after retrying against the latest data. + /// two separate OS processes racing on table creation. Copy-on-write rewrite + /// retries ensure the second operation detects the duplicate after retrying + /// against the latest data. #[tokio::test] async fn test_concurrent_create_table_no_duplicates() { let temp_dir = TempStdDir::default(); @@ -4069,4 +5724,297 @@ mod tests { assert_eq!(n, names(&["c", "d"])); assert_eq!(next, Some("d".to_string())); } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_add_columns(#[case] inline_optimization: bool) { + use lance_namespace::models::{ + AddColumnsEntry, AlterTableAddColumnsRequest, DescribeTableRequest, + }; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create a table with id and name columns + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Add a new column using SQL expression + let mut new_col = AddColumnsEntry::new("doubled_id".to_string()); + new_col.expression = Some(Some("id * 2".to_string())); + let mut add_request = AlterTableAddColumnsRequest::new(vec![new_col]); + add_request.id = Some(vec!["test_table".to_string()]); + + let response = dir_namespace + .alter_table_add_columns(add_request) + .await + .unwrap(); + // Version should have incremented + assert!(response.version > 1); + + // Verify the column was added by describing the table with detailed metadata + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = dir_namespace + .describe_table(describe_request) + .await + .unwrap(); + assert!(describe_response.schema.is_some()); + + let schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"doubled_id"), + "Column 'doubled_id' should exist after add_columns, got: {:?}", + field_names + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_add_columns_missing_id(#[case] inline_optimization: bool) { + use lance_namespace::models::{AddColumnsEntry, AlterTableAddColumnsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Request without ID should fail + let new_col = AddColumnsEntry::new("col".to_string()); + let request = AlterTableAddColumnsRequest::new(vec![new_col]); + let result = dir_namespace.alter_table_add_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_add_columns_nonexistent_table(#[case] inline_optimization: bool) { + use lance_namespace::models::{AddColumnsEntry, AlterTableAddColumnsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Request with non-existent table should fail + let new_col = AddColumnsEntry::new("col".to_string()); + let mut request = AlterTableAddColumnsRequest::new(vec![new_col]); + request.id = Some(vec!["nonexistent".to_string()]); + let result = dir_namespace.alter_table_add_columns(request).await; + assert!(result.is_err(), "Should fail when table does not exist"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_alter_columns_rename(#[case] inline_optimization: bool) { + use lance_namespace::models::{ + AlterColumnsEntry, AlterTableAlterColumnsRequest, DescribeTableRequest, + }; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create a table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Rename the "name" column to "full_name" + let mut entry = AlterColumnsEntry::new("name".to_string()); + entry.rename = Some(Some("full_name".to_string())); + let mut alter_request = AlterTableAlterColumnsRequest::new(vec![entry]); + alter_request.id = Some(vec!["test_table".to_string()]); + + let response = dir_namespace + .alter_table_alter_columns(alter_request) + .await + .unwrap(); + assert!(response.version > 1); + + // Verify the column was renamed + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = dir_namespace + .describe_table(describe_request) + .await + .unwrap(); + assert!(describe_response.schema.is_some()); + + let schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"full_name"), + "Column should be renamed to 'full_name', got: {:?}", + field_names + ); + assert!( + !field_names.contains(&"name"), + "Old column name 'name' should no longer exist, got: {:?}", + field_names + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_alter_columns_missing_id(#[case] inline_optimization: bool) { + use lance_namespace::models::{AlterColumnsEntry, AlterTableAlterColumnsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let entry = AlterColumnsEntry::new("name".to_string()); + let request = AlterTableAlterColumnsRequest::new(vec![entry]); + let result = dir_namespace.alter_table_alter_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_drop_columns(#[case] inline_optimization: bool) { + use lance_namespace::models::{AlterTableDropColumnsRequest, DescribeTableRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create a table with id and name columns + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Drop the "name" column + let mut drop_request = AlterTableDropColumnsRequest::new(vec!["name".to_string()]); + drop_request.id = Some(vec!["test_table".to_string()]); + + let response = dir_namespace + .alter_table_drop_columns(drop_request) + .await + .unwrap(); + assert!(response.version > 1); + + // Verify the column was dropped + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = dir_namespace + .describe_table(describe_request) + .await + .unwrap(); + assert!(describe_response.schema.is_some()); + + let schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + !field_names.contains(&"name"), + "Column 'name' should have been dropped, got: {:?}", + field_names + ); + assert!( + field_names.contains(&"id"), + "Column 'id' should still exist, got: {:?}", + field_names + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_drop_columns_missing_id(#[case] inline_optimization: bool) { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + let result = dir_namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_drop_columns_nonexistent_table(#[case] inline_optimization: bool) { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let mut request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + request.id = Some(vec!["nonexistent".to_string()]); + let result = dir_namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table does not exist"); + } } diff --git a/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs new file mode 100644 index 00000000000..d0849ceda4f --- /dev/null +++ b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Reader/writer feature flags for the directory-catalog `__manifest` dataset. +//! +//! Forward-compatibility infrastructure for the `__manifest` Lance dataset, +//! analogous to the Lance table format's `reader_feature_flags` / +//! `writer_feature_flags` but describing the *catalog manifest* format (schema +//! and semantics) rather than the underlying Lance file format. The flags are +//! persisted in the `__manifest` dataset's `table_metadata` map. +//! +//! Each manifest feature owns one bit in a `u64` bitmask. A build may read a +//! `__manifest` only if it understands every set reader-flag bit, and may write +//! it only if it understands every set writer-flag bit; otherwise it fails fast +//! with a clear "please upgrade" error instead of silently misreading data. The +//! set of bits a build understands is `READER_KNOWN_FLAGS` / `WRITER_KNOWN_FLAGS`. +//! +//! This is the mechanism only: no manifest feature is defined yet, so the known +//! masks are `0` and nothing is ever set — every current manifest reads and +//! writes unchanged. The first format change that needs forward-compatibility +//! protection adds its bit to the known masks and stamps it on write; from then +//! on, builds without that bit refuse the new format rather than misreading it. +//! Manifests written before this mechanism carry no flag keys, which parse as +//! `0` and stay compatible with every build. + +use std::collections::HashMap; + +use lance_core::{Error, Result}; +use lance_namespace::error::NamespaceError; + +/// `table_metadata` key holding the reader feature-flag bitmask (decimal `u64`). +pub const READER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.reader_feature_flags"; +/// `table_metadata` key holding the writer feature-flag bitmask (decimal `u64`). +pub const WRITER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.writer_feature_flags"; + +/// Reader feature-flag bits this build understands. No manifest feature is +/// defined yet, so this build understands none and refuses any non-zero reader +/// flag. A future format change adds its bit here. +const READER_KNOWN_FLAGS: u64 = 0; +/// Writer feature-flag bits this build understands. +const WRITER_KNOWN_FLAGS: u64 = 0; + +/// Whether this build can read a `__manifest` whose persisted reader feature +/// flags are `reader_flags` — i.e. it understands every set bit. +pub fn can_read_manifest(reader_flags: u64) -> bool { + (reader_flags & !READER_KNOWN_FLAGS) == 0 +} + +/// Whether this build can write a `__manifest` whose persisted writer feature +/// flags are `writer_flags` — i.e. it understands every set bit. +pub fn can_write_manifest(writer_flags: u64) -> bool { + (writer_flags & !WRITER_KNOWN_FLAGS) == 0 +} + +fn parse_flags(table_metadata: &HashMap, key: &str) -> Result { + match table_metadata.get(key) { + None => Ok(0), + Some(raw) => raw.parse::().map_err(|e| { + Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset has an unparsable feature-flag value '{raw}' for \ + '{key}': {e}. This likely means it was written by a newer, incompatible \ + version of Lance; please upgrade Lance to use this catalog." + ), + }) + }), + } +} + +/// Reader feature flags persisted in the `__manifest` `table_metadata` (`0` if absent). +pub fn reader_flags(table_metadata: &HashMap) -> Result { + parse_flags(table_metadata, READER_FEATURE_FLAGS_KEY) +} + +/// Writer feature flags persisted in the `__manifest` `table_metadata` (`0` if absent). +pub fn writer_flags(table_metadata: &HashMap) -> Result { + parse_flags(table_metadata, WRITER_FEATURE_FLAGS_KEY) +} + +/// Validate that this build can READ the `__manifest` described by `table_metadata`, +/// returning a clear "please upgrade" error otherwise. +pub fn ensure_readable(table_metadata: &HashMap) -> Result<()> { + let flags = reader_flags(table_metadata)?; + if !can_read_manifest(flags) { + return Err(Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset was written with reader feature flags {flags}, which this \ + version of Lance does not understand (known reader flags: {READER_KNOWN_FLAGS}). \ + Please upgrade Lance to read this catalog." + ), + })); + } + Ok(()) +} + +/// Validate that this build can WRITE the `__manifest` described by `table_metadata`, +/// returning a clear "please upgrade" error otherwise. +pub fn ensure_writable(table_metadata: &HashMap) -> Result<()> { + let flags = writer_flags(table_metadata)?; + if !can_write_manifest(flags) { + return Err(Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset was written with writer feature flags {flags}, which this \ + version of Lance does not understand (known writer flags: {WRITER_KNOWN_FLAGS}). \ + Please upgrade Lance to modify this catalog." + ), + })); + } + Ok(()) +} + +/// Whether `err` indicates the `__manifest` is in a format this build cannot +/// handle — i.e. it carries an unknown reader/writer feature flag, surfaced by +/// [`ensure_readable`] / [`ensure_writable`] as a [`NamespaceError::Unsupported`]. +/// +/// Catalog initialization uses this to refuse opening such a manifest rather +/// than silently degrading to a directory-listing view that ignores it. The +/// `__manifest` open path raises no other `Unsupported` error, so matching the +/// code is sufficient and avoids brittle message matching. +pub fn is_incompatible_manifest_error(err: &Error) -> bool { + matches!( + err, + Error::Namespace { source, .. } + if source + .downcast_ref::() + .is_some_and(|e| matches!(e, NamespaceError::Unsupported { .. })) + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn meta(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + + #[test] + fn unflagged_is_compatible() { + assert!(can_read_manifest(0)); + assert!(can_write_manifest(0)); + let empty = HashMap::new(); + assert!(ensure_readable(&empty).is_ok()); + assert!(ensure_writable(&empty).is_ok()); + assert_eq!(reader_flags(&empty).unwrap(), 0); + assert_eq!(writer_flags(&empty).unwrap(), 0); + // Explicit zeroes are also compatible. + let zeroed = meta(&[ + (READER_FEATURE_FLAGS_KEY, "0"), + (WRITER_FEATURE_FLAGS_KEY, "0"), + ]); + assert!(ensure_readable(&zeroed).is_ok()); + assert!(ensure_writable(&zeroed).is_ok()); + } + + #[test] + fn any_unknown_flag_is_refused() { + // This build understands no feature flags, so any non-zero bit is refused. + assert!(!can_read_manifest(1)); + assert!(!can_write_manifest(1)); + assert!(!can_read_manifest(1 << 30)); + assert!(!can_write_manifest(1 << 63)); + + let reader = meta(&[(READER_FEATURE_FLAGS_KEY, "1")]); + let err = ensure_readable(&reader).unwrap_err(); + assert!(err.to_string().to_lowercase().contains("upgrade")); + assert!(is_incompatible_manifest_error(&err)); + // A reader flag does not block writers that the writer mask allows. + assert!(ensure_writable(&reader).is_ok()); + + let writer = meta(&[(WRITER_FEATURE_FLAGS_KEY, "2")]); + let err = ensure_writable(&writer).unwrap_err(); + assert!(err.to_string().to_lowercase().contains("upgrade")); + assert!(is_incompatible_manifest_error(&err)); + } + + #[test] + fn unparsable_value_is_refused() { + let m = meta(&[(READER_FEATURE_FLAGS_KEY, "not-a-number")]); + assert!(reader_flags(&m).is_err()); + assert!(ensure_readable(&m).is_err()); + } + + #[test] + fn unrelated_error_is_not_an_incompatibility() { + let other = Error::from(NamespaceError::TableNotFound { + message: "x".to_string(), + }); + assert!(!is_incompatible_manifest_error(&other)); + } +} diff --git a/rust/lance-namespace-impls/src/rest.rs b/rust/lance-namespace-impls/src/rest.rs index 27a563d2807..c245a1e6dc1 100644 --- a/rust/lance-namespace-impls/src/rest.rs +++ b/rust/lance-namespace-impls/src/rest.rs @@ -23,11 +23,13 @@ use lance_namespace::models::{ AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, BatchDeleteTableVersionsRequest, BatchDeleteTableVersionsResponse, CountTableRowsRequest, CreateMaterializedViewRequest, CreateMaterializedViewResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, - CreateTableTagResponse, CreateTableVersionRequest, CreateTableVersionResponse, - DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, - DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + CreateNamespaceResponse, CreateTableBranchRequest, CreateTableBranchResponse, + CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + CreateTableVersionRequest, CreateTableVersionResponse, DeclareTableRequest, + DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableBranchRequest, DeleteTableBranchResponse, DeleteTableTagRequest, + DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, DescribeTableVersionRequest, DescribeTableVersionResponse, DescribeTransactionRequest, @@ -36,7 +38,8 @@ use lance_namespace::models::{ ErrorResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableBranchesRequest, ListTableBranchesResponse, ListTableIndicesRequest, + ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, QueryTableRequest, RefreshMaterializedViewRequest, RefreshMaterializedViewResponse, @@ -1294,6 +1297,13 @@ impl LanceNamespace for RestNamespace { descending_str = descending.to_string(); query.push(("descending", descending_str.as_str())); } + // Forward branch as a query param (this op sends no body). + // describe_table_version differs: branch rides its body, already serialized. + let branch_str; + if let Some(ref branch) = request.branch { + branch_str = branch.clone(); + query.push(("branch", branch_str.as_str())); + } self.post_json(&path, &query, &(), "list_table_versions", &id) .await } @@ -1553,6 +1563,55 @@ impl LanceNamespace for RestNamespace { .await } + async fn create_table_branch( + &self, + request: CreateTableBranchRequest, + ) -> Result { + self.record_op("create_table_branch"); + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/branches/create", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_branch", &id) + .await + } + + async fn list_table_branches( + &self, + request: ListTableBranchesRequest, + ) -> Result { + self.record_op("list_table_branches"); + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/branches/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.post_json(&path, &query, &request, "list_table_branches", &id) + .await + } + + async fn delete_table_branch( + &self, + request: DeleteTableBranchRequest, + ) -> Result { + self.record_op("delete_table_branch"); + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/branches/delete", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "delete_table_branch", &id) + .await + } + fn namespace_id(&self) -> String { format!( "RestNamespace {{ endpoint: {:?}, delimiter: {:?} }}", diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index f7de5dc2240..44ebd866810 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -141,6 +141,10 @@ impl RestAdapter { .route("/v1/table/:id/tags/create", post(create_table_tag)) .route("/v1/table/:id/tags/delete", post(delete_table_tag)) .route("/v1/table/:id/tags/update", post(update_table_tag)) + // Branch operations + .route("/v1/table/:id/branches/create", post(create_table_branch)) + .route("/v1/table/:id/branches/list", post(list_table_branches)) + .route("/v1/table/:id/branches/delete", post(delete_table_branch)) // Query plan operations .route("/v1/table/:id/explain_plan", post(explain_table_query_plan)) .route("/v1/table/:id/analyze_plan", post(analyze_table_query_plan)) @@ -302,6 +306,7 @@ struct PaginationQuery { limit: Option, include_declared: Option, descending: Option, + branch: Option, } #[derive(Debug, Deserialize)] @@ -325,11 +330,13 @@ fn error_code_to_status(code: u32) -> StatusCode { | Some(lance_namespace::error::ErrorCode::TableTagNotFound) | Some(lance_namespace::error::ErrorCode::TransactionNotFound) | Some(lance_namespace::error::ErrorCode::TableVersionNotFound) - | Some(lance_namespace::error::ErrorCode::TableColumnNotFound) => StatusCode::NOT_FOUND, + | Some(lance_namespace::error::ErrorCode::TableColumnNotFound) + | Some(lance_namespace::error::ErrorCode::TableBranchNotFound) => StatusCode::NOT_FOUND, Some(lance_namespace::error::ErrorCode::NamespaceAlreadyExists) | Some(lance_namespace::error::ErrorCode::TableAlreadyExists) | Some(lance_namespace::error::ErrorCode::TableIndexAlreadyExists) | Some(lance_namespace::error::ErrorCode::TableTagAlreadyExists) + | Some(lance_namespace::error::ErrorCode::TableBranchAlreadyExists) | Some(lance_namespace::error::ErrorCode::ConcurrentModification) => StatusCode::CONFLICT, Some(lance_namespace::error::ErrorCode::NamespaceNotEmpty) | Some(lance_namespace::error::ErrorCode::InvalidTableState) => StatusCode::CONFLICT, @@ -847,6 +854,7 @@ async fn list_table_versions( page_token: params.page_token, limit: params.limit, descending: params.descending, + branch: params.branch, identity: extract_identity(&headers), ..Default::default() }; @@ -872,6 +880,7 @@ async fn create_table_version( manifest_size: body.manifest_size, e_tag: body.e_tag, metadata: body.metadata, + branch: body.branch, ..Default::default() }; @@ -891,6 +900,7 @@ async fn describe_table_version( let request = DescribeTableVersionRequest { id: Some(parse_id(&id, query.delimiter.as_deref())), version: body.version, + branch: body.branch, identity: extract_identity(&headers), ..Default::default() }; @@ -912,6 +922,7 @@ async fn batch_delete_table_versions( id: Some(parse_id(&id, params.delimiter.as_deref())), identity: extract_identity(&headers), ranges: body.ranges, + branch: body.branch, ..Default::default() }; @@ -1261,6 +1272,62 @@ async fn update_table_tag( } } +// ============================================================================ +// Branch Operation Handlers +// ============================================================================ + +async fn create_table_branch( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.create_table_branch(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_table_branches( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableBranchesRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + page_token: params.page_token, + limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.list_table_branches(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn delete_table_branch( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.delete_table_branch(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + // ============================================================================ // Query Plan Operation Handlers // ============================================================================ @@ -1456,15 +1523,25 @@ mod tests { impl RestServerFixture { async fn new() -> Self { + Self::build(false).await + } + + /// Like [`Self::new`], with managed versioning (table version + /// tracking) enabled on the backend. + async fn new_managed() -> Self { + Self::build(true).await + } + + async fn build(managed_versioning: bool) -> Self { let temp_dir = TempDir::new().unwrap(); let temp_path = temp_dir.path().to_str().unwrap().to_string(); // Create DirectoryNamespace backend with manifest enabled - let backend = DirectoryNamespaceBuilder::new(&temp_path) - .manifest_enabled(true) - .build() - .await - .unwrap(); + let mut builder = DirectoryNamespaceBuilder::new(&temp_path).manifest_enabled(true); + if managed_versioning { + builder = builder.table_version_tracking_enabled(true); + } + let backend = builder.build().await.unwrap(); let backend = Arc::new(backend); // Start REST server with port 0 (OS assigns available port) @@ -3095,13 +3172,7 @@ mod tests { "context_test_ns".to_string(), "test_table".to_string(), ]), - with_table_uri: None, - load_detailed_metadata: None, - check_declared: None, - vend_credentials: None, - version: None, - identity: None, - context: None, + ..Default::default() }; let result = namespace.describe_table(describe_req).await; assert!(result.is_ok(), "Failed to describe table: {:?}", result); @@ -3195,6 +3266,455 @@ mod tests { ); } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_branch_param_forwarded_end_to_end() { + let fixture = RestServerFixture::new().await; + + fixture + .namespace + .create_namespace(CreateNamespaceRequest { + id: Some(vec!["branch_fwd_ns".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + fixture + .namespace + .create_table( + CreateTableRequest { + id: Some(vec![ + "branch_fwd_ns".to_string(), + "branch_fwd_table".to_string(), + ]), + mode: Some("create".to_string()), + ..Default::default() + }, + create_test_arrow_data(), + ) + .await + .unwrap(); + + let id = vec!["branch_fwd_ns".to_string(), "branch_fwd_table".to_string()]; + + // Control: no branch succeeds (resolves the main chain). + assert!( + fixture + .namespace + .list_table_versions(ListTableVersionsRequest { + id: Some(id.clone()), + ..Default::default() + }) + .await + .is_ok() + ); + + // list forwards branch as a query param; a bogus branch 404s at the backend. + assert!( + fixture + .namespace + .list_table_versions(ListTableVersionsRequest { + id: Some(id.clone()), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await + .is_err(), + "branch must be forwarded as a query param and honored by the backend" + ); + + // describe carries branch in the request body; a bogus branch likewise 404s. + assert!( + fixture + .namespace + .describe_table_version(DescribeTableVersionRequest { + id: Some(id.clone()), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await + .is_err(), + "branch must be forwarded in the request body and honored by the backend" + ); + + fixture.server_handle.shutdown(); + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_branch_crud_end_to_end() { + let fixture = RestServerFixture::new().await; + + fixture + .namespace + .create_namespace(CreateNamespaceRequest { + id: Some(vec!["branch_crud_ns".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + fixture + .namespace + .create_table( + CreateTableRequest { + id: Some(vec![ + "branch_crud_ns".to_string(), + "branch_crud_table".to_string(), + ]), + mode: Some("create".to_string()), + ..Default::default() + }, + create_test_arrow_data(), + ) + .await + .unwrap(); + + let id = vec![ + "branch_crud_ns".to_string(), + "branch_crud_table".to_string(), + ]; + + // create -> list shows it (client -> server -> directory backend) + fixture + .namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let listed = fixture + .namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert!( + listed.branches.contains_key("dev"), + "created branch should appear in list: {:?}", + listed.branches + ); + + // duplicate create -> 409 Conflict + let port = fixture.server_handle.port(); + let client = reqwest::Client::new(); + let table_path = "branch_crud_ns%24branch_crud_table"; + let resp = client + .post(format!( + "http://127.0.0.1:{}/v1/table/{}/branches/create", + port, table_path + )) + .query(&[("delimiter", "$")]) + .json(&serde_json::json!({ "name": "dev" })) + .send() + .await + .unwrap(); + assert_eq!( + resp.status(), + 409, + "duplicate branch create should map to 409, got {}", + resp.status() + ); + + // delete -> list no longer shows it + fixture + .namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let listed = fixture + .namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert!( + !listed.branches.contains_key("dev"), + "deleted branch must not appear in list: {:?}", + listed.branches + ); + + // delete missing -> 404 Not Found (raw HTTP validates TableBranchNotFound -> 404). + let resp = client + .post(format!( + "http://127.0.0.1:{}/v1/table/{}/branches/delete", + port, table_path + )) + .query(&[("delimiter", "$")]) + .json(&serde_json::json!({ "name": "dev" })) + .send() + .await + .unwrap(); + assert_eq!( + resp.status(), + 404, + "deleting a missing branch should map to 404, got {}", + resp.status() + ); + + fixture.server_handle.shutdown(); + } + + /// The managed (manifest-tracked) branch flow over REST: create a + /// managed table and a branch through the RestNamespace client, open + /// the branch via `from_namespace(...).with_branch`, commit on it, + /// check out across branches at an overlapping version number, and + /// round-trip a branch-pointing tag. Mirrors the DirectoryNamespace + /// e2e to prove the REST layer forwards everything the managed commit + /// store needs. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_namespace_managed_branch_e2e() { + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use arrow::record_batch::{RecordBatch, RecordBatchIterator}; + use futures::TryStreamExt; + use lance::dataset::builder::DatasetBuilder; + use lance::dataset::refs::Ref; + use lance::dataset::{Dataset, WriteMode, WriteParams}; + use lance_namespace::LanceNamespace; + + async fn scan_ids(ds: &Dataset) -> Vec { + let batches: Vec = ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let mut ids: Vec = batches + .iter() + .flat_map(|b| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + ids.sort(); + ids + } + + let fixture = RestServerFixture::new_managed().await; + let namespace = Arc::new(fixture.namespace.clone()) as Arc; + let table_id = vec!["mb_table".to_string()]; + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = |seed: i32| { + RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(vec![seed]))]) + .unwrap() + }; + + // Managed main: v1 (id=1), v2 (id=2). + let mut main_ds = Dataset::write_into_namespace( + RecordBatchIterator::new(vec![Ok(batch(1))], schema.clone()), + namespace.clone(), + table_id.clone(), + Some(WriteParams { + mode: WriteMode::Create, + ..Default::default() + }), + ) + .await + .unwrap(); + main_ds + .append( + RecordBatchIterator::new(vec![Ok(batch(2))], schema.clone()), + None, + ) + .await + .unwrap(); + + // The REST layer must surface managed versioning for the deferred + // commit handler to engage. + let described = namespace + .describe_table(DescribeTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!( + described.managed_versioning, + Some(true), + "managed_versioning must survive the REST round trip" + ); + let main_chain_len = |ns: Arc, table_id: Vec| async move { + ns.list_table_versions(ListTableVersionsRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap() + .versions + .len() + }; + assert_eq!(main_chain_len(namespace.clone(), table_id.clone()).await, 2); + + // Branch via the REST client, then open and commit on it. + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + let mut branch_ds = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + assert_eq!(branch_ds.manifest().branch.as_deref(), Some("exp")); + assert!( + branch_ds + .branch_location() + .path + .as_ref() + .ends_with("tree/exp"), + "the branch dataset must be rooted at the branch chain" + ); + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(batch(3))], schema.clone()), + None, + ) + .await + .unwrap(); + assert_eq!(branch_ds.manifest().branch.as_deref(), Some("exp")); + assert_eq!(scan_ids(&branch_ds).await, vec![1, 2, 3]); + assert_eq!( + main_chain_len(namespace.clone(), table_id.clone()).await, + 2, + "a branch commit must not advance main's chain" + ); + + // Cross-branch checkout at an overlapping version number must land + // on the branch chain (branch numbering continues from the fork + // point, so both chains have this version). + let overlap_version = branch_ds.version().version; + while main_ds.version().version < overlap_version { + main_ds + .append( + RecordBatchIterator::new(vec![Ok(batch(100))], schema.clone()), + None, + ) + .await + .unwrap(); + } + let on_branch = main_ds + .checkout_version(Ref::Version(Some("exp".to_string()), Some(overlap_version))) + .await + .unwrap(); + assert_eq!(on_branch.manifest().branch.as_deref(), Some("exp")); + assert_eq!(scan_ids(&on_branch).await, vec![1, 2, 3]); + let on_branch_latest = main_ds.checkout_branch("exp").await.unwrap(); + assert_eq!(on_branch_latest.manifest().branch.as_deref(), Some("exp")); + + // Branch-pointing tag round trip through the builder. + main_ds + .tags() + .create("exp-tag", ("exp", Some(overlap_version))) + .await + .unwrap(); + let tag_open = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone()) + .await + .unwrap() + .with_tag("exp-tag") + .load() + .await + .unwrap(); + assert_eq!(tag_open.manifest().branch.as_deref(), Some("exp")); + assert_eq!(tag_open.version().version, overlap_version); + assert_eq!(scan_ids(&tag_open).await, vec![1, 2, 3]); + + fixture.server_handle.shutdown(); + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_list_branches_bodyless_post() { + let fixture = RestServerFixture::new().await; + + fixture + .namespace + .create_namespace(CreateNamespaceRequest { + id: Some(vec!["list_post_ns".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + fixture + .namespace + .create_table( + CreateTableRequest { + id: Some(vec![ + "list_post_ns".to_string(), + "list_post_table".to_string(), + ]), + mode: Some("create".to_string()), + ..Default::default() + }, + create_test_arrow_data(), + ) + .await + .unwrap(); + fixture + .namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(vec![ + "list_post_ns".to_string(), + "list_post_table".to_string(), + ]), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let port = fixture.server_handle.port(); + let client = reqwest::Client::new(); + let resp = client + .post(format!( + "http://127.0.0.1:{}/v1/table/list_post_ns%24list_post_table/branches/list", + port + )) + .query(&[("delimiter", "$")]) + .send() + .await + .unwrap(); + assert_eq!( + resp.status(), + 200, + "bodyless list POST should succeed, got {}", + resp.status() + ); + let body: ListTableBranchesResponse = resp.json().await.unwrap(); + assert!( + body.branches.contains_key("dev"), + "bodyless list should return the branch, got: {:?}", + body.branches + ); + + fixture.server_handle.shutdown(); + } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_describe_table_version() { let fixture = RestServerFixture::new().await; diff --git a/rust/lance-namespace/src/error.rs b/rust/lance-namespace/src/error.rs index 8a73d4db8e8..5ed05541be3 100644 --- a/rust/lance-namespace/src/error.rs +++ b/rust/lance-namespace/src/error.rs @@ -78,6 +78,10 @@ pub enum ErrorCode { TableSchemaValidationError = 20, /// Request was throttled due to rate limiting or too many concurrent operations Throttling = 21, + /// The specified table branch does not exist + TableBranchNotFound = 22, + /// A table branch with this name already exists + TableBranchAlreadyExists = 23, } impl ErrorCode { @@ -113,6 +117,8 @@ impl ErrorCode { 19 => Some(Self::InvalidTableState), 20 => Some(Self::TableSchemaValidationError), 21 => Some(Self::Throttling), + 22 => Some(Self::TableBranchNotFound), + 23 => Some(Self::TableBranchAlreadyExists), _ => None, } } @@ -143,6 +149,8 @@ impl std::fmt::Display for ErrorCode { Self::InvalidTableState => "InvalidTableState", Self::TableSchemaValidationError => "TableSchemaValidationError", Self::Throttling => "Throttling", + Self::TableBranchNotFound => "TableBranchNotFound", + Self::TableBranchAlreadyExists => "TableBranchAlreadyExists", }; write!(f, "{}", name) } @@ -260,6 +268,14 @@ pub enum NamespaceError { /// Request was throttled due to rate limiting or too many concurrent operations. #[snafu(display("Throttling: {message}"))] Throttling { message: String }, + + /// The specified table branch does not exist. + #[snafu(display("Table branch not found: {message}"))] + TableBranchNotFound { message: String }, + + /// A table branch with this name already exists. + #[snafu(display("Table branch already exists: {message}"))] + TableBranchAlreadyExists { message: String }, } impl NamespaceError { @@ -291,7 +307,9 @@ impl NamespaceError { | Self::Internal { message } | Self::InvalidTableState { message } | Self::TableSchemaValidationError { message } - | Self::Throttling { message } => message, + | Self::Throttling { message } + | Self::TableBranchNotFound { message } + | Self::TableBranchAlreadyExists { message } => message, } } @@ -322,6 +340,8 @@ impl NamespaceError { Self::InvalidTableState { .. } => ErrorCode::InvalidTableState, Self::TableSchemaValidationError { .. } => ErrorCode::TableSchemaValidationError, Self::Throttling { .. } => ErrorCode::Throttling, + Self::TableBranchNotFound { .. } => ErrorCode::TableBranchNotFound, + Self::TableBranchAlreadyExists { .. } => ErrorCode::TableBranchAlreadyExists, } } @@ -355,6 +375,8 @@ impl NamespaceError { Self::TableSchemaValidationError { message } } Some(ErrorCode::Throttling) => Self::Throttling { message }, + Some(ErrorCode::TableBranchNotFound) => Self::TableBranchNotFound { message }, + Some(ErrorCode::TableBranchAlreadyExists) => Self::TableBranchAlreadyExists { message }, None => Self::Internal { message }, } } @@ -380,7 +402,7 @@ mod tests { #[test] fn test_error_code_roundtrip() { - for code in 0..=21 { + for code in 0..=23 { let error_code = ErrorCode::from_u32(code).unwrap(); assert_eq!(error_code.as_u32(), code); } diff --git a/rust/lance-namespace/src/namespace.rs b/rust/lance-namespace/src/namespace.rs index 0ee93b80b2e..faa7ec57367 100644 --- a/rust/lance-namespace/src/namespace.rs +++ b/rust/lance-namespace/src/namespace.rs @@ -14,11 +14,13 @@ use lance_namespace_reqwest_client::models::{ AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, BatchDeleteTableVersionsRequest, BatchDeleteTableVersionsResponse, CountTableRowsRequest, CreateMaterializedViewRequest, CreateMaterializedViewResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, - CreateTableTagResponse, CreateTableVersionRequest, CreateTableVersionResponse, - DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, - DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + CreateNamespaceResponse, CreateTableBranchRequest, CreateTableBranchResponse, + CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + CreateTableVersionRequest, CreateTableVersionResponse, DeclareTableRequest, + DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableBranchRequest, DeleteTableBranchResponse, DeleteTableTagRequest, + DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, DescribeTableVersionRequest, DescribeTableVersionResponse, DescribeTransactionRequest, @@ -27,7 +29,8 @@ use lance_namespace_reqwest_client::models::{ ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableBranchesRequest, ListTableBranchesResponse, ListTableIndicesRequest, + ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, QueryTableRequest, RefreshMaterializedViewRequest, RefreshMaterializedViewResponse, @@ -500,6 +503,44 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { Err(Error::not_supported("update_table_tag not implemented")) } + /// Create a branch for a table. + /// + /// The new branch forks from the source ref selected by `from_branch` and + /// `from_version`, defaulting to the latest version of the main branch when + /// both are omitted. + /// + /// # Errors + /// + /// - Returns [`crate::ErrorCode::TableBranchAlreadyExists`] if a branch with the same name already exists. + /// - Returns [`crate::ErrorCode::TableNotFound`] if the table does not exist. + /// - Returns [`crate::ErrorCode::InvalidInput`] if `from_branch` or `from_version` references a source that does not exist. + async fn create_table_branch( + &self, + _request: CreateTableBranchRequest, + ) -> Result { + Err(Error::not_supported("create_table_branch not implemented")) + } + + /// List all branches for a table. + async fn list_table_branches( + &self, + _request: ListTableBranchesRequest, + ) -> Result { + Err(Error::not_supported("list_table_branches not implemented")) + } + + /// Delete a branch from a table. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::TableBranchNotFound`] if the branch does not exist. + async fn delete_table_branch( + &self, + _request: DeleteTableBranchRequest, + ) -> Result { + Err(Error::not_supported("delete_table_branch not implemented")) + } + /// Return a human-readable unique identifier for this namespace instance. /// /// This is used for equality comparison and hashing when the namespace is diff --git a/rust/lance-select/Cargo.toml b/rust/lance-select/Cargo.toml index 4d72b55b5e2..4cba7f082a8 100644 --- a/rust/lance-select/Cargo.toml +++ b/rust/lance-select/Cargo.toml @@ -18,7 +18,6 @@ arrow-schema = { workspace = true } byteorder = { workspace = true } tracing = { workspace = true } bytes = { workspace = true } -deepsize = { workspace = true } itertools = { workspace = true } lance-core = { workspace = true } roaring = { workspace = true } diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs index 86a475866c9..f9df7720441 100644 --- a/rust/lance-select/src/mask.rs +++ b/rust/lance-select/src/mask.rs @@ -9,11 +9,11 @@ use std::{collections::BTreeMap, io::Read}; use arrow_array::{Array, BinaryArray, GenericBinaryArray}; use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer}; use byteorder::{ReadBytesExt, WriteBytesExt}; -use deepsize::DeepSizeOf; use itertools::Itertools; +use lance_core::deepsize::DeepSizeOf; use roaring::{MultiOps, RoaringBitmap, RoaringTreemap}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::utils::address::RowAddress; use lance_core::{Error, Result}; @@ -78,6 +78,13 @@ impl RowAddrMask { } } + /// True if every row_id is selected. Lets callers (e.g. the FTS wand + /// loop) skip per-row mask checks entirely, which in turn lets the + /// deferred-row_id scoring path skip loading the row_id column. + pub fn is_select_all(&self) -> bool { + matches!(self, Self::BlockList(b) if b.is_empty()) + } + /// Return the indices of the input row ids that were valid pub fn selected_indices<'a>(&self, row_ids: impl Iterator + 'a) -> Vec { row_ids @@ -301,7 +308,7 @@ pub enum RowAddrSelection { } impl DeepSizeOf for RowAddrSelection { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { match self { Self::Full => 0, Self::Partial(bitmap) => bitmap.serialized_size(), @@ -685,12 +692,17 @@ impl RowAddrTreeMap { } impl CacheCodecImpl for RowAddrTreeMap { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - self.serialize_into(writer) + const TYPE_ID: &'static str = "lance.RowAddrTreeMap"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + // A roaring bitmap has its own stable, portable serialization; it is + // the whole body, so write it raw rather than length-prefixed. + self.serialize_into(w.raw_writer()) } - fn deserialize(data: &bytes::Bytes) -> Result { - Self::deserialize_from(data.as_ref()) + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + Self::deserialize_from(r.body().as_ref()) } } @@ -1859,7 +1871,7 @@ mod tests { #[test] fn test_row_addr_selection_deep_size_of() { - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; // Test Full variant - should have minimal size (just the enum discriminant) let full = RowAddrSelection::Full; diff --git a/rust/lance-select/src/mask/nullable.rs b/rust/lance-select/src/mask/nullable.rs index f76838170f3..dd9eb341e6a 100644 --- a/rust/lance-select/src/mask/nullable.rs +++ b/rust/lance-select/src/mask/nullable.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use super::{RowAddrMask, RowAddrTreeMap, RowSetOps}; @@ -62,32 +62,52 @@ impl NullableRowAddrSet { &self.nulls } + /// Get the raw `selected` bitmap. + /// + /// This is the backing field, **not** a semantic "TRUE ∪ NULL" set: a NULL + /// row may be stored only in `nulls` without appearing in `selected`. Use + /// this when you want a zero-copy view of the raw representation (e.g. + /// wire serialization that sends `selected` and `nulls` as separate sets). + /// For "TRUE rows only", use [`Self::true_rows`]. + pub fn selected_rows(&self) -> &RowAddrTreeMap { + &self.selected + } + /// Get the TRUE rows (selected but not null) pub fn true_rows(&self) -> RowAddrTreeMap { self.selected.clone() - self.nulls.clone() } pub fn union_all(selections: &[Self]) -> Self { - let true_rows = selections - .iter() - .map(|s| s.true_rows()) - .collect::>(); - let true_rows_refs = true_rows.iter().collect::>(); - let selected = RowAddrTreeMap::union_all(&true_rows_refs); + let selected = RowAddrTreeMap::union_all( + &selections + .iter() + .map(|s| &s.selected) + .collect::>(), + ); let nulls = RowAddrTreeMap::union_all( &selections .iter() .map(|s| &s.nulls) .collect::>(), ); - // TRUE | NULL = TRUE, so remove any TRUE rows from nulls - let nulls = nulls - &selected; + // TRUE | NULL = TRUE, so remove any TRUE rows from nulls. + // A row is TRUE in some input iff it's in that input's (selected - nulls). + let any_true = selections + .iter() + .map(|s| s.selected.clone() - &s.nulls) + .fold(RowAddrTreeMap::new(), |acc, t| acc | t); + let nulls = nulls - &any_true; Self { selected, nulls } } } impl PartialEq for NullableRowAddrSet { fn eq(&self, other: &Self) -> bool { + // Semantic equality: two sets are equal iff they decode to the same + // Kleene state on every row. Comparing raw `selected` would be wrong + // because a NULL row can be represented either inside or outside the + // `selected` bitmap. self.true_rows() == other.true_rows() && self.nulls == other.nulls } } @@ -473,6 +493,135 @@ mod tests { assert_eq!(result.null_rows(), &rows(&[5, 6, 8])); } + #[test] + fn test_partial_eq_semantic_equivalence() { + // Two representations of "row 5 is NULL, nothing is TRUE": + // a: selected={5}, nulls={5} (NULL row also in selected) + // b: selected={}, nulls={5} (NULL row only in nulls) + // Both decode to the same Kleene state on every row, so they must + // compare equal under semantic PartialEq. + let a = NullableRowAddrSet::new(rows(&[5]), rows(&[5])); + let b = NullableRowAddrSet::new(rows(&[]), rows(&[5])); + assert_eq!(a, b); + assert_eq!(a.true_rows(), b.true_rows()); + assert_eq!(a.null_rows(), b.null_rows()); + } + + #[test] + fn test_union_all_true_overrides_null() { + // Critical conflict case: row 5 is TRUE in set1 but NULL in set2. + // Kleene: TRUE ∨ NULL = TRUE → row 5 must end up TRUE, not NULL. + let set1 = nullable_set(&[5], &[]); + let set2 = nullable_set(&[5], &[5]); + + let result = NullableRowAddrSet::union_all(&[set1, set2]); + + assert_eq!(result.true_rows(), rows(&[5])); + assert!(result.null_rows().is_empty()); + } + + #[test] + fn test_union_all_null_only_input() { + // Input where a row is NULL but NOT in `selected` (the type allows + // `selected` to be a strict subset of TRUE ∪ NULL). + let set1 = NullableRowAddrSet::new(rows(&[]), rows(&[5])); + let set2 = NullableRowAddrSet::new(rows(&[1]), rows(&[])); + + let result = NullableRowAddrSet::union_all(&[set1, set2]); + + assert_eq!(result.true_rows(), rows(&[1])); + assert_eq!(result.null_rows(), &rows(&[5])); + } + + #[test] + fn test_union_all_all_null_for_a_row() { + // Every input marks row 7 as NULL; nothing makes it TRUE. + let set1 = nullable_set(&[7], &[7]); + let set2 = nullable_set(&[7], &[7]); + let set3 = NullableRowAddrSet::new(rows(&[]), rows(&[7])); + + let result = NullableRowAddrSet::union_all(&[set1, set2, set3]); + + assert!(result.true_rows().is_empty()); + assert_eq!(result.null_rows(), &rows(&[7])); + } + + #[test] + fn test_union_all_empty_inputs() { + let result = NullableRowAddrSet::union_all(&[]); + assert!(result.true_rows().is_empty()); + assert!(result.null_rows().is_empty()); + } + + #[test] + fn test_union_all_single_input() { + // One input → state of every row preserved. + let set = nullable_set(&[1, 2, 3, 4], &[2, 4]); + let result = NullableRowAddrSet::union_all(std::slice::from_ref(&set)); + + assert_eq!(result.true_rows(), rows(&[1, 3])); + assert_eq!(result.null_rows(), &rows(&[2, 4])); + } + + #[test] + fn test_union_all_all_empty_inputs() { + let inputs = [ + NullableRowAddrSet::empty(), + NullableRowAddrSet::empty(), + NullableRowAddrSet::empty(), + ]; + let result = NullableRowAddrSet::union_all(&inputs); + assert!(result.true_rows().is_empty()); + assert!(result.null_rows().is_empty()); + } + + #[test] + fn test_union_all_disjoint_inputs() { + // No row appears in more than one input. + let set1 = nullable_set(&[1, 2], &[2]); + let set2 = nullable_set(&[10, 11], &[11]); + let set3 = nullable_set(&[20], &[]); + + let result = NullableRowAddrSet::union_all(&[set1, set2, set3]); + + assert_eq!(result.true_rows(), rows(&[1, 10, 20])); + assert_eq!(result.null_rows(), &rows(&[2, 11])); + } + + #[test] + fn test_union_all_three_state_row() { + // Same row 42 across three inputs in three different states: + // set1: TRUE set2: NULL set3: FALSE + // Kleene OR: TRUE ∨ NULL ∨ FALSE = TRUE. + let set1 = nullable_set(&[42], &[]); + let set2 = nullable_set(&[42], &[42]); + let set3 = NullableRowAddrSet::empty(); + + let result = NullableRowAddrSet::union_all(&[set1, set2, set3]); + + assert_eq!(result.true_rows(), rows(&[42])); + assert!(result.null_rows().is_empty()); + } + + #[test] + fn test_union_all_matches_repeated_bitor() { + // union_all(a, b, c) must equal ((a | b) | c) — same Kleene operator, + // applied pairwise via the independently-implemented BitOrAssign. + let set1 = nullable_set(&[1, 2, 3, 4], &[4, 5, 6]); + let set2 = nullable_set(&[1, 4, 7, 8], &[2, 5, 8]); + let set3 = nullable_set(&[2, 6, 10], &[6, 10]); + + let via_union_all = + NullableRowAddrSet::union_all(&[set1.clone(), set2.clone(), set3.clone()]); + + let mut via_bitor = set1; + via_bitor |= &set2; + via_bitor |= &set3; + + assert_eq!(via_union_all.true_rows(), via_bitor.true_rows()); + assert_eq!(via_union_all.null_rows(), via_bitor.null_rows()); + } + #[test] fn test_nullable_row_addr_set_with_nulls() { let set = NullableRowAddrSet::new(rows(&[1, 2, 3]), RowAddrTreeMap::new()); diff --git a/rust/lance-table/Cargo.toml b/rust/lance-table/Cargo.toml index 836b54e942f..042ae92c618 100644 --- a/rust/lance-table/Cargo.toml +++ b/rust/lance-table/Cargo.toml @@ -28,7 +28,6 @@ aws-sdk-dynamodb = { workspace = true, optional = true, default-features = false byteorder.workspace = true bytes.workspace = true chrono.workspace = true -deepsize.workspace = true futures.workspace = true log.workspace = true object_store.workspace = true diff --git a/rust/lance-table/benches/manifest_intern.rs b/rust/lance-table/benches/manifest_intern.rs index aa798a0bfc2..78b7e352207 100644 --- a/rust/lance-table/benches/manifest_intern.rs +++ b/rust/lance-table/benches/manifest_intern.rs @@ -11,7 +11,7 @@ //! `RowDatasetVersionMeta::Inline` bytes across many fragments. use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use prost::Message; use lance_table::format::pb; diff --git a/rust/lance-table/src/format/fragment.rs b/rust/lance-table/src/format/fragment.rs index dc5c94b388a..431e466dbd4 100644 --- a/rust/lance-table/src/format/fragment.rs +++ b/rust/lance-table/src/format/fragment.rs @@ -5,8 +5,8 @@ use std::collections::HashMap; use std::num::NonZero; use std::sync::Arc; -use deepsize::DeepSizeOf; use lance_core::Error; +use lance_core::deepsize::DeepSizeOf; use lance_file::format::{MAJOR_VERSION, MINOR_VERSION}; use lance_file::version::LanceFileVersion; use lance_io::utils::CachedFileSize; diff --git a/rust/lance-table/src/format/index.rs b/rust/lance-table/src/format/index.rs index 945d8364123..f603536a3eb 100644 --- a/rust/lance-table/src/format/index.rs +++ b/rust/lance-table/src/format/index.rs @@ -7,14 +7,15 @@ use std::collections::HashMap; use std::sync::Arc; use chrono::{DateTime, Utc}; -use deepsize::DeepSizeOf; use futures::StreamExt; +use lance_core::deepsize::DeepSizeOf; use lance_io::object_store::ObjectStore; use object_store::path::Path; use roaring::RoaringBitmap; use uuid::Uuid; use super::pb; +use lance_core::cache::{CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; /// Metadata about a single file within an index segment. @@ -121,7 +122,7 @@ impl IndexMetadata { } impl DeepSizeOf for IndexMetadata { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.uuid.as_bytes().deep_size_of_children(context) + self.fields.deep_size_of_children(context) + self.name.deep_size_of_children(context) @@ -235,24 +236,26 @@ impl From<&IndexMetadata> for pb::IndexMetadata { /// orphan rule prevents `impl CacheCodecImpl for Vec`. type ArcAny = Arc; +/// Stable type identifier for the `Vec` cache entry. +const INDEX_METADATA_TYPE_ID: &str = "lance.table.IndexMetadataList"; +/// Body schema version written by this build. +const INDEX_METADATA_VERSION: u32 = 1; + fn serialize_index_metadata( any: &ArcAny, - writer: &mut dyn std::io::Write, + writer: &mut CacheEntryWriter<'_>, ) -> lance_core::Result<()> { - use prost::Message; let vec = any .downcast_ref::>() .expect("index_metadata_codec: wrong type (this is a bug in the cache layer)"); let section = pb::IndexSection { indices: vec.iter().map(pb::IndexMetadata::from).collect(), }; - writer.write_all(§ion.encode_to_vec())?; - Ok(()) + writer.write_header(§ion) } -fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result { - use prost::Message; - let section = pb::IndexSection::decode(data.as_ref())?; +fn deserialize_index_metadata(reader: &mut CacheEntryReader<'_>) -> lance_core::Result { + let section: pb::IndexSection = reader.read_header()?; let indices: Vec = section .indices .into_iter() @@ -262,7 +265,12 @@ fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result } pub fn index_metadata_codec() -> lance_core::cache::CacheCodec { - lance_core::cache::CacheCodec::new(serialize_index_metadata, deserialize_index_metadata) + lance_core::cache::CacheCodec::new( + INDEX_METADATA_TYPE_ID, + INDEX_METADATA_VERSION, + serialize_index_metadata, + deserialize_index_metadata, + ) } /// List all files in an index directory with their sizes. @@ -348,7 +356,8 @@ mod tests { let bytes = store.get(&key).unwrap(); let recovered = codec .deserialize(&bytes::Bytes::copy_from_slice(bytes)) - .unwrap(); + .hit() + .expect("entry should decode as a hit"); let recovered = recovered .downcast::>() .expect("downcast should succeed"); diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index d2b5f2d31c6..9845061b7e4 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -3,7 +3,7 @@ use async_trait::async_trait; use chrono::prelude::*; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_file::datatypes::{Fields, FieldsWithMeta, populate_schema_dictionary}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_file::version::{LEGACY_FORMAT_VERSION, LanceFileVersion}; @@ -588,7 +588,7 @@ impl BasePath { } impl DeepSizeOf for BasePath { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.name.deep_size_of_children(context) + self.path.deep_size_of_children(context) * 2 + size_of::() diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index 3784e84a785..e1a4086730b 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -798,6 +798,26 @@ pub trait CommitHandler: Debug + Send + Sync { default_resolve_version(base_path, version, object_store).await } + /// Check whether an attached manifest version exists without loading it. + /// + /// The default implementation probes the deterministic manifest path for + /// the given naming scheme. Commit handlers with an external source of + /// truth should override this method. + async fn version_exists( + &self, + base_path: &Path, + version: u64, + object_store: &dyn OSObjectStore, + naming_scheme: ManifestNamingScheme, + ) -> Result { + let path = naming_scheme.manifest_path(base_path, version); + match object_store.head(&path).await { + Ok(_) => Ok(true), + Err(ObjectStoreError::NotFound { .. }) => Ok(false), + Err(e) => Err(e.into()), + } + } + /// List detached manifest locations. /// /// Returns a stream of detached manifest locations in arbitrary order. diff --git a/rust/lance-table/src/io/commit/external_manifest.rs b/rust/lance-table/src/io/commit/external_manifest.rs index 75993ca8d1f..22ebaa10b4a 100644 --- a/rust/lance-table/src/io/commit/external_manifest.rs +++ b/rust/lance-table/src/io/commit/external_manifest.rs @@ -8,6 +8,8 @@ use std::sync::Arc; use async_trait::async_trait; +use bytes::Bytes; +use futures::StreamExt; use lance_core::utils::tracing::{ AUDIT_MODE_CREATE, AUDIT_MODE_DELETE, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT, }; @@ -123,7 +125,7 @@ pub trait ExternalManifestStore: std::fmt::Debug + Send + Sync { // Step 2: Copy staging to final path let final_path = naming_scheme.manifest_path(base_path, version); - let copied = match object_store.copy(staging_path, &final_path).await { + let copied = match copy_size_aware(object_store, staging_path, &final_path, size).await { Ok(_) => true, Err(ObjectStoreError::NotFound { .. }) => false, Err(e) => return Err(e.into()), @@ -213,6 +215,129 @@ pub(crate) fn detect_naming_scheme_from_path(path: &Path) -> Result5 GiB copy is slower than a native copy would be. +const MAX_SERVER_SIDE_COPY_BYTES: u64 = 5 * 1024 * 1024 * 1024; + +/// Part size for the read+rewrite fallback. Multipart-capable stores +/// (S3, GCS) require every part except the last to be ≥5 MB and allow up to +/// 10,000 parts. 100 MB sits comfortably inside both bounds and keeps the +/// part count low (~140 parts for a 14 GB manifest) without large per-part +/// RAM. +const COPY_REWRITE_PART_SIZE: usize = 100 * 1024 * 1024; + +/// Copy `from` to `to`, falling back to a multipart-equivalent read+rewrite +/// when the source exceeds the server-side-copy size limit +/// (`MAX_SERVER_SIDE_COPY_BYTES`). +/// +/// For sources below the limit, this is the same fast server-side +/// `store.copy()` as before. For larger sources, the source is streamed +/// through the client and re-uploaded as a multipart upload at `to`. This +/// doubles bytes-on-the-wire for the rare large case while preserving the +/// cheap fast path for the common small case. +/// +/// `size` is the known source size. It is required: the only caller already +/// has it, and the alternative (an extra `head(from)` round-trip) is work +/// the caller can avoid by passing what it already knows. +/// +/// `NotFound` errors on `from` propagate unchanged so callers can keep +/// existing `Err(NotFound { .. })` arms. +/// +/// This is a workaround for the missing `UploadPartCopy` primitive in the +/// upstream `object_store` crate. Once that lands, this helper can be +/// deleted and the call sites can go back to plain `store.copy()`. +async fn copy_size_aware( + store: &dyn OSObjectStore, + from: &Path, + to: &Path, + size: u64, +) -> std::result::Result<(), ObjectStoreError> { + if size < MAX_SERVER_SIDE_COPY_BYTES { + store.copy(from, to).await + } else { + copy_via_read_rewrite(store, from, to).await + } +} + +// NOTE: parts are uploaded sequentially. This could be parallelized (a +// bounded JoinSet, like lance-io/src/object_writer.rs's +// LANCE_UPLOAD_CONCURRENCY) or sidestepped entirely by switching to +// `object_store::WriteMultipart` (which also handles abort-on-drop). Left +// sequential here: this is a cold path (only >5 GiB manifests) and the +// helper is itself a stopgap until `object_store` exposes UploadPartCopy. +async fn copy_via_read_rewrite( + store: &dyn OSObjectStore, + from: &Path, + to: &Path, +) -> std::result::Result<(), ObjectStoreError> { + // NotFound here propagates upward unchanged. + let mut stream = store.get(from).await?.into_stream(); + + // From here on, errors must `abort()` the upload to avoid leaving an + // orphan multipart upload on stores that support them (e.g. S3, GCS), + // which would otherwise incur storage charges until the bucket's + // lifecycle policy cleans it up. + // + // Note: this does NOT cover task cancellation — `MultipartUpload`'s + // upstream Drop is documented as a no-op for S3/GCS. Callers that + // need cancellation cleanliness should run this with a guard or + // switch to `object_store::WriteMultipart` (planned follow-up). + let mut upload = store.put_multipart(to).await?; + let mut part_buf: Vec = Vec::with_capacity(COPY_REWRITE_PART_SIZE); + + while let Some(chunk) = stream.next().await { + let chunk = match chunk { + Ok(b) => b, + Err(e) => { + let _ = upload.abort().await; + return Err(e); + } + }; + // Append the chunk in COPY_REWRITE_PART_SIZE-bounded slices so a + // single oversized chunk (e.g., LocalFileSystem returning a whole + // file) cannot push part_buf past the backend's per-part size limit + // (5 GiB on S3/GCS). COPY_REWRITE_PART_SIZE is well under every + // backend's cap, so each flushed part is always valid. + let mut offset = 0; + while offset < chunk.len() { + let want = COPY_REWRITE_PART_SIZE - part_buf.len(); + let take = want.min(chunk.len() - offset); + part_buf.extend_from_slice(&chunk[offset..offset + take]); + offset += take; + + if part_buf.len() >= COPY_REWRITE_PART_SIZE { + let payload = + std::mem::replace(&mut part_buf, Vec::with_capacity(COPY_REWRITE_PART_SIZE)); + if let Err(e) = upload.put_part(Bytes::from(payload).into()).await { + let _ = upload.abort().await; + return Err(e); + } + } + } + } + + // Flush the final (possibly-short) part. The last part of a multipart + // upload is exempt from the per-part minimum on S3/GCS. + if !part_buf.is_empty() + && let Err(e) = upload.put_part(Bytes::from(part_buf).into()).await + { + let _ = upload.abort().await; + return Err(e); + } + + if let Err(e) = upload.complete().await { + let _ = upload.abort().await; + return Err(e); + } + Ok(()) +} + /// External manifest commit handler /// This handler is used to commit a manifest to an external store /// for detailed design, see @@ -245,14 +370,12 @@ impl ExternalManifestCommitHandler { // step 1: copy the manifest to the final location let final_manifest_path = naming_scheme.manifest_path(base_path, version); - let copied = match store - .copy(staging_manifest_path, &final_manifest_path) - .await - { - Ok(_) => true, - Err(ObjectStoreError::NotFound { .. }) => false, // Another writer beat us to it. - Err(e) => return Err(e.into()), - }; + let copied = + match copy_size_aware(store, staging_manifest_path, &final_manifest_path, size).await { + Ok(_) => true, + Err(ObjectStoreError::NotFound { .. }) => false, // Another writer beat us to it. + Err(e) => return Err(e.into()), + }; if copied { info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_manifest_path.as_ref()); } @@ -456,6 +579,31 @@ impl CommitHandler for ExternalManifestCommitHandler { .await } + async fn version_exists( + &self, + base_path: &Path, + version: u64, + object_store: &dyn OSObjectStore, + naming_scheme: ManifestNamingScheme, + ) -> Result { + match self + .external_manifest_store + .get_manifest_location(base_path.as_ref(), version) + .await + { + Ok(_) => Ok(true), + Err(Error::NotFound { .. }) => { + let path = naming_scheme.manifest_path(base_path, version); + match object_store.head(&path).await { + Ok(_) => Ok(true), + Err(ObjectStoreError::NotFound { .. }) => Ok(false), + Err(e) => Err(e.into()), + } + } + Err(e) => Err(e), + } + } + async fn commit( &self, manifest: &mut Manifest, diff --git a/rust/lance-table/src/lib.rs b/rust/lance-table/src/lib.rs index ebe892ba534..89b424adc61 100644 --- a/rust/lance-table/src/lib.rs +++ b/rust/lance-table/src/lib.rs @@ -5,4 +5,5 @@ pub mod feature_flags; pub mod format; pub mod io; pub mod rowids; +pub mod system_index; pub mod utils; diff --git a/rust/lance-table/src/rowids.rs b/rust/lance-table/src/rowids.rs index 0b56be84f56..6975d798143 100644 --- a/rust/lance-table/src/rowids.rs +++ b/rust/lance-table/src/rowids.rs @@ -22,7 +22,7 @@ pub mod segment; mod serde; pub mod version; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; // These are the public API. pub use index::FragmentRowIdIndex; pub use index::RowIdIndex; diff --git a/rust/lance-table/src/rowids/bitmap.rs b/rust/lance-table/src/rowids/bitmap.rs index 9001c04c3a0..ce7eadd5634 100644 --- a/rust/lance-table/src/rowids/bitmap.rs +++ b/rust/lance-table/src/rowids/bitmap.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; #[derive(PartialEq, Eq, Clone, DeepSizeOf)] pub struct Bitmap { diff --git a/rust/lance-table/src/rowids/encoded_array.rs b/rust/lance-table/src/rowids/encoded_array.rs index 06614765af1..7564cb6bb21 100644 --- a/rust/lance-table/src/rowids/encoded_array.rs +++ b/rust/lance-table/src/rowids/encoded_array.rs @@ -3,7 +3,7 @@ use std::ops::Range; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; /// Encoded array of u64 values. /// diff --git a/rust/lance-table/src/rowids/index.rs b/rust/lance-table/src/rowids/index.rs index 718f2b8f2a9..66720ed1f25 100644 --- a/rust/lance-table/src/rowids/index.rs +++ b/rust/lance-table/src/rowids/index.rs @@ -5,8 +5,8 @@ use std::ops::RangeInclusive; use std::sync::Arc; use super::{RowIdSequence, U64Segment}; -use deepsize::DeepSizeOf; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_core::utils::deletion::DeletionVector; use rangemap::RangeInclusiveMap; @@ -120,7 +120,7 @@ impl RowIdIndex { } impl DeepSizeOf for RowIdIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0 .iter() .map(|(_, (row_id_segment, address_segment))| { diff --git a/rust/lance-table/src/rowids/segment.rs b/rust/lance-table/src/rowids/segment.rs index a02acd8a573..6fba8599016 100644 --- a/rust/lance-table/src/rowids/segment.rs +++ b/rust/lance-table/src/rowids/segment.rs @@ -4,7 +4,7 @@ use std::ops::{Range, RangeInclusive}; use super::{bitmap::Bitmap, encoded_array::EncodedU64Array}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; /// Convert an estimated serialized byte cost from `u128` to `usize`, saturating /// at [`usize::MAX`] when the value does not fit (infeasible encodings). @@ -70,7 +70,7 @@ pub enum U64Segment { } impl DeepSizeOf for U64Segment { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { match self { Self::Range(_) => 0, Self::RangeWithHoles { holes, .. } => holes.deep_size_of_children(context), diff --git a/rust/lance-table/src/rowids/version.rs b/rust/lance-table/src/rowids/version.rs index 80f3d06db60..6ddb083c36e 100644 --- a/rust/lance-table/src/rowids/version.rs +++ b/rust/lance-table/src/rowids/version.rs @@ -9,9 +9,9 @@ use std::sync::Arc; -use deepsize::DeepSizeOf; use lance_core::Error; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use prost::Message; use serde::de::Deserializer; use serde::ser::Serializer; diff --git a/rust/lance-table/src/system_index.rs b/rust/lance-table/src/system_index.rs new file mode 100644 index 00000000000..021c01a5e52 --- /dev/null +++ b/rust/lance-table/src/system_index.rs @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! System indices: table-level structure persisted as indices. +//! +//! Unlike normal indices, whose internals stay opaque behind +//! [`crate::format::IndexMetadata::index_details`], the table format genuinely +//! interprets the contents of these indices (fragment remapping, row +//! visibility). They therefore live at the table layer. +//! +//! The `Index`-trait adapters for these structs live in `lance-index`, which +//! re-exports the structs defined here. + +pub mod frag_reuse; +pub mod mem_wal; diff --git a/rust/lance-table/src/system_index/frag_reuse.rs b/rust/lance-table/src/system_index/frag_reuse.rs new file mode 100644 index 00000000000..40bbc4f58b6 --- /dev/null +++ b/rust/lance-table/src/system_index/frag_reuse.rs @@ -0,0 +1,480 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::{collections::HashMap, sync::Arc}; + +use arrow_array::cast::AsArray; +use arrow_array::types::UInt64Type; +use arrow_array::{Array, ArrayRef, PrimitiveArray, RecordBatch, UInt64Array}; +use lance_core::deepsize::{Context, DeepSizeOf}; +use lance_core::{Error, Result}; +use lance_select::RowAddrTreeMap; +use roaring::{RoaringBitmap, RoaringTreemap}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::format::pb::fragment_reuse_index_details::InlineContent; +use crate::format::{ExternalFile, Fragment, pb}; + +pub const FRAG_REUSE_INDEX_NAME: &str = "__lance_frag_reuse"; +pub const FRAG_REUSE_DETAILS_FILE_NAME: &str = "details.binpb"; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragDigest { + pub id: u64, + pub physical_rows: usize, + pub num_deleted_rows: usize, +} + +impl From<&FragDigest> for pb::fragment_reuse_index_details::FragmentDigest { + fn from(digest: &FragDigest) -> Self { + Self { + id: digest.id, + physical_rows: digest.physical_rows as u64, + num_deleted_rows: digest.num_deleted_rows as u64, + } + } +} + +impl From<&Fragment> for FragDigest { + fn from(fragment: &Fragment) -> Self { + Self { + id: fragment.id, + physical_rows: fragment + .physical_rows + .expect("Fragment doesn't have physical rows recorded"), + num_deleted_rows: fragment + .deletion_file + .as_ref() + .and_then(|d| d.num_deleted_rows) + .unwrap_or(0), + } + } +} + +impl TryFrom for FragDigest { + type Error = Error; + + fn try_from(digest: pb::fragment_reuse_index_details::FragmentDigest) -> Result { + Ok(Self { + id: digest.id, + physical_rows: digest.physical_rows as usize, + num_deleted_rows: digest.num_deleted_rows as usize, + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragReuseGroup { + pub changed_row_addrs: Vec, + pub old_frags: Vec, + pub new_frags: Vec, +} + +impl From<&FragReuseGroup> for pb::fragment_reuse_index_details::Group { + fn from(group: &FragReuseGroup) -> Self { + Self { + changed_row_addrs: group.changed_row_addrs.clone(), + old_fragments: group.old_frags.iter().map(|f| f.into()).collect(), + new_fragments: group.new_frags.iter().map(|f| f.into()).collect(), + } + } +} + +impl TryFrom for FragReuseGroup { + type Error = Error; + + fn try_from(group: pb::fragment_reuse_index_details::Group) -> Result { + Ok(Self { + changed_row_addrs: group.changed_row_addrs, + old_frags: group + .old_fragments + .into_iter() + .map(FragDigest::try_from) + .collect::>()?, + new_frags: group + .new_fragments + .into_iter() + .map(FragDigest::try_from) + .collect::>()?, + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragReuseVersion { + pub dataset_version: u64, + pub groups: Vec, +} + +impl From<&FragReuseVersion> for pb::fragment_reuse_index_details::Version { + fn from(version: &FragReuseVersion) -> Self { + Self { + dataset_version: version.dataset_version, + groups: version.groups.iter().map(|g| g.into()).collect(), + } + } +} + +impl TryFrom for FragReuseVersion { + type Error = Error; + + fn try_from(version: pb::fragment_reuse_index_details::Version) -> Result { + Ok(Self { + dataset_version: version.dataset_version, + groups: version + .groups + .into_iter() + .map(FragReuseGroup::try_from) + .collect::>()?, + }) + } +} + +impl FragReuseVersion { + pub fn old_frag_ids(&self) -> Vec { + self.groups + .iter() + .flat_map(|g| g.old_frags.iter().map(|f| f.id)) + .collect::>() + } + + pub fn new_frag_ids(&self) -> Vec { + self.groups + .iter() + .flat_map(|g| g.new_frags.iter().map(|f| f.id)) + .collect::>() + } + + pub fn new_frag_bitmap(&self) -> RoaringBitmap { + RoaringBitmap::from_iter(self.new_frag_ids().iter().map(|&id| id as u32)) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub enum FragReuseIndexDetailsContentType { + Inline(FragReuseIndexDetails), + External(ExternalFile), +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragReuseIndexDetails { + pub versions: Vec, +} + +impl From<&FragReuseIndexDetails> for InlineContent { + fn from(details: &FragReuseIndexDetails) -> Self { + let mut versions: Vec = + details.versions.iter().map(|m| m.into()).collect(); + // sort from oldest to latest version + versions.sort_by_key(|v| v.dataset_version); + Self { versions } + } +} + +impl TryFrom for FragReuseIndexDetails { + type Error = Error; + + fn try_from(content: InlineContent) -> Result { + Ok(Self { + versions: content + .versions + .into_iter() + .map(|m| m.try_into()) + .collect::>>()?, + }) + } +} + +impl FragReuseIndexDetails { + pub fn new_frag_bitmap(&self) -> RoaringBitmap { + RoaringBitmap::from_iter( + self.versions + .iter() + .flat_map(|v| v.new_frag_ids().into_iter().map(|id| id as u32)), + ) + } +} + +/// An index that stores row ID maps. +/// A row ID map describes the mapping from old row address to new address after compactions. +/// Each version contains the mapping for one round of compaction. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct FragReuseIndex { + pub uuid: Uuid, + pub row_id_maps: Vec>>, + pub details: FragReuseIndexDetails, +} + +impl DeepSizeOf for FragReuseIndex { + fn deep_size_of_children(&self, cx: &mut Context) -> usize { + self.row_id_maps.deep_size_of_children(cx) + self.details.deep_size_of_children(cx) + } +} + +impl FragReuseIndex { + pub fn new( + uuid: Uuid, + row_id_maps: Vec>>, + details: FragReuseIndexDetails, + ) -> Self { + Self { + uuid, + row_id_maps, + details, + } + } + + pub fn remap_row_id(&self, row_id: u64) -> Option { + let mut mapped_value = Some(row_id); + for row_id_map in self.row_id_maps.iter() { + if mapped_value.is_some() { + mapped_value = row_id_map + .get(&mapped_value.unwrap()) + .copied() + .unwrap_or(mapped_value); + } + } + + mapped_value + } + + pub fn remap_row_addrs_tree_map(&self, row_addrs: &RowAddrTreeMap) -> RowAddrTreeMap { + RowAddrTreeMap::from_iter(row_addrs.row_addrs().unwrap().filter_map(|addr| { + let addr_as_u64 = u64::from(addr); + self.remap_row_id(addr_as_u64) + })) + } + + pub fn remap_row_ids_roaring_tree_map(&self, row_ids: &RoaringTreemap) -> RoaringTreemap { + RoaringTreemap::from_iter(row_ids.iter().filter_map(|addr| self.remap_row_id(addr))) + } + + /// Remap a record batch that contains a row_id column at index `row_id_idx` + /// Currently this assumes there are only 2 columns in the schema, + /// which is the case for all indexes. + /// For example, for btree, the schema is (value, row_id). + /// For vector index storage, the schema is (row_id, vector). + pub fn remap_row_ids_record_batch( + &self, + batch: RecordBatch, + row_id_idx: usize, + ) -> Result { + assert_eq!(batch.schema().fields().len(), 2); + let other_column_idx = 1 - row_id_idx; + let row_ids = batch.column(row_id_idx).as_primitive::(); + let (val_indices, new_row_ids): (Vec, Vec) = row_ids + .values() + .iter() + .enumerate() + .filter_map(|(idx, old_id)| { + self.remap_row_id(*old_id) + .map(|new_id| (idx as u64, new_id)) + }) + .unzip(); + let new_val_indices = UInt64Array::from_iter_values(val_indices); + let new_vals = + arrow::compute::take(batch.column(other_column_idx), &new_val_indices, None)?; + + let mut batch_data: Vec<(usize, ArrayRef)> = vec![ + ( + row_id_idx, + Arc::new(UInt64Array::from_iter_values(new_row_ids)) as ArrayRef, + ), + (other_column_idx, Arc::new(new_vals)), + ]; + batch_data.sort_by_key(|(i, _)| *i); + Ok(RecordBatch::try_new( + batch.schema(), + batch_data.into_iter().map(|(_, item)| item).collect(), + )?) + } + + pub fn remap_row_ids_array(&self, array: ArrayRef) -> PrimitiveArray { + let primitive_array = array + .as_any() + .downcast_ref::>() + .expect("expected row IDs to be uint64 array"); + (0..primitive_array.len()) + .map(|i| { + if primitive_array.is_null(i) { + None + } else { + self.remap_row_id(primitive_array.value(i)) + } + }) + .collect() + } + + pub fn remap_fragment_bitmap(&self, fragment_bitmap: &mut RoaringBitmap) -> Result<()> { + for version in self.details.versions.iter() { + for group in version.groups.iter() { + let mut removed = 0; + for old_frag in group.old_frags.iter() { + if fragment_bitmap.remove(old_frag.id as u32) { + removed += 1; + } + } + + if removed > 0 { + if removed != group.old_frags.len() { + // Straddle: the index covered only part of this rewrite + // group. Caused by the bug fixed in + // . + // We've already removed the indexed old_frags from the + // bitmap above; deliberately do NOT insert new_frags, + // since the merged fragment also contains rows that + // were never indexed. Affected rows fall through to + // flat scan until the next optimize_indices. The fix + // is persisted on the next write via build_manifest. + tracing::warn!( + "Healing straddling fragment-reuse rewrite group in index bitmap: \ + group {:?} was only partially indexed ({} of {} old fragments). \ + Affected rows will use flat scan until the next optimize_indices.", + group.old_frags, + removed, + group.old_frags.len(), + ); + continue; + } + + for new_frag in group.new_frags.iter() { + fragment_bitmap.insert(new_frag.id as u32); + } + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[tokio::test] + async fn test_serialize_deserialize_index_details() { + // Create sample FragReuseVersions with different dataset versions + let version1 = FragReuseVersion { + dataset_version: 2, + groups: vec![FragReuseGroup { + changed_row_addrs: vec![1, 2, 3], + old_frags: vec![FragDigest { + id: 1, + physical_rows: 1, + num_deleted_rows: 0, + }], + new_frags: vec![ + FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 3, + physical_rows: 1, + num_deleted_rows: 0, + }, + ], + }], + }; + + let version2 = FragReuseVersion { + dataset_version: 1, + groups: vec![FragReuseGroup { + changed_row_addrs: vec![4, 5, 6], + old_frags: vec![FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }], + new_frags: vec![ + FragDigest { + id: 4, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 5, + physical_rows: 1, + num_deleted_rows: 0, + }, + ], + }], + }; + + // Create FragReuseIndexDetails with versions in reverse order + let details = FragReuseIndexDetails { + versions: vec![version1, version2], + }; + + // Convert to protobuf format + let inline_content: InlineContent = (&details).into(); + + // Convert back to FragReuseIndexDetails + let roundtrip_details = FragReuseIndexDetails::try_from(inline_content).unwrap(); + + // Verify the roundtrip + assert_eq!(roundtrip_details.versions.len(), 2); + + // Verify versions are sorted by dataset_version (oldest to latest) + assert_eq!(roundtrip_details.versions[0].dataset_version, 1); + assert_eq!( + roundtrip_details.versions[0].groups[0].changed_row_addrs, + vec![4, 5, 6] + ); + assert_eq!( + roundtrip_details.versions[0].groups[0].new_frags, + vec![ + FragDigest { + id: 4, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 5, + physical_rows: 1, + num_deleted_rows: 0, + } + ] + ); + assert_eq!( + roundtrip_details.versions[0].groups[0].old_frags, + vec![FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }] + ); + + assert_eq!(roundtrip_details.versions[1].dataset_version, 2); + assert_eq!( + roundtrip_details.versions[1].groups[0].changed_row_addrs, + vec![1, 2, 3] + ); + assert_eq!( + roundtrip_details.versions[1].groups[0].new_frags, + vec![ + FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 3, + physical_rows: 1, + num_deleted_rows: 0, + } + ] + ); + assert_eq!( + roundtrip_details.versions[1].groups[0].old_frags, + vec![FragDigest { + id: 1, + physical_rows: 1, + num_deleted_rows: 0, + }] + ); + } +} diff --git a/rust/lance-table/src/system_index/mem_wal.rs b/rust/lance-table/src/system_index/mem_wal.rs new file mode 100644 index 00000000000..3bf279df062 --- /dev/null +++ b/rust/lance-table/src/system_index/mem_wal.rs @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; + +use lance_core::Error; +use lance_core::deepsize::DeepSizeOf; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::format::pb; + +pub const MEM_WAL_INDEX_NAME: &str = "__lance_mem_wal"; + +/// Type alias for shard identifier (UUID v4). +pub type ShardId = Uuid; + +/// A flushed MemTable generation and its storage location. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FlushedGeneration { + pub generation: u64, + pub path: String, +} + +impl From<&FlushedGeneration> for pb::FlushedGeneration { + fn from(fg: &FlushedGeneration) -> Self { + Self { + generation: fg.generation, + path: fg.path.clone(), + } + } +} + +impl From for FlushedGeneration { + fn from(fg: pb::FlushedGeneration) -> Self { + Self { + generation: fg.generation, + path: fg.path, + } + } +} + +/// A shard's merged generation, used in MemWalIndexDetails. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash, Serialize, Deserialize)] +pub struct MergedGeneration { + pub shard_id: Uuid, + pub generation: u64, +} + +impl DeepSizeOf for MergedGeneration { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { + 0 // UUID is 16 bytes fixed size, no heap allocations + } +} + +impl MergedGeneration { + pub fn new(shard_id: Uuid, generation: u64) -> Self { + Self { + shard_id, + generation, + } + } +} + +impl From<&MergedGeneration> for pb::MergedGeneration { + fn from(mg: &MergedGeneration) -> Self { + Self { + shard_id: Some((&mg.shard_id).into()), + generation: mg.generation, + } + } +} + +impl TryFrom for MergedGeneration { + type Error = Error; + + fn try_from(mg: pb::MergedGeneration) -> lance_core::Result { + let shard_id = mg + .shard_id + .as_ref() + .map(Uuid::try_from) + .ok_or_else(|| Error::invalid_input("Missing shard_id in MergedGeneration"))??; + Ok(Self { + shard_id, + generation: mg.generation, + }) + } +} + +/// Tracks which merged generation a base table index has been rebuilt to cover. +/// Used to determine whether to read from flushed MemTable indexes or base table. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct IndexCatchupProgress { + pub index_name: String, + pub caught_up_generations: Vec, +} + +impl IndexCatchupProgress { + pub fn new(index_name: String, caught_up_generations: Vec) -> Self { + Self { + index_name, + caught_up_generations, + } + } + + /// Get the caught up generation for a specific shard. + /// Returns None if the shard is not present (assumed fully caught up). + pub fn caught_up_generation_for_shard(&self, shard_id: &Uuid) -> Option { + self.caught_up_generations + .iter() + .find(|mg| &mg.shard_id == shard_id) + .map(|mg| mg.generation) + } +} + +impl From<&IndexCatchupProgress> for pb::IndexCatchupProgress { + fn from(icp: &IndexCatchupProgress) -> Self { + Self { + index_name: icp.index_name.clone(), + caught_up_generations: icp + .caught_up_generations + .iter() + .map(|mg| mg.into()) + .collect(), + } + } +} + +impl TryFrom for IndexCatchupProgress { + type Error = Error; + + fn try_from(icp: pb::IndexCatchupProgress) -> lance_core::Result { + Ok(Self { + index_name: icp.index_name, + caught_up_generations: icp + .caught_up_generations + .into_iter() + .map(MergedGeneration::try_from) + .collect::>()?, + }) + } +} + +/// Shard manifest containing epoch-based fencing and WAL state. +/// Each shard has exactly one active writer at any time. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ShardManifest { + pub shard_id: Uuid, + pub version: u64, + pub shard_spec_id: u32, + /// Computed shard field values as raw Arrow scalar bytes, keyed by field id. + /// The byte encoding follows Arrow's little-endian convention: int32 is 4 LE + /// bytes, utf8 is raw UTF-8 bytes, etc. The result_type in the corresponding + /// ShardingField from the ShardingSpec determines how to interpret each value. + pub shard_field_values: HashMap>, + pub writer_epoch: u64, + /// The most recent WAL entry position flushed to a MemTable. + /// Recovery replays from `replay_after_wal_entry_position + 1`. The + /// default value 0 means "no flush has ever stamped this shard" — WAL + /// positions themselves are 1-based, so 0 is never a valid covered + /// position. + pub replay_after_wal_entry_position: u64, + /// The most recent WAL entry position observed at manifest write time. + /// Default 0 means "no entry has been written yet"; WAL positions are + /// 1-based. + pub wal_entry_position_last_seen: u64, + pub current_generation: u64, + pub flushed_generations: Vec, +} + +impl DeepSizeOf for ShardManifest { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.shard_field_values.deep_size_of_children(context) + + self.flushed_generations.deep_size_of_children(context) + } +} + +impl From<&ShardManifest> for pb::ShardManifest { + fn from(rm: &ShardManifest) -> Self { + Self { + shard_id: Some((&rm.shard_id).into()), + version: rm.version, + shard_spec_id: rm.shard_spec_id, + shard_field_entries: rm + .shard_field_values + .iter() + .map(|(k, v)| pb::ShardFieldEntry { + field_id: k.clone(), + value: v.clone(), + }) + .collect(), + writer_epoch: rm.writer_epoch, + replay_after_wal_entry_position: rm.replay_after_wal_entry_position, + wal_entry_position_last_seen: rm.wal_entry_position_last_seen, + current_generation: rm.current_generation, + flushed_generations: rm.flushed_generations.iter().map(|fg| fg.into()).collect(), + } + } +} + +impl TryFrom for ShardManifest { + type Error = Error; + + fn try_from(rm: pb::ShardManifest) -> lance_core::Result { + let shard_id = rm + .shard_id + .as_ref() + .map(Uuid::try_from) + .ok_or_else(|| Error::invalid_input("Missing shard_id in ShardManifest"))??; + let shard_field_values = rm + .shard_field_entries + .into_iter() + .map(|e| (e.field_id, e.value)) + .collect(); + Ok(Self { + shard_id, + version: rm.version, + shard_spec_id: rm.shard_spec_id, + shard_field_values, + writer_epoch: rm.writer_epoch, + replay_after_wal_entry_position: rm.replay_after_wal_entry_position, + wal_entry_position_last_seen: rm.wal_entry_position_last_seen, + current_generation: rm.current_generation, + flushed_generations: rm + .flushed_generations + .into_iter() + .map(FlushedGeneration::from) + .collect(), + }) + } +} + +/// Sharding field definition. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct ShardingField { + pub field_id: String, + pub source_ids: Vec, + pub transform: Option, + pub expression: Option, + pub result_type: String, + pub parameters: HashMap, +} + +impl From<&ShardingField> for pb::ShardingField { + fn from(rf: &ShardingField) -> Self { + Self { + field_id: rf.field_id.clone(), + source_ids: rf.source_ids.clone(), + transform: rf.transform.clone(), + expression: rf.expression.clone(), + result_type: rf.result_type.clone(), + parameters: rf.parameters.clone(), + } + } +} + +impl From for ShardingField { + fn from(rf: pb::ShardingField) -> Self { + Self { + field_id: rf.field_id, + source_ids: rf.source_ids, + transform: rf.transform, + expression: rf.expression, + result_type: rf.result_type, + parameters: rf.parameters, + } + } +} + +/// Sharding spec definition. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct ShardingSpec { + pub spec_id: u32, + pub fields: Vec, +} + +impl From<&ShardingSpec> for pb::ShardingSpec { + fn from(rs: &ShardingSpec) -> Self { + Self { + spec_id: rs.spec_id, + fields: rs.fields.iter().map(|f| f.into()).collect(), + } + } +} + +impl From for ShardingSpec { + fn from(rs: pb::ShardingSpec) -> Self { + Self { + spec_id: rs.spec_id, + fields: rs.fields.into_iter().map(ShardingField::from).collect(), + } + } +} + +/// Index details for MemWAL Index, stored in IndexMetadata.index_details. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct MemWalIndexDetails { + pub snapshot_ts_millis: i64, + pub num_shards: u32, + pub inline_snapshots: Option>, + pub sharding_specs: Vec, + pub maintained_indexes: Vec, + pub merged_generations: Vec, + pub index_catchup: Vec, + /// Default `ShardWriter` configuration values for this MemWAL index. + /// + /// Persisted so every writer — across processes and restarts — starts + /// from the same default writer configuration. These are defaults only; + /// an individual writer may still override any value at runtime in its + /// own (non-persisted) `ShardWriterConfig`. + pub writer_config_defaults: HashMap, +} + +impl From<&MemWalIndexDetails> for pb::MemWalIndexDetails { + fn from(details: &MemWalIndexDetails) -> Self { + Self { + snapshot_ts_millis: details.snapshot_ts_millis, + num_shards: details.num_shards, + inline_snapshots: details.inline_snapshots.clone(), + sharding_specs: details.sharding_specs.iter().map(|rs| rs.into()).collect(), + maintained_indexes: details.maintained_indexes.clone(), + merged_generations: details + .merged_generations + .iter() + .map(|mg| mg.into()) + .collect(), + index_catchup: details.index_catchup.iter().map(|icp| icp.into()).collect(), + writer_config_defaults: details.writer_config_defaults.clone(), + } + } +} + +impl TryFrom for MemWalIndexDetails { + type Error = Error; + + fn try_from(details: pb::MemWalIndexDetails) -> lance_core::Result { + Ok(Self { + snapshot_ts_millis: details.snapshot_ts_millis, + num_shards: details.num_shards, + inline_snapshots: details.inline_snapshots, + sharding_specs: details + .sharding_specs + .into_iter() + .map(ShardingSpec::from) + .collect(), + maintained_indexes: details.maintained_indexes, + merged_generations: details + .merged_generations + .into_iter() + .map(MergedGeneration::try_from) + .collect::>()?, + index_catchup: details + .index_catchup + .into_iter() + .map(IndexCatchupProgress::try_from) + .collect::>()?, + writer_config_defaults: details.writer_config_defaults, + }) + } +} + +/// MemWAL Index provides access to MemWAL configuration and state. +#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)] +pub struct MemWalIndex { + pub details: MemWalIndexDetails, +} + +impl MemWalIndex { + pub fn new(details: MemWalIndexDetails) -> Self { + Self { details } + } + + pub fn merged_generation_for_shard(&self, shard_id: &Uuid) -> Option { + self.details + .merged_generations + .iter() + .find(|mg| &mg.shard_id == shard_id) + .map(|mg| mg.generation) + } + + /// Get the caught up generation for a specific index and shard. + /// Returns None if the index is not tracked (assumed fully caught up). + pub fn index_caught_up_generation(&self, index_name: &str, shard_id: &Uuid) -> Option { + self.details + .index_catchup + .iter() + .find(|icp| icp.index_name == index_name) + .and_then(|icp| icp.caught_up_generation_for_shard(shard_id)) + } + + /// Check if an index is fully caught up for a shard. + /// Returns true if the index covers all merged data for the shard. + pub fn is_index_caught_up(&self, index_name: &str, shard_id: &Uuid) -> bool { + let merged_gen = self.merged_generation_for_shard(shard_id).unwrap_or(0); + let caught_up_gen = self.index_caught_up_generation(index_name, shard_id); + + // If not tracked in index_catchup, assumed fully caught up + caught_up_gen.is_none_or(|generation| generation >= merged_gen) + } +} diff --git a/rust/lance-table/src/utils.rs b/rust/lance-table/src/utils.rs index 01c64f78710..0c37ef1e001 100644 --- a/rust/lance-table/src/utils.rs +++ b/rust/lance-table/src/utils.rs @@ -45,3 +45,5 @@ impl Iterator for ExactSize { (self.size, Some(self.size)) } } + +impl ExactSizeIterator for ExactSize {} diff --git a/rust/lance-tokenizer/Cargo.toml b/rust/lance-tokenizer/Cargo.toml index 5edfe4a9f16..e1006cd93c7 100644 --- a/rust/lance-tokenizer/Cargo.toml +++ b/rust/lance-tokenizer/Cargo.toml @@ -17,6 +17,7 @@ jieba-rs = { workspace = true, optional = true } lindera = { workspace = true, optional = true } rust-stemmers = "1.2.0" serde = { workspace = true, features = ["derive"] } +stop-words = { version = "0.10.0", default-features = false, features = ["iso", "nltk"] } unicode-normalization = "0.1.25" [features] diff --git a/rust/lance-tokenizer/src/ascii_folding_filter.rs b/rust/lance-tokenizer/src/ascii_folding_filter.rs index 376c0e1ebdb..8800545f1fb 100644 --- a/rust/lance-tokenizer/src/ascii_folding_filter.rs +++ b/rust/lance-tokenizer/src/ascii_folding_filter.rs @@ -49,9 +49,10 @@ impl TokenStream for AsciiFoldingFilterTokenStream<'_, T> { if !self.tail.advance() { return false; } - if !self.token_mut().text.is_ascii() { - to_ascii(&self.tail.token().text, self.buffer); - mem::swap(&mut self.tail.token_mut().text, self.buffer); + let token = self.tail.token_mut(); + if !token.text.is_ascii() { + to_ascii(&token.text, self.buffer); + mem::swap(&mut token.text, self.buffer); } true } @@ -67,6 +68,7 @@ impl TokenStream for AsciiFoldingFilterTokenStream<'_, T> { fn to_ascii(text: &str, output: &mut String) { output.clear(); + output.reserve(text.len()); for ch in text.chars() { if ch.is_ascii() { output.push(ch); @@ -149,4 +151,10 @@ mod tests { let tokens = collect_tokens("straße"); assert_eq!(tokens[0].text, "strasse"); } + + #[test] + fn test_ascii_folding_cjk_unchanged() { + let tokens = collect_tokens("こんにちは世界"); + assert_eq!(tokens[0].text, "こんにちは世界"); + } } diff --git a/rust/lance-tokenizer/src/lower_caser.rs b/rust/lance-tokenizer/src/lower_caser.rs index a041ac04e1f..3ad430f2f5a 100644 --- a/rust/lance-tokenizer/src/lower_caser.rs +++ b/rust/lance-tokenizer/src/lower_caser.rs @@ -47,22 +47,30 @@ pub struct LowerCaserTokenStream<'a, T> { fn to_lowercase_unicode(text: &str, output: &mut String) { output.clear(); - output.reserve(50); + output.reserve(text.len()); for ch in text.chars() { output.extend(ch.to_lowercase()); } } +fn is_lowercase_stable(text: &str) -> bool { + text.chars().all(|ch| { + let mut lower = ch.to_lowercase(); + lower.next() == Some(ch) && lower.next().is_none() + }) +} + impl TokenStream for LowerCaserTokenStream<'_, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; } - if self.token_mut().text.is_ascii() { - self.token_mut().text.make_ascii_lowercase(); - } else { - to_lowercase_unicode(&self.tail.token().text, self.buffer); - mem::swap(&mut self.tail.token_mut().text, self.buffer); + let token = self.tail.token_mut(); + if token.text.is_ascii() { + token.text.make_ascii_lowercase(); + } else if !is_lowercase_stable(&token.text) { + to_lowercase_unicode(&token.text, self.buffer); + mem::swap(&mut token.text, self.buffer); } true } @@ -75,3 +83,30 @@ impl TokenStream for LowerCaserTokenStream<'_, T> { self.tail.token_mut() } } + +#[cfg(test)] +mod tests { + use crate::{LowerCaser, RawTokenizer, TextAnalyzer, Token}; + + fn collect_tokens(text: &str) -> Vec { + let mut analyzer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(LowerCaser) + .build(); + let mut stream = analyzer.token_stream(text); + let mut tokens = Vec::new(); + stream.process(&mut |token| tokens.push(token.clone())); + tokens + } + + #[test] + fn test_lower_caser_unicode_changed() { + let tokens = collect_tokens("İSTANBUL"); + assert_eq!(tokens[0].text, "i\u{307}stanbul"); + } + + #[test] + fn test_lower_caser_unicode_unchanged() { + let tokens = collect_tokens("こんにちは世界"); + assert_eq!(tokens[0].text, "こんにちは世界"); + } +} diff --git a/rust/lance-tokenizer/src/stop_word_filter.rs b/rust/lance-tokenizer/src/stop_word_filter.rs index 0c49330a619..2acf0b3dbd5 100644 --- a/rust/lance-tokenizer/src/stop_word_filter.rs +++ b/rust/lance-tokenizer/src/stop_word_filter.rs @@ -12,6 +12,34 @@ use std::sync::Arc; use crate::{Language, Token, TokenFilter, TokenStream, Tokenizer}; +fn all_stop_words() -> impl Iterator { + [ + stop_words::get("ar"), + stopwords::DANISH, + stopwords::DUTCH, + stopwords::ENGLISH, + stopwords::FINNISH, + stopwords::FRENCH, + stopwords::GERMAN, + stop_words::get("el"), + stopwords::HUNGARIAN, + stopwords::ITALIAN, + stopwords::NORWEGIAN, + stopwords::PORTUGUESE, + stop_words::get("ro"), + stopwords::RUSSIAN, + stopwords::SPANISH, + stopwords::SWEDISH, + stop_words::get("ta"), + stop_words::get("tr"), + stop_words::get("zh"), + stop_words::get("ja"), + stop_words::get("ko"), + ] + .into_iter() + .flat_map(|words| words.iter().copied()) +} + #[derive(Clone)] pub struct StopWordFilter { words: Arc>, @@ -20,28 +48,32 @@ pub struct StopWordFilter { impl StopWordFilter { pub fn new(language: Language) -> Option { let words = match language { + Language::Arabic => stop_words::get("ar"), Language::Danish => stopwords::DANISH, Language::Dutch => stopwords::DUTCH, - Language::English => &[ - "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", - "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", - "there", "these", "they", "this", "to", "was", "will", "with", - ], + Language::English => stopwords::ENGLISH, Language::Finnish => stopwords::FINNISH, Language::French => stopwords::FRENCH, Language::German => stopwords::GERMAN, + Language::Greek => stop_words::get("el"), Language::Hungarian => stopwords::HUNGARIAN, Language::Italian => stopwords::ITALIAN, Language::Norwegian => stopwords::NORWEGIAN, Language::Portuguese => stopwords::PORTUGUESE, + Language::Romanian => stop_words::get("ro"), Language::Russian => stopwords::RUSSIAN, Language::Spanish => stopwords::SPANISH, Language::Swedish => stopwords::SWEDISH, - _ => return None, + Language::Tamil => stop_words::get("ta"), + Language::Turkish => stop_words::get("tr"), }; Some(Self::remove(words.iter().map(|word| (*word).to_owned()))) } + pub fn all() -> Self { + Self::remove(all_stop_words().map(str::to_owned)) + } + pub fn remove>(words: W) -> Self { Self { words: Arc::new(words.into_iter().collect()), @@ -49,6 +81,42 @@ impl StopWordFilter { } } +#[cfg(test)] +mod tests { + use super::all_stop_words; + use crate::StopWordFilter; + use std::collections::HashSet; + + #[test] + fn test_external_stop_word_lists_are_available() { + let words = all_stop_words().collect::>(); + for word in ["إلى", "και", "acesta", "அவர்", "ama", "的", "ある", "그리고"] + { + assert!( + words.contains(word), + "built-in stop words should contain {word}" + ); + } + } + + #[test] + fn test_language_stop_word_lists_are_available() { + for (language, word) in [ + (crate::Language::Arabic, "إلى"), + (crate::Language::Greek, "και"), + (crate::Language::Romanian, "acesta"), + (crate::Language::Tamil, "அவர்"), + (crate::Language::Turkish, "ama"), + ] { + let filter = StopWordFilter::new(language).unwrap(); + assert!( + filter.words.contains(word), + "{language:?} should contain {word}" + ); + } + } +} + impl TokenFilter for StopWordFilter { type Tokenizer = StopWordFilterWrapper; diff --git a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs index 2ac3f4a28aa..227556ba527 100644 --- a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs +++ b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs @@ -37,6 +37,12 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +pub const ENGLISH: &[&str] = &[ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with", +]; + pub const DANISH: &[&str] = &[ "og", "i", "jeg", "det", "at", "en", "den", "til", "er", "som", "på", "de", "med", "han", "af", "for", "ikke", "der", "var", "mig", "sig", "men", "et", "har", "om", "vi", "min", "havde", diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index e83353e9a63..100aa42ea20 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -56,7 +56,6 @@ crossbeam-queue = { workspace = true } crossbeam-skiplist.workspace = true # This is already used by datafusion dashmap = "6" -deepsize.workspace = true # matches arrow-rs use half.workspace = true # Fast non-cryptographic hasher for the hot FTS mem-index insert path. @@ -104,6 +103,7 @@ prost-build.workspace = true protobuf-src = { version = "2.1", optional = true } [target.'cfg(target_os = "linux")'.dev-dependencies] +pprof.workspace = true # Need this so we can prevent dynamic linking in binaries (see cli feature) lzma-sys = { version = "0.1" } @@ -138,7 +138,7 @@ parquet = { version = "58", default-features = false, features = ["arrow", "asyn reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } [features] -default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "geo"] +default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "tos", "goosefs", "geo"] fp16kernels = ["lance-linalg/fp16kernels"] # Prevent dynamic linking of lzma, which comes from datafusion cli = ["dep:clap", "lzma-sys/static"] @@ -157,6 +157,8 @@ gcp = ["lance-io/gcp"] azure = ["lance-io/azure"] oss = ["lance-io/oss"] tencent = ["lance-io/tencent"] +goosefs = ["lance-io/goosefs"] +tos = ["lance-io/tos"] huggingface = ["lance-io/huggingface"] geo = ["lance-datafusion/geo", "lance-index/geo"] # Enable slow integration tests (disabled by default in CI) @@ -173,6 +175,10 @@ required-features = ["cli"] name = "scalar_index" harness = false +[[bench]] +name = "regex_ngram" +harness = false + [[bench]] name = "merge_insert" harness = false @@ -181,6 +187,10 @@ harness = false name = "scan" harness = false +[[bench]] +name = "count_pushdown" +harness = false + [[bench]] name = "vector_index" harness = false @@ -294,5 +304,9 @@ harness = false name = "concurrent_append" harness = false +[[bench]] +name = "hamming" +harness = false + [lints] workspace = true diff --git a/rust/lance/benches/count_pushdown.rs b/rust/lance/benches/count_pushdown.rs new file mode 100644 index 00000000000..4f633d489dc --- /dev/null +++ b/rust/lance/benches/count_pushdown.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmarks for `COUNT(*)` via the scanner aggregate plan (the path the +//! `count_pushdown` rule rewrites into `CountFromMaskExec`). +//! +//! The dataset uses stable row ids, multiple fragments, and scattered +//! cross-fragment deletions, with a BTree scalar index on the filter column. +//! Run on two revisions to compare (e.g. before/after a change to the rule): +//! +//! ```text +//! cargo bench -p lance --bench count_pushdown +//! ``` + +use std::sync::Arc; + +use arrow_array::types::UInt32Type; +use criterion::{Criterion, criterion_group, criterion_main}; +use lance::Dataset; +use lance::dataset::WriteParams; +use lance::index::DatasetIndexExt; +use lance_core::utils::tempfile::TempStrDir; +use lance_datagen::{BatchCount, RowCount, array, gen_batch}; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +const ROWS_PER_FRAGMENT: usize = 100_000; +const NUM_FRAGMENTS: usize = 50; +const TOTAL_ROWS: u32 = (ROWS_PER_FRAGMENT * NUM_FRAGMENTS) as u32; // 5,000,000 + +struct Fixture { + _datadir: TempStrDir, + dataset: Arc, +} + +impl Fixture { + async fn open() -> Self { + let datadir = TempStrDir::default(); + // `value` steps 0..TOTAL_ROWS, so `value < k` selects exactly k rows + // (before deletions) and gives precise control over selectivity. + let reader = gen_batch() + .col("value", array::step::()) + .into_reader_rows( + RowCount::from(ROWS_PER_FRAGMENT as u64), + BatchCount::from(NUM_FRAGMENTS as u32), + ); + let mut dataset = Dataset::write( + reader, + datadir.as_str(), + Some(WriteParams { + max_rows_per_file: ROWS_PER_FRAGMENT, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + // Scatter deletions across every fragment (~1%) to exercise the + // deletion mask in stable-id space. + dataset.delete("value % 100 = 0").await.unwrap(); + + dataset + .create_index( + &["value"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + Self { + _datadir: datadir, + dataset: Arc::new(dataset), + } + } +} + +async fn count_unfiltered(dataset: &Dataset) -> u64 { + dataset.scan().count_rows().await.unwrap() +} + +async fn count_filtered(dataset: &Dataset, filter: &str) -> u64 { + let mut scanner = dataset.scan(); + scanner.filter(filter).unwrap(); + scanner.count_rows().await.unwrap() +} + +fn bench_count(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + let fixture = rt.block_on(Fixture::open()); + let ds = &fixture.dataset; + + c.bench_function("count_unfiltered", |b| { + b.iter(|| rt.block_on(count_unfiltered(ds))) + }); + + // ~1% of rows match. + let filter_1pct = format!("value < {}", TOTAL_ROWS / 100); + c.bench_function("count_filtered_1pct", |b| { + b.iter(|| rt.block_on(count_filtered(ds, &filter_1pct))) + }); + + // ~50% of rows match. + let filter_50pct = format!("value < {}", TOTAL_ROWS / 2); + c.bench_function("count_filtered_50pct", |b| { + b.iter(|| rt.block_on(count_filtered(ds, &filter_50pct))) + }); +} + +#[cfg(target_os = "linux")] +criterion_group!( + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10) + .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_count); + +#[cfg(not(target_os = "linux"))] +criterion_group!( + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_count); + +criterion_main!(benches); diff --git a/rust/lance/benches/distributed_vector_build.rs b/rust/lance/benches/distributed_vector_build.rs index 94ae9007a5d..1e66537c8b4 100644 --- a/rust/lance/benches/distributed_vector_build.rs +++ b/rust/lance/benches/distributed_vector_build.rs @@ -290,7 +290,7 @@ async fn build_partial_fixture(dataset: &mut Dataset, bench_case: BenchCase) -> builder = builder .name("distributed_merge_only".to_string()) .fragments(fragments) - .index_uuid(fixture_uuid.to_string()); + .index_uuid(fixture_uuid); Box::pin(builder.execute_uncommitted()).await.unwrap(); } @@ -416,7 +416,7 @@ fn bench_distributed_merge_only(c: &mut Criterion) { || prepare_iteration_target(&source_index_dir_fs, &target_index_dir_fs), |_| { rt.block_on(dataset.merge_index_metadata( - &target_uuid.to_string(), + &target_uuid, IndexType::IvfPq, None, noop_progress(), diff --git a/rust/lance/benches/hamming.rs b/rust/lance/benches/hamming.rs new file mode 100644 index 00000000000..7e926a795db --- /dev/null +++ b/rust/lance/benches/hamming.rs @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmark for hamming distance clustering. +//! +//! This benchmark tests the pairwise hamming distance computation and clustering +//! performance at various scales. +//! +//! Run with: cargo bench -p lance --bench hamming +//! +//! Environment variables: +//! - DATASET_URI: Path to a dataset with a hash column (optional, generates random if not set) +//! - HASH_COLUMN: Name of the hash column (default: "hash") +//! - SAMPLE_SIZE: Number of rows to sample (default: 10000) +//! - THRESHOLD: Hamming distance threshold (default: 10) + +#![allow(clippy::print_stdout)] + +use std::env; +use std::sync::Arc; +use std::time::Instant; + +use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array}; +use arrow_schema::{DataType, Field, FieldRef, Schema}; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use lance_arrow::FixedSizeListArrayExt; +use rand::Rng; + +use lance::index::vector::hamming::{ + hamming_clustering_for_sample, hamming_clustering_from_hashes, +}; +use lance::{Dataset, dataset::WriteParams}; +use lance_linalg::distance::pairwise_hamming_distance_parallel; + +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +/// Generate random 64-bit hashes. +fn generate_random_hashes(n: usize) -> Vec { + let mut rng = rand::rng(); + (0..n).map(|_| rng.random()).collect() +} + +/// Generate random hash dataset as Arrow arrays. +fn generate_hash_batch(num_rows: usize) -> RecordBatch { + let mut rng = rand::rng(); + + // Generate random bytes for the hashes (8 bytes per hash) + let bytes: Vec = (0..num_rows * 8).map(|_| rng.random()).collect(); + let values = UInt8Array::from(bytes); + + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + DataType::FixedSizeList(FieldRef::new(Field::new("item", DataType::UInt8, true)), 8), + false, + )])); + + RecordBatch::try_new(schema, vec![Arc::new(hash_array)]).unwrap() +} + +/// Create a test dataset with random hashes. +async fn create_hash_dataset(path: &std::path::Path, num_rows: usize) { + let batch = generate_hash_batch(num_rows); + let schema = batch.schema(); + + let write_params = WriteParams { + max_rows_per_file: num_rows, + max_rows_per_group: 10_000, + ..Default::default() + }; + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + Dataset::write(reader, path.to_str().unwrap(), Some(write_params)) + .await + .unwrap(); +} + +/// Benchmark pure pairwise hamming computation (no I/O). +fn bench_pairwise_compute(c: &mut Criterion) { + let mut group = c.benchmark_group("hamming_pairwise_compute"); + + for size in [1_000, 5_000, 10_000, 20_000] { + let hashes = generate_random_hashes(size); + let total_pairs = (size as u64) * (size as u64 - 1) / 2; + + group.throughput(Throughput::Elements(total_pairs)); + group.bench_with_input(BenchmarkId::new("parallel", size), &hashes, |b, hashes| { + b.iter(|| { + pairwise_hamming_distance_parallel(hashes, None, Some(10)); + }); + }); + } + + group.finish(); +} + +/// Benchmark full clustering pipeline (compute + cluster). +fn bench_cluster_hashes(c: &mut Criterion) { + let mut group = c.benchmark_group("hamming_cluster"); + + for size in [1_000, 5_000, 10_000] { + let hashes = generate_random_hashes(size); + + group.bench_with_input( + BenchmarkId::new("full_pipeline", size), + &hashes, + |b, hashes| { + b.iter(|| { + hamming_clustering_from_hashes(hashes, None, 10); + }); + }, + ); + } + + group.finish(); +} + +/// Benchmark with dataset I/O (if DATASET_URI is set). +fn bench_dataset_cluster(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + + // Check if we should use an external dataset + let dataset_uri = env::var("DATASET_URI").ok(); + let hash_column = env::var("HASH_COLUMN").unwrap_or_else(|_| "hash".to_string()); + let sample_size: usize = env::var("SAMPLE_SIZE") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(10_000); + let threshold: u32 = env::var("THRESHOLD") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(10); + + let mut group = c.benchmark_group("hamming_dataset"); + + if let Some(uri) = dataset_uri { + // Use external dataset + println!("Using external dataset: {}", uri); + println!( + "Column: {}, Sample: {}, Threshold: {}", + hash_column, sample_size, threshold + ); + + let dataset = rt.block_on(async { Dataset::open(&uri).await.unwrap() }); + + group.bench_function(format!("external_sample_{}", sample_size), |b| { + b.to_async(&rt).iter(|| async { + hamming_clustering_for_sample(&dataset, &hash_column, Some(sample_size), threshold) + .await + .unwrap() + }); + }); + } else { + // Create temporary dataset with random hashes + let temp_dir = tempfile::tempdir().unwrap(); + let uri = temp_dir.path().join("bench_hashes.lance"); + + rt.block_on(async { + create_hash_dataset(&uri, 100_000).await; + }); + + let dataset = rt.block_on(async { Dataset::open(uri.to_str().unwrap()).await.unwrap() }); + + for sample in [1_000, 5_000, 10_000] { + group.bench_function(format!("generated_sample_{}", sample), |b| { + let ds = dataset.clone(); + b.to_async(&rt).iter(|| { + let ds = ds.clone(); + async move { + hamming_clustering_for_sample(&ds, "hash", Some(sample), 10) + .await + .unwrap() + } + }); + }); + } + } + + group.finish(); +} + +/// Quick standalone benchmark that prints results (for quick testing). +#[allow(dead_code)] +fn run_quick_bench() { + println!("=== Hamming Distance Clustering Benchmark ===\n"); + + let sizes = [1_000, 5_000, 10_000, 20_000]; + + for &size in &sizes { + let hashes = generate_random_hashes(size); + let total_pairs = (size as u64) * (size as u64 - 1) / 2; + + println!("Size: {} rows, {} pairs", size, total_pairs); + let start = Instant::now(); + let reader = hamming_clustering_from_hashes(&hashes, None, 10); + // Consume the reader to count clusters + let cluster_count: usize = reader.map(|b| b.unwrap().num_rows()).sum(); + let elapsed = start.elapsed(); + + let pairs_per_sec = total_pairs as f64 / elapsed.as_secs_f64(); + println!( + " Total time: {:?} ({:.2}M pairs/sec)", + elapsed, + pairs_per_sec / 1_000_000.0 + ); + println!(" Total clusters: {}", cluster_count); + println!(); + } +} + +#[cfg(target_os = "linux")] +criterion_group! { + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_pairwise_compute, bench_cluster_hashes, bench_dataset_cluster +} + +#[cfg(not(target_os = "linux"))] +criterion_group!( + benches, + bench_pairwise_compute, + bench_cluster_hashes, + bench_dataset_cluster +); + +criterion_main!(benches); diff --git a/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs b/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs index fc0d6984b53..1b2824bcc83 100644 --- a/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs +++ b/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs @@ -542,8 +542,7 @@ async fn run_checkpoint( let uuid = idx_metas .first() .ok_or_else(|| lance_core::Error::io("flushed gen has no vector index".to_string()))? - .uuid - .to_string(); + .uuid; let vidx = gen_ds .open_vector_index(VECTOR_COL, &uuid, &NoOpMetricsCollector) .await?; @@ -590,6 +589,7 @@ async fn run_checkpoint( use_index: true, query_parallelism: 1, dist_q_c: 0.0, + approx_mode: Default::default(), }; // IVFIndex::search is intentionally unimplemented (top-level does // partition-aware search); replicate the ANN exec node: pick the diff --git a/rust/lance/benches/mem_wal/write/mem_wal_write.rs b/rust/lance/benches/mem_wal/write/mem_wal_write.rs index 24f3a0d7c8f..9a5fc71ab17 100644 --- a/rust/lance/benches/mem_wal/write/mem_wal_write.rs +++ b/rust/lance/benches/mem_wal/write/mem_wal_write.rs @@ -649,8 +649,10 @@ fn bench_lance_memwal_write(c: &mut Criterion) { backpressure_log_interval: default_config .backpressure_log_interval, stats_log_interval: default_config.stats_log_interval, + frozen_memtable_grace: default_config.frozen_memtable_grace, enable_memtable, hnsw_params: default_config.hnsw_params, + warmer: None, }; // Get writer through Dataset API (index configs loaded automatically) diff --git a/rust/lance/benches/merge_insert.rs b/rust/lance/benches/merge_insert.rs index b003d03de44..5de190ffde2 100644 --- a/rust/lance/benches/merge_insert.rs +++ b/rust/lance/benches/merge_insert.rs @@ -15,6 +15,9 @@ //! computation. The other shapes (`clean`, `with_new_rows_only`, //! `with_deletions_only`) skip that branch and serve as controls. //! +//! The `composite_key_indexed` shape covers merge_insert joined on two +//! indexed key columns, which probes every index and AND-folds the results. +//! //! Run with `cargo bench --bench merge_insert`. use std::sync::Arc; @@ -127,6 +130,90 @@ async fn one_merge_insert(ds: Arc, base_existing: i64, base_new: i64) { let _ = job.execute_reader(reader).await.unwrap(); } +// --- Composite-key shape: join on (a, b) with a BTree index on each key --- +// +// Exercises the multi-index probe path: one `IsIn` query per indexed key +// column, AND-folded inside a single `MapIndexExec`, followed by the +// composite hash join. `b` is a deterministic function of `a` so source +// rows can target existing composite keys without tracking extra state. + +fn composite_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Int64, false), + ])) +} + +fn composite_b(a: i64) -> i64 { + a * 7 + 3 +} + +fn make_composite_batch(start: i64, n: usize) -> RecordBatch { + let a = Int64Array::from_iter_values(start..start + n as i64); + let b = Int64Array::from_iter_values((start..start + n as i64).map(composite_b)); + RecordBatch::try_new(composite_schema(), vec![Arc::new(a), Arc::new(b)]).unwrap() +} + +async fn build_composite_key_indexed() -> (TempStrDir, Arc) { + let dir = TempStrDir::default(); + let path = dir.as_str().to_string(); + let total = ROWS_PER_FRAG * NUM_FRAGS; + let mut batches = Vec::new(); + let mut start = 0i64; + while (start as u64) < total { + let n = (total - start as u64).min(ROWS_PER_FRAG) as usize; + batches.push(make_composite_batch(start, n)); + start += n as i64; + } + let params = WriteParams { + max_rows_per_file: ROWS_PER_FRAG as usize, + max_rows_per_group: ROWS_PER_FRAG as usize, + mode: WriteMode::Create, + ..Default::default() + }; + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), composite_schema()); + Dataset::write(reader, &path, Some(params)).await.unwrap(); + + let mut ds = Dataset::open(&path).await.unwrap(); + for col in ["a", "b"] { + ds.create_index( + &[col], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + } + (dir, Arc::new(ds)) +} + +/// Composite-key merge_insert op: 30 updates of existing (a, b) pairs + +/// 70 inserts of new pairs, joined on both columns. +async fn one_composite_merge_insert(ds: Arc, base_existing: i64, base_new: i64) { + let mut a_vals: Vec = (0..30).map(|i| base_existing + i).collect(); + a_vals.extend(base_new..base_new + 70); + let b_vals: Vec = a_vals.iter().copied().map(composite_b).collect(); + let batch = RecordBatch::try_new( + composite_schema(), + vec![ + Arc::new(Int64Array::from(a_vals)), + Arc::new(Int64Array::from(b_vals)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(std::iter::once(Ok(batch)), composite_schema()); + + let mut builder = + MergeInsertBuilder::try_new(ds, vec!["a".to_string(), "b".to_string()]).unwrap(); + builder + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::InsertAll); + let job = builder.try_build().unwrap(); + let _ = job.execute_reader(reader).await.unwrap(); +} + async fn build_clean() -> (TempStrDir, Arc) { let dir = TempStrDir::default(); let path = dir.as_str().to_string(); @@ -161,10 +248,12 @@ async fn build_with_new_rows_and_deletions() -> (TempStrDir, Arc) { (dir, Arc::new(ds)) } -fn bench_one_shape(c: &mut Criterion, name: &str, builder: F) +fn bench_one_shape(c: &mut Criterion, name: &str, builder: F, merge: M) where F: FnOnce() -> Fut, Fut: std::future::Future)>, + M: Fn(Arc, i64, i64) -> MFut + Copy, + MFut: std::future::Future, { let rt = tokio::runtime::Runtime::new().unwrap(); let (_dir, ds) = rt.block_on(builder()); @@ -183,29 +272,40 @@ where bench_ds.restore().await.unwrap(); let arc = Arc::new(bench_ds); // base_existing in the indexed range, base_new beyond all data so it's an insert. - one_merge_insert(arc, 100, total + 1_000_000).await; + merge(arc, 100, total + 1_000_000).await; }) }) }); } fn bench_merge_insert(c: &mut Criterion) { - bench_one_shape(c, "merge_insert/clean", build_clean); + bench_one_shape(c, "merge_insert/clean", build_clean, one_merge_insert); bench_one_shape( c, "merge_insert/with_new_rows_only", build_with_new_rows_only, + one_merge_insert, ); bench_one_shape( c, "merge_insert/with_deletions_only", build_with_deletions_only, + one_merge_insert, ); // The shape that exercises the AllowList(Full) - BlockList(Partial) path. bench_one_shape( c, "merge_insert/with_new_rows_and_deletions", build_with_new_rows_and_deletions, + one_merge_insert, + ); + // Composite key joined on (a, b), both BTree-indexed: exercises the + // AND-folded multi-index probe inside MapIndexExec. + bench_one_shape( + c, + "merge_insert/composite_key_indexed", + build_composite_key_indexed, + one_composite_merge_insert, ); } diff --git a/rust/lance/benches/regex_ngram.rs b/rust/lance/benches/regex_ngram.rs new file mode 100644 index 00000000000..76f597ad9cb --- /dev/null +++ b/rust/lance/benches/regex_ngram.rs @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmark: regex predicate scans over an ngram-indexed string column. +//! +//! Each query is a `regexp_match(doc, '...')` filter against a dataset that has +//! an NGram index on `doc`. The query set spans a selective AND pattern, an +//! alternation, a plain literal (rewritten to an infix LIKE before it reaches +//! the index), and a deliberately non-accelerable pattern (`a.b`, which yields +//! no trigram) that serves as a regression guard. +//! +//! On `main` none of these use the index (regex falls through to a full scan + +//! recheck); with the ngram-regex acceleration the index prunes candidates for +//! the first three while `a.b` stays a full scan. Capture a baseline on `main` +//! with `--save-baseline before_7130`, then compare after the change with +//! `--baseline before_7130`. + +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +use arrow::array::AsArray; +use arrow_array::{RecordBatch, RecordBatchIterator, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::TryStreamExt; +use lance::Dataset; +use lance::index::DatasetIndexExt; +use lance_core::utils::tempfile::TempStrDir; +use lance_datagen::{RowCount, array}; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +const TOTAL: usize = 200_000; + +/// Build the `doc` column: random sentences with rare markers injected into a +/// small fraction of rows so the regex queries have controlled selectivity. +/// The markers (`zqxwvu`, `needlexyz`, `qwerasdf`) are unlikely to appear in +/// the generated English-word sentences. +fn build_docs() -> StringArray { + let mut sentence_gen = array::random_sentence(1, 30, false); + let base = sentence_gen + .generate_default(RowCount::from(TOTAL as u64)) + .unwrap(); + let base = base.as_string::(); + let docs = (0..TOTAL).map(|i| { + let sentence = base.value(i); + if i % 200 == 0 { + // ~0.5% of rows match `zqxwvu.*needlexyz` and `zqxwvu`. + format!("{sentence} zqxwvu needlexyz") + } else if i % 211 == 0 { + // A second marker for the alternation query. + format!("{sentence} qwerasdf") + } else { + sentence.to_string() + } + }); + StringArray::from_iter_values(docs) +} + +async fn build_dataset(tempdir: &TempStrDir) -> Arc { + let schema = Arc::new(Schema::new(vec![Field::new("doc", DataType::Utf8, false)])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(build_docs())]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let mut dataset = Dataset::write(reader, tempdir.as_str(), None) + .await + .unwrap(); + dataset + .create_index( + &["doc"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + Arc::new(dataset) +} + +async fn scan_filter(dataset: &Dataset, filter: &str) -> usize { + let mut scanner = dataset.scan(); + scanner.filter(filter).unwrap(); + let stream = scanner.try_into_stream().await.unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + batches.iter().map(|b| b.num_rows()).sum() +} + +fn bench_regex_ngram(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + let tempdir = TempStrDir::default(); + let dataset = rt.block_on(build_dataset(&tempdir)); + + let queries = [ + ("selective_and", "regexp_match(doc, 'zqxwvu.*needlexyz')"), + ( + "alternation", + "regexp_match(doc, '(zqxwvu|qwerasdf|needlexyz)')", + ), + ("plain_literal", "regexp_match(doc, 'zqxwvu')"), + ("non_accelerable_a_dot_b", "regexp_match(doc, 'a.b')"), + ]; + + let mut group = c.benchmark_group("regex_ngram"); + group + .sample_size(10) + .measurement_time(Duration::from_secs(15)); + for (name, filter) in queries { + group.bench_function(name, |b| { + b.iter(|| black_box(rt.block_on(scan_filter(&dataset, filter)))); + }); + } + group.finish(); +} + +#[cfg(target_os = "linux")] +criterion_group!( + name = benches; + config = Criterion::default() + .significance_level(0.1) + .sample_size(10) + .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_regex_ngram); + +#[cfg(not(target_os = "linux"))] +criterion_group!( + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_regex_ngram); + +criterion_main!(benches); diff --git a/rust/lance/src/blob.rs b/rust/lance/src/blob.rs index 322bf67a04c..58df42b5cd3 100644 --- a/rust/lance/src/blob.rs +++ b/rust/lance/src/blob.rs @@ -7,12 +7,16 @@ //! tagged with `ARROW:extension:name = "lance.blob.v2"`. This module offers a //! type-safe builder to construct that struct without manually wiring metadata +use std::num::NonZeroUsize; use std::sync::Arc; use arrow_array::{ArrayRef, StructArray, builder::LargeBinaryBuilder, builder::StringBuilder}; use arrow_buffer::NullBufferBuilder; use arrow_schema::{DataType, Field}; -use lance_arrow::{ARROW_EXT_NAME_KEY, BLOB_V2_EXT_NAME}; +use lance_arrow::{ + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, +}; use crate::{Error, Result}; @@ -21,9 +25,71 @@ use crate::{Error, Result}; /// Blob v2 expects a column shaped as `Struct` and /// tagged with `ARROW:extension:name = "lance.blob.v2"`. pub fn blob_field(name: &str, nullable: bool) -> Field { - let metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())] + blob_field_with_options(name, nullable, BlobFieldOptions::default()) +} + +/// Options for constructing a blob v2 field. +#[derive(Clone, Debug, Default)] +pub struct BlobFieldOptions { + /// Maximum payload size to keep inline in the data file before using packed blob storage. + pub inline_size_threshold: Option, + /// Maximum payload size to store in packed blob storage before using dedicated blob storage. + /// + /// A zero threshold is invalid because dedicated blob storage is selected when + /// the payload size is greater than this value. + pub dedicated_size_threshold: Option, +} + +impl BlobFieldOptions { + /// Set the maximum payload size to keep inline in the data file. + pub fn with_inline_size_threshold(mut self, threshold: usize) -> Self { + self.inline_size_threshold = Some(threshold); + self + } + + /// Set the maximum payload size to store in packed blob storage. + pub fn with_dedicated_size_threshold(mut self, threshold: NonZeroUsize) -> Self { + self.dedicated_size_threshold = Some(threshold); + self + } +} + +/// Construct the Arrow field for a blob v2 column with storage layout options. +/// +/// Blob v2 expects a column shaped as `Struct` and +/// tagged with `ARROW:extension:name = "lance.blob.v2"`. +/// +/// ``` +/// # use lance::{BlobFieldOptions, blob_field_with_options}; +/// let field = blob_field_with_options( +/// "blob", +/// true, +/// BlobFieldOptions::default().with_inline_size_threshold(16 * 1024), +/// ); +/// assert_eq!( +/// field +/// .metadata() +/// .get("lance-encoding:blob-inline-size-threshold") +/// .map(String::as_str), +/// Some("16384"), +/// ); +/// ``` +pub fn blob_field_with_options(name: &str, nullable: bool, options: BlobFieldOptions) -> Field { + let mut metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())] .into_iter() - .collect(); + .collect::>(); + if let Some(threshold) = options.inline_size_threshold { + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + threshold.to_string(), + ); + } + if let Some(threshold) = options.dedicated_size_threshold { + metadata.insert( + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(), + threshold.get().to_string(), + ); + } Field::new( name, DataType::Struct( @@ -142,6 +208,8 @@ impl BlobArrayBuilder { #[cfg(test)] mod tests { + use std::num::NonZeroUsize; + use super::*; use arrow_array::Array; use arrow_array::cast::AsArray; @@ -156,6 +224,31 @@ mod tests { ); } + #[test] + fn test_field_metadata_with_options() { + let field = blob_field_with_options( + "blob", + true, + BlobFieldOptions::default() + .with_inline_size_threshold(16 * 1024) + .with_dedicated_size_threshold(NonZeroUsize::new(2 * 1024 * 1024).unwrap()), + ); + assert_eq!( + field + .metadata() + .get(BLOB_INLINE_SIZE_THRESHOLD_META_KEY) + .unwrap(), + "16384" + ); + assert_eq!( + field + .metadata() + .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY) + .unwrap(), + "2097152" + ); + } + #[test] fn test_builder_basic() { let mut b = BlobArrayBuilder::new(4); diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 52e21bd7cba..3e0d77704da 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -8,10 +8,10 @@ use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_schema::DataType; use byteorder::{ByteOrder, LittleEndian}; use chrono::{Duration, prelude::*}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::stream::{self, BoxStream, StreamExt, TryStreamExt}; use futures::{FutureExt, Stream}; +use lance_core::deepsize::DeepSizeOf; use crate::dataset::metadata::UpdateFieldMetadataBuilder; use crate::dataset::transaction::translate_schema_metadata_updates; @@ -24,8 +24,7 @@ use lance_core::datatypes::{OnMissing, OnTypeMismatch, Projectable, Projection}; use lance_core::traits::DatasetTakeRows; use lance_core::utils::address::RowAddress; use lance_core::utils::tracing::{ - DATASET_CLEANING_EVENT, DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT, - TRACE_DATASET_EVENTS, + DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT, TRACE_DATASET_EVENTS, }; use lance_datafusion::projection::ProjectionPlan; use lance_file::datatypes::populate_schema_dictionary; @@ -69,10 +68,11 @@ use std::sync::Arc; use tracing::{info, instrument}; pub(crate) mod blob; -mod branch_location; +pub(crate) mod branch_location; pub mod builder; pub mod cleanup; pub mod delta; +pub mod files; pub mod fragment; mod hash_joiner; pub mod index; @@ -103,7 +103,7 @@ use self::scanner::{DatasetRecordBatchStream, Scanner}; use self::transaction::{Operation, Transaction, TransactionBuilder, UpdateMapEntry}; use self::write::{cleanup_data_fragments, write_fragments_internal}; use crate::dataset::branch_location::BranchLocation; -use crate::dataset::cleanup::{CleanupPolicy, CleanupPolicyBuilder}; +use crate::dataset::cleanup::{CleanupOperation, CleanupPolicy, CleanupPolicyBuilder}; use crate::dataset::refs::{BranchContents, BranchIdentifier, Branches, Tags}; use crate::dataset::sql::SqlQueryBuilder; use crate::datatypes::Schema; @@ -127,6 +127,7 @@ pub use schema_evolution::{ BatchInfo, BatchUDF, ColumnAlteration, NewColumnTransform, UDFCheckpointStore, }; pub use take::TakeBuilder; +use uuid::Uuid; pub use write::merge_insert::{ MergeInsertBuilder, MergeInsertJob, MergeStats, UncommittedMergeInsert, WhenMatched, WhenNotMatched, WhenNotMatchedBySource, @@ -499,17 +500,23 @@ impl Dataset { ) -> Result { let (source_branch, version_number) = self.resolve_reference(version.into()).await?; let branch_location = self.branch_location().find_branch(Some(branch))?; + let source_location = self + .branch_location() + .find_branch(source_branch.as_deref())?; let clone_op = Operation::Clone { is_shallow: true, ref_name: source_branch.clone(), ref_version: version_number, - ref_path: String::from(self.uri()), + ref_path: source_location.uri, branch_name: Some(branch.to_string()), }; let transaction = Transaction::new(version_number, clone_op, None); let builder = CommitBuilder::new(WriteDestination::Uri(branch_location.uri.as_str())) - .with_store_params(store_params.unwrap_or_default()) + // Fall back to the dataset's own store params + .with_store_params( + store_params.unwrap_or(self.store_params.as_deref().cloned().unwrap_or_default()), + ) .with_object_store(Arc::new(self.object_store.as_ref().clone())) .with_commit_handler(self.commit_handler.clone()) .with_storage_format(self.manifest.data_storage_format.lance_file_version()?); @@ -555,6 +562,15 @@ impl Dataset { version_number: Option, branch: Option<&str>, ) -> Result { + // Reject malformed names at the boundary (mirroring the branch CRUD + // paths) so they fail as InvalidRef instead of tripping the wrong-chain + // check below + if let Some(branch_name) = branch + && !Branches::is_main_branch(branch) + { + refs::check_valid_branch(branch_name)?; + } + let new_location = self.branch_location().find_branch(branch)?; let manifest_location = if let Some(version_number) = version_number { @@ -582,6 +598,21 @@ impl Dataset { self.session.as_ref(), ) .await?; + + // The resolved manifest must belong to the requested branch. A mismatch + // means the commit handler resolved against a different chain (for + // example an external manifest store that ignores branch-qualified + // paths); error loudly rather than hand back another branch's data. + let requested_branch = branch.and_then(refs::standardize_branch); + if manifest.branch.as_deref() != requested_branch.as_deref() { + return Err(Error::internal(format!( + "checkout of branch '{}' at version {} resolved a manifest belonging to branch '{}'", + refs::normalize_branch(branch), + manifest.version, + refs::normalize_branch(manifest.branch.as_deref()), + ))); + } + Self::checkout_manifest( self.object_store.clone(), new_location.path, @@ -778,12 +809,50 @@ impl Dataset { batches: impl RecordBatchReader + Send + 'static, namespace_client: Arc, table_id: Vec, + params: Option, + ) -> Result { + Self::write_into_namespace_impl(batches, namespace_client, table_id, None, params).await + } + + /// Write into a branch of a namespace client-managed table. + /// + /// Behaves like [`write_into_namespace`](Self::write_into_namespace), but APPEND and + /// OVERWRITE open and commit against `branch` instead of main. CREATE is rejected, + /// since a branch forks from an existing version. + pub async fn write_into_namespace_on_branch( + batches: impl RecordBatchReader + Send + 'static, + namespace_client: Arc, + table_id: Vec, + branch: &str, + params: Option, + ) -> Result { + Self::write_into_namespace_impl( + batches, + namespace_client, + table_id, + Some(branch.to_string()), + params, + ) + .await + } + + async fn write_into_namespace_impl( + batches: impl RecordBatchReader + Send + 'static, + namespace_client: Arc, + table_id: Vec, + branch: Option, mut params: Option, ) -> Result { let mut write_params = params.take().unwrap_or_default(); match write_params.mode { WriteMode::Create => { + if branch.is_some() { + return Err(Error::not_supported_source( + "cannot create a table on a branch; create on main first, then branch it" + .into(), + )); + } let declare_request = DeclareTableRequest { id: Some(table_id.clone()), ..Default::default() @@ -801,10 +870,13 @@ impl Dataset { // Set up commit handler when managed_versioning is enabled if response.managed_versioning == Some(true) { - let external_store = LanceNamespaceExternalManifestStore::new( + // The store derives the branch a request targets from the + // base path it is handed, resolved against the table root. + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( namespace_client.clone(), table_id.clone(), - ); + &uri, + )?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), @@ -856,18 +928,25 @@ impl Dataset { ))) })?; - // Set up commit handler when managed_versioning is enabled - if response.managed_versioning == Some(true) { - let external_store = LanceNamespaceExternalManifestStore::new( - namespace_client.clone(), - table_id.clone(), - ); - let commit_handler: Arc = - Arc::new(ExternalManifestCommitHandler { + // Set up commit handler when managed_versioning is enabled. + // It must ride on the dataset opened below: InsertBuilder + // commits through the destination dataset's handler and does + // not consult write params for Dataset destinations. + let commit_handler: Option> = + if response.managed_versioning == Some(true) { + // The store derives the branch a request targets from the + // base path it is handed, resolved against the table root. + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + namespace_client.clone(), + table_id.clone(), + uri.as_str(), + )?; + Some(Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), - }); - write_params.commit_handler = Some(commit_handler); - } + })) + } else { + None + }; // Set initial credentials and provider from namespace_client if let Some(namespace_storage_options) = response.storage_options { @@ -906,6 +985,12 @@ impl Dataset { { builder = builder.with_storage_options_accessor(accessor.clone()); } + if let Some(commit_handler) = commit_handler { + builder = builder.with_commit_handler(commit_handler); + } + if let Some(branch) = &branch { + builder = builder.with_branch(branch, None); + } let dataset = Arc::new(builder.load().await?); Self::write(batches, dataset, Some(write_params)).await @@ -1200,8 +1285,15 @@ impl Dataset { &self, policy: CleanupPolicy, ) -> BoxFuture<'_, Result> { - info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.uri); - cleanup::cleanup_old_versions(self, policy).boxed() + async move { self.cleanup(policy).execute().await }.boxed() + } + + /// Creates a cleanup operation for this dataset. + /// + /// The returned operation can be explained without deleting files, or + /// executed to re-evaluate the current dataset state and remove files. + pub fn cleanup(&self, policy: CleanupPolicy) -> CleanupOperation<'_> { + CleanupOperation::new(self, policy) } #[allow(clippy::too_many_arguments)] @@ -2149,6 +2241,39 @@ impl Dataset { .version) } + /// Return whether the dataset has a newer committed version. + pub async fn is_stale(&self) -> Result { + let latest_version = self.latest_version_id().await?; + Ok(latest_version != self.manifest.version) + } + + /// Return whether the immediate attached successor manifest exists. + /// + /// This is a fast contiguous-history probe. It does not resolve the latest + /// version and may return `false` if intermediate manifests have been + /// removed. Callers that need a general freshness check should use + /// [`Self::is_stale`]. + #[doc(hidden)] + pub async fn has_successor_version(&self) -> Result { + let Some(next_version) = self.manifest.version.checked_add(1) else { + return Ok(false); + }; + if lance_table::format::is_detached_version(next_version) { + return Ok(false); + } + + let exists = self + .commit_handler + .version_exists( + &self.base, + next_version, + self.object_store.inner.as_ref(), + self.manifest_location.naming_scheme, + ) + .await?; + Ok(exists) + } + pub fn count_fragments(&self) -> usize { self.manifest.fragments.len() } @@ -2509,11 +2634,12 @@ impl Dataset { store_params: Option, ) -> Result { let (ref_name, version_number) = self.resolve_reference(version.into()).await?; + let source_location = self.branch_location().find_branch(ref_name.as_deref())?; let clone_op = Operation::Clone { is_shallow: true, ref_name, ref_version: version_number, - ref_path: self.uri.clone(), + ref_path: source_location.uri, branch_name: None, }; let transaction = Transaction::new(version_number, clone_op, None); @@ -3033,13 +3159,13 @@ impl Dataset { /// progress via the supplied callback. pub async fn merge_index_metadata( &self, - index_uuid: &str, + index_uuid: &Uuid, index_type: IndexType, _batch_readhead: Option, progress: Arc, ) -> Result<()> { let store = LanceIndexStore::from_dataset_for_new(self, index_uuid)?; - let index_dir = self.indices_dir().join(index_uuid); + let index_dir = self.indices_dir().join(index_uuid.to_string()); match index_type { IndexType::Inverted => { // Call merge_index_files function for inverted index @@ -3054,18 +3180,19 @@ impl Dataset { IndexType::BTree => { Err(Error::invalid_input( "BTree distributed indexing no longer supports merge_index_metadata; \ - build segments, and commit with commit_existing_index_segments(...)" + build segments, optionally merge groups with merge_existing_index_segments(...), \ + and commit with commit_existing_index_segments(...)" .to_string(), )) } IndexType::Bitmap => { - lance_index::scalar::bitmap::merge_index_files( - self.object_store.as_ref(), - &index_dir, - Arc::new(store), - progress, - ) - .await + Err(Error::invalid_input( + "Bitmap distributed indexing no longer supports merge_index_metadata; \ + build segments with create_index_uncommitted(...), merge them with \ + merge_existing_index_segments(...), and commit with \ + commit_existing_index_segments(...)" + .to_string(), + )) } IndexType::IvfFlat | IndexType::IvfPq | IndexType::IvfSq | IndexType::Vector => { Err(Error::invalid_input( diff --git a/rust/lance/src/dataset/blob.rs b/rust/lance/src/dataset/blob.rs index f2c243367ce..8cdde543e4e 100644 --- a/rust/lance/src/dataset/blob.rs +++ b/rust/lance/src/dataset/blob.rs @@ -12,14 +12,18 @@ use std::{ use arrow::array::AsArray; use arrow::datatypes::{UInt8Type, UInt32Type, UInt64Type}; -use arrow_array::Array; use arrow_array::RecordBatch; use arrow_array::builder::{LargeBinaryBuilder, PrimitiveBuilder, StringBuilder}; -use arrow_schema::DataType as ArrowDataType; +use arrow_array::{Array, ArrayRef}; +use arrow_schema::{DataType as ArrowDataType, Field as ArrowField}; use bytes::Bytes; +use futures::future::BoxFuture; use futures::stream::BoxStream; use futures::{FutureExt, StreamExt, TryStreamExt, stream}; -use lance_arrow::{BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, FieldExt}; +use lance_arrow::{ + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_INLINE_SIZE_THRESHOLD_META_KEY, FieldExt, + r#struct::StructArrayExt, +}; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; use lance_io::scheduler::{FileScheduler, ScanScheduler, SchedulerConfig}; use object_store::path::Path; @@ -40,6 +44,58 @@ use lance_io::utils::CachedFileSize; const INLINE_MAX: usize = 64 * 1024; // 64KB inline cutoff const DEDICATED_THRESHOLD: usize = 4 * 1024 * 1024; // 4MB dedicated cutoff const PACK_FILE_MAX_SIZE: usize = 1024 * 1024 * 1024; // 1GiB per .pack sidecar + +pub(super) fn blob_inline_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, +) -> Result { + blob_threshold_from_metadata( + metadata, + field_name, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + INLINE_MAX, + true, + ) +} + +pub(super) fn blob_dedicated_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, +) -> Result { + blob_threshold_from_metadata( + metadata, + field_name, + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + DEDICATED_THRESHOLD, + false, + ) +} + +fn blob_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, + key: &str, + default_value: usize, + allow_zero: bool, +) -> Result { + let Some(value) = metadata.get(key) else { + return Ok(default_value); + }; + let threshold = value.parse::().map_err(|_| { + Error::invalid_input(format!( + "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \ + expected a non-negative integer that fits in usize" + )) + })?; + if !allow_zero && threshold == 0 { + return Err(Error::invalid_input(format!( + "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \ + expected a positive integer" + ))); + } + Ok(threshold) +} + #[derive(Clone, Debug, PartialEq, Eq)] pub(super) struct ResolvedExternalBase { pub base_id: u32, @@ -205,9 +261,7 @@ pub struct BlobPreprocessor { data_file_key: String, local_counter: u32, pack_writer: PackWriter, - blob_v2_cols: Vec, - dedicated_thresholds: Vec, - writer_metadata: Vec>, + field_processors: Vec, external_base_resolver: Option>, allow_external_blob_outside_bases: bool, external_blob_mode: ExternalBlobMode, @@ -232,6 +286,64 @@ enum BlobWriteSource<'a> { External(&'a ExternalBlobSource), } +#[derive(Clone, Debug)] +struct BlobPreprocessField { + kind: BlobPreprocessFieldKind, +} + +#[derive(Clone, Debug)] +enum BlobPreprocessFieldKind { + BlobV2 { + inline_threshold: usize, + dedicated_threshold: usize, + writer_metadata: HashMap, + }, + Struct { + children: Vec, + }, + Passthrough, +} + +impl BlobPreprocessField { + fn new(field: &ArrowField) -> Result { + if field.is_blob_v2() { + return Ok(Self { + kind: BlobPreprocessFieldKind::BlobV2 { + inline_threshold: blob_inline_threshold_from_metadata( + field.metadata(), + field.name(), + )?, + dedicated_threshold: blob_dedicated_threshold_from_metadata( + field.metadata(), + field.name(), + )?, + writer_metadata: field.metadata().clone(), + }, + }); + } + + if let ArrowDataType::Struct(children) = field.data_type() { + let children = children + .iter() + .map(|child| Self::new(child.as_ref())) + .collect::>>()?; + if children.iter().any(|child| child.requires_preprocessing()) { + return Ok(Self { + kind: BlobPreprocessFieldKind::Struct { children }, + }); + } + } + + Ok(Self { + kind: BlobPreprocessFieldKind::Passthrough, + }) + } + + fn requires_preprocessing(&self) -> bool { + !matches!(self.kind, BlobPreprocessFieldKind::Passthrough) + } +} + impl ExternalBlobSource { /// Return the logical payload size after applying any external slice. fn size(&self) -> u64 { @@ -313,7 +425,7 @@ impl BlobPreprocessor { source_store_registry: Arc, source_store_params: ObjectStoreParams, pack_file_size_threshold: Option, - ) -> Self { + ) -> Result { let mut pack_writer = PackWriter::new( object_store.clone(), data_dir.clone(), @@ -323,32 +435,25 @@ impl BlobPreprocessor { pack_writer.max_pack_size = max_bytes; } let arrow_schema = arrow_schema::Schema::from(schema); - let fields = arrow_schema.fields(); - let blob_v2_cols = fields.iter().map(|field| field.is_blob_v2()).collect(); - let dedicated_thresholds = fields - .iter() - .map(|field| dedicated_threshold_from_metadata(field.as_ref())) - .collect(); - let writer_metadata = fields + let field_processors = arrow_schema + .fields() .iter() - .map(|field| field.metadata().clone()) - .collect(); - Self { + .map(|field| BlobPreprocessField::new(field.as_ref())) + .collect::>>()?; + Ok(Self { object_store, data_dir, data_file_key, // Start at 1 to avoid a potential all-zero blob_id value. local_counter: 1, pack_writer, - blob_v2_cols, - dedicated_thresholds, - writer_metadata, + field_processors, external_base_resolver, allow_external_blob_outside_bases, external_blob_mode, source_store_registry, source_store_params, - } + }) } fn next_blob_id(&mut self) -> u32 { @@ -443,7 +548,7 @@ impl BlobPreprocessor { } pub(crate) async fn preprocess_batch(&mut self, batch: &RecordBatch) -> Result { - let expected_columns = self.blob_v2_cols.len(); + let expected_columns = self.field_processors.len(); if batch.num_columns() != expected_columns { return Err(Error::invalid_input(format!( "Unexpected number of columns: expected {}, got {}", @@ -454,245 +559,340 @@ impl BlobPreprocessor { let batch_schema = batch.schema(); let batch_fields = batch_schema.fields(); + let field_processors = self.field_processors.clone(); let mut new_columns = Vec::with_capacity(batch.num_columns()); let mut new_fields = Vec::with_capacity(batch.num_columns()); - for idx in 0..batch.num_columns() { - let array = batch.column(idx); - let field = &batch_fields[idx]; - if !self.blob_v2_cols[idx] { - new_columns.push(array.clone()); - new_fields.push(field.clone()); + for ((processor, array), field) in field_processors + .iter() + .zip(batch.columns().iter()) + .zip(batch_fields.iter()) + { + let (new_column, new_field) = self + .preprocess_field(processor, array.clone(), field) + .await?; + new_columns.push(new_column); + new_fields.push(new_field); + } + + let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata( + new_fields + .iter() + .map(|f| f.as_ref().clone()) + .collect::>(), + batch_schema.metadata().clone(), + )); + + RecordBatch::try_new(new_schema, new_columns) + .map_err(|e| Error::invalid_input(e.to_string())) + } + + fn preprocess_field<'a>( + &'a mut self, + processor: &'a BlobPreprocessField, + array: ArrayRef, + field: &'a Arc, + ) -> BoxFuture<'a, Result<(ArrayRef, Arc)>> { + async move { + match &processor.kind { + BlobPreprocessFieldKind::Passthrough => Ok((array, field.clone())), + BlobPreprocessFieldKind::BlobV2 { + inline_threshold, + dedicated_threshold, + writer_metadata, + } => { + self.preprocess_blob_array( + array, + field.as_ref(), + *inline_threshold, + *dedicated_threshold, + writer_metadata, + ) + .await + } + BlobPreprocessFieldKind::Struct { children } => { + self.preprocess_struct_array(array, field.as_ref(), children) + .await + } + } + } + .boxed() + } + + async fn preprocess_struct_array( + &mut self, + array: ArrayRef, + field: &ArrowField, + children: &[BlobPreprocessField], + ) -> Result<(ArrayRef, Arc)> { + let struct_arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("Struct field was not a struct array"))?; + if struct_arr.num_columns() != children.len() { + return Err(Error::invalid_input(format!( + "Struct field '{}' expected {} children, got {}", + field.name(), + children.len(), + struct_arr.num_columns() + ))); + } + + let struct_arr = struct_arr.normalize_slicing()?; + let parent_nulls = struct_arr.nulls().cloned(); + let pushed_down = struct_arr.pushdown_nulls()?; + let child_fields = pushed_down.fields().clone(); + let child_columns = pushed_down.columns().to_vec(); + + let mut new_columns = Vec::with_capacity(children.len()); + let mut new_fields = Vec::with_capacity(children.len()); + for ((child_processor, child_array), child_field) in children + .iter() + .zip(child_columns.into_iter()) + .zip(child_fields.iter()) + { + let (new_column, new_field) = self + .preprocess_field(child_processor, child_array, child_field) + .await?; + new_columns.push(new_column); + new_fields.push(new_field); + } + + let struct_array = + StructArray::try_new(new_fields.clone().into(), new_columns, parent_nulls)?; + let field = Arc::new( + ArrowField::new( + field.name(), + ArrowDataType::Struct(new_fields.into()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ); + Ok((Arc::new(struct_array), field)) + } + + async fn preprocess_blob_array( + &mut self, + array: ArrayRef, + field: &ArrowField, + inline_threshold: usize, + dedicated_threshold: usize, + writer_metadata: &HashMap, + ) -> Result<(ArrayRef, Arc)> { + let struct_arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?; + + let data_col = struct_arr + .column_by_name("data") + .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))? + .as_binary::(); + let uri_col = struct_arr + .column_by_name("uri") + .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))? + .as_string::(); + let position_col = struct_arr + .column_by_name("position") + .map(|col| col.as_primitive::()); + let size_col = struct_arr + .column_by_name("size") + .map(|col| col.as_primitive::()); + + let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0); + let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0); + let mut blob_id_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut blob_size_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut kind_builder = PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut position_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + + let struct_nulls = struct_arr.nulls(); + + for i in 0..struct_arr.len() { + if struct_arr.is_null(i) { + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + kind_builder.append_null(); + position_builder.append_null(); continue; } - let struct_arr = array - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?; - - let data_col = struct_arr - .column_by_name("data") - .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))? - .as_binary::(); - let uri_col = struct_arr - .column_by_name("uri") - .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))? - .as_string::(); - let position_col = struct_arr - .column_by_name("position") - .map(|col| col.as_primitive::()); - let size_col = struct_arr - .column_by_name("size") - .map(|col| col.as_primitive::()); - - let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0); - let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0); - let mut blob_id_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut blob_size_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut kind_builder = PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut position_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - - let struct_nulls = struct_arr.nulls(); - - for i in 0..struct_arr.len() { - if struct_arr.is_null(i) { - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_null(); - blob_size_builder.append_null(); - kind_builder.append_null(); - position_builder.append_null(); - continue; - } + let has_data = !data_col.is_null(i); + let has_uri = !uri_col.is_null(i); + let has_position = position_col + .as_ref() + .map(|col| !col.is_null(i)) + .unwrap_or(false); + let has_size = size_col + .as_ref() + .map(|col| !col.is_null(i)) + .unwrap_or(false); + let data_len = if has_data { data_col.value(i).len() } else { 0 }; - let has_data = !data_col.is_null(i); - let has_uri = !uri_col.is_null(i); - let has_position = position_col - .as_ref() - .map(|col| !col.is_null(i)) - .unwrap_or(false); - let has_size = size_col - .as_ref() - .map(|col| !col.is_null(i)) - .unwrap_or(false); - let data_len = if has_data { data_col.value(i).len() } else { 0 }; - - let dedicated_threshold = self.dedicated_thresholds[idx]; - if has_data && data_len > dedicated_threshold { - let blob_id = self.next_blob_id(); - self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i))) - .await?; - - kind_builder.append_value(BlobKind::Dedicated as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(blob_id); - blob_size_builder.append_value(data_len as u64); - position_builder.append_null(); - continue; - } + if has_data && data_len > dedicated_threshold { + let blob_id = self.next_blob_id(); + self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i))) + .await?; - if has_data && data_len > INLINE_MAX { - let (pack_blob_id, position) = self - .write_packed(BlobWriteSource::Bytes(data_col.value(i))) - .await?; + kind_builder.append_value(BlobKind::Dedicated as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(blob_id); + blob_size_builder.append_value(data_len as u64); + position_builder.append_null(); + continue; + } - kind_builder.append_value(BlobKind::Packed as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(pack_blob_id); - blob_size_builder.append_value(data_len as u64); - position_builder.append_value(position); - continue; - } + if has_data && data_len > inline_threshold { + let (pack_blob_id, position) = self + .write_packed(BlobWriteSource::Bytes(data_col.value(i))) + .await?; - if has_uri { - let uri_val = uri_col.value(i); - if self.external_blob_mode == ExternalBlobMode::Ingest { - let position = if has_position { - Some( - position_col - .as_ref() - .expect("position column must exist") - .value(i), - ) - } else { - None - }; - let size = if has_size { - Some(size_col.as_ref().expect("size column must exist").value(i)) - } else { - None - }; - let source = self.open_external_source(uri_val, position, size).await?; - let data_len = source.size(); - - if data_len > dedicated_threshold as u64 { - let blob_id = self.next_blob_id(); - self.write_dedicated(blob_id, BlobWriteSource::External(&source)) - .await?; - - kind_builder.append_value(BlobKind::Dedicated as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(blob_id); - blob_size_builder.append_value(data_len); - position_builder.append_null(); - continue; - } + kind_builder.append_value(BlobKind::Packed as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(pack_blob_id); + blob_size_builder.append_value(data_len as u64); + position_builder.append_value(position); + continue; + } - if data_len > INLINE_MAX as u64 { - let (pack_blob_id, position) = self - .write_packed(BlobWriteSource::External(&source)) - .await?; - - kind_builder.append_value(BlobKind::Packed as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(pack_blob_id); - blob_size_builder.append_value(data_len); - position_builder.append_value(position); - continue; - } + if has_uri { + let uri_val = uri_col.value(i); + if self.external_blob_mode == ExternalBlobMode::Ingest { + let position = if has_position { + Some( + position_col + .as_ref() + .expect("position column must exist") + .value(i), + ) + } else { + None + }; + let size = if has_size { + Some(size_col.as_ref().expect("size column must exist").value(i)) + } else { + None + }; + let source = self.open_external_source(uri_val, position, size).await?; + let data_len = source.size(); - let data = source.read_all().await?; + if data_len > dedicated_threshold as u64 { + let blob_id = self.next_blob_id(); + self.write_dedicated(blob_id, BlobWriteSource::External(&source)) + .await?; - kind_builder.append_value(BlobKind::Inline as u8); - data_builder.append_value(data.as_ref()); + kind_builder.append_value(BlobKind::Dedicated as u8); + data_builder.append_null(); uri_builder.append_null(); - blob_id_builder.append_null(); - blob_size_builder.append_null(); + blob_id_builder.append_value(blob_id); + blob_size_builder.append_value(data_len); position_builder.append_null(); continue; } - let (external_base_id, external_uri_or_path) = - self.resolve_external_reference(uri_val).await?; - kind_builder.append_value(BlobKind::External as u8); - data_builder.append_null(); - uri_builder.append_value(external_uri_or_path); - blob_id_builder.append_value(external_base_id); - if has_position && has_size { - let position = position_col - .as_ref() - .expect("position column must exist") - .value(i); - let size = size_col.as_ref().expect("size column must exist").value(i); - blob_size_builder.append_value(size); + if data_len > inline_threshold as u64 { + let (pack_blob_id, position) = self + .write_packed(BlobWriteSource::External(&source)) + .await?; + + kind_builder.append_value(BlobKind::Packed as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(pack_blob_id); + blob_size_builder.append_value(data_len); position_builder.append_value(position); - } else { - blob_size_builder.append_null(); - position_builder.append_null(); + continue; } - continue; - } - if has_data { + let data = source.read_all().await?; + kind_builder.append_value(BlobKind::Inline as u8); - let value = data_col.value(i); - data_builder.append_value(value); + data_builder.append_value(data.as_ref()); uri_builder.append_null(); blob_id_builder.append_null(); blob_size_builder.append_null(); position_builder.append_null(); + continue; + } + + let (external_base_id, external_uri_or_path) = + self.resolve_external_reference(uri_val).await?; + kind_builder.append_value(BlobKind::External as u8); + data_builder.append_null(); + uri_builder.append_value(external_uri_or_path); + blob_id_builder.append_value(external_base_id); + if has_position && has_size { + let position = position_col + .as_ref() + .expect("position column must exist") + .value(i); + let size = size_col.as_ref().expect("size column must exist").value(i); + blob_size_builder.append_value(size); + position_builder.append_value(position); } else { - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_null(); blob_size_builder.append_null(); - kind_builder.append_null(); position_builder.append_null(); } + continue; } - let child_fields = vec![ - arrow_schema::Field::new("kind", ArrowDataType::UInt8, true), - arrow_schema::Field::new("data", ArrowDataType::LargeBinary, true), - arrow_schema::Field::new("uri", ArrowDataType::Utf8, true), - arrow_schema::Field::new("blob_id", ArrowDataType::UInt32, true), - arrow_schema::Field::new("blob_size", ArrowDataType::UInt64, true), - arrow_schema::Field::new("position", ArrowDataType::UInt64, true), - ]; - - let struct_array = arrow_array::StructArray::try_new( - child_fields.clone().into(), - vec![ - Arc::new(kind_builder.finish()), - Arc::new(data_builder.finish()), - Arc::new(uri_builder.finish()), - Arc::new(blob_id_builder.finish()), - Arc::new(blob_size_builder.finish()), - Arc::new(position_builder.finish()), - ], - struct_nulls.cloned(), - )?; - - new_columns.push(Arc::new(struct_array)); - new_fields.push(Arc::new( - arrow_schema::Field::new( - field.name(), - ArrowDataType::Struct(child_fields.into()), - field.is_nullable(), - ) - .with_metadata(self.writer_metadata[idx].clone()), - )); + if has_data { + kind_builder.append_value(BlobKind::Inline as u8); + let value = data_col.value(i); + data_builder.append_value(value); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + position_builder.append_null(); + } else { + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + kind_builder.append_null(); + position_builder.append_null(); + } } - let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata( - new_fields - .iter() - .map(|f| f.as_ref().clone()) - .collect::>(), - batch_schema.metadata().clone(), - )); + let child_fields = vec![ + ArrowField::new("kind", ArrowDataType::UInt8, true), + ArrowField::new("data", ArrowDataType::LargeBinary, true), + ArrowField::new("uri", ArrowDataType::Utf8, true), + ArrowField::new("blob_id", ArrowDataType::UInt32, true), + ArrowField::new("blob_size", ArrowDataType::UInt64, true), + ArrowField::new("position", ArrowDataType::UInt64, true), + ]; - RecordBatch::try_new(new_schema, new_columns) - .map_err(|e| Error::invalid_input(e.to_string())) + let struct_array = StructArray::try_new( + child_fields.clone().into(), + vec![ + Arc::new(kind_builder.finish()), + Arc::new(data_builder.finish()), + Arc::new(uri_builder.finish()), + Arc::new(blob_id_builder.finish()), + Arc::new(blob_size_builder.finish()), + Arc::new(position_builder.finish()), + ], + struct_nulls.cloned(), + )?; + + let field = Arc::new( + ArrowField::new( + field.name(), + ArrowDataType::Struct(child_fields.into()), + field.is_nullable(), + ) + .with_metadata(writer_metadata.clone()), + ); + Ok((Arc::new(struct_array), field)) } pub(crate) async fn finish(&mut self) -> Result<()> { @@ -700,16 +900,6 @@ impl BlobPreprocessor { } } -fn dedicated_threshold_from_metadata(field: &arrow_schema::Field) -> usize { - field - .metadata() - .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY) - .and_then(|value| value.parse::().ok()) - .filter(|value| *value > 0) - .and_then(|value| usize::try_from(value).ok()) - .unwrap_or(DEDICATED_THRESHOLD) -} - pub async fn preprocess_blob_batches( batches: &[RecordBatch], pre: &mut BlobPreprocessor, @@ -2103,7 +2293,7 @@ mod tests { }; use arrow_array::RecordBatch; use arrow_array::{ - ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array, + Array, ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; use async_trait::async_trait; @@ -2111,7 +2301,8 @@ mod tests { use chrono::Utc; use futures::{StreamExt, TryStreamExt, future::try_join_all}; use lance_arrow::{ - ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, }; use lance_core::datatypes::BlobKind; use lance_io::object_store::{ @@ -2142,7 +2333,7 @@ mod tests { use crate::{ Dataset, blob::{BlobArrayBuilder, blob_field}, - dataset::{ExternalBlobMode, WriteParams}, + dataset::{ExternalBlobMode, WriteMode, WriteParams}, utils::test::TestDatasetGenerator, }; @@ -2158,6 +2349,32 @@ mod tests { expected: Vec, } + fn nested_blob_v2_batch(blob_array: ArrayRef) -> (Arc, RecordBatch) { + let blob_field = blob_field("blob", true); + let info_fields = vec![Field::new("name", DataType::Utf8, false), blob_field]; + let info_array: ArrayRef = Arc::new( + StructArray::try_new( + info_fields.clone().into(), + vec![ + Arc::new(StringArray::from_iter_values( + (0..blob_array.len()).map(|idx| format!("name-{idx}")), + )) as ArrayRef, + blob_array, + ], + None, + ) + .unwrap(), + ); + + let schema = Arc::new(Schema::new(vec![Field::new( + "info", + DataType::Struct(info_fields.into()), + true, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![info_array]).unwrap(); + (schema, batch) + } + #[cfg(feature = "azure")] fn azure_store_params(account_name: &str) -> ObjectStoreParams { ObjectStoreParams { @@ -3045,6 +3262,114 @@ mod tests { assert_eq!(second.as_ref(), b"world"); } + #[tokio::test] + async fn test_write_and_take_nested_blob_v2() { + let test_dir = TempStrDir::default(); + let packed_payload = vec![0x4A; super::INLINE_MAX + 1024]; + + let mut blob_builder = BlobArrayBuilder::new(3); + blob_builder.push_bytes(b"hello").unwrap(); + blob_builder.push_bytes(&packed_payload).unwrap(); + blob_builder.push_null().unwrap(); + let blob_array: ArrayRef = blob_builder.finish().unwrap(); + + let (schema, batch) = nested_blob_v2_batch(blob_array); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let dataset = Arc::new( + Dataset::write( + reader, + &test_dir, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(), + ); + + let info_batch = dataset + .scan() + .project(&["info"]) + .unwrap() + .try_into_batch() + .await + .unwrap(); + let blob_desc = info_batch + .column(0) + .as_struct() + .column_by_name("blob") + .unwrap() + .as_struct(); + assert_eq!( + blob_desc + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0), + BlobKind::Inline as u8 + ); + assert_eq!( + blob_desc + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(1), + BlobKind::Packed as u8 + ); + + let blobs = dataset + .take_blobs_by_indices(&[0, 1], "info.blob") + .await + .unwrap(); + assert_eq!(blobs.len(), 2); + assert_eq!(blobs[0].read().await.unwrap().as_ref(), b"hello"); + assert_eq!( + blobs[1].read().await.unwrap().as_ref(), + packed_payload.as_slice() + ); + + let null_blobs = dataset + .take_blobs_by_indices(&[2], "info.blob") + .await + .unwrap(); + assert!(null_blobs.is_empty()); + } + + #[tokio::test] + async fn test_nested_blob_v2_requires_v2_2() { + let test_dir = TempStrDir::default(); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_bytes(b"hello").unwrap(); + let blob_array: ArrayRef = blob_builder.finish().unwrap(); + + let (schema, batch) = nested_blob_v2_batch(blob_array); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let result = Dataset::write( + reader, + &test_dir, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_1), + ..Default::default() + }), + ) + .await; + + assert!( + result.is_err(), + "Nested blob v2 should be rejected for file version 2.1" + ); + assert!( + result + .unwrap_err() + .to_string() + .contains("Blob v2 requires file version >= 2.2") + ); + } + #[tokio::test] async fn test_blob_file_read_empty_range_returns_empty_bytes() { let store = reject_empty_range_store(); @@ -3621,6 +3946,50 @@ mod tests { assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice()); } + #[tokio::test] + async fn test_blob_v2_external_ingest_respects_inline_threshold() { + let dataset_dir = TempDir::default(); + let external_dir = TempDir::default(); + let external_path = external_dir.std_path().join("external.bin"); + let payload = vec![0x5A; 2048]; + std::fs::write(&external_path, &payload).unwrap(); + let external_uri = format!("file://{}", external_path.display()); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_uri(external_uri).unwrap(); + let blob_array: arrow_array::ArrayRef = blob_builder.finish().unwrap(); + + let mut field = blob_field("blob", true); + let mut metadata = field.metadata().clone(); + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + field = field.with_metadata(metadata); + let schema = Arc::new(Schema::new(vec![field])); + let batch = RecordBatch::try_new(schema.clone(), vec![blob_array]).unwrap(); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let dataset = Arc::new( + Dataset::write( + reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + external_blob_mode: ExternalBlobMode::Ingest, + ..Default::default() + }), + ) + .await + .unwrap(), + ); + + let blobs = dataset.take_blobs_by_indices(&[0], "blob").await.unwrap(); + assert_eq!(blobs.len(), 1); + assert_eq!(blobs[0].kind(), BlobKind::Packed); + assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice()); + } + #[tokio::test] async fn test_blob_v2_external_ingest_dedicated() { let dataset_dir = TempDir::default(); @@ -3713,7 +4082,10 @@ mod tests { ); } - async fn preprocess_kind_with_schema_metadata(metadata_value: &str, data_len: usize) -> u8 { + async fn try_preprocess_kind_with_blob_metadata( + metadata_entries: Vec<(&'static str, String)>, + data_len: usize, + ) -> Result { let (object_store, base_path) = ObjectStore::from_uri_and_params( Arc::new(ObjectStoreRegistry::default()), "memory://blob_preprocessor", @@ -3726,10 +4098,9 @@ mod tests { let mut field = blob_field("blob", true); let mut metadata = field.metadata().clone(); - metadata.insert( - BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(), - metadata_value.to_string(), - ); + for (key, value) in metadata_entries { + metadata.insert(key.to_string(), value); + } field = field.with_metadata(metadata); let writer_arrow_schema = Schema::new(vec![field.clone()]); @@ -3746,7 +4117,7 @@ mod tests { Arc::new(ObjectStoreRegistry::default()), ObjectStoreParams::default(), None, - ); + )?; let mut blob_builder = BlobArrayBuilder::new(1); blob_builder.push_bytes(vec![0u8; data_len]).unwrap(); @@ -3757,36 +4128,442 @@ mod tests { let batch_schema = Arc::new(Schema::new(vec![field_without_metadata])); let batch = RecordBatch::try_new(batch_schema, vec![blob_array]).unwrap(); - let out = preprocessor.preprocess_batch(&batch).await.unwrap(); + let out = preprocessor.preprocess_batch(&batch).await?; let struct_arr = out .column(0) .as_any() .downcast_ref::() .unwrap(); - struct_arr + Ok(struct_arr .column_by_name("kind") .unwrap() .as_primitive::() - .value(0) + .value(0)) + } + + async fn preprocess_kind_with_blob_metadata( + metadata_entries: Vec<(&'static str, String)>, + data_len: usize, + ) -> u8 { + try_preprocess_kind_with_blob_metadata(metadata_entries, data_len) + .await + .unwrap() } #[tokio::test] - async fn test_blob_v2_dedicated_threshold_ignores_non_positive_metadata() { - let kind = preprocess_kind_with_schema_metadata("0", 256 * 1024).await; - assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); + async fn test_blob_v2_dedicated_threshold_rejects_non_positive_metadata() { + let err = try_preprocess_kind_with_blob_metadata( + vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "0".to_string())], + 256 * 1024, + ) + .await + .unwrap_err(); + assert!(err.to_string().contains("expected a positive integer")); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_rejects_invalid_metadata() { + let err = try_preprocess_kind_with_blob_metadata( + vec![( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + "not-a-number".to_string(), + )], + 256 * 1024, + ) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("expected a non-negative integer that fits in usize") + ); + } + + #[tokio::test] + async fn test_blob_v2_write_rejects_invalid_inline_threshold_metadata() { + let dataset_dir = TempDir::default(); + let mut field = blob_field("blob", true); + let mut metadata = field.metadata().clone(); + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "not-a-number".to_string(), + ); + field = field.with_metadata(metadata); + let schema = Arc::new(Schema::new(vec![field])); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_bytes(vec![0u8; 256]).unwrap(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(blob_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let result = Dataset::write( + reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("write with invalid blob threshold metadata should fail"); + }; + assert!( + err.to_string() + .contains("expected a non-negative integer that fits in usize") + ); } #[tokio::test] async fn test_blob_v2_dedicated_threshold_respects_smaller_metadata() { - let kind = preprocess_kind_with_schema_metadata("131072", 256 * 1024).await; + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "131072".to_string())], + 256 * 1024, + ) + .await; assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8); } #[tokio::test] async fn test_blob_v2_dedicated_threshold_respects_larger_metadata() { - let kind = - preprocess_kind_with_schema_metadata("8388608", super::DEDICATED_THRESHOLD + 1024) - .await; + let kind = preprocess_kind_with_blob_metadata( + vec![( + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + "8388608".to_string(), + )], + super::DEDICATED_THRESHOLD + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_respects_smaller_metadata() { + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())], + 2048, + ) + .await; assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_respects_larger_metadata() { + let kind = preprocess_kind_with_blob_metadata( + vec![( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + (super::INLINE_MAX + 8192).to_string(), + )], + super::INLINE_MAX + 4096, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_uses_strict_greater_than() { + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())], + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_dedicated_threshold_uses_strict_greater_than() { + let kind = preprocess_kind_with_blob_metadata( + vec![ + (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "2048".to_string()), + (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "1024".to_string()), + ], + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_does_not_override_dedicated_threshold() { + let kind = preprocess_kind_with_blob_metadata( + vec![ + (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "8192".to_string()), + (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "4096".to_string()), + ], + 6144, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_is_per_column() { + let (object_store, base_path) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + "memory://blob_preprocessor", + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + let object_store = object_store.as_ref().clone(); + let data_dir = base_path.clone().join("data"); + + let mut inline_field = blob_field("inline_blob", true); + let mut inline_metadata = inline_field.metadata().clone(); + inline_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "4096".to_string(), + ); + inline_field = inline_field.with_metadata(inline_metadata); + + let mut packed_field = blob_field("packed_blob", true); + let mut packed_metadata = packed_field.metadata().clone(); + packed_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + packed_field = packed_field.with_metadata(packed_metadata); + + let writer_arrow_schema = Schema::new(vec![inline_field.clone(), packed_field.clone()]); + let writer_schema = lance_core::datatypes::Schema::try_from(&writer_arrow_schema).unwrap(); + + let mut preprocessor = super::BlobPreprocessor::new( + object_store.clone(), + data_dir, + "data_file_key".to_string(), + &writer_schema, + None, + false, + ExternalBlobMode::Reference, + Arc::new(ObjectStoreRegistry::default()), + ObjectStoreParams::default(), + None, + ) + .unwrap(); + + let mut inline_builder = BlobArrayBuilder::new(1); + inline_builder.push_bytes(vec![0u8; 2048]).unwrap(); + let inline_array: arrow_array::ArrayRef = inline_builder.finish().unwrap(); + + let mut packed_builder = BlobArrayBuilder::new(1); + packed_builder.push_bytes(vec![0u8; 2048]).unwrap(); + let packed_array: arrow_array::ArrayRef = packed_builder.finish().unwrap(); + + let batch_schema = Arc::new(Schema::new(vec![ + Field::new( + "inline_blob", + inline_field.data_type().clone(), + inline_field.is_nullable(), + ), + Field::new( + "packed_blob", + packed_field.data_type().clone(), + packed_field.is_nullable(), + ), + ])); + let batch = RecordBatch::try_new(batch_schema, vec![inline_array, packed_array]).unwrap(); + + let out = preprocessor.preprocess_batch(&batch).await.unwrap(); + let inline_kind = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0); + let packed_kind = out + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0); + + assert_eq!(inline_kind, lance_core::datatypes::BlobKind::Inline as u8); + assert_eq!(packed_kind, lance_core::datatypes::BlobKind::Packed as u8); + } + + #[tokio::test] + async fn test_blob_v2_append_rejects_explicit_inline_threshold_mismatch() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let result = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("append with explicit blob threshold mismatch should fail"); + }; + let message = err.to_string(); + assert!(message.contains("Cannot append data with blob threshold metadata")); + assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY)); + } + + #[tokio::test] + async fn test_blob_v2_append_rejects_threshold_mismatch_with_non_blob_input_extension() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + ARROW_EXT_NAME_KEY.to_string(), + "some.other.extension".to_string(), + ); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let result = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("append with ignored blob threshold metadata should fail"); + }; + let message = err.to_string(); + assert!(message.contains("Cannot append data with blob threshold metadata")); + assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY)); + } + + #[tokio::test] + async fn test_blob_v2_append_accepts_explicit_default_inline_threshold() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + super::INLINE_MAX.to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let dataset = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.count_rows(None).await.unwrap(), 2); + } } diff --git a/rust/lance/src/dataset/branch_location.rs b/rust/lance/src/dataset/branch_location.rs index 2dd9f3aa860..7ebce36ec86 100644 --- a/rust/lance/src/dataset/branch_location.rs +++ b/rust/lance/src/dataset/branch_location.rs @@ -31,14 +31,20 @@ impl BranchLocation { } fn get_root_path(path_str: &str, branch_name: &str) -> Result { + // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`); + // the branch suffix sits on the path part, before the query. + let (path_part, query) = match path_str.split_once('?') { + Some((path, query)) => (path, Some(query)), + None => (path_str, None), + }; let branch_suffix = format!("{}/{}", BRANCH_DIR, branch_name); let branch_suffix = branch_suffix.as_str(); - let root_path_str = path_str + let root_path_str = path_part .strip_suffix(branch_suffix) .or_else(|| { if cfg!(windows) { let windows_suffix = branch_suffix.replace('/', "\\"); - path_str.strip_suffix(&windows_suffix) + path_part.strip_suffix(&windows_suffix) } else { None } @@ -59,7 +65,42 @@ impl BranchLocation { root_path_str, path_str, ))); }; - Ok(root_path_str) + Ok(match query { + Some(query) => format!("{}?{}", root_path_str, query), + None => root_path_str, + }) + } + + /// The branch a location under `root` targets: the inverse of + /// [`Self::find_branch`]. `location` must be either `root` itself (main) + /// or `/tree/`; anything else is rejected so a caller never + /// misattributes an unrelated location to a branch. + pub fn branch_of(root: &str, location: &str) -> Result> { + if location == root { + return Ok(None); + } + // Require the `/` component boundary after the root so a sibling path + // that merely shares the root as a string prefix is rejected. + let branch = location + .strip_prefix(root) + .and_then(|rel| { + if root.is_empty() { + Some(rel) + } else { + rel.strip_prefix('/') + } + }) + .and_then(|rel| rel.strip_prefix(BRANCH_DIR)) + .and_then(|rel| rel.strip_prefix('/')) + .filter(|name| !name.is_empty()); + + match branch { + Some(name) => Ok(Some(name.to_string())), + None => Err(Error::invalid_input(format!( + "cannot derive a branch for location '{}': expected the table root '{}' or a branch chain under '{}/{}'", + location, root, root, BRANCH_DIR + ))), + } } /// Find the target branch location @@ -100,13 +141,23 @@ impl BranchLocation { } fn join_str(base: &str, segment: &str) -> Result { + // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`); + // path segments must be appended before it. + let (path_part, query) = match base.split_once('?') { + Some((path, query)) => (path, Some(query)), + None => (base, None), + }; let normalized_segment = segment.trim_start_matches('/'); - let is_base_dir = base.ends_with("/"); - if is_base_dir { - Ok(format!("{}{}", base, normalized_segment)) + let is_base_dir = path_part.ends_with("/"); + let joined = if is_base_dir { + format!("{}{}", path_part, normalized_segment) } else { - Ok(format!("{}/{}", base, normalized_segment)) - } + format!("{}/{}", path_part, normalized_segment) + }; + Ok(match query { + Some(query) => format!("{}?{}", joined, query), + None => joined, + }) } } @@ -223,6 +274,59 @@ mod tests { assert!(fs::create_dir_all(std::path::Path::new(new_location.uri.as_str())).is_ok()); } + #[test] + fn test_branch_location_with_query_uri() { + // Uris like `s3+ddb://...?ddbTableName=t` carry the commit handler + // config in the query string; branch path segments must be inserted + // before it and the query must survive the round trip. + let location = BranchLocation { + path: Path::parse("bucket/table.lance").unwrap(), + uri: "s3+ddb://bucket/table.lance?ddbTableName=t".to_string(), + branch: None, + }; + let dev = location.find_branch(Some("dev")).unwrap(); + assert_eq!( + dev.uri, + "s3+ddb://bucket/table.lance/tree/dev?ddbTableName=t" + ); + assert_eq!(dev.path.as_ref(), "bucket/table.lance/tree/dev"); + assert_eq!(dev.branch.as_deref(), Some("dev")); + + let main = dev.find_main().unwrap(); + assert_eq!(main.uri, "s3+ddb://bucket/table.lance?ddbTableName=t"); + assert_eq!(main.path.as_ref(), "bucket/table.lance"); + assert_eq!(main.branch, None); + } + + #[test] + fn test_branch_of() { + let derive = |root: &str, location: &str| BranchLocation::branch_of(root, location); + + // The table root targets main. + assert_eq!(derive("data/t.lance", "data/t.lance").unwrap(), None); + + // Branch chains, including multi-segment branch names. + assert_eq!( + derive("data/t.lance", "data/t.lance/tree/exp").unwrap(), + Some("exp".to_string()) + ); + assert_eq!( + derive("data/t.lance", "data/t.lance/tree/bugfix/issue-123").unwrap(), + Some("bugfix/issue-123".to_string()) + ); + + // A sibling path sharing the root as a string prefix is not a branch. + assert!(derive("data/t", "data/tx/tree/exp").is_err()); + // Neither is a sub-path outside the branch directory. + assert!(derive("data/t.lance", "data/t.lance/other/exp").is_err()); + // Nor a path missing the component boundary after the branch dir. + assert!(derive("data/t.lance", "data/t.lance/treex").is_err()); + // An empty branch name is invalid. + assert!(derive("data/t.lance", "data/t.lance/tree").is_err()); + // An unrelated location is invalid. + assert!(derive("data/t.lance", "elsewhere/u.lance").is_err()); + } + #[test] fn test_find_empty_branch() { let root_path = TempStdDir::default().to_owned(); diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 393ff45c4ea..23db7254b15 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -4,7 +4,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; use lance_core::cache::CacheBackend; -use super::refs::{Ref, Refs}; +use super::refs::{Branches, Ref, Refs, check_valid_branch, normalize_branch, standardize_branch}; use super::{DEFAULT_INDEX_CACHE_SIZE, DEFAULT_METADATA_CACHE_SIZE, ReadParams, WriteParams}; use crate::dataset::branch_location::BranchLocation; use crate::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; @@ -53,6 +53,12 @@ pub struct DatasetBuilder { storage_options_override: Option>, /// Runtime-only exact object store bindings keyed by base path URI. base_store_params: HashMap, + /// Namespace-managed table info `(client, table_id)`, set by `from_namespace` + /// when the table uses managed versioning. The commit handler is built in + /// `build_object_store`, rooted at the resolved table path; the branch a + /// namespace request targets is derived per call from the base path the + /// handler is handed. + namespace_managed: Option<(Arc, Vec)>, } impl std::fmt::Debug for DatasetBuilder { @@ -71,6 +77,7 @@ impl std::fmt::Debug for DatasetBuilder { &self.storage_options_override.is_some(), ) .field("base_store_params", &!self.base_store_params.is_empty()) + .field("namespace_managed", &self.namespace_managed.is_some()) .finish() } } @@ -90,6 +97,7 @@ impl DatasetBuilder { file_reader_options: None, storage_options_override: None, base_store_params: HashMap::new(), + namespace_managed: None, } } @@ -149,16 +157,11 @@ impl DatasetBuilder { let mut builder = Self::from_uri(&table_uri); - // Check managed_versioning flag to determine if namespace-managed commits should be used + // Defer building the commit handler to load(): the manifest store is + // rooted at the resolved table path, which is only known once the + // object store is built. if response.managed_versioning == Some(true) { - let external_store = LanceNamespaceExternalManifestStore::new( - namespace_client.clone(), - table_id.clone(), - ); - let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { - external_manifest_store: Arc::new(external_store), - }); - builder.commit_handler = Some(commit_handler); + builder.namespace_managed = Some((namespace_client.clone(), table_id.clone())); } // Use namespace storage options if available @@ -524,13 +527,8 @@ impl DatasetBuilder { /// Build a lance object store for the given config pub async fn build_object_store( - self, + mut self, ) -> Result<(Arc, Path, Arc)> { - let commit_handler = match self.commit_handler { - Some(commit_handler) => Ok(commit_handler), - None => commit_handler_from_url(&self.table_uri, &Some(self.options.clone())).await, - }?; - let storage_options = self .options .storage_options() @@ -546,13 +544,13 @@ impl DatasetBuilder { .unwrap_or_default(); #[allow(deprecated)] - match &self.options.object_store { - Some(store) => Ok(( + let (object_store, base_path) = match &self.options.object_store { + Some(store) => ( Arc::new(ObjectStore::new( store.0.clone(), store.1.clone(), self.options.block_size, - self.options.object_store_wrapper, + self.options.object_store_wrapper.clone(), self.options.use_constant_size_upload_parts, store.1.scheme() != "file", // If user supplied an object store then we just assume it's probably @@ -562,18 +560,35 @@ impl DatasetBuilder { None, // No storage_options available here )), Path::from(store.1.path()), - commit_handler, - )), + ), None => { - let (store, path) = ObjectStore::from_uri_and_params( - store_registry, - &self.table_uri, - &self.options, - ) - .await?; - Ok((store, path, commit_handler)) + ObjectStore::from_uri_and_params(store_registry, &self.table_uri, &self.options) + .await? } - } + }; + + // Resolve the commit handler: an explicitly set one wins; otherwise a + // namespace-managed table builds a manifest store rooted at the resolved + // table path (the branch a request targets is derived per call from the + // base path the handler is handed); otherwise fall back to the default + // for the uri. Resolving here (not in load) keeps this pub method + // consistent for every caller. + let commit_handler: Arc = + if let Some(commit_handler) = self.commit_handler.take() { + commit_handler + } else if let Some((namespace_client, table_id)) = self.namespace_managed.take() { + Arc::new(ExternalManifestCommitHandler { + external_manifest_store: Arc::new(LanceNamespaceExternalManifestStore::new( + namespace_client, + table_id, + base_path.clone(), + )), + }) + } else { + commit_handler_from_url(&self.table_uri, &Some(self.options.clone())).await? + }; + + Ok((object_store, base_path, commit_handler)) } #[instrument(skip_all)] @@ -656,6 +671,14 @@ impl DatasetBuilder { let store_params = self.options.clone(); let base_store_params = (!self.base_store_params.is_empty()) .then(|| Arc::new(std::mem::take(&mut self.base_store_params))); + + // A namespace-managed table is always addressed at its root uri, so the + // effective branch is resolvable before loading: the base path is + // qualified up front and the manifest store derives the branch from it. + // An explicitly supplied commit handler opts out of the managed flow. + let managed_store_active = + self.namespace_managed.is_some() && self.commit_handler.is_none(); + let (object_store, base_path, commit_handler) = self.build_object_store().await?; // Two cases that need to check out after loading the manifest: @@ -667,7 +690,7 @@ impl DatasetBuilder { let mut need_delay_checkout = false; let (mut branch, mut version_number) = match target_ref.clone() { Some(Ref::Version(branch, version_number)) => { - if branch.is_some() { + if branch.is_some() && !managed_store_active { need_delay_checkout = true; } (branch, version_number) @@ -687,17 +710,57 @@ impl DatasetBuilder { branch: None, }, ); - let tag_content = refs.tags().get(&tag_name).await; - if let Ok(tag_content) = tag_content { - (tag_content.branch.clone(), Some(tag_content.version)) - } else { - need_delay_checkout = true; - (None, None) + match refs.tags().get(&tag_name).await { + Ok(tag_content) => { + if tag_content.branch.is_some() && !managed_store_active { + // The tag's chain lives under a different base path + // and the unmanaged handler resolves versions by + // base path only, so load the root's latest first + // and check the tag's branch/version out from it. + need_delay_checkout = true; + (tag_content.branch, None) + } else { + (tag_content.branch.clone(), Some(tag_content.version)) + } + } + Err(e) => { + // A managed table is always rooted at the namespace + // location, so a tag missing here is missing. + if managed_store_active { + return Err(e); + } + need_delay_checkout = true; + (None, None) + } } } None => (None, None), }; + // Reject malformed branch names at the boundary (mirroring the branch + // CRUD paths) so they fail as InvalidRef instead of resolving oddly + if let Some(branch_name) = branch.as_deref() + && !Branches::is_main_branch(Some(branch_name)) + { + check_valid_branch(branch_name)?; + } + + // For a managed table the branch is known before loading; point the base + // path and uri at the branch chain so the loaded dataset is rooted there + // (data placement, refs and the path-derived store branch all follow the + // base path). + let (base_path, table_uri) = if managed_store_active && branch.is_some() { + let branch_location = BranchLocation { + path: base_path, + uri: table_uri, + branch: None, + } + .find_branch(branch.as_deref())?; + (branch_location.path, branch_location.uri) + } else { + (base_path, table_uri) + }; + let dataset = Self::load_by_uri( session, manifest, @@ -712,6 +775,20 @@ impl DatasetBuilder { ) .await?; + if managed_store_active { + // The base path was qualified above, so the loaded manifest must + // already be on the requested branch; a mismatch means the namespace + // resolved another chain. + let requested_branch = branch.as_deref().and_then(standardize_branch); + if dataset.manifest.branch.as_deref() != requested_branch.as_deref() { + return Err(Error::internal(format!( + "open of branch '{}' resolved a manifest belonging to branch '{}'", + normalize_branch(branch.as_deref()), + normalize_branch(dataset.manifest.branch.as_deref()), + ))); + } + } + if need_delay_checkout { if let Some(Ref::Tag(tag_name)) = target_ref { let tag_content = dataset.tags().get(tag_name.as_str()).await?; diff --git a/rust/lance/src/dataset/cleanup.rs b/rust/lance/src/dataset/cleanup.rs index b3ca60cfa0f..65928038cea 100644 --- a/rust/lance/src/dataset/cleanup.rs +++ b/rust/lance/src/dataset/cleanup.rs @@ -46,7 +46,8 @@ use lance_core::{ Error, Result, utils::tracing::{ AUDIT_MODE_DELETE, AUDIT_MODE_DELETE_UNVERIFIED, AUDIT_TYPE_DATA, AUDIT_TYPE_DELETION, - AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT, + AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, DATASET_CLEANING_EVENT, TRACE_DATASET_EVENTS, + TRACE_FILE_AUDIT, }, }; use lance_table::{ @@ -78,7 +79,7 @@ struct ReferencedFiles { index_uuids: HashSet, } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct RemovalStats { pub bytes_removed: u64, pub old_versions: u64, @@ -88,12 +89,194 @@ pub struct RemovalStats { pub deletion_files_removed: u64, } -#[derive(Clone, Copy, Debug)] -enum RemovedFileType { +/// A read-only explanation of what a cleanup operation would remove. +/// +/// This is an explanation, not a deletion plan. Calling +/// [`CleanupOperation::execute`] re-evaluates the current dataset and reference +/// state before deleting files. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupExplanation { + /// Dataset version observed when the explanation was produced. + pub read_version: u64, + /// Aggregate statistics for files that would be removed. + pub stats: RemovalStats, + /// Candidate files that would be removed, capped by `candidate_file_limit`. + pub candidate_files: Vec, + /// True if more candidate files were found than are included. + pub candidate_files_truncated: bool, + /// Maximum number of candidate files included in this explanation. + pub candidate_file_limit: usize, + /// Referenced child branches and whether cleanup would cascade into them. + pub referenced_branches: Vec, + /// Non-fatal warnings about the explanation. + pub warnings: Vec, +} + +/// A file that cleanup identified as removable. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupCandidateFile { + /// Dataset-relative or storage path for the candidate file. + pub path: String, + /// Kind of file identified by cleanup. + pub kind: CleanupFileKind, + /// True if the file is removable only because it aged past the unverified + /// retention threshold or `delete_unverified` is enabled. + pub unverified: bool, + /// Candidate file size in bytes. + pub size_bytes: u64, +} + +/// A branch that references the current branch lineage. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupReferencedBranch { + /// Branch name. + pub name: String, + /// Version of the current lineage referenced by this branch. + pub referenced_version: u64, + /// True if this branch would be cleaned when cascading cleanup is enabled. + pub cleanup_candidate: bool, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum CleanupFileKind { + Manifest, Data, Transaction, Index, Deletion, + /// A leftover `_versions/.tmp` manifest from a failed transaction. These + /// are deleted but excluded from per-kind `RemovalStats` counts and audit + /// logs to match the long-standing cleanup behavior. Their bytes + /// are still included in `bytes_removed`. + TemporaryManifest, +} + +impl CleanupCandidateFile { + fn from_cleanup_file(file: &CleanupFile) -> Self { + Self { + path: file.path.to_string(), + kind: file.kind, + unverified: file.unverified, + size_bytes: file.size_bytes, + } + } +} + +fn cleanup_file( + path: Path, + kind: CleanupFileKind, + unverified: bool, + size_bytes: u64, +) -> Option { + Some(CleanupFile { + path, + kind, + unverified, + size_bytes, + }) +} + +#[derive(Clone, Debug)] +struct CleanupFile { + path: Path, + kind: CleanupFileKind, + /// True when the file was kept on disk past its referenced lifetime + /// because we could not verify it was safe to remove (e.g. produced by an + /// unfinished commit) and is being deleted only because it has aged past + /// the unverified-retention threshold or `delete_unverified` is set. + unverified: bool, + size_bytes: u64, +} + +impl RemovalStats { + fn record_file(&mut self, file: &CleanupFile) { + self.bytes_removed += file.size_bytes; + match file.kind { + CleanupFileKind::Manifest => self.old_versions += 1, + CleanupFileKind::Data => self.data_files_removed += 1, + CleanupFileKind::Transaction => self.transaction_files_removed += 1, + CleanupFileKind::Index => self.index_files_removed += 1, + CleanupFileKind::Deletion => self.deletion_files_removed += 1, + CleanupFileKind::TemporaryManifest => {} + } + } + + fn merge(&mut self, other: &Self) { + self.bytes_removed += other.bytes_removed; + self.old_versions += other.old_versions; + self.data_files_removed += other.data_files_removed; + self.transaction_files_removed += other.transaction_files_removed; + self.index_files_removed += other.index_files_removed; + self.deletion_files_removed += other.deletion_files_removed; + } +} + +#[derive(Debug, Default)] +struct CleanupRunResult { + stats: RemovalStats, + removed_manifests: HashSet, + candidate_files: Vec, + candidate_files_truncated: bool, + referenced_branches: Vec, +} + +impl CleanupRunResult { + fn record_file( + &mut self, + file: &CleanupFile, + candidate_file_limit: Option, + track_removed_manifests: bool, + ) { + self.stats.record_file(file); + if track_removed_manifests && matches!(file.kind, CleanupFileKind::Manifest) { + self.removed_manifests.insert(file.path.clone()); + } + if let Some(limit) = candidate_file_limit { + if self.candidate_files.len() < limit { + self.candidate_files + .push(CleanupCandidateFile::from_cleanup_file(file)); + } else { + self.candidate_files_truncated = true; + } + } + } + + fn merge(&mut self, other: Self, candidate_file_limit: Option) { + self.stats.merge(&other.stats); + self.removed_manifests.extend(other.removed_manifests); + self.referenced_branches.extend(other.referenced_branches); + if let Some(limit) = candidate_file_limit { + for file in other.candidate_files { + if self.candidate_files.len() < limit { + self.candidate_files.push(file); + } else { + self.candidate_files_truncated = true; + } + } + self.candidate_files_truncated |= other.candidate_files_truncated; + } + } +} + +#[derive(Clone, Copy, Debug)] +enum CleanupAction { + Execute, + Explain { max_candidate_files: usize }, +} + +impl CleanupAction { + fn deletes_files(self) -> bool { + matches!(self, Self::Execute) + } + + fn candidate_file_limit(self) -> Option { + match self { + Self::Execute => None, + Self::Explain { + max_candidate_files, + } => Some(max_candidate_files), + } + } } fn remove_prefix(path: &Path, prefix: &Path) -> Path { @@ -108,6 +291,11 @@ fn remove_prefix(path: &Path, prefix: &Path) -> Path { struct CleanupTask<'a> { dataset: &'a Dataset, policy: CleanupPolicy, + action: CleanupAction, + read_version: u64, + ignored_manifests: HashSet, + track_removed_manifests: bool, + include_referenced_branches: bool, } /// Information about the dataset that we learn by inspecting all of the manifests @@ -131,21 +319,131 @@ struct CleanupInspection { const UNVERIFIED_THRESHOLD_DAYS: i64 = 7; const S3_DELETE_STREAM_BATCH_SIZE: u64 = 1_000; const AZURE_DELETE_STREAM_BATCH_SIZE: u64 = 256; +const DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES: usize = 1_000; + +/// Builder-style cleanup operation. +/// +/// Call [`Self::explain`] for a read-only explanation of what cleanup would +/// remove, or [`Self::execute`] to re-evaluate the current dataset state and +/// delete files. +pub struct CleanupOperation<'a> { + dataset: &'a Dataset, + policy: CleanupPolicy, + max_candidate_files: usize, +} + +impl<'a> CleanupOperation<'a> { + pub(crate) fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self { + Self { + dataset, + policy, + max_candidate_files: DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES, + } + } + + /// Set the maximum number of candidate files included in explanations. + /// + /// The aggregate [`RemovalStats`] in [`CleanupExplanation`] still include + /// all files that would be removed. + pub fn with_max_candidate_files(mut self, max_candidate_files: usize) -> Self { + self.max_candidate_files = max_candidate_files; + self + } + + /// Explain what cleanup would remove without deleting files. + pub async fn explain(&self) -> Result { + let cleanup = CleanupTask::new( + self.dataset, + self.policy.clone(), + CleanupAction::Explain { + max_candidate_files: self.max_candidate_files, + }, + ); + let read_version = cleanup.read_version; + let result = cleanup.run().await?; + let warnings = if result.candidate_files_truncated { + vec![format!( + "candidate_files truncated to {} entries", + self.max_candidate_files + )] + } else { + Vec::new() + }; + Ok(CleanupExplanation { + read_version, + stats: result.stats, + candidate_files: result.candidate_files, + candidate_files_truncated: result.candidate_files_truncated, + candidate_file_limit: self.max_candidate_files, + referenced_branches: result.referenced_branches, + warnings, + }) + } + + /// Execute cleanup by re-evaluating the current dataset state. + pub async fn execute(&self) -> Result { + info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.dataset.uri); + let cleanup = CleanupTask::new(self.dataset, self.policy.clone(), CleanupAction::Execute); + Ok(cleanup.run().await?.stats) + } +} impl<'a> CleanupTask<'a> { - fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self { - Self { dataset, policy } + fn new(dataset: &'a Dataset, policy: CleanupPolicy, action: CleanupAction) -> Self { + let track_removed_manifests = policy.clean_referenced_branches; + let include_referenced_branches = action.candidate_file_limit().is_some(); + Self::new_with_ignored_manifests( + dataset, + policy, + action, + HashSet::new(), + track_removed_manifests, + include_referenced_branches, + ) + } + + fn new_with_ignored_manifests( + dataset: &'a Dataset, + policy: CleanupPolicy, + action: CleanupAction, + ignored_manifests: HashSet, + track_removed_manifests: bool, + include_referenced_branches: bool, + ) -> Self { + Self { + dataset, + policy, + action, + read_version: dataset.version().version, + ignored_manifests, + track_removed_manifests, + include_referenced_branches, + } } - async fn run(self) -> Result { - let mut final_stats = RemovalStats::default(); + async fn run(self) -> Result { + let mut final_result = CleanupRunResult::default(); + let candidate_file_limit = self.action.candidate_file_limit(); // First check if we need to clean referenced branches // For cases that referenced branches never clean and the current cleanup cannot clean anything // This must happen before cleaning the current branch if the setting is enabled. let referenced_branches: Vec<(String, u64)> = self.find_referenced_branches().await?; + if self.include_referenced_branches { + final_result.referenced_branches = referenced_branches + .iter() + .map(|(name, referenced_version)| CleanupReferencedBranch { + name: name.clone(), + referenced_version: *referenced_version, + cleanup_candidate: self.policy.clean_referenced_branches, + }) + .collect(); + } if self.policy.clean_referenced_branches { - self.clean_referenced_branches(&referenced_branches).await?; + final_result.merge( + self.clean_referenced_branches(&referenced_branches).await?, + candidate_file_limit, + ); } // we process all manifest files in parallel to figure @@ -179,19 +477,21 @@ impl<'a> CleanupTask<'a> { } if !referenced_branches.is_empty() { + let ignored_manifests: HashSet<_> = final_result + .removed_manifests + .union(&self.ignored_manifests) + .cloned() + .collect(); inspection = self - .retain_branch_lineage_files(inspection, &referenced_branches) + .retain_branch_lineage_files(inspection, &referenced_branches, &ignored_manifests) .await? }; - let stats = self.delete_unreferenced_files(inspection).await?; - final_stats.bytes_removed += stats.bytes_removed; - final_stats.old_versions += stats.old_versions; - final_stats.data_files_removed += stats.data_files_removed; - final_stats.transaction_files_removed += stats.transaction_files_removed; - final_stats.index_files_removed += stats.index_files_removed; - final_stats.deletion_files_removed += stats.deletion_files_removed; - Ok(final_stats) + final_result.merge( + self.delete_unreferenced_files(inspection).await?, + candidate_file_limit, + ); + Ok(final_result) } #[instrument(level = "debug", skip_all)] @@ -203,6 +503,7 @@ impl<'a> CleanupTask<'a> { self.dataset .commit_handler .list_manifest_locations(&self.dataset.base, &self.dataset.object_store, false) + .try_filter(|location| future::ready(!self.ignored_manifests.contains(&location.path))) .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| { self.process_manifest_file(location, &inspection, tagged_versions) }) @@ -224,12 +525,10 @@ impl<'a> CleanupTask<'a> { let manifest = read_manifest(&self.dataset.object_store, &location.path, location.size).await?; - let dataset_version = self.dataset.version().version; - // Don't delete the latest version, even if it is old. Don't delete tagged versions, // regardless of age. Don't delete manifests if their version is newer than the dataset // version. These are either in-progress or newly added since we started. - let is_latest = dataset_version <= manifest.version; + let is_latest = self.read_version <= manifest.version; let is_tagged = tagged_versions.contains(&manifest.version); let in_working_set = is_latest || !self.policy.should_clean(&manifest) || is_tagged; let indexes = @@ -319,8 +618,10 @@ impl<'a> CleanupTask<'a> { async fn delete_unreferenced_files( &self, inspection: CleanupInspection, - ) -> Result { - let removal_stats = Mutex::new(RemovalStats::default()); + ) -> Result { + let cleanup_result = Mutex::new(CleanupRunResult::default()); + let deletes_files = self.action.deletes_files(); + let candidate_file_limit = self.action.candidate_file_limit(); let verification_threshold = utc_now() - TimeDelta::try_days(UNVERIFIED_THRESHOLD_DAYS).expect("TimeDelta::try_days"); @@ -335,9 +636,8 @@ impl<'a> CleanupTask<'a> { ) }; // Build stream for a managed subtree - let build_listing_stream = |dir: Path, file_type: Option| { + let build_listing_stream = |dir: Path| { let inspection_ref = &inspection; - let removal_stats_ref = &removal_stats; self.dataset .object_store .read_dir_all(&dir, inspection.earliest_retained_manifest_time) @@ -356,118 +656,133 @@ impl<'a> CleanupTask<'a> { // delete it if we can verify it is part of an old version. let maybe_in_progress = !self.policy.delete_unverified && obj_meta.last_modified >= verification_threshold; - let path_to_remove = self.path_if_not_referenced( - obj_meta.location, + let file_to_remove = self.cleanup_file_if_not_referenced( + obj_meta, maybe_in_progress, inspection_ref, ); - if matches!(path_to_remove, Ok(Some(..))) { - let mut stats = removal_stats_ref.lock().unwrap(); - stats.bytes_removed += obj_meta.size; - if let Some(file_type) = file_type { - match file_type { - RemovedFileType::Data => stats.data_files_removed += 1, - RemovedFileType::Transaction => { - stats.transaction_files_removed += 1 - } - RemovedFileType::Index => stats.index_files_removed += 1, - RemovedFileType::Deletion => stats.deletion_files_removed += 1, - } - } - } - future::ready(path_to_remove) + future::ready(file_to_remove) }) .boxed() }; // Restrict scanning to Lance-managed subtrees for safety and performance. let streams = vec![ - build_listing_stream(self.dataset.versions_dir(), None), - build_listing_stream( - self.dataset.transactions_dir(), - Some(RemovedFileType::Transaction), - ), - build_listing_stream(self.dataset.data_dir(), Some(RemovedFileType::Data)), - build_listing_stream(self.dataset.indices_dir(), Some(RemovedFileType::Index)), - build_listing_stream( - self.dataset.deletions_dir(), - Some(RemovedFileType::Deletion), - ), + build_listing_stream(self.dataset.versions_dir()), + build_listing_stream(self.dataset.transactions_dir()), + build_listing_stream(self.dataset.data_dir()), + build_listing_stream(self.dataset.indices_dir()), + build_listing_stream(self.dataset.deletions_dir()), ]; - let unreferenced_paths = stream::iter(streams).flatten().boxed(); + let unreferenced_files = stream::iter(streams).flatten().boxed(); let old_manifests = inspection.old_manifests.clone(); - let num_old_manifests = old_manifests.len(); - - // Ideally this collect shouldn't be needed here but it seems necessary - // to avoid https://github.com/rust-lang/rust/issues/102211 - let manifest_bytes_removed = stream::iter(old_manifests.keys()) - .map(|path| self.dataset.object_store.size(path)) - .collect::>() - .await; - let manifest_bytes_removed = stream::iter(manifest_bytes_removed) - .buffer_unordered(self.dataset.object_store.io_parallelism()) - .try_fold(0, |acc, size| async move { Ok(acc + (size)) }) - .await; - - let old_manifests_stream = stream::iter(old_manifests.into_keys()) - .map(|path| { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path.as_ref()); - Ok(path) + let manifest_files = stream::iter(old_manifests) + .map(|(path, _version)| async move { + let size_bytes = self.dataset.object_store.size(&path).await?; + Ok::(CleanupFile { + path, + kind: CleanupFileKind::Manifest, + unverified: false, + size_bytes, + }) }) + .buffer_unordered(self.dataset.object_store.io_parallelism()) .boxed(); - let all_paths_to_remove = - stream::iter(vec![unreferenced_paths, old_manifests_stream]).flatten(); - - let paths_to_delete: BoxStream> = if let Some(rate) = - self.policy.delete_rate_limit - { - let duration = calculate_duration(self.dataset.object_store.scheme().to_string(), rate); - let mut ticker = interval(duration); - ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); - IntervalStream::new(ticker) - .zip(all_paths_to_remove) - .map(|(_, path)| path) - .boxed() - } else { - all_paths_to_remove.boxed() - }; - let delete_fut = self - .dataset - .object_store - .remove_stream(paths_to_delete) - .try_for_each(|_| future::ready(Ok(()))); + let all_files = stream::iter(vec![unreferenced_files, manifest_files]).flatten(); + let all_paths_to_remove = all_files.map(|file| { + let file = file?; + if deletes_files { + let mode = if file.unverified { + AUDIT_MODE_DELETE_UNVERIFIED + } else { + AUDIT_MODE_DELETE + }; + let path_str = file.path.as_ref(); + match file.kind { + CleanupFileKind::Manifest => { + info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path_str); + } + CleanupFileKind::Data => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DATA, path = path_str); + } + CleanupFileKind::Deletion => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DELETION, path = path_str); + } + CleanupFileKind::Index => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_INDEX, path = path_str); + } + CleanupFileKind::Transaction | CleanupFileKind::TemporaryManifest => {} + } + } + cleanup_result + .lock() + .unwrap() + .record_file(&file, candidate_file_limit, self.track_removed_manifests); + Ok(file.path) + }); + + if deletes_files { + let paths_to_delete: BoxStream> = + if let Some(rate) = self.policy.delete_rate_limit { + let duration = + calculate_duration(self.dataset.object_store.scheme().to_string(), rate); + let mut ticker = interval(duration); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + IntervalStream::new(ticker) + .zip(all_paths_to_remove) + .map(|(_, path)| path) + .boxed() + } else { + all_paths_to_remove.boxed() + }; - delete_fut.await?; + self.dataset + .object_store + .remove_stream(paths_to_delete) + .try_for_each(|_| future::ready(Ok(()))) + .await?; + } else { + // Drain the stream to populate stats, but do not call remove_stream. + all_paths_to_remove + .try_for_each(|_| future::ready(Ok(()))) + .await?; + } - let mut removal_stats = removal_stats.into_inner().unwrap(); - removal_stats.old_versions = num_old_manifests as u64; - removal_stats.bytes_removed += manifest_bytes_removed?; + let cleanup_result = cleanup_result.into_inner().unwrap(); let span = Span::current(); - span.record("bytes_removed", removal_stats.bytes_removed); - span.record("data_files_removed", removal_stats.data_files_removed); + span.record("bytes_removed", cleanup_result.stats.bytes_removed); + span.record( + "data_files_removed", + cleanup_result.stats.data_files_removed, + ); span.record( "transaction_files_removed", - removal_stats.transaction_files_removed, + cleanup_result.stats.transaction_files_removed, + ); + span.record( + "index_files_removed", + cleanup_result.stats.index_files_removed, ); - span.record("index_files_removed", removal_stats.index_files_removed); span.record( "deletion_files_removed", - removal_stats.deletion_files_removed, + cleanup_result.stats.deletion_files_removed, ); - Ok(removal_stats) + Ok(cleanup_result) } - fn path_if_not_referenced( + fn cleanup_file_if_not_referenced( &self, - path: Path, + obj_meta: ObjectMeta, maybe_in_progress: bool, inspection: &CleanupInspection, - ) -> Result> { + ) -> Result> { + let path = obj_meta.location; let relative_path = remove_prefix(&path, &self.dataset.base); + let size_bytes = obj_meta.size; if relative_path.as_ref().starts_with("_versions/.tmp") { // This is a temporary manifest file. // @@ -476,7 +791,12 @@ impl<'a> CleanupTask<'a> { if maybe_in_progress { return Ok(None); } else { - return Ok(Some(path)); + return Ok(cleanup_file( + path, + CleanupFileKind::TemporaryManifest, + true, + size_bytes, + )); } } if relative_path.as_ref().starts_with("_indices") { @@ -490,15 +810,18 @@ impl<'a> CleanupTask<'a> { { return Ok(None); } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_INDEX, path = path.to_string()); - return Ok(Some(path)); + return Ok(cleanup_file(path, CleanupFileKind::Index, true, size_bytes)); } else if inspection .verified_files .index_uuids .contains(uuid.as_ref()) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_INDEX, path = path.to_string()); - return Ok(Some(path)); + return Ok(cleanup_file( + path, + CleanupFileKind::Index, + false, + size_bytes, + )); } } else { return Ok(None); @@ -514,15 +837,13 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes)) } else if inspection .verified_files .data_paths .contains(&relative_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes)) } else { Ok(None) } @@ -587,15 +908,13 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes)) } else if inspection .verified_files .data_paths .contains(&parent_data_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes)) } else { Ok(None) } @@ -613,15 +932,23 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DELETION, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file( + path, + CleanupFileKind::Deletion, + true, + size_bytes, + )) } else if inspection .verified_files .delete_paths .contains(&relative_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DELETION, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file( + path, + CleanupFileKind::Deletion, + false, + size_bytes, + )) } else { Ok(None) } @@ -640,7 +967,14 @@ impl<'a> CleanupTask<'a> { } else if !maybe_in_progress || inspection.verified_files.tx_paths.contains(&relative_path) { - Ok(Some(path)) + let unverified = + !inspection.verified_files.tx_paths.contains(&relative_path); + Ok(cleanup_file( + path, + CleanupFileKind::Transaction, + unverified, + size_bytes, + )) } else { Ok(None) } @@ -709,8 +1043,8 @@ impl<'a> CleanupTask<'a> { async fn clean_referenced_branches( &self, referenced_branches: &[(String, u64)], - ) -> Result { - let final_stats = Mutex::new(RemovalStats::default()); + ) -> Result { + let final_result = Mutex::new(CleanupRunResult::default()); // Group branches by their lineage identifier (BranchIdentifier). // Branches with the same identifier share a lineage and must be cleaned sequentially @@ -722,30 +1056,32 @@ impl<'a> CleanupTask<'a> { .or_insert_with(Vec::new) .push(branch.clone()); } + let action = self.action; + let candidate_file_limit = self.action.candidate_file_limit(); let tasks: Vec<_> = branches_chains .values() .map(|branch_chain| { - let final_stats = &final_stats; + let final_result = &final_result; async move { for branch in branch_chain { let branch_dataset = self .dataset .checkout_version((branch.as_str(), None)) .await?; - if let Some(stats) = cleanup_cascade_branch( + let ignored_manifests = + final_result.lock().unwrap().removed_manifests.clone(); + if let Some(result) = cleanup_cascade_branch_run( &branch_dataset, branch_dataset.manifest.as_ref(), + action, + ignored_manifests, ) .await? { - let mut stats_guard = final_stats.lock().unwrap(); - stats_guard.bytes_removed += stats.bytes_removed; - stats_guard.old_versions += stats.old_versions; - stats_guard.data_files_removed += stats.data_files_removed; - stats_guard.transaction_files_removed += - stats.transaction_files_removed; - stats_guard.index_files_removed += stats.index_files_removed; - stats_guard.deletion_files_removed += stats.deletion_files_removed; + final_result + .lock() + .unwrap() + .merge(result, candidate_file_limit); } } Ok::<(), Error>(()) @@ -753,7 +1089,7 @@ impl<'a> CleanupTask<'a> { }) .collect(); try_join_all(tasks).await?; - Ok(final_stats.into_inner().unwrap()) + Ok(final_result.into_inner().unwrap()) } // Retain manifests containing files referenced by descendant branches. @@ -762,6 +1098,7 @@ impl<'a> CleanupTask<'a> { &self, inspection: CleanupInspection, referenced_branches: &[(String, u64)], + removed_branch_manifests: &HashSet, ) -> Result { let inspection = Mutex::new(inspection); for (branch, root_version_number) in referenced_branches { @@ -772,6 +1109,9 @@ impl<'a> CleanupTask<'a> { self.dataset .commit_handler .list_manifest_locations(&branch_location.path, &self.dataset.object_store, false) + .try_filter(|location| { + future::ready(!removed_branch_manifests.contains(&location.path)) + }) .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| { self.process_branch_referenced_manifests( location, @@ -1020,8 +1360,7 @@ pub async fn cleanup_old_versions( dataset: &Dataset, policy: CleanupPolicy, ) -> Result { - let cleanup = CleanupTask::new(dataset, policy); - cleanup.run().await + CleanupOperation::new(dataset, policy).execute().await } /// If the dataset config has `lance.auto_cleanup` parameters set, @@ -1048,11 +1387,35 @@ pub async fn cleanup_cascade_branch( dataset: &Dataset, manifest: &Manifest, ) -> Result> { + Ok( + cleanup_cascade_branch_run(dataset, manifest, CleanupAction::Execute, HashSet::new()) + .await? + .map(|result| result.stats), + ) +} + +async fn cleanup_cascade_branch_run( + dataset: &Dataset, + manifest: &Manifest, + action: CleanupAction, + ignored_manifests: HashSet, +) -> Result> { let policy = build_cleanup_policy(dataset, manifest).await?; if let Some(mut policy) = policy { policy.clean_referenced_branches = false; policy.error_if_tagged_old_versions = false; - Ok(Some(dataset.cleanup_with_policy(policy).await?)) + if action.deletes_files() { + info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&dataset.uri); + } + let cleanup = CleanupTask::new_with_ignored_manifests( + dataset, + policy, + action, + ignored_manifests, + true, + false, + ); + Ok(Some(cleanup.run().await?)) } else { Ok(None) } @@ -1443,6 +1806,14 @@ mod tests { cleanup_old_versions(&db, policy).await } + async fn explain_cleanup_with_policy( + &self, + policy: CleanupPolicy, + ) -> Result { + let db = self.open().await?; + db.cleanup(policy).explain().await + } + async fn run_cleanup_with_override( &self, before: DateTime, @@ -1670,6 +2041,51 @@ mod tests { assert_gt!(after_count.num_tx_files, 0); } + #[tokio::test] + async fn explain_cleanup_does_not_delete_files() { + let fixture = MockDatasetFixture::try_new().unwrap(); + fixture.create_some_data().await.unwrap(); + MockClock::set_system_time(TimeDelta::try_seconds(1).unwrap().to_std().unwrap()); + fixture.overwrite_some_data().await.unwrap(); + + let before_count = fixture.count_files().await.unwrap(); + let policy = CleanupPolicyBuilder::default() + .before_timestamp(utc_now()) + .build(); + + let explanation = fixture + .explain_cleanup_with_policy(policy.clone()) + .await + .unwrap(); + let after_preview_count = fixture.count_files().await.unwrap(); + + // Files are not actually removed when explaining cleanup. + assert_eq!(before_count, after_preview_count); + assert_eq!(explanation.read_version, 2); + assert_eq!(explanation.stats.old_versions, 1); + assert_eq!(explanation.stats.data_files_removed, 1); + assert_eq!(explanation.stats.transaction_files_removed, 1); + assert_gt!(explanation.stats.bytes_removed, 0); + assert!(!explanation.candidate_files.is_empty()); + assert!(!explanation.candidate_files_truncated); + + // Running cleanup with the same policy should remove the same files the + // explanation reported for this unchanged dataset. + let removed = fixture.run_cleanup_with_policy(policy).await.unwrap(); + let after_cleanup_count = fixture.count_files().await.unwrap(); + + assert_eq!( + removed.bytes_removed, + before_count.num_bytes - after_cleanup_count.num_bytes + ); + assert_eq!(removed.old_versions, explanation.stats.old_versions); + assert_eq!( + removed.data_files_removed, + explanation.stats.data_files_removed + ); + assert_eq!(removed.bytes_removed, explanation.stats.bytes_removed); + } + #[tokio::test] async fn cleanup_blob_v2_sidecar_files() { let fixture = MockDatasetFixture::try_new().unwrap(); @@ -3073,6 +3489,17 @@ mod tests { self.run_cleanup_inner(policy).await } + async fn explain_cleanup_with_referenced_branches(&mut self) -> Result { + let policy = CleanupPolicyBuilder::default() + .error_if_tagged_old_versions(false) + .clean_referenced_branches(true) + .retain_n_versions(&self.dataset, 1) + .await? + .build(); + self.dataset.checkout_latest().await?; + self.dataset.cleanup(policy).explain().await + } + async fn run_cleanup_inner(&mut self, policy: CleanupPolicy) -> Result { let pre_count = self.count_data().await?; self.dataset.checkout_latest().await?; @@ -3653,6 +4080,74 @@ mod tests { setup.assert_unchanged(&["branch4"]).await; } + #[tokio::test] + async fn explain_cleanup_with_referenced_branches_matches_cleanup() { + let mut setup = build_lineage_datasets().await.unwrap(); + + setup.enable_auto_cleanup().await.unwrap(); + setup.main.write_data().await.unwrap(); + setup.main.compact().await.unwrap(); + setup.branch4.compact().await.unwrap(); + setup.branch1.write_data().await.unwrap(); + setup.branch1.compact().await.unwrap(); + setup.branch2.write_data().await.unwrap(); + setup.branch2.compact().await.unwrap(); + setup.branch3.write_data().await.unwrap(); + setup.branch3.compact().await.unwrap(); + + setup.main.refresh().await.unwrap(); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + let main_counts_before = setup.main.counts; + let branch1_counts_before = setup.branch1.counts; + let branch2_counts_before = setup.branch2.counts; + let branch3_counts_before = setup.branch3.counts; + let branch4_counts_before = setup.branch4.counts; + + let explanation = setup + .main + .explain_cleanup_with_referenced_branches() + .await + .unwrap(); + + setup.main.refresh().await.unwrap(); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + assert_eq!(setup.main.counts, main_counts_before); + assert_eq!(setup.branch1.counts, branch1_counts_before); + assert_eq!(setup.branch2.counts, branch2_counts_before); + assert_eq!(setup.branch3.counts, branch3_counts_before); + assert_eq!(setup.branch4.counts, branch4_counts_before); + + let removed = setup + .main + .run_cleanup_with_referenced_branches() + .await + .unwrap(); + + assert!(!explanation.referenced_branches.is_empty()); + assert!( + explanation + .referenced_branches + .iter() + .any(|branch| branch.cleanup_candidate) + ); + assert_eq!(explanation.stats, removed); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + assert_eq!(setup.main.counts.num_manifest_files, 1); + assert_eq!(setup.branch1.counts.num_manifest_files, 1); + assert_eq!(setup.branch2.counts.num_manifest_files, 1); + assert_eq!(setup.branch3.counts.num_manifest_files, 1); + assert_eq!(setup.branch4.counts.num_manifest_files, 1); + } + #[tokio::test] async fn auto_clean_referenced_branches_with_tags() { let mut setup = build_lineage_datasets().await.unwrap(); diff --git a/rust/lance/src/dataset/files.rs b/rust/lance/src/dataset/files.rs new file mode 100644 index 00000000000..848add7e4a8 --- /dev/null +++ b/rust/lance/src/dataset/files.rs @@ -0,0 +1,1169 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Dataset file inspection APIs. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use arrow_array::RecordBatch; +use arrow_array::builder::{ + Int64Builder, StringBuilder, StringDictionaryBuilder, TimestampMicrosecondBuilder, +}; +use arrow_array::types::Int32Type; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use either::Either; +use futures::stream::FuturesUnordered; +use futures::{Future, StreamExt, TryStreamExt}; +use lance_table::format::IndexMetadata; +use lance_table::utils::LanceIteratorExtension; +use object_store::path::Path; +use uuid::Uuid; + +use crate::Dataset; +use crate::dataset::files::arrow::{TRACKED_FILES_SCHEMA, TrackedFileBatch}; +use crate::dataset::files::file_types::FileType; +use crate::dataset::{DATA_DIR, INDICES_DIR, TRANSACTIONS_DIR}; +use lance_core::Result; +use lance_table::io::deletion::relative_deletion_file_path; +use lance_table::io::manifest::{read_manifest, read_manifest_indexes}; + +mod arrow; +mod file_types; + +const BATCH_SIZE: usize = 4096; +/// Memory budget for in-flight manifests (estimated in-memory size). +const MANIFEST_MEMORY_BUDGET: usize = 1024 * 1024 * 1024; // 1 GB +/// Estimated ratio of in-memory size to on-disk size for manifests. Found +/// empirically; manifests are protobuf with significant decompression and +/// allocator overhead once parsed. +const MANIFEST_DECOMPRESSION_RATIO: usize = 4; + +fn remove_prefix(path: &Path, prefix: &Path) -> Path { + match path.prefix_match(prefix) { + Some(parts) => Path::from_iter(parts), + None => path.clone(), + } +} + +/// A single row destined for the `tracked_files` output. +struct FileRow<'a> { + version: u64, + base_uri: Cow<'a, str>, + path: Cow<'a, str>, + file_type: FileType, +} + +/// Resolve the base URI a file lives under. Files referenced from a shallow +/// clone carry a `base_id` pointing into `manifest.base_paths`; otherwise they +/// live under this dataset's own `base_uri`. +fn resolve_base_uri<'a>( + manifest: &'a lance_table::format::Manifest, + base_id: Option, + base_uri: &'a str, +) -> &'a str { + base_id + .and_then(|id| manifest.base_paths.get(&id).map(|bp| bp.path.as_str())) + .unwrap_or(base_uri) +} + +fn manifest_file_rows<'a>( + manifest: &'a lance_table::format::Manifest, + base_uri: &'a str, + manifest_path: &'a str, +) -> Box> + Send + 'a> { + let mut files = 1; + let manifest_row = FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(base_uri), + path: Cow::Borrowed(manifest_path), + file_type: FileType::Manifest, + }; + let iter = std::iter::once(manifest_row); + + let iter = if let Some(txn_file) = &manifest.transaction_file { + files += 1; + let txn_row = FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(base_uri), + path: Cow::Owned(format!("{}/{}", TRANSACTIONS_DIR, txn_file)), + file_type: FileType::TransactionFile, + }; + Either::Left(iter.chain(std::iter::once(txn_row))) + } else { + Either::Right(iter) + }; + + for fragment in manifest.fragments.iter() { + files += fragment.files.len(); + + if fragment.deletion_file.is_some() { + files += 1; + } + } + + let data_files = manifest.fragments.iter().flat_map(move |fragment| { + fragment.files.iter().map(move |data_file| { + let effective_base_uri = resolve_base_uri(manifest, data_file.base_id, base_uri); + FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(effective_base_uri), + path: Cow::Owned(format!("{}/{}", DATA_DIR, data_file.path)), + file_type: FileType::DataFile, + } + }) + }); + + let deletion_files = manifest.fragments.iter().filter_map(|fragment| { + fragment.deletion_file.as_ref().map(|del_file| FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(resolve_base_uri(manifest, del_file.base_id, base_uri)), + path: Cow::Owned(relative_deletion_file_path(fragment.id, del_file)), + file_type: FileType::DeletionFile, + }) + }); + + Box::new( + iter.chain(data_files) + .chain(deletion_files) + .exact_size(files), + ) +} + +fn manifest_file_batches<'a>( + manifest: &'a lance_table::format::Manifest, + base_uri: &'a str, + manifest_path: &'a str, +) -> Box> + Send + 'a> { + let mut builder = TrackedFileBatch::with_capacity(BATCH_SIZE); + + let mut iter = manifest_file_rows(manifest, base_uri, manifest_path); + let size = iter.len().div_ceil(BATCH_SIZE); + + let mut flushed = false; + Box::new( + std::iter::from_fn(move || { + if flushed { + return None; + } + while let Some(row) = iter.next() { + builder.append(&row); + if builder.len() == BATCH_SIZE { + let next_size = iter.len().div_ceil(BATCH_SIZE); + let old_builder = + std::mem::replace(&mut builder, TrackedFileBatch::with_capacity(next_size)); + return Some(old_builder.finish()); + } + } + // Flush the remaining partial batch. + flushed = true; + if builder.len() != 0 { + let partial = std::mem::replace(&mut builder, TrackedFileBatch::with_capacity(0)); + Some(partial.finish()) + } else { + None + } + }) + .exact_size(size), + ) +} + +async fn get_index_files( + uuids: impl IntoIterator, + base: &Path, + object_store: &lance_io::object_store::ObjectStore, + cache: &mut HashMap>, +) -> Result> { + let uuids: Vec = uuids.into_iter().collect(); + + // Phase 1: list uncached UUID directories concurrently. + let uncached: Vec = uuids + .iter() + .filter(|uuid| !cache.contains_key(*uuid)) + .copied() + .collect(); + if !uncached.is_empty() { + let parallelism = object_store.io_parallelism(); + // Clone for use in async move closures (ObjectStore is Arc-backed). + let base_owned = base.clone(); + let os = object_store.clone(); + let new_entries: Vec<(Uuid, Vec)> = + futures::stream::iter(uncached) + .map(|uuid| { + let base = base_owned.clone(); + let os = os.clone(); + async move { + let prefix = base.join(INDICES_DIR).join(uuid.to_string()); + let files: Vec = + os.list(Some(prefix)).try_collect().await?; + lance_core::Result::Ok((uuid, files)) + } + }) + .buffer_unordered(parallelism) + .try_collect() + .await?; + + // Phase 2: insert results into cache (serial, no contention). + cache.extend(new_entries); + } + + // Phase 3: collect paths for the requested UUIDs in order. + let mut paths = Vec::new(); + for uuid in &uuids { + paths.extend( + cache[uuid] + .iter() + .map(|meta| remove_prefix(&meta.location, base)), + ); + } + Ok(paths) +} + +async fn index_file_batch(version: u64, base_uri: &str, paths: &[Path]) -> Result { + let mut builder = TrackedFileBatch::with_capacity(paths.len()); + for path in paths { + builder.append(&FileRow { + version, + base_uri: Cow::Borrowed(base_uri), + path: Cow::Owned(path.to_string()), + file_type: FileType::IndexFile, + }); + } + builder.finish() +} + +/// Progress update for [`Dataset::tracked_files_with_options`]. +#[derive(Debug, Clone)] +pub struct TrackedFilesProgress { + /// Number of manifests processed so far. + pub manifests_processed: usize, + /// Total number of manifests, if known. This becomes `Some` once the + /// listing stream is exhausted; until then it is `None`. + pub manifests_total: Option, +} + +/// Options for [`Dataset::tracked_files_with_options`]. +#[derive(Default)] +pub struct TrackedFilesOptions { + /// If set, only include manifests with `version >= min_version`. + pub min_version: Option, + /// If set, called each time a manifest has been fully processed. The + /// callback runs on a background tokio task, so it must not block (it + /// will stall the manifest reader pipeline). Order is the order in which + /// manifests finish processing, which is not the version order. + pub progress: Option>, +} + +// A `ManifestLocation` is ~100 bytes, so a 50k-slot mpsc channel costs ~5 MB +// in the worst case. That's enough headroom for the lister to run well ahead +// of the reader on datasets with hundreds of thousands of manifests, while +// still bounding memory. +const MAX_BUFFERED_LOCATIONS: usize = 50_000; + +impl Dataset { + /// Returns one row per (version, file) for every file referenced in any manifest. + /// + /// Each row contains the manifest version, the storage root URI, the file path + /// relative to that URI, and the file type. + /// + /// # Schema + /// + /// | Column | Type | Notes | + /// |------------|-----------------------------------|-------| + /// | `version` | `Int64` (non-null) | Manifest version number | + /// | `base_uri` | `Dictionary(Int32, Utf8)` (non-null) | Storage root for this file | + /// | `path` | `Utf8` (non-null) | Relative to `base_uri` | + /// | `type` | `Dictionary(Int8, Utf8)` (non-null) | One of: `data file`, `manifest`, `deletion file`, `transaction file`, `index file` | + /// + /// Output order is non-deterministic. + pub async fn tracked_files(&self) -> SendableRecordBatchStream { + self.tracked_files_with_options(TrackedFilesOptions::default()) + .await + } + + /// Like [`Self::tracked_files`], but with additional options for filtering + /// and progress reporting. + pub async fn tracked_files_with_options( + &self, + options: TrackedFilesOptions, + ) -> SendableRecordBatchStream { + use lance_table::io::commit::ManifestLocation; + + let base = self.base.clone(); + let uri = self.uri().to_string(); + let object_store = self.object_store.clone(); + let commit_handler = self.commit_handler.clone(); + + // Pipeline architecture: + // + // Lister ──► tx_locations ──► Reader ──┬──► tx_manifest ──► Emitter ──► tx (output) + // └──► tx_indexes ──► IndexLister ──► tx (output) + + // Output channel: Emitter and IndexLister both send batches here. + let (tx, rx) = tokio::sync::mpsc::channel::>(4); + // Location channel: Lister -> Reader. Large buffer since locations are + // small (~100 bytes each) and we want the lister to run ahead. + let (tx_locations, mut rx_locations) = + tokio::sync::mpsc::channel::(MAX_BUFFERED_LOCATIONS); + // Manifest channel: Reader -> Emitter (small buffer for backpressure + // since manifests can be large). + let (tx_manifest, mut rx_manifest) = + tokio::sync::mpsc::channel::<(Arc, String, usize)>(2); + // Index channel: Reader -> IndexLister. + let (tx_indexes, mut rx_indexes) = + tokio::sync::mpsc::channel::<(u64, Vec)>(8); + + // Tracks estimated in-memory size of in-flight manifests. Reader adds + // before sending; Emitter subtracts after processing. + let inflight_mem = Arc::new(AtomicUsize::new(0)); + let mem_notify = Arc::new(tokio::sync::Notify::new()); + + // Progress: total is set by Lister once listing finishes, read by Emitter. + let total_manifests: Arc> = Arc::new(std::sync::OnceLock::new()); + + // --- Lister task --- + // Lists manifest locations, applies min_version filter, and counts the + // total. Locations are lightweight so we buffer up to MAX_BUFFERED_LOCATIONS. + let tx_err_lister = tx.clone(); + let os_lister = object_store.clone(); + let base_lister = base.clone(); + let total_manifests_lister = total_manifests.clone(); + let min_version = options.min_version; + tokio::spawn(async move { + let result: lance_core::Result<()> = async { + let mut locations = + commit_handler.list_manifest_locations(&base_lister, &os_lister, false); + let mut count = 0usize; + while let Some(loc) = locations.next().await { + let loc = loc?; + if let Some(min_v) = min_version + && loc.version < min_v + { + continue; + } + count += 1; + if tx_locations.send(loc).await.is_err() { + return Ok(()); + } + } + let _ = total_manifests_lister.set(count); + Ok(()) + } + .await; + if let Err(e) = result { + let _ = tx_err_lister + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + } + }); + + // --- Reader task --- + // Reads manifests with memory-aware parallelism and fans out to + // Emitter (file batches) and IndexLister (index metadata). + let tx_err_reader = tx.clone(); + let os_reader = object_store.clone(); + let base_reader = base.clone(); + let inflight_mem_reader = inflight_mem.clone(); + let mem_notify_reader = mem_notify.clone(); + tokio::spawn(async move { + let result: lance_core::Result<()> = async { + let max_parallelism = os_reader.io_parallelism(); + + type ManifestResult = lance_core::Result<( + Arc, + String, + Vec, + usize, + )>; + let mut in_flight: FuturesUnordered< + std::pin::Pin + Send>>, + > = FuturesUnordered::new(); + let mut locations_exhausted = false; + + loop { + let can_launch = !locations_exhausted + && in_flight.len() < max_parallelism + && (in_flight.is_empty() + || inflight_mem_reader.load(Ordering::Acquire) + < MANIFEST_MEMORY_BUDGET); + + if in_flight.is_empty() && !can_launch { + break; + } + + tokio::select! { + biased; + // Always drain completed reads first. + Some(item) = in_flight.next(), if !in_flight.is_empty() => { + let (manifest, manifest_path, indexes, estimated) = item?; + let version = manifest.version; + if tx_manifest + .send((manifest, manifest_path, estimated)) + .await + .is_err() + { + return Ok(()); + } + if !indexes.is_empty() + && tx_indexes.send((version, indexes)).await.is_err() + { + return Ok(()); + } + } + // Receive next location and start a read. + loc = rx_locations.recv(), if can_launch => { + match loc { + Some(loc) => { + let estimated = + loc.size.unwrap_or(0) as usize + * MANIFEST_DECOMPRESSION_RATIO; + inflight_mem_reader.fetch_add(estimated, Ordering::AcqRel); + + let os = os_reader.clone(); + let base = base_reader.clone(); + in_flight.push(Box::pin(async move { + let manifest = + read_manifest(&os, &loc.path, loc.size).await?; + let indexes = + read_manifest_indexes(&os, &loc, &manifest).await?; + let manifest_path = + remove_prefix(&loc.path, &base).to_string(); + lance_core::Result::Ok(( + Arc::new(manifest), + manifest_path, + indexes, + estimated, + )) + })); + } + None => { + locations_exhausted = true; + } + } + } + // Wake up when Emitter frees memory. + _ = mem_notify_reader.notified(), + if !can_launch && !in_flight.is_empty() => {} + } + } + Ok(()) + } + .await; + + if let Err(e) = result { + let _ = tx_err_reader + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + } + }); + + // --- Emitter task --- + // Converts manifests into file-row batches, releases memory budget, + // and reports progress. + let tx_emitter = tx.clone(); + let uri_emitter = uri.clone(); + let progress_cb = options.progress; + tokio::spawn(async move { + let mut processed = 0usize; + while let Some((manifest, manifest_path, estimated)) = rx_manifest.recv().await { + let batches = manifest_file_batches(&manifest, &uri_emitter, &manifest_path); + for batch_result in batches { + let df_result = batch_result.map_err(datafusion::error::DataFusionError::from); + if tx_emitter.send(df_result).await.is_err() { + return; + } + } + drop(manifest); + inflight_mem.fetch_sub(estimated, Ordering::AcqRel); + mem_notify.notify_one(); + + processed += 1; + if let Some(ref cb) = progress_cb { + cb(TrackedFilesProgress { + manifests_processed: processed, + manifests_total: total_manifests.get().copied(), + }); + } + } + }); + + // --- IndexLister task --- + // Lists index directories and emits index file batches. + let tx_idx = tx; + let uri_idx = uri; + let os_idx = object_store; + let base_idx = base; + tokio::spawn(async move { + let mut uuid_cache: HashMap> = HashMap::new(); + while let Some((version, indexes)) = rx_indexes.recv().await { + let uuids: Vec = indexes.iter().map(|idx| idx.uuid).collect(); + match get_index_files(uuids, &base_idx, &os_idx, &mut uuid_cache).await { + Ok(index_paths) if !index_paths.is_empty() => { + match index_file_batch(version, &uri_idx, &index_paths).await { + Ok(batch) => { + if tx_idx.send(Ok(batch)).await.is_err() { + return; + } + } + Err(e) => { + let _ = tx_idx + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + return; + } + } + } + Err(e) => { + let _ = tx_idx + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + return; + } + _ => {} + } + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + + Box::pin(RecordBatchStreamAdapter::new( + TRACKED_FILES_SCHEMA.clone(), + stream, + )) + } + + /// Returns one row per file that physically exists at the dataset's base URI. + /// + /// This scans the primary object store root only. Additional `base_paths` + /// entries in the manifest (for externally-located data files) are not + /// scanned by this method. + /// + /// # Schema + /// + /// | Column | Type | Notes | + /// |-----------------|--------------------------------------------|-------| + /// | `base_uri` | `Dictionary(Int32, Utf8)` (non-null) | Storage root | + /// | `path` | `Utf8` (non-null) | Relative to `base_uri` | + /// | `size_bytes` | `Int64` (non-null) | File size in bytes | + /// | `last_modified` | `Timestamp(Microsecond, "UTC")` (non-null) | Last modification time | + pub async fn all_files(&self) -> SendableRecordBatchStream { + let base = self.base.clone(); + let uri = self.uri().to_string(); + let object_store = self.object_store.clone(); + + let stream = object_store + .list(Some(base.clone())) + .try_chunks(4000) + .map_err(|err| err.1) + .and_then( + move |chunk| match build_all_files_batch(&chunk, &base, &uri) { + Ok(batch) => futures::future::ok(batch), + Err(e) => futures::future::err(e), + }, + ) + .map_err(datafusion::error::DataFusionError::from); + + Box::pin(RecordBatchStreamAdapter::new( + arrow::ALL_FILES_SCHEMA.clone(), + stream, + )) + } +} + +fn build_all_files_batch( + chunk: &[object_store::ObjectMeta], + base: &Path, + uri: &str, +) -> Result { + let n = chunk.len(); + let mut base_uri_builder = StringDictionaryBuilder::::with_capacity(n, 1, uri.len()); + let path_capacity = chunk.iter().map(|m| m.location.as_ref().len()).sum(); + let mut path_builder = StringBuilder::with_capacity(n, path_capacity); + let mut size_builder = Int64Builder::with_capacity(n); + let mut ts_builder = TimestampMicrosecondBuilder::with_capacity(n).with_timezone("UTC"); + + for meta in chunk { + let rel = remove_prefix(&meta.location, base); + base_uri_builder.append_value(uri); + path_builder.append_value(rel.as_ref()); + size_builder.append_value(meta.size as i64); + ts_builder.append_value(meta.last_modified.timestamp_micros()); + } + + RecordBatch::try_new( + arrow::ALL_FILES_SCHEMA.clone(), + vec![ + Arc::new(base_uri_builder.finish()), + Arc::new(path_builder.finish()), + Arc::new(size_builder.finish()), + Arc::new(ts_builder.finish()), + ], + ) + .map_err(Into::into) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Dataset; + use crate::index::DatasetIndexExt; + use crate::index::vector::VectorIndexParams; + use arrow_array::{Array, Int32Array, RecordBatchIterator, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use futures::TryStreamExt; + use lance_index::IndexType; + use lance_linalg::distance::MetricType; + use lance_testing::datagen::some_batch; + use std::collections::HashSet; + + async fn collect_rows(stream: SendableRecordBatchStream) -> Vec { + stream.try_collect::>().await.unwrap() + } + + fn count_rows(batches: &[RecordBatch]) -> usize { + batches.iter().map(|b| b.num_rows()).sum() + } + + fn dict_value_at(col: &dyn arrow_array::Array, i: usize) -> String { + if let Some(dict) = col + .as_any() + .downcast_ref::>() + { + let values = dict + .values() + .as_any() + .downcast_ref::() + .unwrap(); + values.value(dict.keys().value(i) as usize).to_string() + } else if let Some(dict) = col + .as_any() + .downcast_ref::>() + { + let values = dict + .values() + .as_any() + .downcast_ref::() + .unwrap(); + values.value(dict.keys().value(i) as usize).to_string() + } else { + panic!("expected a dictionary array with Int8 or Int32 keys"); + } + } + + fn collect_column_values(batches: &[RecordBatch], col: &str) -> Vec { + batches + .iter() + .flat_map(|b| { + let col = b.column_by_name(col).unwrap(); + (0..col.len()).map(|i| dict_value_at(col.as_ref(), i)) + }) + .collect() + } + + fn make_simple_batch() -> impl arrow_array::RecordBatchReader { + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + RecordBatchIterator::new(vec![Ok(batch)], schema) + } + + #[tokio::test] + async fn test_tracked_files_basic() { + let uri = "memory://test_tracked_files_basic"; + + // Create then append twice to get 3 manifest versions. + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + let stream = ds.tracked_files().await; + let schema = stream.schema(); + let batches = collect_rows(stream).await; + + // Schema is correct. + assert_eq!(schema.field(0).name(), "version"); + assert_eq!(schema.field(1).name(), "base_uri"); + assert_eq!(schema.field(2).name(), "path"); + assert_eq!(schema.field(3).name(), "type"); + + let n = count_rows(&batches); + // At minimum: 3 manifests + 3 data files = 6 rows + assert!(n >= 6, "expected at least 6 rows, got {n}"); + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!(types.contains("manifest"), "missing 'manifest' rows"); + assert!(types.contains("data file"), "missing 'data file' rows"); + } + + #[tokio::test] + async fn test_tracked_files_deletion() { + let uri = "memory://test_tracked_files_deletion"; + + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.delete("id = 2").await.unwrap(); + + let stream = ds.tracked_files().await; + let batches = collect_rows(stream).await; + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!( + types.contains("deletion file"), + "missing 'deletion file' rows after delete; got types: {:?}", + types + ); + } + + #[tokio::test] + async fn test_tracked_files_transaction() { + let uri = "memory://test_tracked_files_transaction"; + + // Normal writes record transaction files by default. + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + let stream = ds.tracked_files().await; + let batches = collect_rows(stream).await; + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!( + types.contains("transaction file"), + "expected 'transaction file' rows; got types: {:?}", + types + ); + } + + #[tokio::test] + async fn test_tracked_files_index() { + let uri = "memory://test_tracked_files_index"; + + let mut ds = Dataset::write(some_batch(), uri, None).await.unwrap(); + let params = VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 5); + ds.create_index(&["indexable"], IndexType::Vector, None, ¶ms, true) + .await + .unwrap(); + + let stream = ds.tracked_files().await; + let batches = collect_rows(stream).await; + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!( + types.contains("index file"), + "expected 'index file' rows after vector index creation; got types: {:?}", + types + ); + } + + fn collect_versions(batches: &[RecordBatch]) -> Vec { + batches + .iter() + .flat_map(|b| { + let col = b + .column_by_name("version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + (0..col.len()).map(|i| col.value(i)).collect::>() + }) + .collect() + } + + #[tokio::test] + async fn test_tracked_files_min_version() { + let uri = "memory://test_tracked_files_min_version"; + + // Create 3 versions. + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + // Without filter: should have rows from versions 1, 2, 3. + let stream = ds.tracked_files().await; + let all_batches = collect_rows(stream).await; + let all_versions: HashSet = collect_versions(&all_batches).into_iter().collect(); + assert!(all_versions.contains(&1)); + assert!(all_versions.contains(&2)); + assert!(all_versions.contains(&3)); + + // With min_version=3: should only have version 3. + let stream = ds + .tracked_files_with_options(TrackedFilesOptions { + min_version: Some(3), + ..Default::default() + }) + .await; + let filtered_batches = collect_rows(stream).await; + let filtered_versions: HashSet = + collect_versions(&filtered_batches).into_iter().collect(); + assert_eq!(filtered_versions, HashSet::from([3])); + + // With min_version=2: should have versions 2 and 3. + let stream = ds + .tracked_files_with_options(TrackedFilesOptions { + min_version: Some(2), + ..Default::default() + }) + .await; + let filtered_batches = collect_rows(stream).await; + let filtered_versions: HashSet = + collect_versions(&filtered_batches).into_iter().collect(); + assert_eq!(filtered_versions, HashSet::from([2, 3])); + } + + #[tokio::test] + async fn test_tracked_files_progress() { + let uri = "memory://test_tracked_files_progress"; + + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + let updates = Arc::new(std::sync::Mutex::new(Vec::new())); + let updates_clone = updates.clone(); + + let stream = ds + .tracked_files_with_options(TrackedFilesOptions { + progress: Some(Box::new(move |p| { + updates_clone.lock().unwrap().push(p); + })), + ..Default::default() + }) + .await; + // Consume the full stream to drive all tasks to completion. + let _batches = collect_rows(stream).await; + + let updates = updates.lock().unwrap(); + // Should have exactly 3 progress updates (one per manifest). + assert_eq!(updates.len(), 3, "expected 3 progress updates"); + // Processed counts should be monotonically increasing. + for (i, u) in updates.iter().enumerate() { + assert_eq!(u.manifests_processed, i + 1); + } + // The last update should know the total. + let last = updates.last().unwrap(); + assert_eq!(last.manifests_total, Some(3)); + } + + fn make_multi_row_batch(rows: usize) -> impl arrow_array::RecordBatchReader { + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..rows as i32))], + ) + .unwrap(); + RecordBatchIterator::new(vec![Ok(batch)], schema) + } + + /// Multi-fragment scenario: write 6 rows split across 3 fragments, delete + /// one row to produce a deletion file, then assert that every path + /// `tracked_files` emits for the latest version actually exists in the + /// `all_files` listing of the dataset directory. + #[tokio::test] + async fn test_tracked_files_paths_match_disk() { + use crate::dataset::WriteParams; + + let uri = "memory://test_tracked_files_paths_match_disk"; + + let write_params = WriteParams { + max_rows_per_file: 2, + ..Default::default() + }; + let mut ds = Dataset::write(make_multi_row_batch(6), uri, Some(write_params)) + .await + .unwrap(); + // Triggers a deletion file on one of the fragments. + ds.delete("id = 1").await.unwrap(); + let latest_version = ds.version().version as i64; + + // Sanity-check the multi-fragment setup: 3 data files in the latest manifest. + assert_eq!( + ds.get_fragments().len(), + 3, + "expected 3 fragments from max_rows_per_file=2 over 6 rows" + ); + + let tracked = collect_rows(ds.tracked_files().await).await; + let all = collect_rows(ds.all_files().await).await; + + let all_paths: HashSet = all + .iter() + .flat_map(|b| { + let col = b + .column_by_name("path") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + (0..col.len()).map(|i| col.value(i).to_string()) + }) + .collect(); + + // Collect tracked paths grouped by type for the latest version only. + let mut tracked_at_latest: HashMap> = HashMap::new(); + for batch in &tracked { + let versions = batch + .column_by_name("version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let paths = batch + .column_by_name("path") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let types = batch.column_by_name("type").unwrap(); + for i in 0..batch.num_rows() { + if versions.value(i) == latest_version { + tracked_at_latest + .entry(dict_value_at(types.as_ref(), i)) + .or_default() + .push(paths.value(i).to_string()); + } + } + } + + // Every file tracked at the latest version must exist on disk. + for (file_type, paths) in &tracked_at_latest { + for p in paths { + assert!( + all_paths.contains(p), + "tracked {file_type} path {p:?} not present in all_files (got {all_paths:?})" + ); + } + } + + // The latest manifest references one manifest, 3 data files, and 1 deletion file. + assert_eq!( + tracked_at_latest.get("manifest").map(Vec::len), + Some(1), + "expected 1 manifest row at latest version" + ); + assert_eq!( + tracked_at_latest.get("data file").map(Vec::len), + Some(3), + "expected 3 data files at latest version" + ); + assert_eq!( + tracked_at_latest.get("deletion file").map(Vec::len), + Some(1), + "expected 1 deletion file at latest version" + ); + + // Path shapes are as documented (relative to base_uri, no leading slash). + for p in tracked_at_latest.get("data file").unwrap() { + assert!( + p.starts_with("data/"), + "data file path {p:?} should start with data/" + ); + } + let manifest_path = &tracked_at_latest.get("manifest").unwrap()[0]; + assert!( + manifest_path.starts_with("_versions/") && manifest_path.ends_with(".manifest"), + "manifest path {manifest_path:?} should match _versions/.manifest" + ); + let deletion_path = &tracked_at_latest.get("deletion file").unwrap()[0]; + assert!( + deletion_path.starts_with("_deletions/"), + "deletion path {deletion_path:?} should start with _deletions/" + ); + } + + /// Each `DataFile` inside a fragment carries its own `base_id`; the + /// emitted `base_uri` must be looked up per file, not per fragment. + #[test] + fn test_manifest_file_rows_per_file_base_id() { + use lance_core::datatypes::{Field as LanceField, Schema as LanceSchema}; + use lance_io::utils::CachedFileSize; + use lance_table::format::{ + BasePath, DataFile, DataStorageFormat, DeletionFile, DeletionFileType, Fragment, + Manifest, + }; + + let schema = LanceSchema { + fields: vec![LanceField::try_from(&Field::new("id", DataType::Int32, false)).unwrap()], + metadata: Default::default(), + }; + + let mk_file = |path: &str, base_id: Option| DataFile { + path: path.to_string(), + fields: Arc::from(vec![0]), + column_indices: Arc::from(Vec::::new()), + file_major_version: 2, + file_minor_version: 0, + file_size_bytes: CachedFileSize::unknown(), + base_id, + }; + + let fragment = Fragment { + id: 0, + files: vec![ + mk_file("a.lance", Some(1)), + mk_file("b.lance", Some(2)), + // No base_id -> falls back to the dataset base_uri. + mk_file("c.lance", None), + ], + // Deletion files also carry a base_id when they originate from a + // shallow clone, and must resolve against base_paths too. + deletion_file: Some(DeletionFile { + read_version: 1, + id: 7, + file_type: DeletionFileType::Bitmap, + num_deleted_rows: Some(1), + base_id: Some(2), + }), + row_id_meta: None, + physical_rows: Some(3), + last_updated_at_version_meta: None, + created_at_version_meta: None, + }; + + let mut base_paths = HashMap::new(); + base_paths.insert( + 1, + BasePath::new(1, "s3://bucket-a/root".to_string(), None, false), + ); + base_paths.insert( + 2, + BasePath::new(2, "s3://bucket-b/root".to_string(), None, false), + ); + + let manifest = Manifest::new( + schema, + Arc::new(vec![fragment]), + DataStorageFormat::default(), + base_paths, + ); + + let rows: Vec<_> = + manifest_file_rows(&manifest, "memory://main", "_versions/1.manifest").collect(); + let by_path: HashMap<&str, &str> = rows + .iter() + .filter(|r| matches!(r.file_type, FileType::DataFile)) + .map(|r| (r.path.as_ref(), r.base_uri.as_ref())) + .collect(); + + assert_eq!(by_path.get("data/a.lance"), Some(&"s3://bucket-a/root")); + assert_eq!(by_path.get("data/b.lance"), Some(&"s3://bucket-b/root")); + assert_eq!(by_path.get("data/c.lance"), Some(&"memory://main")); + + let deletion = rows + .iter() + .find(|r| matches!(r.file_type, FileType::DeletionFile)) + .expect("deletion file row"); + assert_eq!(deletion.path.as_ref(), "_deletions/0-1-7.bin"); + assert_eq!(deletion.base_uri.as_ref(), "s3://bucket-b/root"); + } + + #[tokio::test] + async fn test_all_files_basic() { + let uri = "memory://test_all_files_basic"; + let ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + + let stream = ds.all_files().await; + let schema = stream.schema(); + let batches = collect_rows(stream).await; + + assert_eq!(schema.field(0).name(), "base_uri"); + assert_eq!(schema.field(1).name(), "path"); + assert_eq!(schema.field(2).name(), "size_bytes"); + assert_eq!(schema.field(3).name(), "last_modified"); + + let n = count_rows(&batches); + // A dataset always has at least a manifest and a data file. + assert!(n >= 2, "expected at least 2 physical files, got {n}"); + + // Verify sizes and timestamps are populated (non-zero). + for batch in &batches { + let sizes = batch + .column_by_name("size_bytes") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..sizes.len() { + assert!( + sizes.value(i) > 0, + "size_bytes should be positive, got {}", + sizes.value(i) + ); + } + + let ts = batch + .column_by_name("last_modified") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..ts.len() { + assert!( + ts.value(i) > 0, + "last_modified should be positive, got {}", + ts.value(i) + ); + } + } + } + + #[tokio::test] + async fn test_all_files_schema() { + let uri = "memory://test_all_files_schema"; + let ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + + let stream = ds.all_files().await; + let schema = stream.schema(); + + assert_eq!(schema.fields().len(), 4); + assert_eq!(schema.field(0).name(), "base_uri"); + assert!(matches!( + schema.field(0).data_type(), + DataType::Dictionary(_, _) + )); + assert_eq!(schema.field(1).name(), "path"); + assert_eq!(schema.field(1).data_type(), &DataType::Utf8); + assert_eq!(schema.field(2).name(), "size_bytes"); + assert_eq!(schema.field(2).data_type(), &DataType::Int64); + assert_eq!(schema.field(3).name(), "last_modified"); + assert!(matches!( + schema.field(3).data_type(), + DataType::Timestamp(TimeUnit::Microsecond, _) + )); + } +} diff --git a/rust/lance/src/dataset/files/arrow.rs b/rust/lance/src/dataset/files/arrow.rs new file mode 100644 index 00000000000..22d767fe13b --- /dev/null +++ b/rust/lance/src/dataset/files/arrow.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::sync::{Arc, LazyLock}; + +use arrow::array::{ArrayBuilder, Int8Builder}; +use arrow::datatypes::Int8Type; +use arrow_array::builder::{Int64Builder, StringBuilder, StringDictionaryBuilder}; +use arrow_array::types::Int32Type; +use arrow_array::{ArrayRef, DictionaryArray, RecordBatch}; +use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use lance_core::Result; + +use super::FileRow; +use super::file_types::FileType; + +pub static FILE_TYPE_DICT_ARRAY: LazyLock = LazyLock::new(|| { + let mut builder = StringBuilder::with_capacity(5, 20); + builder.append_value(FileType::Manifest.to_string()); + builder.append_value(FileType::DataFile.to_string()); + builder.append_value(FileType::DeletionFile.to_string()); + builder.append_value(FileType::TransactionFile.to_string()); + builder.append_value(FileType::IndexFile.to_string()); + Arc::new(builder.finish()) +}); + +pub struct FileTypeArrayBuilder { + builder: Int8Builder, +} + +impl FileTypeArrayBuilder { + pub fn with_capacity(capacity: usize) -> Self { + Self { + builder: Int8Builder::with_capacity(capacity), + } + } + + pub fn append_value(&mut self, file_type: FileType) { + let value = file_type.into(); + self.builder.append_value(value); + } + + pub fn finish(mut self) -> DictionaryArray { + let indices = self.builder.finish(); + DictionaryArray::new(indices, FILE_TYPE_DICT_ARRAY.clone()) + } +} + +pub(super) static TRACKED_FILES_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(Schema::new(vec![ + Field::new("version", DataType::Int64, false), + Field::new( + "base_uri", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("path", DataType::Utf8, false), + Field::new( + "type", + DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), + false, + ), + ])) +}); + +pub(super) static ALL_FILES_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(Schema::new(vec![ + Field::new( + "base_uri", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("path", DataType::Utf8, false), + Field::new("size_bytes", DataType::Int64, false), + Field::new( + "last_modified", + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), + false, + ), + ])) +}); + +/// Arrow batch builder for the `tracked_files` schema. +/// +/// Construct with [`with_capacity`](Self::with_capacity) to pre-size the +/// underlying buffers, then call [`extend`](Self::extend) to fill rows in bulk. +pub(super) struct TrackedFileBatch { + version: Int64Builder, + base_uri: StringDictionaryBuilder, + path: StringBuilder, + file_type: FileTypeArrayBuilder, +} + +impl TrackedFileBatch { + pub fn with_capacity(capacity: usize) -> Self { + Self { + version: Int64Builder::with_capacity(capacity), + // Most of the time, there is only 1 base_uri + base_uri: StringDictionaryBuilder::with_capacity(capacity, 1, 20), + path: StringBuilder::with_capacity(capacity, capacity * 50), + file_type: FileTypeArrayBuilder::with_capacity(capacity), + } + } + + pub fn append(&mut self, row: &FileRow) { + self.version.append_value(row.version as i64); + self.base_uri.append_value(&row.base_uri); + self.path.append_value(&row.path); + self.file_type.append_value(row.file_type); + } + + pub fn len(&self) -> usize { + self.version.len() + } + + pub fn finish(mut self) -> Result { + RecordBatch::try_new( + TRACKED_FILES_SCHEMA.clone(), + vec![ + Arc::new(self.version.finish()), + Arc::new(self.base_uri.finish()), + Arc::new(self.path.finish()), + Arc::new(self.file_type.finish()), + ], + ) + .map_err(Into::into) + } +} diff --git a/rust/lance/src/dataset/files/file_types.rs b/rust/lance/src/dataset/files/file_types.rs new file mode 100644 index 00000000000..7c20a81eae5 --- /dev/null +++ b/rust/lance/src/dataset/files/file_types.rs @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +// Discriminants are the dictionary keys used in the `tracked_files` output +// schema; they must stay in sync with `FILE_TYPE_DICT_ARRAY` in `arrow.rs`. +#[repr(i8)] +#[derive(Debug, Clone, Copy)] +pub enum FileType { + Manifest = 0, + DataFile = 1, + DeletionFile = 2, + TransactionFile = 3, + IndexFile = 4, +} + +impl std::fmt::Display for FileType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Self::Manifest => "manifest", + Self::DataFile => "data file", + Self::DeletionFile => "deletion file", + Self::TransactionFile => "transaction file", + Self::IndexFile => "index file", + }; + write!(f, "{s}") + } +} + +impl From for i8 { + fn from(file_type: FileType) -> Self { + file_type as Self + } +} diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 11851e8846e..eb165e5f612 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1792,7 +1792,7 @@ impl FileFragment { read_columns: Option>, batch_size: Option, ) -> Result<(Fragment, Schema)> { - let (fragments, schema) = schema_evolution::add_columns_to_fragments( + let (fragments, schema, _) = schema_evolution::add_columns_to_fragments( self.dataset.as_ref(), transforms, read_columns, diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index b10158224f9..9731be0c0eb 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -12,6 +12,7 @@ use lance_file::previous::writer::FileWriter as PreviousFileWriter; use lance_file::version::LanceFileVersion; use lance_file::writer::FileWriterOptions; use lance_io::object_store::ObjectStore; +use lance_io::utils::CachedFileSize; use lance_table::format::{DataFile, Fragment}; use lance_table::io::manifest::ManifestDescribing; use std::borrow::Cow; @@ -165,7 +166,8 @@ impl<'a> FragmentCreateBuilder<'a> { writer.write_batches(batch_chunk.iter()).await?; } - fragment.physical_rows = Some(writer.finish().await? as usize); + let write_summary = writer.finish().await?; + fragment.physical_rows = Some(write_summary.num_rows as usize); if matches!(fragment.physical_rows, Some(0)) { return Err(Error::invalid_input("Input data was empty.")); @@ -186,6 +188,7 @@ impl<'a> FragmentCreateBuilder<'a> { fragment.files[0].fields = field_ids; fragment.files[0].column_indices = column_indices; + fragment.files[0].file_size_bytes = CachedFileSize::new(write_summary.size_bytes); progress.complete(&fragment).await?; diff --git a/rust/lance/src/dataset/index.rs b/rust/lance/src/dataset/index.rs index 354cdaf7f86..770c68b89e9 100644 --- a/rust/lance/src/dataset/index.rs +++ b/rust/lance/src/dataset/index.rs @@ -21,6 +21,7 @@ use lance_index::pb::VectorIndexDetails; use lance_index::scalar::lance_format::LanceIndexStore; use lance_table::format::IndexMetadata; use serde::{Deserialize, Serialize}; +use uuid::Uuid; use super::optimize::{IndexRemapper, IndexRemapperOptions}; @@ -121,7 +122,7 @@ impl IndexRemapper for DatasetIndexRemapper { #[async_trait] pub trait LanceIndexStoreExt { /// Create an index store for a new index (will always be absolute with no base id) - fn from_dataset_for_new(dataset: &Dataset, uuid: &str) -> Result + fn from_dataset_for_new(dataset: &Dataset, uuid: &Uuid) -> Result where Self: Sized; @@ -147,8 +148,8 @@ pub(crate) fn dataset_format_version(dataset: &Dataset) -> LanceFileVersion { #[async_trait] impl LanceIndexStoreExt for LanceIndexStore { - fn from_dataset_for_new(dataset: &Dataset, uuid: &str) -> Result { - let index_dir = dataset.indices_dir().join(uuid); + fn from_dataset_for_new(dataset: &Dataset, uuid: &Uuid) -> Result { + let index_dir = dataset.indices_dir().join(uuid.to_string()); let cache = dataset.metadata_cache.file_metadata_cache(&index_dir); let format_version = dataset_format_version(dataset); Ok(Self::with_format_version( @@ -223,7 +224,7 @@ mod tests { let built_index = dataset .create_index_builder(&["vector"], IndexType::Vector, ¶ms) .name("vector_idx".to_string()) - .index_uuid(first_segment_uuid.to_string()) + .index_uuid(first_segment_uuid) .execute_uncommitted() .await .unwrap(); diff --git a/rust/lance/src/dataset/index/frag_reuse.rs b/rust/lance/src/dataset/index/frag_reuse.rs index 4fbefcd4725..ceebe456bbf 100644 --- a/rust/lance/src/dataset/index/frag_reuse.rs +++ b/rust/lance/src/dataset/index/frag_reuse.rs @@ -243,4 +243,198 @@ mod tests { Err(Error::RetryableCommitConflict { .. }) )); } + + /// With more than one index on the table, remapping every index must catch + /// all of them up so the reuse index can be trimmed. + /// + /// Regression: `remap_column_index` used to decide whether to remap an + /// index's data from the presence of the old fragments in its fragment + /// bitmap. But `load_indices` coverage-remaps the bitmap onto the new + /// fragments in memory, and remapping the *first* index commits a manifest + /// that persists that cleaned bitmap for the others — so remapping the + /// remaining indexes became a silent no-op (their data was never remapped + /// and their `dataset_version` never advanced), and the reuse index could + /// never be trimmed. + #[tokio::test] + async fn test_cleanup_frag_reuse_index_multiple_indices() { + let mut dataset = lance_datagen::gen_batch() + .col("i", lance_datagen::array::step::()) + .col("j", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + for col in ["i", "j"] { + dataset + .create_index( + &[col], + IndexType::Scalar, + Some(format!("{col}_idx")), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + } + + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap(); + assert_eq!(frag_reuse_details.versions.len(), 1); + + for col in ["i", "j"] { + remapping::remap_column_index(&mut dataset, &[col], Some(format!("{col}_idx"))) + .await + .unwrap(); + } + + // Every index must now be caught up (data remapped, version advanced). + let indices = dataset.load_indices().await.unwrap(); + for col in ["i", "j"] { + let index = indices + .iter() + .find(|idx| idx.name == format!("{col}_idx")) + .unwrap(); + assert!( + is_index_remap_caught_up(&frag_reuse_details.versions[0], index).unwrap(), + "index {col}_idx was not caught up after remap" + ); + } + + // ... so the reuse index trims down to zero versions. + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap(); + assert_eq!(frag_reuse_details.versions.len(), 0); + + // Data correctness, not just version bookkeeping: with the reuse index + // trimmed there is no auto-remap safety net, so each index must resolve + // to LIVE rows. An index whose data was not actually remapped (e.g. one + // whose bitmap was coverage-remapped by a sibling's commit before its + // own data remap) points at compacted-away fragments and errors on take. + use futures::TryStreamExt; + for col in ["i", "j"] { + let rows: usize = dataset + .scan() + .filter(&format!("{col} >= 2000 AND {col} < 3000")) + .unwrap() + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum(); + assert_eq!( + rows, 1000, + "index {col}_idx must resolve to live rows after remap+trim" + ); + } + } + + /// When the reuse index has accumulated several versions, a single remap + /// must compose them and rebuild + commit the index exactly ONCE, not once + /// per version. + #[tokio::test] + async fn test_remap_index_batches_multiple_reuse_versions() { + let mut dataset = lance_datagen::gen_batch() + .col("i", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(8), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["i"], + IndexType::Scalar, + Some("i_idx".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + + // Accumulate multiple reuse versions: each round deletes a prefix, which + // shrinks fragments below target and forces another deferred compaction. + let options = CompactionOptions { + target_rows_per_fragment: 4_000, + defer_index_remap: true, + ..Default::default() + }; + for round in 0..4 { + dataset + .delete(&format!("i < {}", 1_000 * (round + 1))) + .await + .unwrap(); + compact_files(&mut dataset, options.clone(), None) + .await + .unwrap(); + } + + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let num_versions = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap() + .versions + .len(); + assert!( + num_versions >= 2, + "test needs multiple reuse versions to exercise batching, got {num_versions}" + ); + + // A single remap must commit exactly once, regardless of version count. + let version_before = dataset.manifest.version; + remapping::remap_column_index(&mut dataset, &["i"], Some("i_idx".into())) + .await + .unwrap(); + let commits = dataset.manifest.version - version_before; + assert_eq!( + commits, 1, + "batched remap must commit once, not once per reuse version ({num_versions})" + ); + + // ... and the reuse index then trims to zero. + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + assert_eq!( + load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap() + .versions + .len(), + 0 + ); + } } diff --git a/rust/lance/src/dataset/mem_wal.rs b/rust/lance/src/dataset/mem_wal.rs index 5f3bc2ed483..7eaa8ffb83f 100644 --- a/rust/lance/src/dataset/mem_wal.rs +++ b/rust/lance/src/dataset/mem_wal.rs @@ -51,6 +51,7 @@ pub use sharding::{ evaluate_sharding_spec, evaluate_sharding_spec_with_embedded_columns, evaluate_sharding_spec_with_source_columns, }; -pub use wal::{WalAppendResult, WalAppender, WalReadEntry, WalTailer}; +pub use wal::{BatchDurableWatcher, WalAppendResult, WalAppender, WalReadEntry, WalTailer}; pub use write::ShardWriter; pub use write::ShardWriterConfig; +pub use write::WriteResult; diff --git a/rust/lance/src/dataset/mem_wal/api.rs b/rust/lance/src/dataset/mem_wal/api.rs index ec78ff89c72..79184c13ec8 100644 --- a/rust/lance/src/dataset/mem_wal/api.rs +++ b/rust/lance/src/dataset/mem_wal/api.rs @@ -26,7 +26,7 @@ use crate::index::mem_wal::{load_mem_wal_index_details, new_mem_wal_index_meta}; use super::ShardWriterConfig; use super::scanner::flushed_cache::open_flushed_dataset; -use super::scanner::{FlushedMemTableCache, ShardSnapshot}; +use super::scanner::{DatasetCache, ShardSnapshot}; use super::write::MemIndexConfig; use super::write::ShardWriter; @@ -500,7 +500,7 @@ pub trait DatasetMemWalExt { async fn prewarm_mem_wal( &self, _snapshots: &[ShardSnapshot], - _cache: Option<&Arc>, + _cache: Option<&Arc>, ) -> Result<()> { Ok(()) } @@ -586,7 +586,7 @@ impl DatasetMemWalExt for Dataset { async fn prewarm_mem_wal( &self, snapshots: &[ShardSnapshot], - cache: Option<&Arc>, + cache: Option<&Arc>, ) -> Result<()> { let session = self.session(); // Resolve flushed paths exactly as the LSM collector does, so the @@ -601,7 +601,8 @@ impl DatasetMemWalExt for Dataset { snapshot.flushed_generations.iter().map(move |flushed| { let path = format!("{}/_mem_wal/{}/{}", base_path, shard_id, flushed.path); async move { - let dataset = open_flushed_dataset(&path, Some(session), cache).await?; + let dataset = + open_flushed_dataset(&path, Some(session), cache, None).await?; prewarm_all_indexes(&dataset).await } }) @@ -738,7 +739,7 @@ async fn load_vector_index_config( // bake this metric into their on-disk metadata, so a wrong default would // be durable corruption. let distance_type = dataset - .open_vector_index(&column, &index_meta.uuid.to_string(), &NoOpMetricsCollector) + .open_vector_index(&column, &index_meta.uuid, &NoOpMetricsCollector) .await .map_err(|e| { Error::invalid_input(format!( @@ -762,6 +763,7 @@ async fn load_vector_index_config( #[cfg(test)] mod tests { + use super::super::scanner::FlushedMemTableCache; use super::*; use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator}; @@ -831,7 +833,7 @@ mod tests { .with_current_generation(2) .with_flushed_generation(1, folder.to_string()); - let cache = Arc::new(FlushedMemTableCache::new(4)); + let cache: Arc = Arc::new(FlushedMemTableCache::new(4)); base.prewarm_mem_wal(std::slice::from_ref(&snapshot), Some(&cache)) .await .expect("prewarm must open the generation and warm its index"); diff --git a/rust/lance/src/dataset/mem_wal/index.rs b/rust/lance/src/dataset/mem_wal/index.rs index 116ea6c60ce..208971f7be6 100644 --- a/rust/lance/src/dataset/mem_wal/index.rs +++ b/rust/lance/src/dataset/mem_wal/index.rs @@ -18,10 +18,14 @@ mod arena_skiplist; mod btree; mod fts; mod hnsw; +mod pk_key; use std::collections::HashMap; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; +use datafusion::common::ScalarValue; + use super::memtable::batch_store::StoredBatch; use arrow_array::RecordBatch; use lance_core::datatypes::Schema as LanceSchema; @@ -44,6 +48,32 @@ pub type RowPosition = u64; pub use btree::{BTreeIndexConfig, BTreeMemIndex}; pub use fts::{FtsIndexConfig, FtsMemIndex, FtsQueryExpr, SearchOptions}; pub use hnsw::{HnswIndexConfig, HnswMemIndex}; +pub use pk_key::encode_pk_tuple; + +use pk_key::encode_pk_batch; + +/// Synthetic column the composite PK index is keyed on: the order-preserving +/// encoded tuple (see [`encode_pk_tuple`]), stored as `Binary` so a +/// [`BTreeMemIndex`]'s byte backend indexes it directly. +const PK_KEY_COLUMN: &str = "__pk_key__"; + +/// The memtable's primary-key index, used to answer "newest visible version of +/// this key" for dedup. Single-column PKs reuse the column's compact typed +/// [`BTreeMemIndex`] (no second copy); composite PKs key a `BTreeMemIndex` on +/// the order-preserving encoded tuple ([`encode_pk_tuple`]) instead. Either way +/// the lookup is a single seek on one `BTreeMemIndex`. +enum PkIndex { + /// Arity 1: aliases a `btree_indexes` entry, so the insert loop maintains it. + Single(Arc), + /// Arity >= 2: a `BTreeMemIndex` over the encoded-tuple `Binary` key, + /// maintained explicitly in the insert paths (the original batch lacks the + /// synthetic key column). `columns` are the PK columns in order, resolved + /// against each batch's schema at insert time. + Composite { + index: Arc, + columns: Vec, + }, +} // ============================================================================ // Index Store @@ -195,12 +225,17 @@ impl MemIndexConfig { /// therefore safe for scanners to read. Scanners snapshot this at plan /// construction time so every plan keys on a stable MVCC cursor. pub struct IndexStore { - /// BTree indexes keyed by index name. - btree_indexes: HashMap, + /// BTree indexes keyed by index name. `Arc` so the primary-key BTrees can be + /// shared into [`Self::pk_btrees`] without a second copy or a second insert. + btree_indexes: HashMap>, /// HNSW vector indexes keyed by index name. hnsw_indexes: HashMap, /// FTS indexes keyed by index name. fts_indexes: HashMap, + /// The primary-key index (single-column or composite), or `None` without a + /// primary key. Queried via [`Self::pk_newest_visible`] (see + /// [`Self::enable_pk_index`]). + pk_index: Option, /// Maximum batch position that is durable in the WAL and therefore /// visible to scanners. Advanced unconditionally after a WAL append /// succeeds; not gated on whether any indexes are configured. @@ -213,6 +248,7 @@ impl Default for IndexStore { btree_indexes: HashMap::new(), hnsw_indexes: HashMap::new(), fts_indexes: HashMap::new(), + pk_index: None, max_visible_batch_position: AtomicUsize::new(0), } } @@ -230,6 +266,16 @@ impl std::fmt::Debug for IndexStore { &self.hnsw_indexes.keys().collect::>(), ) .field("fts_indexes", &self.fts_indexes.keys().collect::>()) + .field( + "pk_index", + &match &self.pk_index { + None => "none".to_string(), + Some(PkIndex::Single(b)) => format!("single({})", b.column_name()), + Some(PkIndex::Composite { columns, .. }) => { + format!("composite({})", columns.join(", ")) + } + }, + ) .field( "max_visible_batch_position", &self.max_visible_batch_position.load(Ordering::Acquire), @@ -264,7 +310,7 @@ impl IndexStore { for config in configs { match config { MemIndexConfig::BTree(c) => { - let index = BTreeMemIndex::new(c.field_id, c.column.clone()); + let index = Arc::new(BTreeMemIndex::new(c.field_id, c.column.clone())); registry.btree_indexes.insert(c.name.clone(), index); } MemIndexConfig::Hnsw(c) => { @@ -293,7 +339,7 @@ impl IndexStore { /// the production memtable path goes through [`Self::from_configs`]. pub fn add_btree(&mut self, name: String, field_id: i32, column: String) { self.btree_indexes - .insert(name, BTreeMemIndex::new(field_id, column)); + .insert(name, Arc::new(BTreeMemIndex::new(field_id, column))); } /// Add an HNSW vector index with default build parameters. @@ -362,6 +408,158 @@ impl IndexStore { .insert(name, FtsMemIndex::with_params(field_id, column, params)); } + /// Maintain a primary-key index so the memtable can answer "newest visible + /// version of this key" (see [`Self::pk_newest_visible`]). + /// + /// Single-column PKs reuse an existing BTree on the field, else auto-create + /// one under a `__pk__*` name so the normal insert loop maintains it (no + /// second copy). Composite (arity >= 2) PKs key a `BTreeMemIndex` on the + /// order-preserving encoded tuple (synthetic `PK_KEY_COLUMN`), maintained + /// explicitly in the insert paths. Call once at construction, after + /// [`Self::from_configs`] and before any inserts; a no-op when `pk_columns` + /// is empty. + pub fn enable_pk_index(&mut self, pk_columns: &[(String, i32)]) { + self.pk_index = match pk_columns { + [] => None, + [(column, field_id)] => { + let btree = match self + .btree_indexes + .values() + .find(|b| b.field_id() == *field_id) + { + Some(existing) => existing.clone(), + None => { + let btree = Arc::new(BTreeMemIndex::new(*field_id, column.clone())); + self.btree_indexes + .insert(format!("__pk__{column}"), btree.clone()); + btree + } + }; + Some(PkIndex::Single(btree)) + } + multi => Some(PkIndex::Composite { + // Synthetic field id (-1): the composite index is held directly, + // never resolved by field id. + index: Arc::new(BTreeMemIndex::new(-1, PK_KEY_COLUMN.to_string())), + columns: multi.iter().map(|(c, _)| c.clone()).collect(), + }), + }; + } + + /// Whether the memtable has a primary-key index. + pub fn has_pk_index(&self) -> bool { + self.pk_index.is_some() + } + + /// Sorted `(value, row_id)` training batches for the flushed on-disk PK + /// BTree (the sidecar dedup index). Single-column emits the typed PK value; + /// composite emits the order-preserving `Binary` encoded tuple. Empty when + /// there is no primary key. Row positions line up 1:1 with the forward- + /// written data file, so they are the flushed row ids directly. + pub fn pk_training_batches(&self, batch_size: usize) -> Result> { + match &self.pk_index { + None => Ok(Vec::new()), + Some(PkIndex::Single(btree)) => btree.to_training_batches(batch_size), + Some(PkIndex::Composite { index, .. }) => index.to_training_batches(batch_size), + } + } + + /// Resolve the PK columns' positions in `batch` (composite insert helper). + fn pk_batch_indices(batch: &RecordBatch, columns: &[String]) -> Result> { + columns + .iter() + .map(|c| { + batch + .schema() + .column_with_name(c) + .map(|(i, _)| i) + .ok_or_else(|| { + Error::invalid_input(format!("PK column '{c}' not found in batch")) + }) + }) + .collect() + } + + /// Maintain the composite PK index for `batch` (no-op for single/no PK): + /// encode the PK columns into the synthetic `PK_KEY_COLUMN` `Binary` column + /// and feed that to the keyed `BTreeMemIndex`. + fn insert_composite_pk(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { + if let Some(PkIndex::Composite { index, columns }) = &self.pk_index { + let pk_indices = Self::pk_batch_indices(batch, columns)?; + let encoded = encode_pk_batch(batch, &pk_indices)?; + let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new( + PK_KEY_COLUMN, + arrow_schema::DataType::Binary, + false, + )])); + let key_batch = RecordBatch::try_new(schema, vec![Arc::new(encoded)]) + .map_err(|e| Error::invalid_input(e.to_string()))?; + index.insert(&key_batch, row_offset)?; + } + Ok(()) + } + + /// The newest row position of the primary-key tuple `values` (in PK order) + /// visible at `max_visible_row`, or `None`. A single seek either way: + /// single-column probes the typed BTree; composite probes the encoded-tuple + /// index. Collision-free, since `position` is the row identity. + pub fn pk_newest_visible( + &self, + values: &[ScalarValue], + max_visible_row: RowPosition, + ) -> Option { + match &self.pk_index { + None => None, + Some(PkIndex::Single(btree)) => btree.get_newest_visible(&values[0], max_visible_row), + Some(PkIndex::Composite { index, .. }) => { + // An unsupported PK type would have failed at insert, so the + // index can't hold a tuple this fails to encode. The probe key is + // the same `Binary`-encoded tuple the insert path indexed. + let key = encode_pk_tuple(values).ok()?; + index.get_newest_visible(&ScalarValue::Binary(Some(key)), max_visible_row) + } + } + } + + /// Whether `position` is the newest visible row of `values` — the recency + /// check the active index-search arms apply to drop predicate-crossing + /// stale hits. Callers gate on [`Self::has_pk_index`] first, since this is + /// `false` (drop) when the memtable has no primary-key index. + pub fn pk_is_newest( + &self, + values: &[ScalarValue], + position: RowPosition, + max_visible_row: RowPosition, + ) -> bool { + self.pk_newest_visible(values, max_visible_row) == Some(position) + } + + /// Whether `key` has any version visible at `max_visible_row` — the + /// cross-source block-list's existence query, snapshot-bounded so a + /// not-yet-visible write can't shadow an older visible copy. + /// + /// `key` is already in the index's key space: the typed PK value for a + /// single-column key, the `Binary`-encoded tuple for a composite one (built + /// by `block_list::on_disk_pk_key`, the same key the flushed on-disk index is + /// probed with). Both arities forward it straight to the keyed BTree. + pub fn pk_contains_key(&self, key: &ScalarValue, max_visible_row: RowPosition) -> bool { + match &self.pk_index { + None => false, + Some(PkIndex::Single(btree)) | Some(PkIndex::Composite { index: btree, .. }) => { + btree.get_newest_visible(key, max_visible_row).is_some() + } + } + } + + /// Whether the primary-key index holds no rows (or doesn't exist). + pub fn pk_is_empty(&self) -> bool { + match &self.pk_index { + None => true, + Some(PkIndex::Single(btree)) => btree.is_empty(), + Some(PkIndex::Composite { index, .. }) => index.is_empty(), + } + } + /// Insert a batch into all indexes. pub fn insert(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { self.insert_with_batch_position(batch, row_offset, None) @@ -384,6 +582,9 @@ impl IndexStore { for index in self.fts_indexes.values() { index.insert(batch, row_offset)?; } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + self.insert_composite_pk(batch, row_offset)?; // Update global watermark after all indexes have been updated if let Some(bp) = batch_position { @@ -440,6 +641,12 @@ impl IndexStore { } } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -552,6 +759,14 @@ impl IndexStore { .map(|(name, _idx_type, duration)| (name.to_string(), duration)) .collect(); + // Single-column PK aliases a `btree_indexes` entry — its thread above + // already maintained it (and joined). A composite PK has its own + // index; maintain it here before the watermark advances so the + // visible prefix is fully indexed. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -562,7 +777,7 @@ impl IndexStore { /// Get a BTree index by name. pub fn get_btree(&self, name: &str) -> Option<&BTreeMemIndex> { - self.btree_indexes.get(name) + self.btree_indexes.get(name).map(Arc::as_ref) } /// Get an HNSW vector index by name. @@ -583,6 +798,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.field_id() == field_id) + .map(Arc::as_ref) } /// Get an HNSW vector index by field ID. @@ -607,6 +823,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.column_name() == column) + .map(Arc::as_ref) } /// Get an HNSW vector index by column name. @@ -694,6 +911,73 @@ mod tests { .unwrap() } + /// Single-column `id` batch for primary-key lookup tests. + fn id_batch(ids: &[i32]) -> RecordBatch { + RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])), + vec![Arc::new(Int32Array::from(ids.to_vec()))], + ) + .unwrap() + } + + #[test] + fn pk_newest_visible_single_column() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0)]); + // id=1 at positions 0 and 2 (an update), id=2 at position 1. + store.insert(&id_batch(&[1, 2]), 0).unwrap(); + store.insert(&id_batch(&[1]), 2).unwrap(); + + let one = [ScalarValue::Int32(Some(1))]; + // Watermark above the update sees the newest position; below it, the older. + assert_eq!(store.pk_newest_visible(&one, 5), Some(2)); + assert_eq!(store.pk_newest_visible(&one, 1), Some(0)); + assert!(store.pk_is_newest(&one, 2, 5)); + assert!(!store.pk_is_newest(&one, 0, 5)); + // Absent key (probed by the typed value, as the block-list does). + assert!(!store.pk_contains_key(&ScalarValue::Int32(Some(9)), 5)); + } + + #[test] + fn pk_newest_visible_composite_seeks_encoded_tuple() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0), ("name".to_string(), 1)]); + // Rows: (1,"a")@0, (1,"b")@1, (1,"a")@2 — an update of (1,"a"). + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![1, 1, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a"])), + ], + ) + .unwrap(); + store.insert(&batch, 0).unwrap(); + + let tuple_1a = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + let tuple_1b = [ScalarValue::Int32(Some(1)), ScalarValue::from("b")]; + // (1,"a")'s newest visible row is its re-write at position 2. + assert_eq!(store.pk_newest_visible(&tuple_1a, 5), Some(2)); + assert!(store.pk_is_newest(&tuple_1a, 2, 5)); + assert!(!store.pk_is_newest(&tuple_1a, 0, 5)); + // (1,"b") only exists at position 1. + assert_eq!(store.pk_newest_visible(&tuple_1b, 5), Some(1)); + // Watermark below the re-write: the older (1,"a")@0 is the newest visible. + assert_eq!(store.pk_newest_visible(&tuple_1a, 1), Some(0)); + // An absent tuple (probed by its Binary-encoded key, as the block-list + // does). + let tuple_2a = [ScalarValue::Int32(Some(2)), ScalarValue::from("a")]; + let key_2a = ScalarValue::Binary(Some(encode_pk_tuple(&tuple_2a).unwrap())); + assert!(!store.pk_contains_key(&key_2a, 5)); + } + #[test] fn test_index_registry() { let schema = create_test_schema(); diff --git a/rust/lance/src/dataset/mem_wal/index/pk_key.rs b/rust/lance/src/dataset/mem_wal/index/pk_key.rs new file mode 100644 index 00000000000..b31fe42c995 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/index/pk_key.rs @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Composite primary-key encoding for MemWAL dedup. +//! +//! A multi-column primary key is reduced to a single order-preserving byte +//! string ([`encode_pk_tuple`]) so the whole tuple is one comparable key: +//! lexicographic byte order equals tuple order, and distinct tuples never +//! collide. Encoded as a `Binary` value, the tuple is indexed directly by a +//! [`super::BTreeMemIndex`] (its byte backend) — both in memory and, after +//! flush, as the on-disk BTree's `Binary` value column — so a probe builds +//! `ScalarValue::Binary(key)` and every layer agrees. +//! +//! Single-column primary keys do **not** use this — they key the typed +//! `BTreeMemIndex` on the column value directly. + +use arrow_array::{BinaryArray, RecordBatch}; +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; + +/// Sign-flip a signed integer to an order-preserving unsigned key (matches the +/// fixed-int BTree backend). Big-endian bytes of the result sort like the value. +#[inline] +fn encode_signed(v: i64) -> u64 { + (v as u64) ^ (1u64 << 63) +} + +/// Append an order-preserving encoding of one non-null byte string: each `0x00` +/// is escaped to `0x00 0xFF`, then a `0x00 0x00` terminator is appended. The +/// terminator sorts before any escaped content, so a prefix orders before its +/// extensions and no value can forge a column boundary. +fn encode_bytes(out: &mut Vec, bytes: &[u8]) { + for &b in bytes { + out.push(b); + if b == 0x00 { + out.push(0xFF); + } + } + out.extend_from_slice(&[0x00, 0x00]); +} + +/// Append the order-preserving encoding of a single PK column value. A leading +/// tag (`0x00` null / `0x01` non-null) makes nulls sort first and keeps the +/// per-column encoding self-delimiting (fixed-width for ints, terminated for +/// bytes), so concatenating columns stays injective and order-preserving. +fn encode_value(out: &mut Vec, value: &ScalarValue) -> Result<()> { + if value.is_null() { + out.push(0x00); + return Ok(()); + } + out.push(0x01); + macro_rules! be_signed { + ($v:expr) => { + out.extend_from_slice(&encode_signed($v as i64).to_be_bytes()) + }; + } + match value { + ScalarValue::Int8(Some(v)) => be_signed!(*v), + ScalarValue::Int16(Some(v)) => be_signed!(*v), + ScalarValue::Int32(Some(v)) => be_signed!(*v), + ScalarValue::Int64(Some(v)) => be_signed!(*v), + ScalarValue::Date32(Some(v)) => be_signed!(*v), + ScalarValue::Date64(Some(v)) => be_signed!(*v), + ScalarValue::UInt8(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt16(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt32(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt64(Some(v)) => out.extend_from_slice(&v.to_be_bytes()), + ScalarValue::Boolean(Some(b)) => out.push(*b as u8), + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => { + encode_bytes(out, s.as_bytes()) + } + ScalarValue::Binary(Some(b)) + | ScalarValue::LargeBinary(Some(b)) + | ScalarValue::FixedSizeBinary(_, Some(b)) => encode_bytes(out, b), + other => { + return Err(Error::invalid_input(format!( + "Unsupported primary-key column type for composite key: {other:?}" + ))); + } + } + Ok(()) +} + +/// Encode a PK tuple (values in PK column order) to one order-preserving key. +pub fn encode_pk_tuple(values: &[ScalarValue]) -> Result> { + let mut out = Vec::with_capacity(values.len() * 9); + for value in values { + encode_value(&mut out, value)?; + } + Ok(out) +} + +/// Encode row `row` of `batch`'s PK columns (at `pk_indices`) to one key. +fn encode_pk_row(batch: &RecordBatch, pk_indices: &[usize], row: usize) -> Result> { + let mut out = Vec::with_capacity(pk_indices.len() * 9); + for &col in pk_indices { + let value = ScalarValue::try_from_array(batch.column(col), row)?; + encode_value(&mut out, &value)?; + } + Ok(out) +} + +/// Encode every row of `batch`'s PK columns (at `pk_indices`) into a `Binary` +/// column of order-preserving composite keys — the form a [`super::BTreeMemIndex`] +/// indexes directly (its byte backend), so the composite PK reuses the same +/// index as a single-column one. +pub fn encode_pk_batch(batch: &RecordBatch, pk_indices: &[usize]) -> Result { + let mut keys: Vec> = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + keys.push(encode_pk_row(batch, pk_indices, row)?); + } + Ok(BinaryArray::from_iter_values(keys.iter())) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int32Array, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + + fn tuple(a: i32, b: &str) -> Vec { + vec![ScalarValue::Int32(Some(a)), ScalarValue::from(b)] + } + + #[test] + fn encoding_is_order_preserving_and_injective() { + // Sorting tuples by their encoding must match tuple order, and distinct + // tuples must produce distinct bytes. + let tuples = [ + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + tuple(-1, "z"), + ]; + let mut encoded: Vec<(Vec, &Vec)> = tuples + .iter() + .map(|t| (encode_pk_tuple(t).unwrap(), t)) + .collect(); + encoded.sort_by(|x, y| x.0.cmp(&y.0)); + let order: Vec<_> = encoded.iter().map(|(_, t)| (*t).clone()).collect(); + // -1 < 1 < 2; within id=1, "a" < "ab" < "b". + assert_eq!( + order, + vec![ + tuple(-1, "z"), + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + ] + ); + // Injective: 5 distinct tuples → 5 distinct keys. + let mut keys: Vec> = tuples.iter().map(|t| encode_pk_tuple(t).unwrap()).collect(); + keys.sort(); + keys.dedup(); + assert_eq!(keys.len(), 5); + } + + #[test] + fn null_sorts_first_and_is_distinct() { + let null_a = vec![ScalarValue::Int32(None), ScalarValue::from("a")]; + let one_a = tuple(1, "a"); + assert!(encode_pk_tuple(&null_a).unwrap() < encode_pk_tuple(&one_a).unwrap()); + assert_ne!( + encode_pk_tuple(&null_a).unwrap(), + encode_pk_tuple(&one_a).unwrap() + ); + } + + #[test] + fn prefix_safety_with_embedded_zero() { + // A string containing 0x00 must not collide with or sort incorrectly + // against a shorter one (escaping + terminator). + let with_zero = vec![ScalarValue::Binary(Some(vec![0x00]))]; + let empty = vec![ScalarValue::Binary(Some(vec![]))]; + assert!(encode_pk_tuple(&empty).unwrap() < encode_pk_tuple(&with_zero).unwrap()); + } + + #[test] + fn encode_pk_batch_matches_per_tuple_encoding() { + // Each row of the encoded `Binary` column equals `encode_pk_tuple` of + // that row's PK values — so the column a BTreeMemIndex indexes is exactly + // what a probe builds. + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![2, 1])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let encoded = encode_pk_batch(&batch, &[0, 1]).unwrap(); + assert_eq!(encoded.value(0), encode_pk_tuple(&tuple(2, "a")).unwrap()); + assert_eq!(encoded.value(1), encode_pk_tuple(&tuple(1, "b")).unwrap()); + // (1,"b") encodes below (2,"a"). + assert!(encoded.value(1) < encoded.value(0)); + } +} diff --git a/rust/lance/src/dataset/mem_wal/memtable.rs b/rust/lance/src/dataset/mem_wal/memtable.rs index fd23fd6d577..cb95a4ab531 100644 --- a/rust/lance/src/dataset/mem_wal/memtable.rs +++ b/rust/lance/src/dataset/mem_wal/memtable.rs @@ -14,8 +14,8 @@ use std::time::{Duration, Instant}; use arrow_array::{Array, RecordBatch, RecordBatchIterator}; use arrow_schema::Schema as ArrowSchema; use lance_core::datatypes::Schema; +use lance_core::utils::bloomfilter::sbbf::Sbbf; use lance_core::{Error, Result}; -use lance_index::scalar::bloomfilter::sbbf::Sbbf; use tokio::sync::RwLock; use tracing::instrument; use uuid::Uuid; diff --git a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs index f4d4d797acc..054d9b1630e 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs @@ -615,6 +615,22 @@ impl BatchStore { (0..end).collect() } + /// The inclusive maximum visible *row* position at `max_visible_batch_position`, + /// or `None` when no rows are visible. The visible batches are the committed + /// prefix `[0, last_visible_idx]`; each batch carries its cumulative + /// `row_offset`, so this is the end of the last visible batch minus one. + /// Used to bound MVCC seeks against the maintained PK-position index. + pub fn max_visible_row(&self, max_visible_batch_position: usize) -> Option { + let len = self.committed_len.load(Ordering::Acquire); + if len == 0 { + return None; + } + let last_visible_idx = max_visible_batch_position.min(len - 1); + let last = self.get(last_visible_idx)?; + let visible_end = last.row_offset + last.num_rows as u64; // exclusive + visible_end.checked_sub(1) + } + /// Check if a specific batch is visible at a given visibility position. #[inline] pub fn is_batch_visible( @@ -910,6 +926,37 @@ mod tests { assert!(!store.is_batch_visible(3, 10)); } + #[test] + fn test_max_visible_row() { + // (1) Empty store: no rows are visible at any position. + let store = BatchStore::with_capacity(10); + assert_eq!(store.max_visible_row(0), None); + assert_eq!(store.max_visible_row(100), None); + + // Three batches → rows [0,10) [10,30) [30,60); row_offsets 0, 10, 30. + store.append(create_test_batch(10)).unwrap(); // position 0 + store.append(create_test_batch(20)).unwrap(); // position 1 + store.append(create_test_batch(30)).unwrap(); // position 2 + + // (2) A position within range yields the inclusive end of that prefix. + assert_eq!(store.max_visible_row(0), Some(9)); // batch 0: 0..10 + assert_eq!(store.max_visible_row(1), Some(29)); // batch 1: 10..30 + assert_eq!(store.max_visible_row(2), Some(59)); // batch 2: 30..60 + + // (3) A position beyond the committed range clamps to the last batch, + // i.e. the inclusive max over all rows. + assert_eq!(store.max_visible_row(100), Some(59)); + + // (4) An empty leading batch contributes no rows: at its own position + // the inclusive end underflows to None, while a later non-empty batch + // is reported correctly. + let store = BatchStore::with_capacity(10); + store.append(create_test_batch(0)).unwrap(); // position 0: rows [0,0) + store.append(create_test_batch(5)).unwrap(); // position 1: rows [0,5) + assert_eq!(store.max_visible_row(0), None); // empty prefix → no rows + assert_eq!(store.max_visible_row(1), Some(4)); // through batch 1 + } + #[test] fn test_recommended_capacity() { // 64MB memtable, 64KB avg batch = 1024 batches * 1.2 = ~1228 diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index 8f0e34db5ec..ebcc06cab44 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -18,7 +18,7 @@ use lance_io::object_store::ObjectStore; use lance_table::format::IndexMetadata; use lance_table::io::commit::write_manifest_file_to_path; use lance_table::io::deletion::write_deletion_file; -use log::info; +use log::{info, warn}; use object_store::ObjectStoreExt; use object_store::path::Path; use roaring::RoaringBitmap; @@ -29,6 +29,7 @@ use super::super::index::MemIndexConfig; use super::super::memtable::MemTable; use crate::Dataset; use crate::dataset::mem_wal::manifest::ShardManifestStore; +use crate::dataset::mem_wal::scanner::GenerationWarmer; use crate::dataset::mem_wal::scanner::exec::{compute_pk_hash, validate_pk_types}; use crate::dataset::mem_wal::util::{flushed_memtable_path, generate_random_hash}; @@ -68,6 +69,9 @@ pub struct MemTableFlusher { base_uri: String, shard_id: Uuid, manifest_store: Arc, + /// When present, each new generation is warmed before it is committed, so + /// the first query sees zero cold reads. `None` => no warming. + warmer: Option>, } impl MemTableFlusher { @@ -84,6 +88,26 @@ impl MemTableFlusher { base_uri: base_uri.into(), shard_id, manifest_store, + warmer: None, + } + } + + /// Attach the warmer fired pre-commit for each new generation. + pub fn with_warmer(mut self, warmer: Option>) -> Self { + self.warmer = warmer; + self + } + + /// Warm a just-written generation before it is committed. Best-effort: a + /// failure is logged and the flush proceeds — warming is never a commit + /// gate. No-op without a warmer. `uri` must be the resolved reader path + /// (`path_to_uri(gen_path)`) so warmed entries key-match later queries. + async fn warm_generation(&self, uri: &str) { + let Some(warmer) = &self.warmer else { + return; + }; + if let Err(e) = warmer.warm(uri).await { + warn!("pre-commit warm failed for generation {uri}; committing cold: {e}"); } } @@ -178,6 +202,16 @@ impl MemTableFlusher { self.write_bloom_filter(&bloom_path, memtable.bloom_filter()) .await?; + // Write the standalone primary-key dedup sidecar. A primary key needs + // no secondary index, so this is required on the plain-flush path too — + // the LSM scanner opens it to dedup the generation. (`flush_with_indexes` + // writes it on the indexed path.) No-op when the memtable has no PK. + self.create_pk_index(&gen_path, memtable.indexes()).await?; + + // Warm before commit (zero cold window); no-op without a warmer. + let warm_uri = self.path_to_uri(&gen_path); + self.warm_generation(&warm_uri).await; + let new_manifest = self .update_manifest( epoch, @@ -332,7 +366,7 @@ impl MemTableFlusher { async fn write_bloom_filter( &self, path: &Path, - bloom: &lance_index::scalar::bloomfilter::sbbf::Sbbf, + bloom: &lance_core::utils::bloomfilter::sbbf::Sbbf, ) -> Result<()> { let data = bloom.to_bytes(); self.object_store @@ -449,6 +483,10 @@ impl MemTableFlusher { all_indexes.extend(fts_indexes); } + // Write the standalone primary-key dedup index (sidecar, not a manifest + // index — the block-list opens it directly by path). + self.create_pk_index(&gen_path, memtable.indexes()).await?; + // Write a single manifest that records the fragments, the // within-generation deletion vector, and all indexes, overwriting the // data-only v1 manifest created by Dataset::write. @@ -459,6 +497,10 @@ impl MemTableFlusher { self.write_bloom_filter(&bloom_path, memtable.bloom_filter()) .await?; + // Warm before commit (zero cold window); no-op without a warmer. + let warm_uri = self.path_to_uri(&gen_path); + self.warm_generation(&warm_uri).await; + let new_manifest = self .update_manifest( epoch, @@ -543,6 +585,49 @@ impl MemTableFlusher { Ok(created_indexes) } + /// Write the standalone primary-key dedup index for this generation. + /// + /// Unlike user indexes, this is a **sidecar**: it is not registered in the + /// manifest. The block-list opens it directly by path + /// ([`pk_index_path`]) and probes it with `Equals`. Single-column primary + /// keys index the typed value; composite keys index the order-preserving + /// `Binary` encoded tuple (see [`super::super::index::encode_pk_tuple`]). + /// Row positions line up 1:1 with the forward-written data file, so they are + /// the flushed row ids directly. No-op without a primary-key index. + async fn create_pk_index( + &self, + gen_path: &Path, + mem_indexes: Option<&super::super::index::IndexStore>, + ) -> Result<()> { + use datafusion::physical_plan::SendableRecordBatchStream; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_index::scalar::btree::train_btree_index; + use lance_index::scalar::lance_format::LanceIndexStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let Some(registry) = mem_indexes else { + return Ok(()); + }; + let batches = registry.pk_training_batches(8192)?; + if batches.is_empty() { + return Ok(()); + } + + let schema = batches[0].schema(); + let store = LanceIndexStore::new( + self.object_store.clone(), + pk_index_path(gen_path), + Arc::new(LanceCache::no_cache()), + ); + let stream: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(batches.into_iter().map(Ok)), + )); + train_btree_index(stream, &store, 8192, None, None).await?; + Ok(()) + } + /// Create FTS (Full-Text Search) indexes from in-memory data (uncommitted). /// /// Writes the FTS index files and returns index metadata without committing. @@ -965,21 +1050,30 @@ impl MemTableFlusher { } } -/// Message to trigger flush of a frozen memtable to Lance storage. -pub struct TriggerMemTableFlush { - /// The frozen memtable to flush. - pub memtable: Arc, - /// Optional channel to notify when flush completes. - pub done: Option>>, +/// Message driving the background memtable-flush task. +pub enum TriggerMemTableFlush { + /// Flush a frozen memtable to Lance storage. + Flush { + /// The frozen memtable to flush. + memtable: Arc, + /// Optional channel to notify when flush completes. + done: Option>>, + }, + /// Periodic tick: evict frozen memtables whose post-flush grace has elapsed. + SweepExpired, } impl std::fmt::Debug for TriggerMemTableFlush { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("TriggerMemTableFlush") - .field("memtable_gen", &self.memtable.generation()) - .field("memtable_rows", &self.memtable.row_count()) - .field("has_done", &self.done.is_some()) - .finish() + match self { + Self::Flush { memtable, done } => f + .debug_struct("TriggerMemTableFlush::Flush") + .field("memtable_gen", &memtable.generation()) + .field("memtable_rows", &memtable.row_count()) + .field("has_done", &done.is_some()) + .finish(), + Self::SweepExpired => f.write_str("TriggerMemTableFlush::SweepExpired"), + } } } @@ -1139,6 +1233,79 @@ mod tests { assert_eq!(updated_manifest.flushed_generations.len(), 1); } + /// A `GenerationWarmer` that counts calls and optionally fails. + #[derive(Debug)] + struct CountingWarmer { + calls: Arc, + fail: bool, + } + + #[async_trait::async_trait] + impl GenerationWarmer for CountingWarmer { + async fn warm(&self, _path: &str) -> Result<()> { + self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + if self.fail { + Err(Error::io("simulated warm failure".to_string())) + } else { + Ok(()) + } + } + } + + /// Warming is a best-effort optimization, never a commit gate: a warmer that + /// errors pre-commit must still let the flush commit the generation. The + /// warm fires exactly once on the pre-commit path. + #[tokio::test] + async fn test_flusher_commits_when_warm_fails() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + let schema = create_test_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![]).unwrap(); + let frag_id = memtable + .insert(create_test_batch(&schema, 10)) + .await + .unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let calls = Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let warmer: Arc = Arc::new(CountingWarmer { + calls: calls.clone(), + fail: true, + }); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path, + base_uri, + shard_id, + manifest_store.clone(), + ) + .with_warmer(Some(warmer)); + // Flush must succeed despite the warmer erroring. + let result = flusher.flush(&memtable, epoch, 1).await.unwrap(); + + assert_eq!(result.generation.generation, 1); + assert_eq!( + calls.load(std::sync::atomic::Ordering::SeqCst), + 1, + "pre-commit warm fires exactly once" + ); + let updated = manifest_store.read_latest().await.unwrap().unwrap(); + assert_eq!( + updated.flushed_generations.len(), + 1, + "generation still committed after a failed warm" + ); + } + /// Flushing a generation with within-generation duplicate PKs writes a /// deletion vector so the flushed dataset exposes newest-per-PK on scan. #[tokio::test] @@ -1227,6 +1394,202 @@ mod tests { assert_eq!(rows.get(&3), Some(&"c2".to_string())); } + /// Flushing a memtable with a primary-key index writes a standalone sidecar + /// BTree at `{gen}/_pk_index` that the block-list can reopen by path and + /// probe by value — including for a within-gen-superseded PK (existence, + /// not visibility). + #[tokio::test] + async fn flushed_pk_index_sidecar_is_probeable() { + use lance_core::cache::LanceCache; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::registry::IndexPluginRegistry; + use lance_index::scalar::lance_format::LanceIndexStore; + use lance_index::scalar::{SargableQuery, SearchResult}; + + use super::super::super::index::IndexStore; + use crate::dataset::mem_wal::util::pk_index_path; + use datafusion::common::ScalarValue; + + let (store, base_path, _base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + // Primary-key index on `id`, no user indexes. + let schema = create_pk_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap(); + let mut registry = IndexStore::new(); + registry.enable_pk_index(&[("id".to_string(), 0)]); + memtable.set_indexes(registry); + + // id=1 updated in-gen (a -> a2); id=2 unique. + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a2"])), + ], + ) + .unwrap(); + let frag_id = memtable.insert(batch).await.unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path.clone(), + _base_uri.clone(), + shard_id, + manifest_store.clone(), + ); + let result = flusher + .flush_with_indexes(&memtable, epoch, &[], 1) + .await + .unwrap(); + + // Reopen the sidecar directly by path (the block-list's route). + let gen_path = base_path + .clone() + .join("_mem_wal") + .join(shard_id.to_string()) + .join(result.generation.path.as_str()); + let index_store = Arc::new(LanceIndexStore::new( + store.clone(), + pk_index_path(&gen_path), + Arc::new(LanceCache::no_cache()), + )); + let registry = IndexPluginRegistry::with_default_plugins(); + let plugin = registry.get_plugin_by_name("BTree").unwrap(); + let details = + prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap(); + let index = plugin + .load_index(index_store, &details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let contains = |id: i32| { + let index = index.clone(); + async move { + let result = index + .search( + &SargableQuery::Equals(ScalarValue::Int32(Some(id))), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + match result { + SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => { + !s.is_empty() + } + } + } + }; + // Both PKs present (id=1 even though its first version was superseded); + // an absent PK is not. + assert!(contains(1).await); + assert!(contains(2).await); + assert!(!contains(99).await); + } + + /// Regression: production dispatches a PK-only flush (a primary key, no + /// secondary index) to `flush`, not `flush_with_indexes`. `flush` must still + /// write the PK dedup sidecar, otherwise cross-generation dedup fails with + /// `page_lookup.lance not found`. + #[tokio::test] + async fn plain_flush_writes_pk_sidecar() { + use lance_core::cache::LanceCache; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::registry::IndexPluginRegistry; + use lance_index::scalar::lance_format::LanceIndexStore; + use lance_index::scalar::{SargableQuery, SearchResult}; + + use super::super::super::index::IndexStore; + use crate::dataset::mem_wal::util::pk_index_path; + use datafusion::common::ScalarValue; + + let (store, base_path, _base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + // Primary-key index on `id`, no user indexes. + let schema = create_pk_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap(); + let mut registry = IndexStore::new(); + registry.enable_pk_index(&[("id".to_string(), 0)]); + memtable.set_indexes(registry); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let frag_id = memtable.insert(batch).await.unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path.clone(), + _base_uri.clone(), + shard_id, + manifest_store.clone(), + ); + // The plain-flush path — what the writer dispatches to with no indexes. + let result = flusher.flush(&memtable, epoch, 1).await.unwrap(); + + let gen_path = base_path + .clone() + .join("_mem_wal") + .join(shard_id.to_string()) + .join(result.generation.path.as_str()); + let index_store = Arc::new(LanceIndexStore::new( + store.clone(), + pk_index_path(&gen_path), + Arc::new(LanceCache::no_cache()), + )); + let registry = IndexPluginRegistry::with_default_plugins(); + let plugin = registry.get_plugin_by_name("BTree").unwrap(); + let details = + prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap(); + let index = plugin + .load_index(index_store, &details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let contains = |id: i32| { + let index = index.clone(); + async move { + let result = index + .search( + &SargableQuery::Equals(ScalarValue::Int32(Some(id))), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + match result { + SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => { + !s.is_empty() + } + } + } + }; + assert!(contains(1).await); + assert!(contains(2).await); + assert!(!contains(99).await); + } + /// Covers `finalize_generation` writing both a deletion vector *and* /// indexes into the same manifest — the deletion-only and index-only /// paths are exercised by sibling tests. diff --git a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs index 2c5192e28a1..17fa9c76a65 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs @@ -366,6 +366,14 @@ impl MemTableScanner { self } + /// The `max_visible_batch_position` snapshot this scanner latched at + /// construction. A downstream recency filter must key on this same snapshot + /// (not a fresh read of the IndexStore watermark, which a concurrent append + /// could have advanced) so it stays consistent with the rows the search saw. + pub fn max_visible_batch_position(&self) -> usize { + self.max_visible_batch_position + } + /// Include the _rowaddr column in output. /// /// Same value as _rowid but named for compatibility with LSM scanner. diff --git a/rust/lance/src/dataset/mem_wal/scanner.rs b/rust/lance/src/dataset/mem_wal/scanner.rs index b1766f8525f..fe14bd82dd8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner.rs @@ -43,12 +43,15 @@ mod point_lookup; mod projection; mod vector_search; +pub use block_list::write_pk_sidecar; pub use builder::LsmScanner; pub use collector::{ ActiveMemTableRef, InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector, }; -pub use data_source::{FlushedGeneration, LsmDataSource, LsmGeneration, ShardSnapshot}; -pub use flushed_cache::FlushedMemTableCache; +pub use data_source::{ + FlushedGeneration, FreshTierWatermark, LsmDataSource, LsmGeneration, ShardSnapshot, +}; +pub use flushed_cache::{DatasetCache, FlushedMemTableCache, GenerationWarmer}; pub use fts_search::{LsmFtsSearchPlanner, SCORE_COLUMN}; pub use point_lookup::LsmPointLookupPlanner; pub use projection::DISTANCE_COLUMN; diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs index 684fde48da1..69d16930888 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs @@ -3,37 +3,151 @@ //! Per-source block-list construction for LSM vector search. //! -//! A generation's membership is an `Arc>` of PK hashes -//! ([`compute_pk_hash`]), built once (immutable gens cached). Each source gets a -//! `Vec>>` of the newer generations' sets (`NEWER(G)`; base: all -//! of them) — referenced, never merged. The KNN drops candidates whose PK is in -//! any (see [`super::exec::PkHashFilterExec`]). +//! A generation's membership is a [`GenMembership`]: in-memory generations +//! (active / frozen) are probed by value against their maintained primary-key +//! index (no per-query set), while flushed generations are probed against their +//! standalone on-disk PK BTree (the sidecar written at flush, opened by path). +//! Probing is batched — [`GenMembership::contains_keys`] tests a whole batch of +//! keys per generation in one pass. Each source gets a `Vec` of +//! the newer generations (`NEWER(G)`; base: all of them); the KNN drops a +//! candidate whose PK any of them contains (see +//! [`super::exec::PkBlockFilterExec`]). //! -//! Cross-generation only: within-gen dups share a hash and fall to the global -//! dedup's `(generation, freshness)` tiebreaker. +//! Cross-generation only: within-gen dups collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. + +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; + +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; + +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::registry::IndexPluginRegistry; +use lance_index::scalar::btree::BTreeIndex; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::{ + IndexStore as ScalarIndexStore, SargableQuery, ScalarIndex, SearchResult, +}; +use uuid::Uuid; -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; +use super::data_source::{FreshTierWatermark, LsmDataSource, LsmGeneration}; +use super::flushed_cache::{DatasetCache, open_flushed_dataset}; +use crate::dataset::mem_wal::index::encode_pk_tuple; +use crate::dataset::mem_wal::util::PK_INDEX_DIR; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; +use crate::session::Session; -use arrow_array::RecordBatch; -use futures::TryStreamExt; -use lance_core::Result; +/// Default-plugin registry, used only to load the standalone PK BTree by its +/// `BTreeIndexDetails` type. Built once. +static PK_BTREE_REGISTRY: LazyLock> = + LazyLock::new(IndexPluginRegistry::with_default_plugins); + +/// One newer generation's PK membership, used to decide whether it shadows an +/// older source's row. +#[derive(Clone, Debug)] +pub enum GenMembership { + /// Probe the in-memory memtable's primary-key index, bounded to its visible + /// prefix (so a not-yet-visible write can't shadow an older visible copy). + InMemory { + index_store: Arc, + /// Inclusive visible row watermark; `None` when no rows are visible. + max_visible_row: Option, + }, + /// Probe the flushed generation's standalone on-disk PK BTree. + OnDisk(Arc), +} -use uuid::Uuid; +impl GenMembership { + /// Whether this generation visibly contains the primary `key` — the typed + /// value for a single-column PK, the encoded `Binary` tuple for a composite + /// one (built by [`on_disk_pk_key`]). The same key probes the in-memory + /// BTree and the flushed on-disk BTree, which now share a key space. + pub async fn contains(&self, key: &ScalarValue) -> Result { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => Ok(max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))), + Self::OnDisk(index) => { + let result = index + .search(&SargableQuery::Equals(key.clone()), &NoOpMetricsCollector) + .await + .map_err(|e| Error::io(e.to_string()))?; + Ok(!search_is_empty(&result)) + } + } + } -use super::data_source::{LsmDataSource, LsmGeneration}; -use super::exec::{compute_pk_hash, resolve_pk_indices}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; -use crate::dataset::Dataset; -use crate::dataset::mem_wal::write::BatchStore; -use crate::session::Session; + /// Batched [`Self::contains`]: for each key in `keys`, whether this + /// generation visibly contains it, returned as a mask aligned to `keys`. + /// + /// One probe replaces N. The on-disk arm issues a single + /// [`BTreeIndex::contains_keys`] (no per-key `SearchResult` allocation); the + /// in-memory arm maps the sync, allocation-free PK lookup over the slice. + /// Keys are in the index's key space (see [`on_disk_pk_key`]). + pub async fn contains_keys(&self, keys: &[ScalarValue]) -> Result> { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => Ok(keys + .iter() + .map(|key| max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))) + .collect()), + Self::OnDisk(index) => { + // The flushed PK sidecar is always a BTree (built via + // `PK_BTREE_REGISTRY`); downcast to reach the batched probe. + let btree = index.as_any().downcast_ref::().ok_or_else(|| { + Error::io("flushed PK dedup index is not a BTree".to_string()) + })?; + btree + .contains_keys(keys, &NoOpMetricsCollector) + .await + .map_err(|e| Error::io(e.to_string())) + } + } + } + + /// Whether this generation has no (visible) membership — used to skip adding + /// an empty blocked set. A flushed generation always has rows (flush rejects + /// an empty memtable), so it is never empty. + fn is_empty(&self) -> bool { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => max_visible_row.is_none() || index_store.pk_is_empty(), + Self::OnDisk(_) => false, + } + } +} -/// Per-source blocked PK-hash sets, keyed by `(shard_id, generation)`. Each -/// value is the membership sets of the generations newer than that source. -pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>>>; +/// Whether a scalar search returned no rows (existence test for the block-list). +fn search_is_empty(result: &SearchResult) -> bool { + match result { + SearchResult::Exact(set) | SearchResult::AtMost(set) | SearchResult::AtLeast(set) => { + set.is_empty() + } + } +} -/// A shard's generations paired with their PK-hash membership, before sorting. -type ShardGenSets = HashMap>)>>; +/// The probe key for the on-disk PK BTree: a single-column PK indexes its typed +/// value directly; a composite PK indexes the order-preserving encoded tuple as +/// `Binary` (matching what flush wrote — see [`encode_pk_tuple`]). +pub fn on_disk_pk_key(values: &[ScalarValue]) -> Result { + match values { + [single] => Ok(single.clone()), + _ => Ok(ScalarValue::Binary(Some(encode_pk_tuple(values)?))), + } +} + +/// Per-source blocked memberships, keyed by `(shard_id, generation)`. Each value +/// is the memberships of the generations newer than that source. +pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>; + +/// A shard's generations paired with their membership, before sorting. +type ShardGenSets = HashMap>; /// Per-source `NEWER(G)`, keyed by `(shard_id, generation)`. Generations are /// per-shard, so a source is superseded only by strictly-newer generations of @@ -42,59 +156,64 @@ type ShardGenSets = HashMap>)>>; /// Only superseded sources get an entry; the newest of each shard never does. pub async fn compute_source_block_lists( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, - flushed_cache: Option<&Arc>, + flushed_cache: Option<&Arc>, ) -> Result { - // Hash each non-base source's membership, grouped by shard (generations are + // Membership per non-base source, grouped by shard (generations are // per-shard, so supersession is within-shard only). let mut by_shard: ShardGenSets = HashMap::new(); let mut has_base = false; + // Flushed PK-BTree opens are cold S3 reads; overlap them with + // `try_join_all`. Order is irrelevant — gens are sorted per-shard below. + let mut flushed_loads = Vec::new(); for source in sources { match source { LsmDataSource::BaseTable { .. } => has_base = true, LsmDataSource::ActiveMemTable { batch_store, + index_store, shard_id, generation, .. } => { - let hashes = Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?); + let membership = in_memory_membership(batch_store, index_store); by_shard .entry(*shard_id) .or_default() - .push((*generation, hashes)); + .push((*generation, membership)); } LsmDataSource::FlushedMemTable { path, shard_id, generation, .. - } => { - // Cached by immutable path so repeated searches skip the PK scan. - let hashes = flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?; - by_shard - .entry(*shard_id) - .or_default() - .push((*generation, hashes)); - } + } => flushed_loads.push(async move { + let index = open_pk_index(path, session, flushed_cache).await?; + Ok::<_, Error>((*shard_id, *generation, GenMembership::OnDisk(index))) + }), } } + for (shard_id, generation, membership) in futures::future::try_join_all(flushed_loads).await? { + by_shard + .entry(shard_id) + .or_default() + .push((generation, membership)); + } let mut blocked: SourceBlockLists = HashMap::new(); // Base (shardless, oldest) is superseded by every non-base generation. - let mut base_blocked: Vec>> = Vec::new(); + let mut base_blocked: Vec = Vec::new(); for (shard, mut gens) in by_shard { // Newest-first: a gen's blocked list is its own shard's newer gens. gens.sort_by_key(|(generation, _)| std::cmp::Reverse(*generation)); - let mut newer: Vec>> = Vec::new(); - for (generation, hashes) in gens { + let mut newer: Vec = Vec::new(); + for (generation, membership) in gens { if !newer.is_empty() { blocked.insert((Some(shard), generation), newer.clone()); } - if !hashes.is_empty() { - base_blocked.push(hashes.clone()); - newer.push(hashes); + if !membership.is_empty() { + base_blocked.push(membership.clone()); + newer.push(membership); } } } @@ -104,260 +223,355 @@ pub async fn compute_source_block_lists( Ok(blocked) } -/// The fresh-tier block-list: one membership set per generation that shadows the -/// base table — active + frozen memtables (hashed now) and flushed generations -/// (from the cache). Same `Vec>>` shape the vector-search filter -/// consumes; a base/external reader can drop any row whose PK is in one of them. -/// The base source, if present, is skipped (it is what gets shadowed). +/// The fresh-tier block-list: one [`GenMembership`] per generation that shadows +/// the base table — active + frozen memtables (probed against their index) and +/// flushed generations (probed against their on-disk PK BTree). A base/external +/// reader can test any PK against these (via [`GenMembership::contains`]) to +/// decide whether the fresh tier shadows it. The base source, if present, is +/// skipped (it is what gets shadowed). +/// +/// When `watermarks` carries a watermark for a source's shard, membership is +/// bounded to it (see [`FreshTierWatermark`]): higher generations are excluded, +/// the active generation is bounded to its first `active_batch_count` batches, +/// and lower generations (frozen and flushed) are immutable and included whole. +/// A shard absent from `watermarks` (or `watermarks == None`) uses the live tier. pub async fn fresh_tier_block_list( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, - flushed_cache: Option<&Arc>, -) -> Result>>> { - let mut sets = Vec::new(); + flushed_cache: Option<&Arc>, + watermarks: Option<&HashMap>, +) -> Result> { + // Membership per source, in source order (`None` = skipped). Flushed + // PK-BTree opens are cold S3 reads, so collect them tagged with their slot + // and overlap with `try_join_all` rather than opening one at a time. + let mut slots: Vec> = Vec::with_capacity(sources.len()); + let mut flushed_loads = Vec::new(); for source in sources { - let set = match source { - LsmDataSource::BaseTable { .. } => continue, - LsmDataSource::ActiveMemTable { batch_store, .. } => { - Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?) + match source { + LsmDataSource::BaseTable { .. } => slots.push(None), + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + shard_id, + generation, + .. + } => { + let membership = match watermarks.and_then(|m| m.get(shard_id)) { + None => Some(in_memory_membership(batch_store, index_store)), + Some(watermark) => { + let g = generation.as_u64(); + if g > watermark.active_generation { + // Rolled in after the snapshot; the arm never saw it. + None + } else if g == watermark.active_generation { + // Bound the active generation to the batches the arm saw. + Some(bounded_in_memory_membership( + batch_store, + index_store, + watermark.active_batch_count, + )) + } else { + // Lower (frozen) generations are immutable — include all. + Some(in_memory_membership(batch_store, index_store)) + } + } + }; + slots.push(membership); } - LsmDataSource::FlushedMemTable { path, .. } => { - flushed_pk_hashes(path, pk_columns, session, flushed_cache).await? + LsmDataSource::FlushedMemTable { + path, + shard_id, + generation, + .. + } => { + // A generation at or above the active one was flushed after the + // snapshot; exclude it. Lower generations are immutable. The + // `==` case is the active generation flushed between the two + // reads: excluding the flushed copy loses nothing, since its + // rows are already captured by the in-memory arm above (bounded + // to `active_batch_count`). + let flushed_after_snapshot = watermarks + .and_then(|m| m.get(shard_id)) + .is_some_and(|watermark| generation.as_u64() >= watermark.active_generation); + if flushed_after_snapshot { + slots.push(None); + } else { + let slot = slots.len(); + slots.push(None); + flushed_loads.push(async move { + let index = open_pk_index(path, session, flushed_cache).await?; + Ok::<_, Error>((slot, GenMembership::OnDisk(index))) + }); + } } - }; - if !set.is_empty() { - sets.push(set); } } - Ok(sets) + for (slot, membership) in futures::future::try_join_all(flushed_loads).await? { + slots[slot] = Some(membership); + } + Ok(slots + .into_iter() + .flatten() + .filter(|membership| !membership.is_empty()) + .collect()) } -/// Hash the PK membership of an in-memory memtable (active or frozen) from its -/// committed `BatchStore` rows. -pub fn pk_hashes_from_batch_store( - store: &BatchStore, - pk_columns: &[String], -) -> Result> { - let mut batches: Vec = Vec::with_capacity(store.len()); - for i in 0..store.len() { - if let Some(stored) = store.get(i) { - batches.push(stored.data.clone()); - } +/// Cross-source membership of an in-memory (active / frozen) memtable: a +/// snapshot-bounded probe of its maintained primary-key index. A memtable +/// without a primary-key index can't be probed, so it blocks nothing — the +/// production vector-search path always enables the index. +fn in_memory_membership( + batch_store: &Arc, + index_store: &Arc, +) -> GenMembership { + let max_visible_row = batch_store.max_visible_row(index_store.max_visible_batch_position()); + GenMembership::InMemory { + index_store: index_store.clone(), + max_visible_row, } - pk_hashes_from_batches(&batches, pk_columns) } -/// Hash every row's primary key across `batches` into a membership set. -fn pk_hashes_from_batches(batches: &[RecordBatch], pk_columns: &[String]) -> Result> { - let mut pk_hashes = HashSet::new(); - for batch in batches { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row_idx in 0..batch.num_rows() { - pk_hashes.insert(compute_pk_hash(batch, &pk_indices, row_idx)); - } +/// As-of variant of [`in_memory_membership`] for the active generation under a +/// watermark: bounds visibility to the first `batch_count` batches — those a +/// prior scan observed before the memtable grew. A later append lands at a +/// higher row position and is excluded by the probe, so it can't shadow a base +/// row whose replacement the scan never delivered. `batch_count == 0` leaves the +/// membership empty. +fn bounded_in_memory_membership( + batch_store: &Arc, + index_store: &Arc, + batch_count: u64, +) -> GenMembership { + let max_visible_row = batch_count + .checked_sub(1) + .and_then(|last_batch| batch_store.max_visible_row(last_batch as usize)); + GenMembership::InMemory { + index_store: index_store.clone(), + max_visible_row, } - Ok(pk_hashes) } -/// Build (or fetch the cached) PK-hash membership for one flushed generation. -/// Cached by immutable path (single-flight); the build scans the flushed -/// dataset's PK columns. -async fn flushed_pk_hashes( +/// Open the standalone PK BTree at `{flushed gen}/_pk_index` for one flushed +/// generation. Reuses the flushed dataset's (session-configured) object store +/// and **its index cache**, then loads the sidecar directly by path through the +/// BTree plugin — it is not a manifest index. The opened index and its pages +/// are cached in the session's index cache (keyed by the immutable flushed +/// path), so repeated probes reuse them with no separate cache path and no +/// upfront scan; concurrent first-opens may each load before the cache fills. +/// A stable cache UUID for a non-manifest index identified only by its path. +/// +/// `DSIndexCache::for_index` keys by `&Uuid`, but the flushed PK sidecar has no +/// manifest UUID — its identity is its immutable path. Derive a deterministic +/// UUID from the path so the cache namespace is per-path and stable across +/// probes (the `uuid` crate lacks the `v5` "name-based" feature here, so hash to +/// a `u128` instead). +fn path_cache_uuid(path: &str) -> Uuid { + use std::hash::{Hash, Hasher}; + let mut lo = std::collections::hash_map::DefaultHasher::new(); + path.hash(&mut lo); + let mut hi = std::collections::hash_map::DefaultHasher::new(); + // Seed the high half differently so it never equals the low half. + "lance/flushed-pk-index".hash(&mut hi); + path.hash(&mut hi); + Uuid::from_u128(((hi.finish() as u128) << 64) | lo.finish() as u128) +} + +async fn open_pk_index( path: &str, - pk_columns: &[String], session: Option<&Arc>, - flushed_cache: Option<&Arc>, -) -> Result>> { - match flushed_cache { - Some(cache) => { - let build_cache = cache.clone(); - let build_path = path.to_string(); - let build_session = session.cloned(); - let build_pk = pk_columns.to_vec(); - cache - .get_or_build_pk_hashes( - path, - // `Box::pin` keeps this build future off the caller's future - // (avoids `clippy::large_futures`). - Box::pin(async move { - let dataset = open_flushed_dataset( - &build_path, - build_session.as_ref(), - Some(&build_cache), - ) - .await?; - scan_pk_hashes(&dataset, &build_pk).await - }), - ) - .await - } - None => { - let dataset = open_flushed_dataset(path, session, None).await?; - Ok(Arc::new(scan_pk_hashes(&dataset, pk_columns).await?)) - } + flushed_cache: Option<&Arc>, +) -> Result> { + let dataset = open_flushed_dataset(path, session, flushed_cache, None).await?; + // Namespace the session index cache by the (immutable) flushed path so this + // sidecar's pages live alongside every other index instead of a bespoke + // cache. `fri_uuid` is None — flushed generations carry no fragment-reuse. + let index_cache = dataset.index_cache.for_index(&path_cache_uuid(path), None); + let index_dir = dataset.base.clone().join(PK_INDEX_DIR); + let store: Arc = Arc::new(LanceIndexStore::new( + dataset.object_store.clone(), + index_dir, + Arc::new(index_cache.clone()), + )); + + let plugin = PK_BTREE_REGISTRY.get_plugin_by_name("BTree")?; + // Cache the opened index in the session cache (mirrors `open_scalar_index`). + if let Some(index) = plugin + .get_from_cache(store.clone(), None, &index_cache) + .await? + { + return Ok(index); } + let details = prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()) + .map_err(|e| Error::io(e.to_string()))?; + let index = plugin + .load_index(store, &details, None, &index_cache) + .await?; + plugin.put_in_cache(&index_cache, index.clone()).await?; + Ok(index) } -/// Scan a dataset's PK columns and fold them into a membership set, one batch -/// resident at a time (no full PK-column buffer). -async fn scan_pk_hashes(dataset: &Dataset, pk_columns: &[String]) -> Result> { - let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); - let mut scanner = dataset.scan(); - scanner.project(&pk_refs)?; - let mut stream = scanner.try_into_stream().await?; - let mut hashes = HashSet::new(); - while let Some(batch) = stream.try_next().await? { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(&batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row in 0..batch.num_rows() { - hashes.insert(compute_pk_hash(&batch, &pk_indices, row)); - } +/// Write a flushed generation's standalone PK sidecar at `{uri}/_pk_index` from +/// `batches`, mirroring what flush does in production. `pk_columns` are the +/// primary-key column names (field ids are synthesized by position — `insert` +/// resolves columns by name). A no-op when no batch carries the PK columns. +/// +/// Used by Rust scanner tests and by the Python test-support binding to stage +/// faithful flushed generations (a flushed dataset alone, with no sidecar, is +/// not a state production ever produces). +pub async fn write_pk_sidecar( + uri: &str, + batches: &[arrow_array::RecordBatch], + pk_columns: &[&str], +) -> Result<()> { + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_core::cache::LanceCache; + use lance_index::scalar::btree::train_btree_index; + use lance_io::object_store::ObjectStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let pk: Vec<(String, i32)> = pk_columns + .iter() + .enumerate() + .map(|(i, c)| (c.to_string(), i as i32)) + .collect(); + let mut index = IndexStore::new(); + index.enable_pk_index(&pk); + let mut offset = 0u64; + for batch in batches { + index.insert(batch, offset)?; + offset += batch.num_rows() as u64; + } + + let training = index.pk_training_batches(8192)?; + if training.is_empty() { + return Ok(()); } - Ok(hashes) + let schema = training[0].schema(); + let (object_store, base_path) = ObjectStore::from_uri(uri).await?; + let store = LanceIndexStore::new( + object_store, + pk_index_path(&base_path), + Arc::new(LanceCache::no_cache()), + ); + let stream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(training.into_iter().map(Ok)), + )); + // `train_btree_index` now returns the written index files; the sidecar + // writer only needs success/failure. + train_btree_index(stream, &store, 8192, None, None).await?; + Ok(()) } #[cfg(test)] mod tests { use super::*; - use arrow_array::Int32Array; + use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; + use crate::dataset::mem_wal::write::IndexStore; + use arrow_array::{Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; use std::sync::Arc; + use uuid::Uuid; fn id_batch(ids: &[i32]) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() } - /// Hash a single Int32 `id` PK the way the planner does, so a test can probe - /// a returned blocked set by value. - fn hash_id(id: i32) -> u64 { - let batch = id_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - /// Whether `id`'s PK hash is blocked by any of a source's newer-gen sets. - fn blocks(sets: &[Arc>], id: i32) -> bool { - sets.iter().any(|s| s.contains(&hash_id(id))) - } - - #[test] - fn pk_hashes_collapse_within_gen_duplicates() { - // Two rows share pk=1 (a within-gen duplicate); pk=2 is unique. - let hashes = pk_hashes_from_batches(&[id_batch(&[1, 2, 1])], &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 2); // distinct pks: 1, 2 + /// An active/frozen memtable source whose PK index holds one row per id in + /// `ids` (positions 0..n), all committed and visible. + fn active_source(shard: Uuid, generation: u64, ids: &[i32]) -> LsmDataSource { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = id_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + LsmDataSource::ActiveMemTable { + batch_store: Arc::new(store), + index_store: Arc::new(index), + schema: id_batch(&[1]).schema(), + shard_id: shard, + generation: LsmGeneration::memtable(generation), + } } - #[test] - fn empty_batches_yield_empty_membership() { - let hashes = pk_hashes_from_batches(&[id_batch(&[])], &["id".to_string()]).unwrap(); - assert!(hashes.is_empty()); + /// Whether `id`'s PK is blocked by any of a source's newer-gen memberships. + async fn blocks(memberships: &[GenMembership], id: i32) -> bool { + let key = on_disk_pk_key(&[ScalarValue::Int32(Some(id))]).unwrap(); + for m in memberships { + if m.contains(&key).await.unwrap() { + return true; + } + } + false } #[test] - fn batch_store_membership_collapses_within_gen_dups() { - let store = BatchStore::with_capacity(8); - // Two single-row batches, both pk=1 (a within-gen update). - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[1])).unwrap(); - // A two-row batch: pk=2, pk=3. - store.append(id_batch(&[2, 3])).unwrap(); - - let hashes = pk_hashes_from_batch_store(&store, &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 3); // distinct pks: 1, 2, 3 + fn on_disk_key_is_typed_for_single_and_binary_for_composite() { + // Single-column → the typed value; composite → encoded Binary. + let single = [ScalarValue::Int32(Some(7))]; + assert_eq!( + on_disk_pk_key(&single).unwrap(), + ScalarValue::Int32(Some(7)) + ); + let composite = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + assert!(matches!( + on_disk_pk_key(&composite).unwrap(), + ScalarValue::Binary(Some(_)) + )); } #[tokio::test] - async fn fresh_tier_block_list_one_set_per_in_memory_gen() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - + async fn fresh_tier_block_list_one_membership_per_in_memory_gen() { let shard = Uuid::new_v4(); - let mk = |ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; // Active gen 2: pk=1,2. Frozen gen 1: pk=3. - let sources = vec![mk(&[1, 2], 2), mk(&[3], 1)]; + let sources = vec![ + active_source(shard, 2, &[1, 2]), + active_source(shard, 1, &[3]), + ]; - let sets = fresh_tier_block_list(&sources, &["id".to_string()], None, None) + let memberships = fresh_tier_block_list(&sources, None, None, None) .await .unwrap(); - // One set per generation; together they cover pk=1,2,3 (not 4). - assert_eq!(sets.len(), 2); + // One membership per generation; together they cover pk=1,2,3 (not 4). + assert_eq!(memberships.len(), 2); for id in [1, 2, 3] { - assert!(blocks(&sets, id)); + assert!(blocks(&memberships, id).await); } - assert!(!blocks(&sets, 4)); + assert!(!blocks(&memberships, 4).await); } #[tokio::test] async fn block_lists_suppress_stale_across_in_memory_gens() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - let shard = Uuid::new_v4(); - let mk = |batches: &[&[i32]], generation: u64| { - let store = BatchStore::with_capacity(8); - for ids in batches { - store.append(id_batch(ids)).unwrap(); - } - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Frozen gen 1: stale pk=1. - // Active gen 2: pk=1 re-written, pk=2 new. - let sources = vec![mk(&[&[1]], 1), mk(&[&[1], &[2]], 2)]; + // Frozen gen 1: stale pk=1. Active gen 2: pk=1 re-written, pk=2 new. + let sources = vec![ + active_source(shard, 1, &[1]), + active_source(shard, 2, &[1, 2]), + ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // The newer active write supersedes the frozen copy: gen 1 is blocked on // pk=1, so its KNN drops pk=1. - assert!(blocks(&blocked[&(Some(shard), g1)], 1)); + assert!(blocks(&blocked[&(Some(shard), g1)], 1).await); // The active (newest) generation is superseded by nothing — no entry. assert!(!blocked.contains_key(&(Some(shard), g2))); } #[tokio::test] async fn block_lists_suppress_stale_base_row() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; use crate::dataset::{Dataset, WriteParams}; use arrow_array::RecordBatchIterator; - use uuid::Uuid; // Base (gen 0): pk=1 (stale), pk=3 (live). let base_batch = id_batch(&[1, 3]); @@ -372,89 +586,239 @@ mod tests { ); // Active gen 1: pk=1 re-written, pk=2 new. - let store = BatchStore::with_capacity(8); - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[2])).unwrap(); - let sources = vec![ LsmDataSource::BaseTable { dataset: base }, - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema, - shard_id: Uuid::new_v4(), - generation: LsmGeneration::memtable(1), - }, + active_source(Uuid::new_v4(), 1, &[1, 2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); // Base is blocked by every newer gen: pk=1 (re-written in gen 1) is - // blocked, pk=3 (base-only) is not. End-to-end drop: vector_search specs. + // blocked, pk=3 (base-only) is not. let base_blocked = blocked .get(&(None, LsmGeneration::BASE_TABLE)) .expect("base has a blocked set"); - assert!(blocks(base_blocked, 1)); - assert!(!blocks(base_blocked, 3)); + assert!(blocks(base_blocked, 1).await); + assert!(!blocks(base_blocked, 3).await); } #[tokio::test] async fn block_lists_are_keyed_per_shard() { // Regression: generations are per-shard, so a source must only be blocked - // by newer generations of its OWN shard. A generation-only key would - // cross-block same-generation sources from different shards. - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - - let mk = |shard: Uuid, ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). - // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). + // by newer generations of its OWN shard. let a = Uuid::new_v4(); let b = Uuid::new_v4(); + // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). + // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). let sources = vec![ - mk(a, &[1], 1), - mk(a, &[1], 2), - mk(b, &[2], 1), - mk(b, &[2], 2), + active_source(a, 1, &[1]), + active_source(a, 2, &[1]), + active_source(b, 1, &[2]), + active_source(b, 2, &[2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // Each shard's gen 1 is blocked by its OWN gen 2 only. - assert!(blocks(&blocked[&(Some(a), g1)], 1)); - assert!(!blocks(&blocked[&(Some(a), g1)], 2)); - assert!(blocks(&blocked[&(Some(b), g1)], 2)); - assert!(!blocks(&blocked[&(Some(b), g1)], 1)); + assert!(blocks(&blocked[&(Some(a), g1)], 1).await); + assert!(!blocks(&blocked[&(Some(a), g1)], 2).await); + assert!(blocks(&blocked[&(Some(b), g1)], 2).await); + assert!(!blocks(&blocked[&(Some(b), g1)], 1).await); // The newest generation of each shard is superseded by nothing. assert!(!blocked.contains_key(&(Some(a), g2))); assert!(!blocked.contains_key(&(Some(b), g2))); } + + #[tokio::test] + async fn index_membership_is_snapshot_bounded() { + // The index-sourced membership only counts a PK whose version is visible + // at the source's watermark, so a newer generation's not-yet-visible + // write can't shadow an older generation's visible copy. + let shard = Uuid::new_v4(); + let schema = id_batch(&[1]).schema(); + + // Older frozen gen 1: pk=1. + let g1 = active_source(shard, 1, &[1]); + + // Newer active gen 2: pk=99 visible at position 0, then pk=1 written at + // position 1 but with the watermark left at batch 0 (so pk=1 is in the + // index yet not visible) — the concurrent-write race. + let g2_store = BatchStore::with_capacity(8); + let mut g2_index = IndexStore::new(); + g2_index.enable_pk_index(&[("id".to_string(), 0)]); + let b0 = id_batch(&[99]); + let (bp0, off0, _) = g2_store.append(b0.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b0, off0, Some(bp0)) // advances watermark to 0 + .unwrap(); + let b1 = id_batch(&[1]); + let (_, off1, _) = g2_store.append(b1.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b1, off1, None) // index updated, watermark unchanged + .unwrap(); + let g2 = LsmDataSource::ActiveMemTable { + batch_store: Arc::new(g2_store), + index_store: Arc::new(g2_index), + schema, + shard_id: shard, + generation: LsmGeneration::memtable(2), + }; + + let blocked = Box::pin(compute_source_block_lists(&[g1, g2], None, None)) + .await + .unwrap(); + + let g1_block = &blocked[&(Some(shard), LsmGeneration::memtable(1))]; + // pk=99 is visible in gen 2 → it blocks gen 1's pk=99. + assert!(blocks(g1_block, 99).await); + // pk=1's only gen-2 copy is not yet visible → it must NOT shadow gen 1. + assert!( + !blocks(g1_block, 1).await, + "a not-yet-visible newer write must not shadow an older visible copy" + ); + } + + /// A fresh-tier watermark bounds the active generation to the first + /// `active_batch_count` batches — those the arm observed before the memtable + /// grew. A later append is invisible, so a base row is never dropped without + /// the arm having delivered its replacement. + #[tokio::test] + async fn fresh_tier_watermark_bounds_active_memtable_by_batch_count() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use std::collections::HashMap; + + let shard = Uuid::new_v4(); + // Three single-row batches: pk=1 at batch 0, pk=2 at batch 1, pk=3 at + // batch 2 (appended after the arm). + let sources = vec![active_source(shard, 1, &[1, 2, 3])]; + + // Watermark at 2 batches of gen 1: pk=1,2 are members; pk=3 (batch 2) is not. + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 1, + active_batch_count: 2, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks)) + .await + .unwrap(); + assert!(blocks(&sets, 1).await); + assert!(blocks(&sets, 2).await); + assert!(!blocks(&sets, 3).await); + + // No watermark → live tier: all three are members. + let sets = fresh_tier_block_list(&sources, None, None, None) + .await + .unwrap(); + for id in [1, 2, 3] { + assert!(blocks(&sets, id).await); + } + } + + /// A generation above the active one rolled in after the snapshot and is + /// excluded whole; a lower one is immutable (frozen) and included whole + /// regardless of the active batch count. + #[tokio::test] + async fn fresh_tier_watermark_excludes_newer_gen_includes_lower_gen() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use std::collections::HashMap; + + let shard = Uuid::new_v4(); + // gen 3 newer (after snapshot), gen 2 == active (bounded to 1 batch), + // gen 1 lower/immutable (whole). Each id is its own batch. + let sources = vec![ + active_source(shard, 3, &[100]), + active_source(shard, 2, &[20, 21]), + active_source(shard, 1, &[1, 2]), + ]; + + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 2, + active_batch_count: 1, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks)) + .await + .unwrap(); + assert!(blocks(&sets, 1).await); // gen 1, whole + assert!(blocks(&sets, 2).await); // gen 1, whole + assert!(blocks(&sets, 20).await); // gen 2, batch 0 + assert!(!blocks(&sets, 21).await); // gen 2, batch 1 — past the watermark + assert!(!blocks(&sets, 100).await); // gen 3 — after the snapshot + } + + /// A flushed generation at or above the active generation was produced by a + /// flush after the snapshot and is excluded; one strictly below it is + /// immutable and included. + #[tokio::test] + async fn fresh_tier_watermark_excludes_flushed_at_or_above_active() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use crate::dataset::{Dataset, WriteParams}; + use arrow_array::RecordBatchIterator; + use std::collections::HashMap; + + // A flushed generation 2 holding pk=5, staged as a flushed dataset with + // its standalone PK sidecar (what the on-disk membership probes). + let flushed_batch = id_batch(&[5]); + let schema = flushed_batch.schema(); + let tmp = tempfile::tempdir().unwrap(); + let path = format!("{}/gen2", tmp.path().to_str().unwrap()); + let reader = RecordBatchIterator::new(vec![Ok(flushed_batch.clone())], schema.clone()); + Dataset::write(reader, &path, Some(WriteParams::default())) + .await + .unwrap(); + write_pk_sidecar(&path, &[flushed_batch], &["id"]) + .await + .unwrap(); + + let shard = Uuid::new_v4(); + let sources = vec![LsmDataSource::FlushedMemTable { + path, + shard_id: shard, + generation: LsmGeneration::memtable(2), + }]; + + // active_generation 2 (gen 2 flushed at/after the snapshot): excluded. + let at: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 2, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&at)) + .await + .unwrap(); + assert!(!blocks(&sets, 5).await); + + // active_generation 3 (gen 2 strictly below, immutable): included. + let above: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 3, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&above)) + .await + .unwrap(); + assert!(blocks(&sets, 5).await); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs index ade4164d485..a006257493b 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs @@ -20,8 +20,8 @@ use lance_core::{Error, Result, is_system_column}; use uuid::Uuid; use super::collector::{InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector}; -use super::data_source::ShardSnapshot; -use super::flushed_cache::FlushedMemTableCache; +use super::data_source::{FreshTierWatermark, ShardSnapshot}; +use super::flushed_cache::{DatasetCache, GenerationWarmer}; use super::planner::LsmScanPlanner; use super::point_lookup::LsmPointLookupPlanner; use crate::dataset::Dataset; @@ -124,7 +124,12 @@ pub struct LsmScanner { session: Option>, /// Cache of opened flushed-generation datasets. When set, repeated /// queries against the same generation skip the manifest read entirely. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, + /// Over-fetch multiple for block-listed sources in search plans + /// (see [`super::LsmFtsSearchPlanner::with_overfetch_factor`]). + overfetch_factor: Option, } impl LsmScanner { @@ -160,6 +165,8 @@ impl LsmScanner { pk_columns, session, flushed_cache: None, + warmer: None, + overfetch_factor: None, } } @@ -198,6 +205,8 @@ impl LsmScanner { pk_columns, session: None, flushed_cache: None, + warmer: None, + overfetch_factor: None, } } @@ -246,13 +255,29 @@ impl LsmScanner { /// /// With a cache, repeated queries against the same generation become a /// pure `Arc::clone` with no manifest read or object-store I/O. The cache - /// is owned and sized by the caller (see [`FlushedMemTableCache`]); not - /// set by default, so behavior is unchanged unless opted in. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + /// is owned and sized by the caller (any [`DatasetCache`] impl, e.g. + /// [`FlushedMemTableCache`](super::FlushedMemTableCache)); not set by + /// default, so behavior is unchanged unless opted in. + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. Not set by + /// default, so behavior is unchanged unless opted in. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + + /// Set the over-fetch multiple block-listed sources use in search plans + /// so they still yield `k` live rows after cross-generation dedup. + /// Threaded into [`super::LsmFtsSearchPlanner`]; clamped to `>= 1.0`. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = Some(factor); + self + } + /// Project specific columns. /// /// If not called, all columns from the base schema are included. @@ -354,6 +379,9 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } let plan = planner .plan_point_lookup(&keys, self.projection.as_deref()) .await?; @@ -370,6 +398,12 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } + if let Some(factor) = self.overfetch_factor { + planner = planner.with_overfetch_factor(factor); + } planner .plan_scan( @@ -405,6 +439,12 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } + if let Some(factor) = self.overfetch_factor { + planner = planner.with_overfetch_factor(factor); + } planner .plan_search(column, query, k, self.projection.as_deref()) .await @@ -454,24 +494,65 @@ impl LsmScanner { /// the primary-key columns; the returned `Vec` is aligned with its /// rows. Hashing matches the scanner's internal dedup, so the caller never /// hashes PKs itself. Flushed membership comes from the injected - /// [`FlushedMemTableCache`] when one is set. + /// [`DatasetCache`] when one is set. pub async fn contains_pks(&self, pks: &RecordBatch) -> Result> { + self.contains_pks_at(pks, None).await + } + + /// As-of variant of [`Self::contains_pks`]. Membership is evaluated against + /// a per-shard watermark on the fresh tier, supplied via `watermarks` (see + /// [`FreshTierWatermark`]), matching the tier a prior scan observed and + /// avoiding the two-snapshot skew that would drop a base row with no + /// delivered replacement. `None` evaluates against the live tier. + pub async fn contains_pks_at( + &self, + pks: &RecordBatch, + watermarks: Option<&HashMap>, + ) -> Result> { let sources = self.build_collector().collect()?; - let sets = super::block_list::fresh_tier_block_list( + let memberships = super::block_list::fresh_tier_block_list( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), + watermarks, ) .await?; let pk_indices = super::exec::resolve_pk_indices(pks, &self.pk_columns) .map_err(|e| Error::invalid_input(e.to_string()))?; - Ok((0..pks.num_rows()) + // One key per row, in the index key space (typed value, or encoded + // `Binary` tuple for a composite PK). + let keys: Vec = (0..pks.num_rows()) .map(|row| { - let hash = super::exec::compute_pk_hash(pks, &pk_indices, row); - sets.iter().any(|set| set.contains(&hash)) + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(pks.column(col), row)) + .collect::>() + .map_err(|e| Error::invalid_input(e.to_string()))?; + super::block_list::on_disk_pk_key(&values) }) - .collect()) + .collect::>()?; + + // A row is contained if any generation contains its key. Probe each + // generation once (batched), narrowing to still-unfound rows. + let mut contained = vec![false; keys.len()]; + let mut live: Vec = (0..keys.len()).collect(); + for membership in &memberships { + if live.is_empty() { + break; + } + let live_keys: Vec = live.iter().map(|&i| keys[i].clone()).collect(); + let mask = membership.contains_keys(&live_keys).await?; + let mut next_live = Vec::with_capacity(live.len()); + for (pos, &row) in live.iter().enumerate() { + if mask[pos] { + contained[row] = true; + } else { + next_live.push(row); + } + } + live = next_live; + } + Ok(contained) } /// Build the data source collector. @@ -572,35 +653,42 @@ mod tests { assert_eq!(memtable_ref.generation, 10); } - #[tokio::test] - async fn contains_pks_reports_fresh_tier_membership() { - use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; - use arrow_array::Int32Array; + /// Single-column `id: Int32` schema used by the PK-membership tests. + fn pk_schema() -> SchemaRef { use arrow_schema::{DataType, Field, Schema}; + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])) + } - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - let id_batch = |ids: &[i32]| { - RecordBatch::try_new( - schema.clone(), - vec![Arc::new(Int32Array::from(ids.to_vec()))], - ) - .unwrap() - }; - let mk = |ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - InMemoryMemTableRef { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: schema.clone(), - generation, - } - }; + /// A `RecordBatch` of `id` values against [`pk_schema`]. + fn id_pk_batch(ids: &[i32]) -> RecordBatch { + use arrow_array::Int32Array; + RecordBatch::try_new(pk_schema(), vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() + } + + /// An active/frozen memtable holding `ids` at `generation`, with a single + /// batch and a maintained primary-key index on `id`. + fn mk_pk_memtable(ids: &[i32], generation: u64) -> InMemoryMemTableRef { + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + let store = BatchStore::with_capacity(8); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + let b = id_pk_batch(ids); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + InMemoryMemTableRef { + batch_store: Arc::new(store), + index_store: Arc::new(index), + schema: pk_schema(), + generation, + } + } + #[tokio::test] + async fn contains_pks_reports_fresh_tier_membership() { // Fresh-tier only: active gen 2 (pk=1,2) + frozen gen 1 (pk=3). let shard = Uuid::new_v4(); let scanner = LsmScanner::without_base_table( - schema.clone(), + pk_schema(), "memory://t", vec![], vec!["id".to_string()], @@ -608,16 +696,68 @@ mod tests { .with_in_memory_memtables( shard, InMemoryMemTables { - active: mk(&[1, 2], 2), - frozen: vec![mk(&[3], 1)], + active: mk_pk_memtable(&[1, 2], 2), + frozen: vec![mk_pk_memtable(&[3], 1)], }, ); // pk=1 (active), pk=4 (absent), pk=3 (frozen). - let result = scanner.contains_pks(&id_batch(&[1, 4, 3])).await.unwrap(); + let result = scanner + .contains_pks(&id_pk_batch(&[1, 4, 3])) + .await + .unwrap(); assert_eq!(result, vec![true, false, true]); } + /// `contains_pks_at` probes each generation once over the still-unfound + /// rows, so a multi-PK batch spanning several generations resolves to the + /// right per-row mask — and a watermark bounds which generations count. + #[tokio::test] + async fn contains_pks_at_batched_probe_respects_watermark() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + + // active gen 2 (pk=1,2) + frozen gen 1 (pk=3,4). + let shard = Uuid::new_v4(); + let scanner = LsmScanner::without_base_table( + pk_schema(), + "memory://t", + vec![], + vec!["id".to_string()], + ) + .with_in_memory_memtables( + shard, + InMemoryMemTables { + active: mk_pk_memtable(&[1, 2], 2), + frozen: vec![mk_pk_memtable(&[3, 4], 1)], + }, + ); + + // Duplicate and out-of-order keys exercise the live-row narrowing: each + // generation only re-probes the rows earlier generations didn't claim. + let probe = id_pk_batch(&[4, 1, 9, 3, 2, 1]); + + // watermark=None → live tier: every PK present in either generation. + let live = scanner.contains_pks_at(&probe, None).await.unwrap(); + assert_eq!(live, vec![true, true, false, true, true, true]); + + // watermark at gen 1 → active gen 2 rolled in after the snapshot and is + // excluded; only the frozen gen 1 keys (3,4) remain members. + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 1, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let bounded = scanner + .contains_pks_at(&probe, Some(&watermarks)) + .await + .unwrap(); + assert_eq!(bounded, vec![true, false, false, true, false, false]); + } + /// One active memtable with a maintained BTree on `id`, all rows visible. fn mk_indexed_memtable(schema: &SchemaRef, ids: &[i32], names: &[&str]) -> InMemoryMemTableRef { use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; diff --git a/rust/lance/src/dataset/mem_wal/scanner/collector.rs b/rust/lance/src/dataset/mem_wal/scanner/collector.rs index 2db4b4f277d..6645f159b12 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/collector.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/collector.rs @@ -229,6 +229,19 @@ impl LsmDataSourceCollector { .collect() } + /// True when `generation` for `shard_id` is still pinned in memory as a + /// frozen memtable. During the post-flush grace window a generation is both + /// committed to the manifest (a flushed source) and held in memory (an + /// in-memory source); it must be served only from memory — which preserves + /// the per-batch boundaries the flushed dataset has lost, so as-of reads + /// stay snapshot-bounded — and its on-disk copy skipped to avoid scanning + /// the generation twice. See `ShardWriterConfig::frozen_memtable_grace`. + fn flushed_gen_pinned_in_memory(&self, shard_id: &Uuid, generation: u64) -> bool { + self.in_memory_memtables + .get(shard_id) + .is_some_and(|mems| mems.frozen.iter().any(|f| f.generation == generation)) + } + /// Collect all data sources. /// /// Returns sources in a consistent order: @@ -246,6 +259,9 @@ impl LsmDataSourceCollector { for snapshot in &self.shard_snapshots { for flushed in &snapshot.flushed_generations { + if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) { + continue; + } let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path); sources.push(LsmDataSource::FlushedMemTable { path, @@ -284,6 +300,9 @@ impl LsmDataSourceCollector { } for flushed in &snapshot.flushed_generations { + if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) { + continue; + } let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path); sources.push(LsmDataSource::FlushedMemTable { path, @@ -443,4 +462,53 @@ mod tests { 3 ); } + + /// During the post-flush grace window a generation is both committed to the + /// manifest (a flushed source) and still pinned in memory (a frozen + /// source). The collector must emit it once, from memory — so as-of reads + /// keep batch-resolved membership — and skip the on-disk copy. Flushed + /// generations NOT pinned in memory are still emitted from disk. + #[test] + fn test_collect_suppresses_flushed_gen_pinned_in_memory() { + let shard = Uuid::new_v4(); + // Manifest lists gens 1 and 2 as flushed; gen 2 is still pinned in + // memory (just flushed, within grace), gen 1 has been swept. + let snapshot = ShardSnapshot { + shard_id: shard, + spec_id: 0, + current_generation: 3, + flushed_generations: vec![ + FlushedGeneration { + generation: 1, + path: "gen_1".to_string(), + }, + FlushedGeneration { + generation: 2, + path: "gen_2".to_string(), + }, + ], + }; + let mems = InMemoryMemTables { + active: memtable_ref(3), + frozen: vec![memtable_ref(2)], + }; + let collector = LsmDataSourceCollector::without_base_table("/tmp/x", vec![snapshot]) + .with_in_memory_memtables(shard, mems); + + let sources = collector.collect().unwrap(); + // gen 1: on-disk (not pinned). gen 2: in-memory only (pinned, disk + // copy suppressed). gen 3: active. No duplicate gen 2. + let flushed: Vec = sources + .iter() + .filter(|s| !s.is_active_memtable()) + .map(|s| s.generation().as_u64()) + .collect(); + let in_memory: Vec = sources + .iter() + .filter(|s| s.is_active_memtable()) + .map(|s| s.generation().as_u64()) + .collect(); + assert_eq!(flushed, vec![1], "only the unpinned flushed gen from disk"); + assert_eq!(in_memory, vec![2, 3], "pinned gen 2 served from memory"); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs index 1a6207f27e3..0d5f3fdc925 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs @@ -11,6 +11,29 @@ use uuid::Uuid; use crate::dataset::Dataset; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; +/// A watermark marking how far into one shard's fresh tier a prior scan +/// observed, so membership can be evaluated as of that point (see +/// [`super::builder::LsmScanner::contains_pks_at`]). +/// +/// Only the active memtable grows between two reads (appended batches, and a new +/// generation when it rolls); everything at a lower generation — frozen and +/// flushed — is immutable and was fully observed. The watermark includes lower +/// generations whole, the active generation up to `active_batch_count` batches, +/// and excludes higher generations (which appeared after it). It uses only the +/// batch count and generation — both always available, unlike per-batch WAL +/// positions, which the write path does not track. The bound only excludes rows +/// the scan did not observe, so a stale watermark under-counts (a tolerable +/// stale read) rather than dropping a row with no replacement. +#[derive(Debug, Clone, Copy)] +pub struct FreshTierWatermark { + /// Active generation the scan observed. Higher generations are excluded; + /// lower ones are immutable and included whole. + pub active_generation: u64, + /// Active-memtable batch count at snapshot time. Within the active + /// generation, only batches at index `< active_batch_count` were observed. + pub active_batch_count: u64, +} + /// Generation number in LSM tree. /// /// The base table has generation 0. MemTables have positive integers diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec.rs b/rust/lance/src/dataset/mem_wal/scanner/exec.rs index 88fd617dc0a..115cffccc81 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec.rs @@ -9,22 +9,22 @@ //! - [`MemtableGenTagExec`]: Wraps a scan to add `_memtable_gen` column //! - [`BloomFilterGuardExec`]: Guards child execution with bloom filter check //! - [`CoalesceFirstExec`]: Returns first non-empty result with short-circuit -//! - [`WithinSourceDedupExec`]: Deduplicates rows with the same PK from a single source -//! - [`PkHashFilterExec`]: Drops rows whose PK hash was superseded by a newer generation (the cross-generation block-list) +//! - [`PkBlockFilterExec`]: Drops rows whose PK was superseded by a newer generation (the cross-generation block-list) +//! - [`NewestPkFilterExec`]: Drops active-memtable hits that aren't the newest visible version of their PK (the within-source recency filter) mod bloom_guard; mod coalesce_first; mod generation_tag; +mod newest_pk_filter; mod pk; -mod pk_hash_filter; -mod within_source_dedup; +mod pk_block_filter; pub use bloom_guard::{BloomFilterGuardExec, compute_pk_hash_from_scalars}; pub use coalesce_first::CoalesceFirstExec; pub use generation_tag::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec}; +pub use newest_pk_filter::NewestPkFilterExec; pub use pk::{ ROW_ADDRESS_COLUMN, compute_pk_hash, is_supported_pk_type, resolve_pk_indices, validate_pk_types, }; -pub use pk_hash_filter::PkHashFilterExec; -pub use within_source_dedup::{DedupDirection, WithinSourceDedupExec}; +pub use pk_block_filter::PkBlockFilterExec; diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs index 6039eed1629..632b08a753f 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs @@ -21,7 +21,7 @@ use datafusion::physical_plan::{ SendableRecordBatchStream, }; use futures::Stream; -use lance_index::scalar::bloomfilter::sbbf::Sbbf; +use lance_core::utils::bloomfilter::sbbf::Sbbf; /// Guards a child execution node with a bloom filter check. /// diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs new file mode 100644 index 00000000000..e1495cb0bb1 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop predicate-crossing stale rows from an active-memtable index search. +//! +//! The active memtable's HNSW / inverted index are append-only, so an updated +//! row's old entries stay live. When an update moves a row out of the query's +//! match set, the fresh version isn't in the index result, so a result-set +//! dedup (keep-newest among the returned rows) has nothing to suppress the +//! stale version against — and it leaks. +//! +//! This node closes that hole with a predicate-independent recency check: for +//! each hit it asks the memtable's maintained primary-key index +//! ([`IndexStore::pk_is_newest`]) whether the hit's own row position is the +//! newest version of its primary key visible at the query's `max_visible` +//! watermark, and keeps the hit **iff so**. A stale hit (some +//! newer version exists) is dropped even when that newer version never appears +//! in the result. This is exactly the seek point-lookup already does; the index +//! search arms simply didn't do it. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{Array, BooleanArray, RecordBatch, UInt64Array}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::{Stream, StreamExt}; + +use super::pk::resolve_pk_indices; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + +/// Keeps only the index hits that are the newest visible version of their PK. +/// +/// The input must expose all `pk_columns` and the `row_id_column` (`UInt64`, +/// the BatchStore row position). The output schema is unchanged. +pub struct NewestPkFilterExec { + input: Arc, + pk_columns: Vec, + row_id_column: String, + /// Holds the maintained primary-key index, queried per hit via + /// [`IndexStore::pk_is_newest`]. + index_store: Arc, + /// Resolves the `max_visible` row watermark from the visible batch prefix. + batch_store: Arc, + /// The MVCC batch-position snapshot the index search latched. Captured once + /// at plan time and shared with the search so the recency check keys on the + /// same snapshot the hits came from. + max_visible_batch_position: usize, + properties: Arc, +} + +impl fmt::Debug for NewestPkFilterExec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // `BatchStore` / `IndexStore` aren't `Debug`; show only the knobs. + f.debug_struct("NewestPkFilterExec") + .field("pk_columns", &self.pk_columns) + .field("row_id_column", &self.row_id_column) + .field( + "max_visible_batch_position", + &self.max_visible_batch_position, + ) + .finish() + } +} + +impl NewestPkFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + row_id_column: impl Into, + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + row_id_column: row_id_column.into(), + index_store, + batch_store, + max_visible_batch_position, + properties, + } + } + + /// The inclusive max visible row position for this snapshot, or `None` when + /// no rows are visible. + fn max_visible_row(&self) -> Option { + self.batch_store + .max_visible_row(self.max_visible_batch_position) + } +} + +impl DisplayAs for NewestPkFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "NewestPkFilterExec: pk=[{}], row_id={}, max_visible_batch={}", + self.pk_columns.join(", "), + self.row_id_column, + self.max_visible_batch_position, + ) + } + } + } +} + +impl ExecutionPlan for NewestPkFilterExec { + fn name(&self) -> &str { + "NewestPkFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "NewestPkFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.row_id_column.clone(), + self.index_store.clone(), + self.batch_store.clone(), + self.max_visible_batch_position, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(NewestPkFilterStream { + input: input_stream, + pk_columns: self.pk_columns.clone(), + row_id_column: self.row_id_column.clone(), + index_store: self.index_store.clone(), + max_visible_row: self.max_visible_row(), + schema: self.schema(), + })) + } +} + +struct NewestPkFilterStream { + input: SendableRecordBatchStream, + pk_columns: Vec, + row_id_column: String, + index_store: Arc, + /// Inclusive watermark snapshot; `None` when no rows are visible. + max_visible_row: Option, + schema: SchemaRef, +} + +impl NewestPkFilterStream { + fn filter_batch(&self, batch: RecordBatch) -> DFResult { + // No primary-key index (memtable without a primary key), no visible + // rows, or an empty batch: nothing to dedup against, so pass it through. + if !self.index_store.has_pk_index() { + return Ok(batch); + } + let Some(max_visible_row) = self.max_visible_row else { + return Ok(batch); + }; + if batch.num_rows() == 0 { + return Ok(batch); + } + + let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; + let row_ids = batch + .column_by_name(&self.row_id_column) + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' not found in NewestPkFilterExec input", + self.row_id_column + )) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' is not UInt64", + self.row_id_column + )) + })?; + + let mut keep = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + // A null row position can't be ordered; keep it rather than guess + // (callers always project a real position here). + if row_ids.is_null(row) { + keep.push(true); + continue; + } + let position = row_ids.value(row); + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + // Keep iff this hit is the newest visible version of its PK. + keep.push( + self.index_store + .pk_is_newest(&values, position, max_visible_row), + ); + } + filter_record_batch(&batch, &BooleanArray::from(keep)) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } +} + +impl Stream for NewestPkFilterStream { + type Item = DFResult; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(self.filter_batch(batch))), + other => other, + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for NewestPkFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + /// Single-column `id` PK batch, one per append so a caller can control + /// row-level visibility via `max_visible_batch_position`. + fn id_batch(id: i32) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![id]))]).unwrap() + } + + /// Index-search "hits": `(id, _rowid)` pairs the filter evaluates. + fn hits(rows: &[(i32, u64)]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new(lance_core::ROW_ID, DataType::UInt64, true), + ])); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + let rowids: Vec = rows.iter().map(|(_, p)| *p).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(ids)), + Arc::new(UInt64Array::from(rowids)), + ], + ) + .unwrap() + } + + /// Build an active memtable whose PK index + BatchStore hold one row per + /// `id` in `appended` (positions 0..n), all committed. + fn active(appended: &[i32]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in appended { + let b = id_batch(id); + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + (Arc::new(index), batch_store) + } + + async fn run( + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + hits_batch: RecordBatch, + ) -> Vec<(i32, u64)> { + let input = + TestMemoryExec::try_new_exec(&[vec![hits_batch.clone()]], hits_batch.schema(), None) + .unwrap(); + let exec = NewestPkFilterExec::new( + input, + vec!["id".to_string()], + lance_core::ROW_ID, + index_store, + batch_store, + max_visible_batch_position, + ); + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + let mut rows = Vec::new(); + for b in &out { + let ids = b.column(0).as_any().downcast_ref::().unwrap(); + let pos = b.column(1).as_any().downcast_ref::().unwrap(); + for i in 0..b.num_rows() { + rows.push((ids.value(i), pos.value(i))); + } + } + rows + } + + #[tokio::test] + async fn keeps_only_the_newest_visible_position_per_pk() { + // id=1 written at positions 0 and 2 (an update), id=2 at position 1; all + // visible. A stale hit (id=1 @ 0) is dropped; the newest (id=1 @ 2) and + // the unrelated id=2 survive — even though all three were "returned" by + // the index search. + let (index, store) = active(&[1, 2, 1]); + let rows = run(index, store, 2, hits(&[(1, 0), (2, 1), (1, 2)])).await; + assert_eq!(rows, vec![(2, 1), (1, 2)]); + } + + #[tokio::test] + async fn does_not_vanish_a_visible_row_under_a_newer_invisible_write() { + // The store/index hold id=1 at positions 0 and 2, but the query latched + // `max_visible_batch_position = 0` (only position 0 visible) — i.e. the + // update at position 2 was committed *after* this query's snapshot. The + // visible older row (id=1 @ 0) must be KEPT (its newest *visible* version + // is itself), not dropped because of the not-yet-visible position 2. + let (index, store) = active(&[1, 2, 1]); + let kept = run(index.clone(), store.clone(), 0, hits(&[(1, 0)])).await; + assert_eq!(kept, vec![(1, 0)], "visible row must not vanish"); + + // And the not-yet-visible position is itself dropped (outside snapshot). + let dropped = run(index, store, 0, hits(&[(1, 2)])).await; + assert!( + dropped.is_empty(), + "row beyond the snapshot must be dropped" + ); + } + + #[tokio::test] + async fn passes_through_when_no_pk_index() { + // A memtable without a primary-key index can't be deduped here, so the + // filter is a pass-through rather than dropping everything. + let batch_store = Arc::new(BatchStore::with_capacity(16)); + batch_store.append(id_batch(1)).unwrap(); + let index = Arc::new(IndexStore::new()); // no enable_pk_index + let rows = run(index, batch_store, 0, hits(&[(1, 0), (1, 9)])).await; + assert_eq!(rows, vec![(1, 0), (1, 9)]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs index 523dd30bf82..0707eb5e8dd 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs @@ -4,7 +4,7 @@ //! Shared primary-key helpers for the LSM scanner execution nodes. //! //! Centralizes PK column resolution and per-row hashing so that every -//! consumer (e.g. [`super::WithinSourceDedupExec`], [`super::PkHashFilterExec`]) +//! consumer (e.g. [`super::PkBlockFilterExec`], [`super::NewestPkFilterExec`]) //! resolves and hashes a primary key the same way. The row hash is kept //! consistent with the variants supported by [`super::compute_pk_hash_from_scalars`] //! so a single PK produces the same hash regardless of which exec consumes it. diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs new file mode 100644 index 00000000000..c5b8f959d26 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop superseded rows from a per-source result by primary-key membership. +//! +//! Drops a row when any newer generation's membership ([`GenMembership`]) +//! contains its primary key — in-memory generations probe their PK index by +//! value, flushed generations probe their on-disk PK BTree. Each generation is +//! probed once per batch (see the perf note below). Used both as the KNN +//! post-filter (vector search, with over-fetch) and the cross-generation scan +//! filter (`k = 0`). +//! +//! Cross-generation only: within-gen duplicates collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. +//! +//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns +//! when a source had >= k candidates but < k survived (over-fetch too small). +//! +//! Perf note: each generation is probed once per batch via +//! [`GenMembership::contains_keys`] — a batched existence check over the +//! batch's keys — not once per row. The on-disk arm issues a single +//! `BTreeIndex::contains_keys` (one page pass, no per-key `SearchResult` +//! allocation); the in-memory arm maps a sync PK lookup over the keys. Probes +//! are not disk-bound in steady state: the opened index and its (small, +//! memtable-sized) pages are held by the injected `FlushedMemTableCache` / +//! `LanceCache`, so after the first touch every probe is memory-resident. +//! Already-blocked rows are dropped from the key set before probing older +//! generations, preserving the per-row short-circuit. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{BooleanArray, RecordBatch}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::future::BoxFuture; +use futures::{FutureExt, Stream, StreamExt}; +use tracing::warn; + +use super::super::block_list::{GenMembership, on_disk_pk_key}; +use super::pk::resolve_pk_indices; + +/// Filters out rows whose PK is contained in any newer generation's membership. +#[derive(Debug)] +pub struct PkBlockFilterExec { + input: Arc, + pk_columns: Vec, + /// Newer generations' membership; a row is blocked if any contains its PK. + blocked: Vec, + /// Target neighbor count, used only to warn on a per-source under-fetch. + k: usize, + properties: Arc, +} + +impl PkBlockFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + blocked: Vec, + k: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + blocked, + k, + properties, + } + } +} + +impl DisplayAs for PkBlockFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "PkBlockFilterExec: pk_cols=[{}], gens={}", + self.pk_columns.join(", "), + self.blocked.len(), + ) + } + } + } +} + +impl ExecutionPlan for PkBlockFilterExec { + fn name(&self) -> &str { + "PkBlockFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "PkBlockFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.blocked.clone(), + self.k, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(PkBlockFilterStream { + input: input_stream, + config: Arc::new(FilterConfig { + pk_columns: self.pk_columns.clone(), + blocked: self.blocked.clone(), + }), + k: self.k, + schema: self.schema(), + pending: None, + input_seen: 0, + kept: 0, + warned: false, + })) + } +} + +/// Immutable per-stream filter config. Shared into each batch's `'static` async +/// future by a single `Arc` clone, rather than deep-cloning the PK columns and +/// memberships per batch. +struct FilterConfig { + pk_columns: Vec, + blocked: Vec, +} + +struct PkBlockFilterStream { + input: SendableRecordBatchStream, + config: Arc, + k: usize, + schema: SchemaRef, + /// The in-flight filter for the batch currently being processed (the probe + /// is async, so a batch is filtered off-poll and resumed here). + pending: Option>>, + input_seen: usize, + kept: usize, + warned: bool, +} + +/// Keep only the rows no newer-gen membership contains. Async because flushed +/// generations are probed against their on-disk PK BTree. +async fn filter_batch(batch: RecordBatch, config: Arc) -> DFResult { + let FilterConfig { + pk_columns, + blocked, + } = config.as_ref(); + if blocked.is_empty() || batch.num_rows() == 0 { + return Ok(batch); + } + let pk_indices = resolve_pk_indices(&batch, pk_columns)?; + let to_df = |e: lance_core::Error| DataFusionError::Execution(e.to_string()); + + // One key per row, in the index key space. + let keys: Vec = (0..batch.num_rows()) + .map(|row| { + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + on_disk_pk_key(&values).map_err(to_df) + }) + .collect::>()?; + + // A row is dropped if any newer generation contains its key. Probe each + // generation once (batched) rather than once per row, narrowing to the + // still-live rows so an already-blocked row isn't re-probed against older + // generations. + let mut blocked_row = vec![false; keys.len()]; + let mut live: Vec = (0..keys.len()).collect(); + for membership in blocked { + if live.is_empty() { + break; + } + let live_keys: Vec = live.iter().map(|&i| keys[i].clone()).collect(); + let mask = membership.contains_keys(&live_keys).await.map_err(to_df)?; + let mut next_live = Vec::with_capacity(live.len()); + for (pos, &row) in live.iter().enumerate() { + if mask[pos] { + blocked_row[row] = true; + } else { + next_live.push(row); + } + } + live = next_live; + } + + let keep = BooleanArray::from_iter(blocked_row.into_iter().map(|b| Some(!b))); + filter_record_batch(&batch, &keep).map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) +} + +impl Stream for PkBlockFilterStream { + type Item = DFResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + loop { + // Drive an in-flight filter to completion before pulling more input. + if let Some(fut) = this.pending.as_mut() { + return match fut.as_mut().poll(cx) { + Poll::Ready(Ok(out)) => { + this.pending = None; + this.kept += out.num_rows(); + Poll::Ready(Some(Ok(out))) + } + Poll::Ready(Err(e)) => { + this.pending = None; + Poll::Ready(Some(Err(e))) + } + Poll::Pending => Poll::Pending, + }; + } + + match this.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + this.input_seen += batch.num_rows(); + this.pending = Some(filter_batch(batch, this.config.clone()).boxed()); + // Loop to poll the just-created future. + } + Poll::Ready(Some(Err(e))) => return Poll::Ready(Some(Err(e))), + Poll::Ready(None) => { + // >= k candidates in, < k out: over-fetch missed superseded rows. + if !this.warned && this.input_seen >= this.k && this.kept < this.k { + warn!( + k = this.k, + fetched = this.input_seen, + kept = this.kept, + "LSM vector search: < k live rows survived the PK post-filter; \ + raise the over-fetch factor or use a true KNN prefilter." + ); + this.warned = true; + } + return Poll::Ready(None); + } + Poll::Pending => return Poll::Pending, + } + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for PkBlockFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + fn int_batch(ids: &[i32]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() + } + + /// An in-memory membership whose PK index holds `ids` (positions 0..n). + fn membership(ids: &[i32]) -> GenMembership { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = int_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + let max_visible_row = store.max_visible_row(index.max_visible_batch_position()); + GenMembership::InMemory { + index_store: Arc::new(index), + max_visible_row, + } + } + + async fn run(exec: PkBlockFilterExec) -> Vec { + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + out.iter() + .flat_map(|b| { + b.column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect() + } + + #[tokio::test] + async fn drops_rows_blocked_by_a_newer_generation() { + let b = int_batch(&[10, 20, 30]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = + PkBlockFilterExec::new(input, vec!["id".to_string()], vec![membership(&[20])], 1); + assert_eq!(run(exec).await, vec![10, 30]); + } + + #[tokio::test] + async fn blocks_a_pk_present_in_any_generation() { + // Two newer-gen memberships: a row is dropped if either contains its PK. + let b = int_batch(&[10, 20, 30]); + let blocked = vec![membership(&[10]), membership(&[30])]; + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], blocked, 1); + assert_eq!(run(exec).await, vec![20]); + } + + #[tokio::test] + async fn empty_blocked_keeps_all_rows() { + let b = int_batch(&[1, 2, 3]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); + assert_eq!(run(exec).await, vec![1, 2, 3]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs deleted file mode 100644 index ee473047d01..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs +++ /dev/null @@ -1,350 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! Drop superseded rows from a per-source KNN result by primary-key hash. -//! -//! Drops a row when its PK hash ([`super::compute_pk_hash`]) is in any `blocked` -//! set — the newer generations' membership (`Arc`, shared, never merged; -//! base table: all generations). Only the KNN output is hashed. -//! -//! Cross-generation only: within-gen duplicates share a hash, so the global -//! dedup's `(generation, freshness)` tiebreaker collapses those instead. -//! -//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns -//! when a source had >= k candidates but < k survived (over-fetch too small). - -use std::any::Any; -use std::collections::HashSet; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow::compute::filter_record_batch; -use arrow_array::{BooleanArray, RecordBatch}; -use arrow_schema::SchemaRef; -use datafusion::error::{DataFusionError, Result as DFResult}; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::EquivalenceProperties; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt}; -use tracing::warn; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Filters out rows whose PK hash is in any set of `blocked`. -#[derive(Debug)] -pub struct PkHashFilterExec { - input: Arc, - pk_columns: Vec, - /// Newer generations' membership; a row is blocked if any set holds its hash. - blocked: Vec>>, - /// Target neighbor count, used only to warn on a per-source under-fetch. - k: usize, - properties: Arc, -} - -impl PkHashFilterExec { - pub fn new( - input: Arc, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - ) -> Self { - // A filter preserves the input schema and partitioning. - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(input.schema()), - input.output_partitioning().clone(), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - blocked, - k, - properties, - } - } -} - -impl DisplayAs for PkHashFilterExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - let total: usize = self.blocked.iter().map(|s| s.len()).sum(); - write!( - f, - "PkHashFilterExec: pk_cols=[{}], gens={}, blocked={}", - self.pk_columns.join(", "), - self.blocked.len(), - total, - ) - } - } - } -} - -impl ExecutionPlan for PkHashFilterExec { - fn name(&self) -> &str { - "PkHashFilterExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.input.schema() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(DataFusionError::Internal( - "PkHashFilterExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.blocked.clone(), - self.k, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(PkHashFilterStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - blocked: self.blocked.clone(), - k: self.k, - schema: self.schema(), - input_seen: 0, - kept: 0, - warned: false, - })) - } -} - -struct PkHashFilterStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - schema: SchemaRef, - input_seen: usize, - kept: usize, - warned: bool, -} - -impl PkHashFilterStream { - fn filter_batch(&self, batch: RecordBatch) -> DFResult { - if self.blocked.is_empty() || batch.num_rows() == 0 { - return Ok(batch); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let keep: BooleanArray = (0..batch.num_rows()) - .map(|row| { - let hash = compute_pk_hash(&batch, &pk_indices, row); - !self.blocked.iter().any(|set| set.contains(&hash)) - }) - .collect(); - filter_record_batch(&batch, &keep) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for PkHashFilterStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match self.input.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(batch))) => { - self.input_seen += batch.num_rows(); - match self.filter_batch(batch) { - Ok(out) => { - self.kept += out.num_rows(); - Poll::Ready(Some(Ok(out))) - } - Err(e) => Poll::Ready(Some(Err(e))), - } - } - Poll::Ready(None) => { - // >= k candidates in, < k out: the over-fetch missed superseded rows. - if !self.warned && self.input_seen >= self.k && self.kept < self.k { - warn!( - k = self.k, - fetched = self.input_seen, - kept = self.kept, - "LSM vector search: < k live rows survived the PK-hash post-filter; \ - raise the over-fetch factor or use a true KNN prefilter." - ); - self.warned = true; - } - Poll::Ready(None) - } - other => other, - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for PkHashFilterStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - /// Hash a single-column Int32 PK value the way the exec does, so a test can - /// build blocked sets from values rather than hand-computed hashes. - fn hash_int_pk(id: i32) -> u64 { - let batch = int_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - fn int_batch(ids: &[i32]) -> RecordBatch { - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() - } - - fn blocked(ids: &[i32]) -> Vec>> { - vec![Arc::new(ids.iter().map(|&id| hash_int_pk(id)).collect())] - } - - async fn run(exec: PkHashFilterExec) -> Vec { - let ctx = SessionContext::new(); - let out: Vec = exec - .execute(0, ctx.task_ctx()) - .unwrap() - .try_collect() - .await - .unwrap(); - out.iter() - .flat_map(|b| { - b.column_by_name("id") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .values() - .to_vec() - }) - .collect() - } - - #[tokio::test] - async fn drops_rows_with_blocked_pk_hash() { - let b = int_batch(&[10, 20, 30]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], blocked(&[20]), 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn blocks_a_pk_present_in_any_generation_set() { - // Two newer-gen sets: a row is dropped if either contains its PK. - let b = int_batch(&[10, 20, 30]); - let sets = vec![ - Arc::new(HashSet::from([hash_int_pk(10)])), - Arc::new(HashSet::from([hash_int_pk(30)])), - ]; - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], sets, 1); - assert_eq!(run(exec).await, vec![20]); - } - - #[tokio::test] - async fn empty_blocked_keeps_all_rows() { - let b = int_batch(&[1, 2, 3]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); - assert_eq!(run(exec).await, vec![1, 2, 3]); - } - - #[tokio::test] - async fn null_pk_is_hashed_consistently_and_blockable() { - // A null PK hashes deterministically (compute_pk_hash hashes is_null), - // so a superseded null-key row can be dropped like any other. - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, true)])); - let with_null = |ids: Vec>| { - RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(ids))]).unwrap() - }; - let pk = vec!["id".to_string()]; - let null_row = with_null(vec![None]); - let pk_indices = resolve_pk_indices(&null_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &null_row, - &pk_indices, - 0, - )]))]; - - // Rows: 10, NULL, 30 — only the NULL-key row is dropped. - let b = with_null(vec![Some(10), None, Some(30)]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn composite_pk_hash_matches_block_set() { - // Composite PK (id, name): block the (2, "b") tuple only. - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); - let mk = |ids: &[i32], names: &[&str]| { - RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - ], - ) - .unwrap() - }; - let pk = vec!["id".to_string(), "name".to_string()]; - let one_row = mk(&[2], &["b"]); - let pk_indices = resolve_pk_indices(&one_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &one_row, - &pk_indices, - 0, - )]))]; - - // (1,"a") and (2,"a") survive; only the exact (2,"b") tuple is dropped. - let b = mk(&[1, 2, 2], &["a", "a", "b"]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![1, 2]); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs deleted file mode 100644 index be5dae6a668..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs +++ /dev/null @@ -1,432 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! WithinSourceDedupExec - Deduplicates rows with the same primary key from a -//! single LSM source, keeping the newest insert. -//! -//! In MemWAL/LSM mode the same primary key can be written multiple times into -//! the same memtable. The active memtable stores rows in insert order (larger -//! `_rowaddr` = newer), while flushed memtables are reverse-written so that -//! within a flushed file the smallest `_rowid` is the newest insert (see -//! `memtable/flush.rs:152` and `hnsw/storage.rs:307`). Point lookup uses this -//! node to collapse such duplicates *within a single source* so that the -//! downstream `CoalesceFirstExec` / `LIMIT` sees at most one row per primary -//! key per source. - -use std::any::Any; -use std::collections::HashMap; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow_array::{Array, RecordBatch, UInt64Array}; -use arrow_schema::SchemaRef; -use datafusion::error::Result as DFResult; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::{EquivalenceProperties, Partitioning}; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt, ready}; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Among rows that share a primary key, which row-address extreme identifies -/// the newest insert to keep. The kept row is always the freshest; only the -/// row address (`_rowaddr`/`_rowid`) used to find it differs by source. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum DedupDirection { - /// Keep the row with the largest row-address value (active memtable: larger - /// `_rowaddr` = inserted later). - KeepMaxRowAddr, - /// Keep the row with the smallest row-address value (flushed memtable under - /// reverse-write: smaller `_rowid` = inserted later). - KeepMinRowAddr, -} - -/// Deduplicates rows from a single source by primary key, keeping the row -/// whose `row_addr_column` value wins per [`DedupDirection`]. -/// -/// # Required columns -/// -/// The input must expose: -/// - All `pk_columns` -/// - `row_addr_column` of `UInt64` type -/// -/// The output schema is unchanged from the input. Callers that need to hide -/// the row-address column from downstream consumers should compose this node -/// with `project_to_canonical` or `null_columns`. -/// -/// # Performance -/// -/// Memory: `O(unique primary keys in input)`. For point lookup the input is -/// already filtered to a single primary key so the map holds at most one -/// entry. -#[derive(Debug)] -pub struct WithinSourceDedupExec { - input: Arc, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - properties: Arc, -} - -impl WithinSourceDedupExec { - pub fn new( - input: Arc, - pk_columns: Vec, - row_addr_column: impl Into, - direction: DedupDirection, - ) -> Self { - let schema = input.schema(); - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(schema.clone()), - Partitioning::UnknownPartitioning(1), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - row_addr_column: row_addr_column.into(), - direction, - schema, - properties, - } - } - - pub fn pk_columns(&self) -> &[String] { - &self.pk_columns - } - - pub fn row_addr_column(&self) -> &str { - &self.row_addr_column - } - - pub fn direction(&self) -> DedupDirection { - self.direction - } -} - -impl DisplayAs for WithinSourceDedupExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - write!( - f, - "WithinSourceDedupExec: pk=[{}], row_addr={}, direction={:?}", - self.pk_columns.join(", "), - self.row_addr_column, - self.direction, - ) - } - } - } -} - -impl ExecutionPlan for WithinSourceDedupExec { - fn name(&self) -> &str { - "WithinSourceDedupExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(datafusion::error::DataFusionError::Internal( - "WithinSourceDedupExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.row_addr_column.clone(), - self.direction, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(WithinSourceDedupStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - row_addr_column: self.row_addr_column.clone(), - direction: self.direction, - schema: self.schema.clone(), - winners: HashMap::new(), - emitted: false, - })) - } -} - -/// One winning row, materialized as a single-row `RecordBatch` so we don't -/// have to keep the source batch alive after we've picked the winner. -struct Winner { - batch: RecordBatch, - row_addr: u64, -} - -struct WithinSourceDedupStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - winners: HashMap, - emitted: bool, -} - -impl WithinSourceDedupStream { - fn consume_batch(&mut self, batch: RecordBatch) -> DFResult<()> { - if batch.num_rows() == 0 { - return Ok(()); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let row_addr_array = batch - .column_by_name(&self.row_addr_column) - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' not found in batch", - self.row_addr_column - )) - })? - .as_any() - .downcast_ref::() - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' is not UInt64", - self.row_addr_column - )) - })?; - - for row_idx in 0..batch.num_rows() { - if row_addr_array.is_null(row_idx) { - // A NULL row address can't be ordered against a real one. Skip - // rather than guess — callers should always project a real - // row-address column for dedup-eligible sources. - continue; - } - let row_addr = row_addr_array.value(row_idx); - let pk_hash = compute_pk_hash(&batch, &pk_indices, row_idx); - - let take_row = match self.winners.get(&pk_hash) { - None => true, - Some(existing) => match self.direction { - DedupDirection::KeepMaxRowAddr => row_addr > existing.row_addr, - DedupDirection::KeepMinRowAddr => row_addr < existing.row_addr, - }, - }; - - if take_row { - let single = batch.slice(row_idx, 1); - self.winners.insert( - pk_hash, - Winner { - batch: single, - row_addr, - }, - ); - } - } - Ok(()) - } - - fn finalize(&mut self) -> DFResult { - if self.winners.is_empty() { - return Ok(RecordBatch::new_empty(self.schema.clone())); - } - let batches: Vec = self.winners.drain().map(|(_, w)| w.batch).collect(); - let batch_refs: Vec<&RecordBatch> = batches.iter().collect(); - arrow_select::concat::concat_batches(&self.schema, batch_refs) - .map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for WithinSourceDedupStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - loop { - if self.emitted { - return Poll::Ready(None); - } - match ready!(self.input.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - if let Err(e) = self.consume_batch(batch) { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - } - Some(Err(e)) => { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - None => { - self.emitted = true; - return Poll::Ready(Some(self.finalize())); - } - } - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for WithinSourceDedupStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Float32Array, Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - fn create_test_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - Field::new("_distance", DataType::Float32, true), - Field::new("_row_addr", DataType::UInt64, true), - ])) - } - - fn batch(ids: &[i32], names: &[&str], distances: &[f32], row_addr: &[u64]) -> RecordBatch { - let schema = create_test_schema(); - RecordBatch::try_new( - schema, - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - Arc::new(Float32Array::from(distances.to_vec())), - Arc::new(UInt64Array::from(row_addr.to_vec())), - ], - ) - .unwrap() - } - - async fn run(batches: Vec, direction: DedupDirection) -> Vec { - let schema = create_test_schema(); - let input = TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap(); - let exec = - WithinSourceDedupExec::new(input, vec!["id".to_string()], "_row_addr", direction); - let ctx = SessionContext::new(); - let stream = exec.execute(0, ctx.task_ctx()).unwrap(); - stream.try_collect().await.unwrap() - } - - fn extract(batches: &[RecordBatch]) -> Vec<(i32, String, u64)> { - let mut out = Vec::new(); - for b in batches { - let ids = b.column(0).as_any().downcast_ref::().unwrap(); - let names = b.column(1).as_any().downcast_ref::().unwrap(); - let addr = b.column(3).as_any().downcast_ref::().unwrap(); - for i in 0..b.num_rows() { - out.push((ids.value(i), names.value(i).to_string(), addr.value(i))); - } - } - out.sort_by_key(|(id, _, _)| *id); - out - } - - #[tokio::test] - async fn keep_max_picks_largest_row_addr() { - // Active-memtable case: same pk inserted twice; newer = larger _rowaddr. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[10, 99, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 99)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn keep_min_picks_smallest_row_addr() { - // Flushed-memtable case under reverse-write: newer = smaller _rowid. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[99, 10, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMinRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 10)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn dedup_across_batches() { - let b1 = batch(&[1, 2], &["a", "b"], &[0.1, 0.2], &[1, 1]); - let b2 = batch(&[1, 3], &["a_new", "c"], &[0.5, 0.4], &[7, 1]); - let out = run(vec![b1, b2], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 3); - assert_eq!(rows[0], (1, "a_new".to_string(), 7)); - assert_eq!(rows[1], (2, "b".to_string(), 1)); - assert_eq!(rows[2], (3, "c".to_string(), 1)); - } - - #[tokio::test] - async fn empty_input() { - let out = run(vec![], DedupDirection::KeepMaxRowAddr).await; - let total: usize = out.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total, 0); - } - - #[tokio::test] - async fn null_row_addr_skipped() { - // Rows with NULL row address can't be ordered — they're dropped so they - // don't accidentally become winners against real values. - let schema = create_test_schema(); - let b = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 1])), - Arc::new(StringArray::from(vec!["nulladdr", "real"])), - Arc::new(Float32Array::from(vec![0.1, 0.2])), - Arc::new(UInt64Array::from(vec![None, Some(5)])), - ], - ) - .unwrap(); - let out = run(vec![b], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 1); - assert_eq!(rows[0], (1, "real".to_string(), 5)); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs index 39abf7e8c71..7a5280bedb8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs @@ -22,6 +22,7 @@ use std::collections::HashSet; use std::sync::Arc; +use async_trait::async_trait; use lance_core::{Error, Result}; use crate::dataset::{Dataset, DatasetBuilder}; @@ -41,12 +42,10 @@ use crate::session::Session; pub struct FlushedMemTableCache { // `moka`'s async cache gives a bounded size plus single-flight // `try_get_with`, so concurrent first-queries on a just-flushed - // generation open the dataset exactly once. + // generation open the dataset exactly once. The opened dataset carries the + // session index cache, which also backs each generation's standalone PK + // dedup index (see `block_list::open_pk_index`) — no separate cache path. inner: moka::future::Cache>, - // Per-generation set of PK hashes for the vector-search block-list, keyed by - // the same immutable flushed path. Built lazily on the first query that needs - // it (single-flight) so repeated searches skip re-scanning the PK column. - pk_hashes: moka::future::Cache>>, } impl FlushedMemTableCache { @@ -63,10 +62,6 @@ impl FlushedMemTableCache { // into at build time. .support_invalidation_closures() .build(), - pk_hashes: moka::future::Cache::builder() - .max_capacity(max_entries) - .support_invalidation_closures() - .build(), } } @@ -96,21 +91,6 @@ impl FlushedMemTableCache { .map_err(|e: Arc| Error::cloned(e.to_string())) } - /// Get the cached set of PK hashes for `path`, building it (exactly once) on - /// a miss via `build`. The flushed path is immutable, so a cached set is - /// never stale; concurrent first-queries share one build via `moka`'s - /// single-flight `try_get_with`. - pub async fn get_or_build_pk_hashes( - &self, - path: &str, - build: impl std::future::Future>>, - ) -> Result>> { - self.pk_hashes - .try_get_with(path.to_string(), async move { build.await.map(Arc::new) }) - .await - .map_err(|e: Arc| Error::cloned(e.to_string())) - } - /// Drop cached entries whose path is not in `live_paths`. /// /// Called by the consumer after compaction retires generations. Purely a @@ -125,10 +105,6 @@ impl FlushedMemTableCache { let _ = self .inner .invalidate_entries_if(move |path, _| !live.contains(path)); - let live = live_paths.clone(); - let _ = self - .pk_hashes - .invalidate_entries_if(move |path, _| !live.contains(path)); } } @@ -140,29 +116,92 @@ impl std::fmt::Debug for FlushedMemTableCache { } } +/// Caching of opened flushed-generation datasets, keyed by immutable path. The +/// opened dataset carries the session index cache, which also backs each +/// generation's secondary indexes and its PK dedup sidecar (see +/// `block_list::open_pk_index`) — so a single `get_or_open` is the +/// whole caching surface. Implemented by [`FlushedMemTableCache`]; a +/// [`GenerationWarmer`] composes one to warm through it, and a consumer may +/// supply its own implementation. +#[async_trait] +pub trait DatasetCache: Send + Sync + std::fmt::Debug { + async fn get_or_open(&self, path: &str, session: Option>) -> Result>; + + /// Drop cached entries whose path is not in `live_paths`. Async so an + /// implementation can evict retired generations' index objects (e.g. + /// `Session::invalidate_index_prefix`) without a later breaking signature + /// change; [`FlushedMemTableCache`]'s own eviction is synchronous. + async fn retain_paths(&self, live_paths: &HashSet); +} + +#[async_trait] +impl DatasetCache for FlushedMemTableCache { + async fn get_or_open(&self, path: &str, session: Option>) -> Result> { + Self::get_or_open(self, path, session).await + } + + async fn retain_paths(&self, live_paths: &HashSet) { + Self::retain_paths(self, live_paths) + } +} + +/// Proactively warms a flushed generation into the shared caches: open the +/// dataset and pre-load its secondary indexes and PK dedup sidecar so the first +/// query sees no cold reads. This is the **seam** the flush and read paths fire +/// — lance defines it; the consumer (e.g. the WAL pod) implements it. `None` => +/// no warming, generations warm lazily on first read. +/// +/// Everything a warmer touches is keyed by the immutable generation `path` +/// (opened dataset, its secondary indexes, its PK dedup sidecar), so `path` is +/// the only input it needs. +/// +/// `warm` is fired fire-and-forget from every read path that opens a generation +/// (all four LSM planners) as well as pre-commit on flush, so the same path may +/// be warmed concurrently and repeatedly. Implementations **must be idempotent +/// and cheap when the path is already warm** (e.g. dedup in-flight and +/// completed paths) — a redundant call must not re-do work or fail. +#[async_trait] +pub trait GenerationWarmer: Send + Sync + std::fmt::Debug { + async fn warm(&self, path: &str) -> Result<()>; +} + /// Open a flushed-generation dataset, shared by all three LSM open sites /// (scan, point lookup, vector search). /// -/// - `cache` present: route through [`FlushedMemTableCache`] (single-flight, -/// shared `Arc`, manifest read amortized across queries). +/// - `cache` present: route through a [`DatasetCache`] (e.g. +/// [`FlushedMemTableCache`]: single-flight, shared `Arc`, manifest read +/// amortized across queries). /// - `cache` absent: cold open via [`DatasetBuilder`]. Passing `session` /// still reuses the shared index / metadata caches; `None`/`None` /// reproduces the original per-query cold-open behavior exactly. +/// - `warmer` present: fire a fire-and-forget warm-on-open backstop behind the +/// returned handle (the warmer dedups already-warm paths). `None` => no warming. pub async fn open_flushed_dataset( path: &str, session: Option<&Arc>, - cache: Option<&Arc>, + cache: Option<&Arc>, + warmer: Option<&Arc>, ) -> Result> { - match cache { - Some(cache) => cache.get_or_open(path, session.cloned()).await, + let dataset = match cache { + Some(cache) => cache.get_or_open(path, session.cloned()).await?, None => { let mut builder = DatasetBuilder::from_uri(path); if let Some(session) = session { builder = builder.with_session(session.clone()); } - Ok(Arc::new(builder.load().await?)) + Arc::new(builder.load().await?) } + }; + if let Some(warmer) = warmer { + let warmer = Arc::clone(warmer); + let path = path.to_string(); + tokio::spawn(async move { + if let Err(e) = warmer.warm(&path).await { + tracing::debug!(generation = %path, error = %e, "warm-on-open failed"); + } + }); } + Ok(dataset) } #[cfg(test)] @@ -250,34 +289,6 @@ mod tests { assert_eq!(cache.inner.entry_count(), 1, "exactly one entry cached"); } - #[tokio::test] - async fn pk_hashes_cached_reuses_first_build() { - // The PK-hash set is keyed by the immutable flushed path: a hit returns - // the first-built set and never runs the second build closure. - let cache = FlushedMemTableCache::new(8); - let path = "memory://shard/gen_1"; - let first = cache - .get_or_build_pk_hashes(path, async { Ok(HashSet::from([1u64, 2])) }) - .await - .unwrap(); - let second = cache - .get_or_build_pk_hashes(path, async { - // Different contents; must be ignored because the path is cached. - Ok(HashSet::from([9u64])) - }) - .await - .unwrap(); - assert!( - Arc::ptr_eq(&first, &second), - "a PK-hash cache hit must reuse the first-built set" - ); - assert_eq!( - second.len(), - 2, - "cached set keeps the first build's contents" - ); - } - #[tokio::test] async fn test_retain_paths_drops_unreferenced() { let temp_dir = tempfile::tempdir().unwrap(); @@ -310,8 +321,8 @@ mod tests { let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap()); write_dataset(&uri, &[7, 8, 9]).await; - let a = open_flushed_dataset(&uri, None, None).await.unwrap(); - let b = open_flushed_dataset(&uri, None, None).await.unwrap(); + let a = open_flushed_dataset(&uri, None, None, None).await.unwrap(); + let b = open_flushed_dataset(&uri, None, None, None).await.unwrap(); assert!( !Arc::ptr_eq(&a, &b), "no-cache path must cold-open each call" @@ -319,13 +330,57 @@ mod tests { assert_eq!(a.count_rows(None).await.unwrap(), 3); // With a cache, the second call is a shared clone. - let cache = Arc::new(FlushedMemTableCache::new(8)); - let c = open_flushed_dataset(&uri, None, Some(&cache)) + let cache: Arc = Arc::new(FlushedMemTableCache::new(8)); + let c = open_flushed_dataset(&uri, None, Some(&cache), None) .await .unwrap(); - let d = open_flushed_dataset(&uri, None, Some(&cache)) + let d = open_flushed_dataset(&uri, None, Some(&cache), None) .await .unwrap(); assert!(Arc::ptr_eq(&c, &d), "cached path must reuse the Arc"); } + + /// A warmer that records calls and signals each one. + #[derive(Debug)] + struct NotifyingWarmer { + calls: Arc, + notify: Arc, + } + + #[async_trait] + impl GenerationWarmer for NotifyingWarmer { + async fn warm(&self, _path: &str) -> Result<()> { + self.calls.fetch_add(1, Ordering::SeqCst); + self.notify.notify_one(); + Ok(()) + } + } + + #[tokio::test] + async fn test_open_flushed_dataset_fires_warm_on_open() { + // The warm-on-open backstop fires the warmer (fire-and-forget) when a + // generation is opened, so generations the flusher never warmed still + // get warmed lazily on first read. + let temp_dir = tempfile::tempdir().unwrap(); + let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap()); + write_dataset(&uri, &[1, 2, 3]).await; + + let calls = Arc::new(AtomicUsize::new(0)); + let notify = Arc::new(tokio::sync::Notify::new()); + let warmer: Arc = Arc::new(NotifyingWarmer { + calls: calls.clone(), + notify: notify.clone(), + }); + + let ds = open_flushed_dataset(&uri, None, None, Some(&warmer)) + .await + .unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 3); + + // The warm is spawned fire-and-forget; wait (bounded) for it to run. + tokio::time::timeout(std::time::Duration::from_secs(5), notify.notified()) + .await + .expect("warm-on-open must fire"); + assert_eq!(calls.load(Ordering::SeqCst), 1, "warmer fired once on open"); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs index aa086a80e66..e7c8d205d5d 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs @@ -49,9 +49,11 @@ use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::query::FtsQuery as IndexFtsQuery; use tracing::instrument; +use super::block_list::compute_source_block_lists; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::exec::{NewestPkFilterExec, PkBlockFilterExec}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::project_to_canonical; use crate::dataset::mem_wal::memtable::scanner::MemTableScanner; use crate::session::Session; @@ -61,6 +63,11 @@ use crate::session::Session; /// require an import for one string constant. pub const SCORE_COLUMN: &str = "_score"; +/// Default over-fetch multiple for blocked sources. `1.0` keeps cross-generation +/// dedup on with no over-fetch; callers (e.g. the sophon WAL handler) raise it +/// so a blocked source still yields `k` live rows after the block-list filter. +const DEFAULT_OVERFETCH_FACTOR: f64 = 1.0; + /// Plans local-scoring FTS queries over LSM data. pub struct LsmFtsSearchPlanner { collector: LsmDataSourceCollector, @@ -69,7 +76,11 @@ pub struct LsmFtsSearchPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, + /// Over-fetch multiple for blocked sources (clamped to `>= 1.0`). + overfetch_factor: f64, } impl LsmFtsSearchPlanner { @@ -85,9 +96,18 @@ impl LsmFtsSearchPlanner { base_schema, session: None, flushed_cache: None, + warmer: None, + overfetch_factor: DEFAULT_OVERFETCH_FACTOR, } } + /// Set the over-fetch multiple for blocked sources so they still yield `k` + /// live rows after cross-generation block-list filtering. Clamped to `>= 1.0`. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = factor; + self + } + /// Thread a session into flushed-generation opens so the first open /// populates the shared index / file-metadata caches. pub fn with_session(mut self, session: Arc) -> Self { @@ -97,11 +117,17 @@ impl LsmFtsSearchPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// searches against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Build the FTS execution plan (local scoring). /// /// # Arguments @@ -137,12 +163,69 @@ impl LsmFtsSearchPlanner { return self.empty_plan(&target_schema); } + // Per-source PK block sets for cross-generation dedup (NEWER(G) per + // shard; base = union of all gens). Query-type-agnostic — same call the + // vector planner makes. `Box::pin` keeps the future off + // `clippy::large_futures`. + let block_lists = Box::pin(compute_source_block_lists( + &sources, + self.session.as_ref(), + self.flushed_cache.as_ref(), + )) + .await?; + let overfetch = self.overfetch_factor.max(1.0); + + // Stage the per-source over-fetch decisions, then build every source + // plan concurrently — the builds are independent and a sequential loop + // was the dominant serial planning cost at multiple generations. + let arm_inputs: Vec<_> = sources + .iter() + .map(|source| { + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + let blocked = block_lists.get(&(source.shard_id(), source.generation())); + // Over-fetch a blocked source so the post-filter still yields k live + // rows. The active arm returns all matches (no builder limit), so its + // within-source dedup needs no over-fetch hint. + let fetch_k = if blocked.is_some() { + ((k as f64) * overfetch).ceil() as usize + } else { + k + }; + (source, is_active, blocked, fetch_k) + }) + .collect(); + let built = + futures::future::try_join_all(arm_inputs.iter().map(|(source, _, _, fetch_k)| { + Box::pin(self.build_source_plan(source, column, &query, *fetch_k, projection)) + })) + .await?; + let mut per_source_plans: Vec> = Vec::with_capacity(sources.len()); - for source in &sources { - let plan = self - .build_source_plan(source, column, &query, k, projection) - .await?; - let normalized = project_to_canonical(plan, &target_schema)?; + for ((_, is_active, blocked, _), plan) in arm_inputs.iter().zip(built) { + let is_active = *is_active; + let blocked = *blocked; + // Dedup, mirroring LsmVectorSearchPlanner: + // * active: already wrapped in `NewestPkFilterExec` inside + // `build_source_plan` (drops predicate-crossing stale hits, which a + // result-set dedup can't catch). + // * flushed/base: drop rows superseded by a newer generation via the + // block-list (within-gen is handled by the flushed deletion vector). + let deduped = if is_active { + plan + } else if let Some(set) = blocked { + Arc::new(PkBlockFilterExec::new( + plan, + self.pk_columns.clone(), + set.clone(), + k, + )) as Arc + } else { + plan + }; + + // Normalize to canonical. This also drops the active arm's _rowid, + // which the canonical FTS schema omits — it served only the dedup. + let normalized = project_to_canonical(deduped, &target_schema)?; per_source_plans.push(normalized); } @@ -151,8 +234,11 @@ impl LsmFtsSearchPlanner { per_source_plans.into_iter().next().unwrap() } else { #[allow(deprecated)] - let union: Arc = Arc::new(UnionExec::new(per_source_plans)); - union + // The downstream `SortPreservingMergeExec` already spawns one driver + // task per input partition (one per union arm) via `spawn_buffered`, + // so each arm's per-arm CPU (posting decode, BM25) runs on its own + // task without an extra repartition. + Arc::new(UnionExec::new(per_source_plans)) }; let score_idx = merged.schema().index_of(SCORE_COLUMN).map_err(|_| { @@ -209,9 +295,13 @@ impl LsmFtsSearchPlanner { scanner.create_plan().await } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>())?; @@ -232,6 +322,12 @@ impl LsmFtsSearchPlanner { MemTableScanner::new(batch_store.clone(), index_store.clone(), schema.clone()); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); + // Expose the row position so the recency filter can identify the + // newest visible version of each PK. The append-only inverted + // index keeps an updated row's old postings live, so a stale hit + // can match a query the fresh row no longer does; the filter + // drops it. `project_to_canonical` strips `_rowid` afterward. + scanner.with_row_id(); // `MemTableScanner::full_text_search` takes a raw match // string; richer query shapes (phrase/boolean/fuzzy) can // be plumbed through once the MemTable scanner accepts a @@ -250,7 +346,19 @@ impl LsmFtsSearchPlanner { // today; the per-partition Sort+fetch above bounds the // emitted rows. let _ = k; - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Drop predicate-crossing stale hits: keep a hit iff it is the + // newest visible version of its PK (collapses duplicate-PK + // appends too — supersedes the old WithinSourceDedupExec). + let filtered: Arc = Arc::new(NewestPkFilterExec::new( + plan, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store.clone(), + batch_store.clone(), + scanner.max_visible_batch_position(), + )); + Ok(filtered) } } } @@ -404,6 +512,7 @@ mod tests { // Active memtable with its own FTS index, containing a matching row. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); let active_batch = make_batch( &schema, @@ -561,4 +670,179 @@ mod tests { } } } + + #[tokio::test] + async fn local_mode_active_dedups_updated_pk_keeping_newest() { + // The active memtable is an append log and the FTS index is + // append-only, so a PK updated before flush is searchable as two + // row-positions. WithinSourceDedupExec(KeepMaxRowAddr) must collapse + // them to the newest insert. Without it the same PK would surface + // twice (criterion 2 violation). + let schema = fts_schema(); + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); + indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); + + // First append (positions 0,1): id=1 is the stale version of the PK. + let batch_old = make_batch(&schema, &[1, 2], &["lance stale version", "other doc"]); + batch_store.append(batch_old.clone()).unwrap(); + indexes + .insert_with_batch_position(&batch_old, 0, Some(0)) + .unwrap(); + + // Second append (position 2): id=1 updated — same PK, later row. + let batch_new = make_batch(&schema, &[1], &["lance fresh version"]); + batch_store.append(batch_new.clone()).unwrap(); + indexes + .insert_with_batch_position(&batch_new, 2, Some(1)) + .unwrap(); + let indexes = Arc::new(indexes); + + let tmp = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", tmp.path().to_str().unwrap()); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + uuid::Uuid::new_v4(), + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store: indexes, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmFtsSearchPlanner::new(collector, vec!["id".to_string()], schema); + let plan = planner + .plan_search( + "text", + FullTextSearchQuery::new("lance".to_string()), + 10, + None, + ) + .await + .expect("planner should produce an active-only plan"); + + let ctx = datafusion::prelude::SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let mut rows: Vec<(i32, String)> = Vec::new(); + for b in &batches { + let ids = b + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let texts = b + .column_by_name("text") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..b.num_rows() { + rows.push((ids.value(i), texts.value(i).to_string())); + } + } + + // id=1 must appear exactly once, and it must be the *newest* version. + let id1: Vec<&(i32, String)> = rows.iter().filter(|(id, _)| *id == 1).collect(); + assert_eq!( + id1.len(), + 1, + "updated PK id=1 must be deduped to one row; got {rows:?}" + ); + assert_eq!( + id1[0].1, "lance fresh version", + "dedup must keep the newest (max row-position) version" + ); + } + + #[tokio::test] + async fn active_stale_update_predicate_crossing_leaks() { + // A PK update that crosses out of the match set: pk=1 inserted as + // "alpha lance", then updated to "beta lance". The append-only inverted + // index keeps the old "alpha" posting live, so an "alpha" search still + // matches the STALE pk=1 row — and the fresh "beta lance" row isn't even + // a candidate, so a result-set dedup has nothing to suppress it against. + // `NewestPkFilterExec` drops it predicate-independently: pk=1's newest + // visible row is "beta lance", so the "alpha" hit is not the newest. + let schema = fts_schema(); + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); + indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); + + // Insert pk=1 ("alpha lance") and an unrelated live pk=2 ("alpha foo"). + let b1 = make_batch(&schema, &[1, 2], &["alpha lance", "alpha foo"]); + let (bp1, off1, _) = batch_store.append(b1.clone()).unwrap(); + indexes + .insert_with_batch_position(&b1, off1, Some(bp1)) + .unwrap(); + + // Update pk=1 → "beta lance" (no longer matches "alpha"). + let b2 = make_batch(&schema, &[1], &["beta lance"]); + let (bp2, off2, _) = batch_store.append(b2.clone()).unwrap(); + indexes + .insert_with_batch_position(&b2, off2, Some(bp2)) + .unwrap(); + let indexes = Arc::new(indexes); + + let tmp = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", tmp.path().to_str().unwrap()); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + uuid::Uuid::new_v4(), + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store: indexes, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmFtsSearchPlanner::new(collector, vec!["id".to_string()], schema); + let plan = planner + .plan_search( + "text", + FullTextSearchQuery::new("alpha".to_string()), + 10, + None, + ) + .await + .expect("planner should produce a plan"); + + let ctx = datafusion::prelude::SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let mut ids: Vec = Vec::new(); + for b in &batches { + let col = b + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..b.num_rows() { + ids.push(col.value(i)); + } + } + + assert!( + !ids.contains(&1), + "stale pk=1 (now 'beta lance') leaked on an 'alpha' search; got ids={ids:?}" + ); + assert!( + ids.contains(&2), + "live pk=2 ('alpha foo') must still match 'alpha'; got ids={ids:?}" + ); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/planner.rs b/rust/lance/src/dataset/mem_wal/scanner/planner.rs index f3f15e2e680..f040428f342 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/planner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/planner.rs @@ -15,8 +15,8 @@ use tracing::instrument; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkHashFilterExec, ROW_ADDRESS_COLUMN}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkBlockFilterExec, ROW_ADDRESS_COLUMN}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, }; @@ -33,7 +33,13 @@ pub struct LsmScanPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, + /// Over-fetch multiple for the per-source limit pushdown: block-listed + /// sources scan `(offset + limit) * factor` rows so cross-gen dedup drops + /// still leave enough live rows. Clamped to `>= 1.0`. + overfetch_factor: f64, } impl LsmScanPlanner { @@ -49,6 +55,8 @@ impl LsmScanPlanner { base_schema, session: None, flushed_cache: None, + warmer: None, + overfetch_factor: 1.0, } } @@ -61,11 +69,24 @@ impl LsmScanPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// queries against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + + /// Set the over-fetch multiple for the per-source limit pushdown + /// (see the field docs). Clamped to `>= 1.0` at use. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = factor; + self + } + /// Create scan plan with deduplication. /// /// # Arguments @@ -82,7 +103,7 @@ impl LsmScanPlanner { /// Each source is independently newest-per-PK (active via the fused /// [`MemTableDedupScanExec`](super::super::memtable::scanner), flushed via /// its within-generation deletion vector) and a cross-generation block-list - /// ([`PkHashFilterExec`]) drops any PK superseded by a newer generation. + /// ([`PkBlockFilterExec`]) drops any PK superseded by a newer generation. /// Each PK therefore survives in exactly one source, so a plain /// `UnionExec` carries at most one row per PK — no cross-source dedup, /// sort, or merge needed. `_memtable_gen` / `_rowaddr` are output-only and @@ -119,7 +140,6 @@ impl LsmScanPlanner { // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -130,23 +150,59 @@ impl LsmScanPlanner { // cross-gen block-list, not from output ordering. let sources: Vec<_> = sources.into_iter().rev().collect(); + // Per-source limit pushdown: an unordered LIMIT needs only + // `offset + limit` live rows from EACH source to fill the global + // limit after dedup (any-N semantics), so cap every on-disk source + // instead of scanning whole generations and trimming above the + // union. Block-listed sources over-fetch by `overfetch_factor` so + // cross-gen dedup drops still leave `n_needed` live rows; the + // PkBlockFilter warns when that was not enough. The active memtable + // is in-memory and within-gen append duplicates are resolved by its + // own dedup, so it is never capped here. + let n_needed = limit.map(|l| l.saturating_add(offset.unwrap_or(0))); + let overfetch = self.overfetch_factor.max(1.0); + let mut source_plans = Vec::new(); for source in sources { let is_base = matches!(source, LsmDataSource::BaseTable { .. }); - let scan = self.build_source_scan(&source, projection, filter).await?; + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + let blocked = block_lists + .get(&(source.shard_id(), source.generation())) + .cloned(); + let fetch = match (n_needed, is_active) { + (Some(n), false) => Some(if blocked.is_some() { + ((n as f64) * overfetch).ceil() as usize + } else { + n + }), + _ => None, + }; + let scan = self + .build_source_scan(&source, projection, filter, fetch) + .await?; // Drop cross-generation stale rows (PKs superseded by a newer gen). - // `k = 0`: there is no top-k, so the under-fetch warning never fires. - let scan = match block_lists.get(&(source.shard_id(), source.generation())) { - Some(set) => Arc::new(PkHashFilterExec::new( + // With a limit, `k = n_needed` arms the under-fetch warning; with + // no limit `k = 0` keeps it silent. + let scan = match blocked { + Some(set) => Arc::new(PkBlockFilterExec::new( scan, self.pk_columns.clone(), - set.clone(), - 0, + set, + n_needed.unwrap_or(0), )) as Arc, None => scan, }; + // Post-block-list cap: each source contributes at most `n_needed` + // live rows toward the global limit. + let scan: Arc = match n_needed { + Some(n) if !is_active => Arc::new( + datafusion::physical_plan::limit::LocalLimitExec::new(scan, n), + ), + _ => scan, + }; + // When `_rowaddr` is surfaced, NULL it for non-base arms: only base // values are meaningful (e.g. for `take_rows`); per-source addresses // collide with base IDs. @@ -229,6 +285,7 @@ impl LsmScanPlanner { source: &LsmDataSource, projection: Option<&[String]>, filter: Option<&Expr>, + fetch: Option, ) -> Result> { match source { LsmDataSource::BaseTable { dataset } => { @@ -247,13 +304,22 @@ impl LsmScanPlanner { if let Some(expr) = filter { scanner.filter_expr(expr.clone()); } + // Per-source limit pushdown (post-filter rows): bounds the + // physical scan instead of trimming above the union. + if let Some(fetch) = fetch { + scanner.limit(Some(fetch as i64), None)?; + } scanner.create_plan().await } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = @@ -264,6 +330,12 @@ impl LsmScanPlanner { if let Some(expr) = filter { scanner.filter_expr(expr.clone()); } + // Per-source limit pushdown: flushed generations are + // within-gen live (dedup-on-flush deletion vectors), so any + // `fetch` post-filter rows are valid contributions. + if let Some(fetch) = fetch { + scanner.limit(Some(fetch as i64), None)?; + } scanner.create_plan().await } @@ -413,13 +485,36 @@ mod integration_tests { .unwrap() } - /// Create a dataset at the given URI with the provided batches. + /// Create a dataset at the given URI with the provided batches. Also writes + /// the standalone PK sidecar (on `id`) so a flushed-generation source can be + /// probed by the block-list; harmless for a base table (never probed). async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) .await - .unwrap() + .unwrap(); + if has_id { + super::super::block_list::write_pk_sidecar(uri, &batches, &["id"]) + .await + .unwrap(); + } + dataset + } + + /// Build an in-memory memtable's `(batch_store, index_store)` with the PK + /// index enabled and populated (mirrors production — the block-list needs + /// the PK index to dedup in-memory generations). + fn pk_indexed(batches: &[RecordBatch]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(100)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for b in batches { + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(b, off, Some(bp)).unwrap(); + } + (batch_store, Arc::new(index)) } /// Setup a multi-level LSM structure with: @@ -470,10 +565,8 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Create active memtable - let batch_store = Arc::new(BatchStore::with_capacity(100)); - let index_store = Arc::new(IndexStore::new()); - let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); - let _ = batch_store.append(active_batch); + let (batch_store, index_store) = + pk_indexed(&[create_test_batch(&schema, &[5, 6, 7], "active")]); let active_memtable = InMemoryMemTables { active: InMemoryMemTableRef { @@ -515,18 +608,18 @@ mod integration_tests { // Verify the plan (gen DESC order: active -> gen2 -> gen1 -> base): // - plain UnionExec at top // - active arm: MemTableDedupScanExec (newest gen, not block-listed) - // - older arms: PkHashFilterExec (cross-gen block-list) -> LanceRead + // - older arms: PkBlockFilterExec (cross-gen block-list) -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... CoalescePartitionsExec UnionExec MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -549,9 +642,9 @@ mod integration_tests { // Verify the plan with `_memtable_gen` tags (gen DESC order): // - plain UnionExec at top - // - each arm: MemtableGenTagExec -> (PkHashFilterExec ->) data source + // - each arm: MemtableGenTagExec -> (PkBlockFilterExec ->) data source // - gen3 (active): MemtableGenTagExec -> MemTableDedupScanExec - // - gen2/gen1/base: MemtableGenTagExec -> PkHashFilterExec -> LanceRead + // - gen2/gen1/base: MemtableGenTagExec -> PkBlockFilterExec -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... @@ -560,13 +653,13 @@ mod integration_tests { MemtableGenTagExec: gen=gen3 MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -647,14 +740,14 @@ mod integration_tests { } // base/gen1/gen2 all hold PKs superseded by a newer generation, so each - // is wrapped in a `PkHashFilterExec`; the newest (active) arm is not. + // is wrapped in a `PkBlockFilterExec`; the newest (active) arm is not. let plan = scanner.create_plan().await.unwrap(); let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "filtered-read plan must apply the cross-gen block-list, got:\n{}", plan_str ); @@ -730,21 +823,21 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Frozen gen3 (sealed, NOT in the manifest) and active gen4. - let frozen_store = Arc::new(BatchStore::with_capacity(100)); - let _ = frozen_store.append(create_test_batch(&schema, &[6, 7], "frozen")); + let (frozen_store, frozen_index) = + pk_indexed(&[create_test_batch(&schema, &[6, 7], "frozen")]); let frozen = InMemoryMemTableRef { batch_store: frozen_store, - index_store: Arc::new(IndexStore::new()), + index_store: frozen_index, schema: schema.clone(), generation: 3, }; - let active_store = Arc::new(BatchStore::with_capacity(100)); - let _ = active_store.append(create_test_batch(&schema, &[7, 8], "active")); + let (active_store, active_index) = + pk_indexed(&[create_test_batch(&schema, &[7, 8], "active")]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store: active_store, - index_store: Arc::new(IndexStore::new()), + index_store: active_index, schema: schema.clone(), generation: 4, }, @@ -969,12 +1062,12 @@ mod integration_tests { ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1037,14 +1130,14 @@ mod integration_tests { MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1113,6 +1206,8 @@ mod integration_tests { let mut index_store = IndexStore::new(); // Add BTree index on id column (field_id=0) index_store.add_btree("id_idx".to_string(), 0, "id".to_string()); + // Reuse it as the PK index so the block-list can dedup this generation. + index_store.enable_pk_index(&[("id".to_string(), 0)]); let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); let _ = batch_store.append(active_batch.clone()); @@ -1177,7 +1272,7 @@ mod integration_tests { // 1. Verify overall structure assert!(plan_str.contains("UnionExec"), "Should have UnionExec"); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "older generations should be block-list filtered" ); assert!( @@ -1365,7 +1460,6 @@ mod integration_tests { // Active memtable: id=10 inserted ("keep") then updated to NULL within // the same generation; id=20 ("active_20") is a control that matches. - let batch_store = Arc::new(BatchStore::with_capacity(16)); let active_batch = RecordBatch::try_new( schema.clone(), vec![ @@ -1378,12 +1472,12 @@ mod integration_tests { ], ) .unwrap(); - batch_store.append(active_batch).unwrap(); + let (batch_store, index_store) = pk_indexed(&[active_batch]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store, - index_store: Arc::new(IndexStore::new()), + index_store, schema: schema.clone(), generation: 1, }, diff --git a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs index a6063c2930c..2da4b5cd9a6 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs @@ -9,17 +9,20 @@ use std::collections::HashMap; use std::sync::Arc; use arrow_array::{Array, RecordBatch}; -use arrow_schema::SchemaRef; +use arrow_schema::{SchemaRef, SortOptions}; use datafusion::common::ScalarValue; use datafusion::execution::TaskContext; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr}; use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::limit::GlobalLimitExec; +use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::prelude::{Expr, SessionContext}; use futures::TryStreamExt; +use lance_core::utils::bloomfilter::sbbf::Sbbf; use lance_core::{Result, is_system_column}; use lance_datafusion::exec::OneShotExec; -use lance_index::scalar::bloomfilter::sbbf::Sbbf; use tracing::instrument; use crate::dataset::mem_wal::index::IndexStore; @@ -27,11 +30,8 @@ use crate::dataset::mem_wal::memtable::batch_store::BatchStore; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{ - BloomFilterGuardExec, CoalesceFirstExec, DedupDirection, WithinSourceDedupExec, - compute_pk_hash_from_scalars, -}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::exec::{BloomFilterGuardExec, CoalesceFirstExec, compute_pk_hash_from_scalars}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, wants_row_address, wants_row_id, @@ -87,7 +87,9 @@ pub struct LsmPointLookupPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, /// Precomputed canonical output schema for the no-projection case, so the /// hot `lookup(.., None)` path clones an `Arc` instead of rebuilding the /// schema on every call. @@ -120,6 +122,7 @@ impl LsmPointLookupPlanner { bloom_filters: std::collections::HashMap::new(), session: None, flushed_cache: None, + warmer: None, none_target, task_ctx: SessionContext::new().task_ctx(), } @@ -137,11 +140,17 @@ impl LsmPointLookupPlanner { /// front during scan setup via /// [`DatasetMemWalExt::prewarm_mem_wal`](crate::dataset::mem_wal::DatasetMemWalExt::prewarm_mem_wal) /// so the first gen-key lookup does not pay the dataset open. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Add a bloom filter for a generation. /// /// Bloom filters are optional but improve performance by skipping @@ -546,9 +555,13 @@ impl LsmPointLookupPlanner { scanner.create_plan().await? } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>())?; scanner.filter_expr(filter.clone()); @@ -573,19 +586,29 @@ impl LsmPointLookupPlanner { // multiple rows sharing the target primary key. scanner.with_row_id(); let raw = scanner.create_plan().await?; - // Within the active memtable, larger `_rowid` = newer - // insert. After dedup there is exactly one row per PK. - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - raw, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); + // The filter already restricts to the exact PK value, so the + // scan yields that key's insert history. Within the active + // memtable larger `_rowid` = newer insert, so sorting `_rowid` + // DESC and keeping the first row picks the newest version — one + // row per (value-exact) PK. + let rowid_idx = raw.schema().index_of(lance_core::ROW_ID)?; + let ordering = LexOrdering::new(vec![PhysicalSortExpr { + expr: Arc::new(Column::new(lance_core::ROW_ID, rowid_idx)), + options: SortOptions { + descending: true, + nulls_first: false, + }, + }]) + .ok_or_else(|| { + lance_core::Error::internal("point-lookup: failed to build _rowid ordering") + })?; + let newest: Arc = + Arc::new(SortExec::new(ordering, raw).with_fetch(Some(1))); // Per-source `_rowid` would collide with the base table's; // NULL it before canonicalization (the value is internal to // this arm). project_to_canonical drops it entirely when // the user didn't request `_rowid` in the projection. - null_columns(deduped, &[lance_core::ROW_ID])? + null_columns(newest, &[lance_core::ROW_ID])? } }; project_to_canonical(scan, &target) @@ -642,10 +665,6 @@ fn probe_position( pk_column: &str, pk_value: &ScalarValue, ) -> Result { - let Some(btree) = index_store.get_btree_by_column(pk_column) else { - return Ok(ProbePos::NoIndex); - }; - // Visible batches are the committed prefix [0, last_visible_idx]; each // `StoredBatch` carries its cumulative `row_offset`, so visibility and the // position→batch mapping are O(1)/O(log) with no per-probe allocation. @@ -661,22 +680,37 @@ fn probe_position( if visible_end == 0 { return Ok(ProbePos::Miss); } + let max_visible_row = visible_end - 1; - // Newest visible position of the key — a single seek-and-stop on the - // ordered skiplist (largest key ≤ (value, max_visible_row)). No range - // collect, no allocation. - let Some(pos) = btree.get_newest_visible(pk_value, visible_end - 1) else { + // A single-column primary key always has a value-keyed BTree (reused or + // auto-created — see `IndexStore::enable_pk_index`): collision-free, so one + // seek yields the answer with no re-check. Absent only when the table has no + // PK index, where the caller falls back to the plan path. + let Some(btree) = index_store.get_btree_by_column(pk_column) else { + return Ok(ProbePos::NoIndex); + }; + let Some(pos) = btree.get_newest_visible(pk_value, max_visible_row) else { return Ok(ProbePos::Miss); }; + let (batch_idx, row) = resolve_position(batch_store, last_visible_idx, pos)?; + Ok(ProbePos::Found { batch_idx, row }) +} - // Binary-search the owning batch by `row_offset` (appended in order). +/// Map a global row `position` to its `(batch_idx, row_in_batch)` by binary +/// searching the visible batch prefix on cumulative `row_offset` (batches are +/// appended in order). +fn resolve_position( + batch_store: &BatchStore, + last_visible_idx: usize, + position: u64, +) -> Result<(usize, usize)> { let (mut lo, mut hi) = (0usize, last_visible_idx); while lo < hi { let mid = lo + (hi - lo).div_ceil(2); let off = batch_store.get(mid).map(|b| b.row_offset).ok_or_else(|| { lance_core::Error::internal("point-lookup: batch index out of range during search") })?; - if off <= pos { + if off <= position { lo = mid; } else { hi = mid - 1; @@ -685,10 +719,7 @@ fn probe_position( let stored = batch_store .get(lo) .ok_or_else(|| lance_core::Error::internal("point-lookup: resolved batch missing"))?; - Ok(ProbePos::Found { - batch_idx: lo, - row: (pos - stored.row_offset) as usize, - }) + Ok((lo, (position - stored.row_offset) as usize)) } /// Gather `rows` from `batch_store`'s batch `batch_idx` into the `target` @@ -1097,8 +1128,8 @@ mod tests { // Regression: same primary key inserted twice into one active // memtable must return the *newest* row. The bug was that // `FilterExec → LIMIT 1` over an insert-ordered scan returned the - // first (oldest) match. `WithinSourceDedupExec` collapses by PK, - // keeping the row with the largest `_rowid` (insert order). + // first (oldest) match. The plan-path active arm now sorts `_rowid` + // DESC and keeps the first row (largest `_rowid` = newest insert). use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use futures::TryStreamExt; @@ -1118,17 +1149,17 @@ mod tests { let b_old = create_test_batch(&schema, &[1], "old"); let b_new = create_test_batch(&schema, &[1], "new"); let b_other = create_test_batch(&schema, &[2], "two"); - let (_, _, bp_old) = batch_store.append(b_old.clone()).unwrap(); + let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap(); index_store - .insert_with_batch_position(&b_old, 0, Some(bp_old)) + .insert_with_batch_position(&b_old, off_old, Some(bp_old)) .unwrap(); - let (_, _, bp_new) = batch_store.append(b_new.clone()).unwrap(); + let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap(); index_store - .insert_with_batch_position(&b_new, 1, Some(bp_new)) + .insert_with_batch_position(&b_new, off_new, Some(bp_new)) .unwrap(); - let (_, _, bp_other) = batch_store.append(b_other.clone()).unwrap(); + let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap(); index_store - .insert_with_batch_position(&b_other, 2, Some(bp_other)) + .insert_with_batch_position(&b_other, off_other, Some(bp_other)) .unwrap(); let index_store = Arc::new(index_store); @@ -1168,6 +1199,88 @@ mod tests { ); } + #[tokio::test] + async fn test_point_lookup_probes_auto_created_pk_btree() { + // No user `add_btree` on the PK column — only `enable_pk_index`, which + // auto-creates a BTree on the primary key (the production default). The + // fast probe must resolve the newest visible version through that + // collision-free BTree rather than falling back to the plan path. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + + let schema = create_pk_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + // No `add_btree` — `enable_pk_index` auto-creates the PK BTree. + index_store.enable_pk_index(&[("id".to_string(), 0)]); + + // pk=1 written twice (the newer second), plus an unrelated pk=2. + let b_old = create_test_batch(&schema, &[1], "old"); + let b_new = create_test_batch(&schema, &[1], "new"); + let b_other = create_test_batch(&schema, &[2], "two"); + let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_old, off_old, Some(bp_old)) + .unwrap(); + let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_new, off_new, Some(bp_new)) + .unwrap(); + let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_other, off_other, Some(bp_other)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + let planner = LsmPointLookupPlanner::new(collector, vec!["id".to_string()], schema); + + // `lookup` takes the fast probe path (single-column PK, no system cols). + let hit = planner + .lookup(&[ScalarValue::Int32(Some(1))], None) + .await + .unwrap() + .expect("pk=1 must be found via the PK-position index probe"); + assert_eq!(hit.num_rows(), 1); + let name = hit + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + name.value(0), + "new_1", + "probe must return the newest version" + ); + + // An absent key resolves to None (no on-disk sources to consult). + assert!( + planner + .lookup(&[ScalarValue::Int32(Some(999))], None) + .await + .unwrap() + .is_none(), + "absent key must miss" + ); + } + #[tokio::test] async fn test_point_lookup_flushed_memtable_returns_newest_duplicate() { // Regression / invariant pin: when a flushed memtable contains two diff --git a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs index b6b1f952b25..7f849f3d8bf 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs @@ -27,8 +27,7 @@ use crate::io::exec::TakeExec; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{DedupDirection, WithinSourceDedupExec}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ DISTANCE_COLUMN, build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, wants_row_id, @@ -38,10 +37,12 @@ use crate::session::Session; /// Plans vector search queries over LSM data. /// /// Each source is independently newest-per-PK before the union — the active -/// memtable via an over-fetched KNN + within-source dedup, flushed generations -/// via their within-generation deletion vector — and the cross-generation -/// block-list ([`super::exec::PkHashFilterExec`]) drops any PK superseded by a -/// newer generation. So each PK reaches the union from exactly one source and a +/// memtable via an over-fetched KNN + a newest-per-PK recency filter +/// ([`super::exec::NewestPkFilterExec`], which drops a hit that isn't the newest +/// visible version of its PK), flushed generations via their within-generation +/// deletion vector — and the cross-generation block-list +/// ([`super::exec::PkBlockFilterExec`]) drops any PK superseded by a newer +/// generation. So each PK reaches the union from exactly one source and a /// distance-ordered merge yields the global top-k; no cross-source dedup is /// needed. /// @@ -54,15 +55,15 @@ use crate::session::Session; /// UnionExec /// ProjectionExec (canonical output schema) /// SortExec(_distance, fetch=k) -/// WithinSourceDedupExec: KeepMaxRowAddr (active) +/// NewestPkFilterExec: newest-per-PK recency (active) /// KNNExec: active memtable, fetch=ceil(k*overfetch) /// ProjectionExec (canonical output schema) /// ProjectionExec (null_columns _rowid) -/// PkHashFilterExec: block-list (flushed) +/// PkBlockFilterExec: block-list (flushed) /// KNNExec: flushed gen N, fetch=ceil(k*overfetch) (fast_search) /// … one per flushed gen … /// ProjectionExec (canonical output schema) -/// PkHashFilterExec: block-list (base) +/// PkBlockFilterExec: block-list (base) /// KNNExec: base table, k (fast_search)[.refine()?] /// ``` /// @@ -92,7 +93,9 @@ pub struct LsmVectorSearchPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, } impl LsmVectorSearchPlanner { @@ -121,6 +124,7 @@ impl LsmVectorSearchPlanner { dataset: None, session: None, flushed_cache: None, + warmer: None, } } @@ -133,11 +137,17 @@ impl LsmVectorSearchPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// searches against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Set the base dataset for post-rerank take. /// /// After global PK dedup and sort, a `TakeExec` against this dataset @@ -168,7 +178,7 @@ impl LsmVectorSearchPlanner { /// the rows that filtering drops: /// /// - `factor < 1.0` (e.g. `0.0`): **stale filtering off.** The per-source - /// block-list / [`super::exec::PkHashFilterExec`] is not built or applied, + /// block-list / [`super::exec::PkBlockFilterExec`] is not built or applied, /// so rows superseded by a newer generation can surface. The global PK /// dedup still runs, so it still suppresses stale copies in the cases /// where both the stale and the fresh row reach it. @@ -210,11 +220,10 @@ impl LsmVectorSearchPlanner { // live candidates after the post-filter. let overfetch_factor = overfetch_factor.max(1.0); - // Per-source PK-hash block sets (`NEWER(G)`; base = union of all gens). + // Per-source PK block sets (`NEWER(G)`; base = union of all gens). // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -233,49 +242,83 @@ impl LsmVectorSearchPlanner { // `block_lists` is non-empty exactly when a newer generation exists. let refine_base = refine_base_table || !block_lists.is_empty(); + // Stage per-source over-fetch decisions, then build every KNN plan + // concurrently — the builds are independent and a sequential loop was + // the dominant serial planning cost at multiple generations. + let arm_inputs: Vec<_> = sources + .iter() + .map(|source| { + let generation = source.generation(); + let is_base = matches!(source, LsmDataSource::BaseTable { .. }); + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + // Over-fetch when the post-source filter can drop candidates: a + // blocked source loses superseded rows; the active source's + // within-source dedup collapses duplicate-PK HNSW nodes. Block + // lookup is per shard — generations are per-shard. + let blocked = block_lists.get(&(source.shard_id(), generation)); + let fetch_k = if blocked.is_some() || is_active { + ((k as f64) * overfetch_factor).ceil() as usize + } else { + k + }; + (source, is_base, is_active, blocked, fetch_k) + }) + .collect(); + let built = futures::future::try_join_all(arm_inputs.iter().map( + |(source, is_base, _, _, fetch_k)| { + Box::pin(self.build_knn_plan( + source, + query_vector, + *fetch_k, + nprobes, + projection, + *is_base && refine_base, + )) + }, + )) + .await?; + let mut knn_plans = Vec::new(); - for source in &sources { - let generation = source.generation(); - let is_base = matches!(source, LsmDataSource::BaseTable { .. }); - let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); - // Over-fetch when the post-source filter can drop candidates: a - // blocked source loses superseded rows; the active source's - // within-source dedup collapses duplicate-PK HNSW nodes. Block - // lookup is per shard — generations are per-shard. - let blocked = block_lists.get(&(source.shard_id(), generation)); - let fetch_k = if blocked.is_some() || is_active { - ((k as f64) * overfetch_factor).ceil() as usize - } else { - k - }; - let knn = Box::pin(self.build_knn_plan( - source, - query_vector, - fetch_k, - nprobes, - projection, - is_base && refine_base, - )) - .await?; + // `build_knn_plan` returns each active arm's max-visible snapshot + // alongside its plan; the active arm's NewestPkFilterExec needs both it + // and `source` (for the batch/index stores), so neither is discarded. + for ((source, is_base, is_active, blocked, _), (knn, active_max_visible)) in + arm_inputs.iter().zip(built) + { + let is_base = *is_base; + let is_active = *is_active; + let blocked = *blocked; // Make each source independently newest-per-PK before the union: // * active: the append-only HNSW returns one node per inserted - // version, so collapse duplicate PKs to the newest insert - // (KeepMaxRowAddr on `_rowid`) and re-sort by distance. This - // stays probabilistic — a fresh version evicted from the - // over-fetched top-k still leaks. + // version *and* leaves stale versions of updated PKs live. The + // recency filter keeps only the hit that is the newest visible + // version of its PK (per the maintained MVCC PK-position index), + // closing the predicate-crossing stale read, then re-sort by + // distance. // * flushed/base: drop cross-gen superseded rows via the // block-list (within-gen is handled by the flushed DV). let knn = if is_active { - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - knn, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); - sort_by_distance(deduped, k)? + let (batch_store, index_store) = match source { + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + .. + } => (batch_store.clone(), index_store.clone()), + _ => unreachable!("is_active implies ActiveMemTable"), + }; + let filtered: Arc = + Arc::new(super::exec::NewestPkFilterExec::new( + knn, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store, + batch_store, + active_max_visible.expect("active arm returns its max_visible snapshot"), + )); + sort_by_distance(filtered, k)? } else { match blocked { - Some(set) => Arc::new(super::exec::PkHashFilterExec::new( + Some(set) => Arc::new(super::exec::PkBlockFilterExec::new( knn, self.pk_columns.clone(), set.clone(), @@ -301,6 +344,10 @@ impl LsmVectorSearchPlanner { // No cross-source dedup needed (see struct doc): SortExec(per partition) // + SortPreservingMerge does the p-way distance-ordered top-k merge. #[allow(deprecated)] + // The downstream `SortPreservingMergeExec` already spawns one driver + // task per input partition (one per union arm) via `spawn_buffered`, so + // each arm's per-arm CPU (HNSW search, distance refine) runs on its own + // task without an extra repartition. let merged: Arc = Arc::new(UnionExec::new(knn_plans)); let distance_idx = merged.schema().index_of(DISTANCE_COLUMN).map_err(|_| { @@ -364,11 +411,15 @@ impl LsmVectorSearchPlanner { merged_sorted }; - // Under-fetch is warned per-source inside `PkHashFilterExec`. + // Under-fetch is warned per-source inside `PkBlockFilterExec`. Ok(result) } /// Build KNN plan for a single data source. + /// + /// Returns the plan and, for the active memtable, the `max_visible_batch_position` + /// snapshot its scanner latched — threaded into the recency filter so it keys + /// on the same snapshot the search saw (`None` for base / flushed sources). async fn build_knn_plan( &self, source: &LsmDataSource, @@ -377,7 +428,7 @@ impl LsmVectorSearchPlanner { nprobes: usize, projection: Option<&[String]>, refine: bool, - ) -> Result> { + ) -> Result<(Arc, Option)> { match source { LsmDataSource::BaseTable { dataset } => { let mut scanner = dataset.scan(); @@ -402,12 +453,16 @@ impl LsmVectorSearchPlanner { if refine { scanner.refine(1); } - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = build_scanner_projection(projection, &self.base_schema, &self.pk_columns); @@ -418,7 +473,7 @@ impl LsmVectorSearchPlanner { scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); scanner.fast_search(); - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::ActiveMemTable { batch_store, @@ -436,8 +491,8 @@ impl LsmVectorSearchPlanner { build_scanner_projection(projection, &self.base_schema, &self.pk_columns); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); // Expose `_rowid` (BatchStore row offset, monotonic with - // insert order) so [`WithinSourceDedupExec`] can collapse - // duplicate-PK rows to the newest insert. The value is + // insert order) so `NewestPkFilterExec` can compare each hit's + // position against the PK-position index. The value is // per-source and NULL'd before reaching the canonical merge. // (VectorIndexExec only plumbs `with_row_id`, not // `with_row_address`, but the two yield identical values @@ -447,7 +502,9 @@ impl LsmVectorSearchPlanner { scanner.nearest(&self.vector_column, query_arr, k); scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Capture the scanner's own latched snapshot for the recency filter. + Ok((plan, Some(scanner.max_visible_batch_position()))) } } } @@ -567,10 +624,19 @@ mod tests { async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) .await - .unwrap() + .unwrap(); + // Also write the standalone PK sidecar (on `id`) so a flushed-generation + // source can be probed by the block-list (harmless for a base table). + if has_id { + crate::dataset::mem_wal::scanner::block_list::write_pk_sidecar(uri, &batches, &["id"]) + .await + .unwrap(); + } + dataset } #[tokio::test] @@ -641,6 +707,7 @@ mod tests { // Active memtable with HNSW index over the "vector" column. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -759,6 +826,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -838,6 +906,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -951,6 +1020,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1007,8 +1077,7 @@ mod tests { plan_str ); assert!( - plan_str.contains("WithinSourceDedupExec") - && plan_str.contains("SortPreservingMergeExec"), + plan_str.contains("NewestPkFilterExec") && plan_str.contains("SortPreservingMergeExec"), "expected per-arm dedup + distance merge, got:\n{}", plan_str ); @@ -1091,6 +1160,7 @@ mod tests { // "right" vector close to the query, plus an unrelated pk=2. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1210,6 +1280,7 @@ mod tests { // Active memtable: id=3 with HNSW index. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1436,9 +1507,9 @@ mod tests { #[tokio::test] async fn test_vector_search_dedup_within_active_memtable() { // Regression: same PK inserted twice into one active memtable with - // *different* vectors. HNSW indexes each as a distinct node, so - // without WithinSourceDedupExec a KNN can return both candidates - // for the same PK and pollute top-k. The newer insert must win. + // *different* vectors. HNSW indexes each as a distinct node, so without + // the recency filter a KNN can return both candidates for the same PK + // and pollute top-k. The newer insert must win. use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use datafusion::prelude::SessionContext; @@ -1450,6 +1521,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1513,14 +1585,14 @@ mod tests { .await .unwrap(); - // The active arm collapses duplicate-PK HNSW nodes itself via - // WithinSourceDedupExec — there is no cross-source dedup fallback. + // The active arm collapses duplicate-PK HNSW nodes itself via the + // recency filter — there is no cross-source dedup fallback. let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("WithinSourceDedupExec"), + plan_str.contains("NewestPkFilterExec"), "active vector arm must self-dedup, got:\n{}", plan_str ); @@ -1549,10 +1621,120 @@ mod tests { ); } + #[tokio::test] + async fn test_vector_search_active_stale_update_out_of_neighborhood() { + // BUG REPRODUCTION (vector case: a PK update that moves out of the neighborhood). + // + // Within a *single* active memtable, pk=1 is first inserted ON the query + // (distance ~0), then updated to a FAR vector. The append-only HNSW keeps + // both nodes live. A result-set dedup only collapses duplicate PKs that + // are BOTH present in the over-fetched candidate set. + // + // Here the fresh (far) pk=1 is evicted from the candidate set — there are + // enough nearer filler rows that it ranks below the fetch cutoff — so the + // dedup never sees it and the STALE near pk=1 leaks as the nearest hit. + // This is the predicate-crossing hole: the row that *would* suppress the + // stale version isn't in the result set, so result-set dedup can't help. + // + // Desired (NewestPkFilterExec) behaviour: pk=1's newest row-position is + // the far one, computed predicate-independently over the whole memtable, + // so the stale near node is dropped and pk=1 must NOT surface at ~0. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use datafusion::prelude::SessionContext; + use futures::TryStreamExt; + + let schema = create_vector_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); + index_store.add_hnsw( + "vector_hnsw".to_string(), + 1, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + 64, + 8, + ); + + // First append: stale pk=1 ON the query, plus five filler rows strictly + // farther than pk=1 but far nearer than the eventual fresh pk=1. + let q = [0.1, 0.2, 0.3, 0.4]; + let stale_then_fillers = batch_rows( + &schema, + &[ + (1, q), + (10, [0.11, 0.21, 0.31, 0.41]), + (11, [0.13, 0.23, 0.33, 0.43]), + (12, [0.15, 0.25, 0.35, 0.45]), + (13, [0.17, 0.27, 0.37, 0.47]), + (14, [0.19, 0.29, 0.39, 0.49]), + ], + ); + let (bp0, off0, _) = batch_store.append(stale_then_fillers.clone()).unwrap(); + index_store + .insert_with_batch_position(&stale_then_fillers, off0, Some(bp0)) + .unwrap(); + + // Second append: the UPDATE — pk=1 moved far from the query. This is the + // newest version (largest row position) but it sits well outside top-k. + let fresh_pk1 = batch_rows(&schema, &[(1, [9.0, 9.0, 9.0, 9.0])]); + let (bp1, off1, _) = batch_store.append(fresh_pk1.clone()).unwrap(); + index_store + .insert_with_batch_position(&fresh_pk1, off1, Some(bp1)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = uuid::Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmVectorSearchPlanner::new( + collector, + vec!["id".to_string()], + schema, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + ); + + // k=3, no over-fetch: the candidate set is {pk1@near, two nearest + // fillers}; fresh pk1@far ranks 7th and never enters the candidates. + let query = create_query_vector(); + let plan = planner + .plan_search(&query, 3, 1, None, false, 1.0) + .await + .unwrap(); + let ctx = SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + let rows = collect_id_dist(&batches); + + assert!( + !rows.iter().any(|&(id, d)| id == 1 && d.abs() < 1e-3), + "stale near pk=1 leaked: its live vector is far from the query, so it \ + must not appear at distance ~0. results={:?}", + rows + ); + } + #[tokio::test] async fn test_vector_search_stale_read_when_fresh_falls_out_of_top_k() { // Regression for the cross-generation stale-read gap that the - // PkHashFilterExec block-list closes. + // PkBlockFilterExec block-list closes. // // Scenario: // * Base (gen 0): stale pk=1 sitting on the query (distance ~0). @@ -1587,6 +1769,7 @@ mod tests { // active arm surfaces pk=2 and drops fresh pk=1. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1783,6 +1966,7 @@ mod tests { // Active (gen 1): pk 1,2,3 re-inserted with a far vector (the fresh value). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1987,6 +2171,7 @@ mod tests { // Active: (1,1) re-inserted far (fresh) + an unrelated nearby (2,2). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id1".to_string(), 0), ("id2".to_string(), 1)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -2091,6 +2276,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, diff --git a/rust/lance/src/dataset/mem_wal/util.rs b/rust/lance/src/dataset/mem_wal/util.rs index d1413b84b2a..3f5090f6b40 100644 --- a/rust/lance/src/dataset/mem_wal/util.rs +++ b/rust/lance/src/dataset/mem_wal/util.rs @@ -169,6 +169,16 @@ pub fn flushed_memtable_path( shard_base_path(base_path, shard_id).join(format!("{}_gen_{}", random_hash, generation)) } +/// Subdirectory of a flushed generation holding its standalone primary-key +/// dedup index (a sidecar BTree, not registered in the manifest). Both the +/// flush writer and the block-list probe join this onto the generation path. +pub const PK_INDEX_DIR: &str = "_pk_index"; + +/// Path to a flushed generation's standalone primary-key dedup index. +pub fn pk_index_path(gen_path: &Path) -> Path { + gen_path.clone().join(PK_INDEX_DIR) +} + /// Generate an 8-character random hex string for flushed MemTable directories. pub fn generate_random_hash() -> String { let bytes: [u8; 4] = rand::random(); diff --git a/rust/lance/src/dataset/mem_wal/wal.rs b/rust/lance/src/dataset/mem_wal/wal.rs index 6232ab0d8ae..afccfbc0979 100644 --- a/rust/lance/src/dataset/mem_wal/wal.rs +++ b/rust/lance/src/dataset/mem_wal/wal.rs @@ -39,6 +39,17 @@ use super::memtable::batch_store::{BatchStore, StoredBatch}; /// Key for storing writer epoch in Arrow IPC file schema metadata. pub const WRITER_EPOCH_KEY: &str = "writer_epoch"; +/// Marks a WAL entry as a data-less fence sentinel (observability only; +/// replay skips sentinels via their empty batch list). +pub const FENCE_SENTINEL_KEY: &str = "fence_sentinel"; + +/// True if `error` is the terminal fence emitted by `ManifestStore::check_fenced` +/// (a successor claimed a higher epoch). Matches the message it formats, since +/// fences surface as a plain `Error::io` rather than a typed variant. +fn is_fence_error(error: &Error) -> bool { + error.to_string().contains("Writer fenced") +} + /// Watcher for batch durability using watermark-based tracking. /// /// Uses a shared watch channel that broadcasts the durable watermark. @@ -49,22 +60,36 @@ pub struct BatchDurableWatcher { rx: watch::Receiver, /// Target batch ID to wait for. target_batch_position: usize, + /// Terminal flush failure (e.g. a fence) shared with the flusher. When + /// set, the watermark will never advance to the target, so `wait` + /// returns this error instead of blocking forever. + terminal_error: Arc>>, } impl BatchDurableWatcher { /// Create a new watcher for a specific batch ID. - pub fn new(rx: watch::Receiver, target_batch_position: usize) -> Self { + pub fn new( + rx: watch::Receiver, + target_batch_position: usize, + terminal_error: Arc>>, + ) -> Self { Self { rx, target_batch_position, + terminal_error, } } /// Wait until the batch is durable. /// - /// Returns Ok(()) when `durable_watermark >= target_batch_position`. + /// Returns Ok(()) when `durable_watermark >= target_batch_position`, or + /// Err if a terminal flush failure (e.g. a fence) means the watermark can + /// never reach the target. pub async fn wait(&mut self) -> Result<()> { loop { + if let Some(msg) = self.terminal_error.lock().unwrap().clone() { + return Err(Error::io(msg)); + } let current = *self.rx.borrow(); if current >= self.target_batch_position { return Ok(()); @@ -313,6 +338,11 @@ pub struct WalFlusher { /// Created at construction and recreated after each flush. /// Used by backpressure to wait for WAL flushes. wal_flush_cell: std::sync::Mutex>>, + /// First terminal flush failure (a fence). Shared with every + /// `BatchDurableWatcher` so a fenced flush — which never advances the + /// watermark — wakes durability waiters with the error instead of + /// hanging them forever. + terminal_error: Arc>>, } impl WalFlusher { @@ -334,6 +364,7 @@ impl WalFlusher { shard_id, flush_tx: None, wal_flush_cell: std::sync::Mutex::new(Some(wal_flush_cell)), + terminal_error: Arc::new(StdMutex::new(None)), } } @@ -354,7 +385,27 @@ impl WalFlusher { pub fn track_batch(&self, batch_position: usize) -> BatchDurableWatcher { // Return a watcher that waits for this batch to become durable // batch_position is 0-indexed, so we wait for watermark > batch_position (i.e., >= batch_position + 1) - BatchDurableWatcher::new(self.durable_watermark_rx.clone(), batch_position + 1) + BatchDurableWatcher::new( + self.durable_watermark_rx.clone(), + batch_position + 1, + Arc::clone(&self.terminal_error), + ) + } + + /// Record a terminal flush failure (a fence) and wake every pending + /// durability waiter. A fence is permanent — the watermark will never + /// advance — so waiters must observe the error rather than block forever. + /// Idempotent: only the first failure is retained. + fn mark_terminal_failure(&self, error: &Error) { + { + let mut slot = self.terminal_error.lock().unwrap(); + if slot.is_none() { + *slot = Some(error.to_string()); + } + } + // Wake `wait`ers without advancing the watermark; each re-checks + // `terminal_error` and returns the error. + self.durable_watermark_tx.send_modify(|_| {}); } /// Get the current durable watermark. @@ -427,7 +478,7 @@ impl WalFlusher { source: &WalFlushSource, end_batch_position: usize, ) -> Result { - match source { + let result = match source { WalFlushSource::BatchStore { batch_store, indexes, @@ -436,7 +487,16 @@ impl WalFlusher { .await } WalFlushSource::WalOnly { state } => self.flush_from_wal_only(state).await, + }; + // A fence is terminal: the append will never succeed, so the + // durability watermark can never advance. Wake any waiter (e.g. a + // `durable_write` put) with the fence error instead of hanging it. + if let Err(e) = &result + && is_fence_error(e) + { + self.mark_terminal_failure(e); } + result } async fn flush_from_batch_store( @@ -882,6 +942,52 @@ impl WalAppender { self.manifest_store.check_fenced(self.writer_epoch).await } + /// Drop a data-less sentinel at the WAL tip so the predecessor's next + /// `append` collides on PUT-IF-NOT-EXISTS and learns it is fenced, rather + /// than succeeding into the empty next slot. Call *before* replay: any + /// predecessor entry below the sentinel is then recovered, not orphaned. + /// On a lost slot race, re-probes one past the winner. Seeds next position + /// past the sentinel; returns the sentinel position. + pub(crate) async fn write_fence_sentinel(&self) -> Result { + let sentinel = Bytes::from(serialize_fence_sentinel(self.writer_epoch)?); + let mut next_pos = self.next_entry_position.lock().await; + let mut pos = match *next_pos { + Some(p) => p, + None => self.discover_next_position().await?, + }; + let mut conflicts = 0; + loop { + match atomic_put( + self.object_store.as_ref(), + &self.wal_dir, + &wal_entry_filename(pos), + sentinel.clone(), + ) + .await + { + Ok(()) => { + let next = pos.checked_add(1).ok_or_else(|| { + Error::io(format!("WAL position overflow for shard {}", self.shard_id)) + })?; + *next_pos = Some(next); + self.next_entry_position_hint.store(next, Ordering::SeqCst); + return Ok(pos); + } + Err(AtomicPutError::AlreadyExists) => { + conflicts += 1; + if conflicts >= MAX_APPEND_CREATE_CONFLICTS { + return Err(Error::io(format!( + "fence sentinel write for shard {} failed after {} conflicts", + self.shard_id, conflicts + ))); + } + pos = self.discover_next_position().await?; + } + Err(AtomicPutError::Other(error)) => return Err(error), + } + } + } + async fn discover_next_position(&self) -> Result { if let Ok(Some(manifest)) = self.manifest_store.read_latest().await { let hint = manifest.wal_entry_position_last_seen; @@ -1053,6 +1159,28 @@ fn serialize_appender_batches(batches: &[RecordBatch], writer_epoch: u64) -> Res Ok(buffer) } +/// Data-less sentinel: an empty-schema Arrow IPC stream with the writer epoch +/// and a marker flag, no batches. Reads back as `(epoch, [])` so replay skips +/// it. See [`WalAppender::write_fence_sentinel`]. +fn serialize_fence_sentinel(writer_epoch: u64) -> Result> { + let mut metadata = std::collections::HashMap::new(); + metadata.insert(WRITER_EPOCH_KEY.to_string(), writer_epoch.to_string()); + metadata.insert(FENCE_SENTINEL_KEY.to_string(), "true".to_string()); + let ipc_schema = Arc::new(ArrowSchema::new_with_metadata( + arrow_schema::Fields::empty(), + metadata, + )); + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &ipc_schema) + .map_err(|e| Error::io(format!("failed to create fence sentinel IPC writer: {}", e)))?; + writer + .finish() + .map_err(|e| Error::io(format!("failed to finish fence sentinel IPC stream: {}", e)))?; + } + Ok(buffer) +} + fn deserialize_appender_batches(bytes: Bytes) -> Result<(u64, Vec)> { let cursor = Cursor::new(bytes); let reader = StreamReader::try_new(cursor, None) @@ -1584,6 +1712,108 @@ mod tests { ); } + #[tokio::test] + async fn test_fence_sentinel_fences_predecessor_without_successor_write() { + // The race the sentinel closes: a successor claims a higher epoch but + // has NOT yet written any data batch. Without the sentinel, the + // predecessor's next append lands in the empty next slot, succeeds, + // and false-acks. With the sentinel, the predecessor collides. + let (store, base_path, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + + let first = WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(); + let schema = create_test_schema(); + let batch = create_test_batch(&schema, 1); + first.append(vec![batch.clone()]).await.unwrap(); // position 1 + + // Successor claims epoch 2 and drops a sentinel at the tip (position 2) + // — but writes no data of its own. + let second = WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(); + assert_eq!(second.writer_epoch(), 2); + let sentinel_pos = second.write_fence_sentinel().await.unwrap(); + assert_eq!(sentinel_pos, 2, "sentinel should land at the tip"); + + // Predecessor's next append collides with the sentinel and is fenced. + let err = first.append(vec![batch.clone()]).await.unwrap_err(); + assert!( + err.to_string().contains("Writer fenced"), + "expected fence error from append, got: {err}" + ); + + // The sentinel is data-less: a tailer reads it back as zero batches so + // replay skips it. + let tailer = WalTailer::new(store.clone(), base_path.clone(), shard_id); + let entry = tailer.read_entry(sentinel_pos).await.unwrap().unwrap(); + assert_eq!(entry.writer_epoch, 2); + assert!(entry.batches.is_empty(), "sentinel must carry no batches"); + + // Successor's own writes land after the sentinel (position 3). + let res = second.append(vec![batch]).await.unwrap(); + assert_eq!(res.entry_position, 3); + } + + // Regression: a fenced WAL flush never advances the durability watermark. + // A `durable_write` put waits on a `BatchDurableWatcher`, so without + // terminal-failure propagation the watcher blocks forever (the predecessor + // pod's HTTP write hangs until the client times out). The flusher must + // surface the fence through the watcher so the caller fails fast with 410. + #[tokio::test] + async fn test_durable_watcher_aborts_on_fence_instead_of_hanging() { + let (store, base_path, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let schema = create_test_schema(); + + // Predecessor claims epoch 1 and writes one entry (position 1), seeding + // its cached next position at 2. The flusher shares this appender. + let first = Arc::new( + WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(), + ); + assert_eq!(first.writer_epoch(), 1); + first + .append(vec![create_test_batch(&schema, 1)]) + .await + .unwrap(); + let flusher = WalFlusher::new(Arc::clone(&first)); + + // Successor claims epoch 2 and drops a sentinel at the predecessor's + // next slot (position 2) — a rolling-restart pod replacement. + let second = WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(); + assert_eq!(second.writer_epoch(), 2); + assert_eq!(second.write_fence_sentinel().await.unwrap(), 2); + + // A durable put on the predecessor: stage a batch and track it. + let batch_store = Arc::new(BatchStore::with_capacity(10)); + batch_store.append(create_test_batch(&schema, 1)).unwrap(); + let mut watcher = flusher.track_batch(0); + + // Flushing collides with the sentinel and fences. Both the flush result + // and the watcher must report the fence — and the watcher must resolve + // promptly, not block on a watermark that can never advance. + let source = batch_store_source(&batch_store); + let flush_err = flusher.flush(&source, batch_store.len()).await.unwrap_err(); + assert!( + is_fence_error(&flush_err), + "expected fence error from flush, got: {flush_err}" + ); + + let waited = tokio::time::timeout(std::time::Duration::from_secs(5), watcher.wait()).await; + let err = waited + .expect("watcher.wait() hung after a fenced flush") + .expect_err("watcher must surface the fence, not report success"); + assert!( + is_fence_error(&err), + "watcher must report the fence so the HTTP layer maps 410, got: {err}" + ); + } + #[tokio::test] async fn test_wal_appender_rejects_invalid_input() { let (store, base_path, _temp_dir) = create_local_store().await; diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 0788b657366..5faa65d8e7d 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -47,8 +47,10 @@ pub use super::util::{WatchableOnceCell, WatchableOnceCellReader}; pub use super::wal::{WalEntry, WalEntryData, WalFlushResult, WalFlusher}; use super::memtable::flush::TriggerMemTableFlush; +use super::scanner::GenerationWarmer; use super::wal::{ - TriggerWalFlush, WalAppender, WalFlushSource, WalOnlyState, WalTailer, empty_flush_result, + BatchDurableWatcher, TriggerWalFlush, WalAppender, WalFlushSource, WalOnlyState, WalTailer, + empty_flush_result, }; use super::manifest::ShardManifestStore; @@ -177,6 +179,21 @@ pub struct ShardWriterConfig { /// Default: 60 seconds pub stats_log_interval: Option, + /// How long a frozen memtable lingers in memory after its flush commits, + /// before it is evicted and served only from the on-disk flushed dataset. + /// + /// `Duration::ZERO` (the default) disables retention: evict on commit, no + /// sweep ticker. Correct for single-shot queries, which can't observe a + /// generation evicted mid-read. + /// + /// A non-zero value is required only for queries split across reads (e.g. + /// fresh tier and base table read separately, then deduped): the flushed + /// dataset loses the per-batch boundaries that bound as-of membership + /// (see [`crate::dataset::mem_wal::scanner::FreshTierWatermark`]), so a + /// generation evicted between a query's reads can serve a stale row. Set it + /// above the worst-case multi-part query latency, with margin. + pub frozen_memtable_grace: Duration, + /// Whether to maintain an in-memory MemTable on top of the WAL. /// /// When `true` (default), the writer maintains an in-memory `MemTable`, @@ -216,6 +233,11 @@ pub struct ShardWriterConfig { /// /// Default: empty. pub hnsw_params: HashMap, + + /// Optional warmer fired pre-commit for each new generation (zero cold reads + /// on first query). Wired to the flusher; supplied by the consumer (e.g. the + /// WAL pod). Default: `None`. + pub warmer: Option>, } impl Default for ShardWriterConfig { @@ -236,8 +258,10 @@ impl Default for ShardWriterConfig { async_index_buffer_rows: 10_000, async_index_interval: Duration::from_secs(1), stats_log_interval: Some(Duration::from_secs(60)), // 1 minute + frozen_memtable_grace: Duration::ZERO, enable_memtable: true, hnsw_params: HashMap::new(), + warmer: None, } } } @@ -335,6 +359,13 @@ impl ShardWriterConfig { self } + /// Set how long a flushed memtable lingers in memory before eviction. MUST + /// exceed the maximum query elapsed time — see `frozen_memtable_grace`. + pub fn with_frozen_memtable_grace(mut self, grace: Duration) -> Self { + self.frozen_memtable_grace = grace; + self + } + /// Toggle the in-memory MemTable layer. See `enable_memtable` for the /// full WAL-only-mode contract. Defaults to `true`. pub fn with_enable_memtable(mut self, enable: bool) -> Self { @@ -708,6 +739,15 @@ pub struct WriteResult { pub batch_positions: std::ops::Range, } +/// A sealed memtable kept queryable in memory. `flushed_at_ms` is `None` while +/// the generation is still awaiting (or retrying) its flush, and `Some(t)` once +/// the flush commits — after which it lingers for `frozen_memtable_grace` so +/// in-flight as-of reads keep batch-resolved membership, then is swept. +struct FrozenMemTable { + memtable: Arc, + flushed_at_ms: Option, +} + /// ShardWriter state shared across tasks. struct WriterState { memtable: MemTable, @@ -716,12 +756,13 @@ struct WriterState { frozen_memtable_bytes: usize, /// Flush watchers for frozen memtables (for backpressure). frozen_flush_watchers: VecDeque<(usize, DurabilityWatcher)>, - /// Sealed-but-undrained memtables, kept queryable so a concurrent reader - /// sees no hole between `freeze_memtable` and the flush task's manifest - /// commit. Pushed in `freeze_memtable`; removed by generation in - /// `flush_memtable` on commit success only (retained on failure until a - /// later flush or WAL replay on reopen). - frozen_memtables: VecDeque>, + /// Sealed memtables, kept queryable so a concurrent reader sees no hole + /// between `freeze_memtable` and the flush task's manifest commit, and for + /// `frozen_memtable_grace` beyond it so as-of reads stay batch-resolved. + /// Pushed in `freeze_memtable`; stamped `flushed_at_ms` by `flush_memtable` + /// on commit success only (retained un-stamped on failure until a later + /// flush or WAL replay on reopen); swept after the grace by `SweepExpired`. + frozen_memtables: VecDeque, /// Flag to prevent duplicate memtable flush requests. flush_requested: bool, /// Counter for WAL flush threshold crossings. @@ -806,6 +847,8 @@ async fn replay_memtable_from_wal( position, entry.writer_epoch, our_epoch, shard_id ))); } + // Fence sentinels deserialize to zero batches and are skipped + // here — they carry only a position, no rows. if !entry.batches.is_empty() { memtable.insert_batches_only(entry.batches).await?; } @@ -844,6 +887,16 @@ async fn replay_memtable_from_wal( Ok(position) } +/// Pair each primary-key column name with its field id (both derived from the +/// schema's primary key, in the same order) for [`IndexStore::enable_pk_index`]. +fn pk_index_columns(pk_columns: &[String], pk_field_ids: &[i32]) -> Vec<(String, i32)> { + pk_columns + .iter() + .cloned() + .zip(pk_field_ids.iter().copied()) + .collect() +} + /// Shared state for writer operations. struct SharedWriterState { state: Arc>, @@ -853,6 +906,9 @@ struct SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + /// Primary-key column names, used to (re)enable the PK-position index on + /// each fresh active memtable created at freeze. + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -868,6 +924,7 @@ impl SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -880,6 +937,7 @@ impl SharedWriterState { config, schema, pk_field_ids, + pk_columns, max_memtable_batches, max_memtable_rows, index_configs, @@ -905,13 +963,17 @@ impl SharedWriterState { self.max_memtable_batches, )?; - if !self.index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + // Build an IndexStore when there are user indexes *or* a primary key: + // the PK dedup index (and its flushed on-disk sidecar) is required for + // cross-generation dedup even when no secondary index is configured. + if !self.index_configs.is_empty() || !self.pk_columns.is_empty() { + let mut indexes = IndexStore::from_configs( &self.index_configs, self.max_memtable_rows, self.max_memtable_batches, - )?); - new_memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&self.pk_columns, &self.pk_field_ids)); + new_memtable.set_indexes_arc(Arc::new(indexes)); } let mut old_memtable = std::mem::replace(&mut state.memtable, new_memtable); @@ -947,10 +1009,13 @@ impl SharedWriterState { let frozen_memtable = Arc::new(old_memtable); - // Keep this generation queryable until its manifest commit lands - // (dropped in `flush_memtable`, success only). Arc refcount, not a - // copy — the flush task holds it alive for the whole drain anyway. - state.frozen_memtables.push_back(frozen_memtable.clone()); + // Keep this generation queryable past its manifest commit (swept after + // the grace by `SweepExpired`). Arc refcount, not a copy — the flush + // task holds it alive for the whole drain anyway. + state.frozen_memtables.push_back(FrozenMemTable { + memtable: frozen_memtable.clone(), + flushed_at_ms: None, + }); debug!( "Frozen memtable generation {}, pending_count = {}", @@ -958,7 +1023,7 @@ impl SharedWriterState { state.frozen_flush_watchers.len() ); - let _ = self.memtable_flush_tx.send(TriggerMemTableFlush { + let _ = self.memtable_flush_tx.send(TriggerMemTableFlush::Flush { memtable: frozen_memtable, done: None, }); @@ -1208,6 +1273,12 @@ impl ShardWriter { position_hint_seed, )); + // Fence the predecessor before replay (see `write_fence_sentinel`). + // Epoch 1 is a fresh shard with no predecessor to fence. + if epoch >= 2 { + wal_appender.write_fence_sentinel().await?; + } + // Create WAL flusher backed by the shared appender. let mut wal_flusher = WalFlusher::new(wal_appender); @@ -1279,11 +1350,9 @@ impl ShardWriter { ) -> Result { // Create MemTable with primary key field IDs from schema let lance_schema = Schema::try_from(schema.as_ref())?; - let pk_field_ids: Vec = lance_schema - .unenforced_primary_key() - .iter() - .map(|f| f.id) - .collect(); + let pk_fields = lance_schema.unenforced_primary_key(); + let pk_field_ids: Vec = pk_fields.iter().map(|f| f.id).collect(); + let pk_columns: Vec = pk_fields.iter().map(|f| f.name.clone()).collect(); let mut memtable = MemTable::with_capacity( schema.clone(), manifest.current_generation, @@ -1292,14 +1361,18 @@ impl ShardWriter { config.max_memtable_batches, )?; - // Create indexes if configured and set them on the MemTable. - if !index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + // Create indexes if configured and set them on the MemTable. The + // PK-position index is enabled before any WAL replay below so replayed + // rows are recorded in it. A primary key alone (no secondary index) + // still needs the PK index so flush writes its on-disk dedup sidecar. + if !index_configs.is_empty() || !pk_columns.is_empty() { + let mut indexes = IndexStore::from_configs( index_configs, config.max_memtable_rows, config.max_memtable_batches, - )?); - memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&pk_columns, &pk_field_ids)); + memtable.set_indexes_arc(Arc::new(indexes)); } // Replay any WAL entries written after the last successfully-flushed @@ -1349,13 +1422,10 @@ impl ShardWriter { let (memtable_flush_tx, memtable_flush_rx) = mpsc::unbounded_channel(); - let flusher = Arc::new(MemTableFlusher::new( - object_store, - base_path, - base_uri, - shard_id, - manifest_store, - )); + let flusher = Arc::new( + MemTableFlusher::new(object_store, base_path, base_uri, shard_id, manifest_store) + .with_warmer(config.warmer.clone()), + ); let backpressure = BackpressureController::new(config.clone()); @@ -1370,8 +1440,14 @@ impl ShardWriter { // Background MemTable flush handler — frozen memtable to Lance file. // It rebuilds the same secondary indexes on each flushed generation. - let memtable_handler = - MemTableFlushHandler::new(state.clone(), flusher, epoch, index_configs.to_vec(), stats); + let memtable_handler = MemTableFlushHandler::new( + state.clone(), + flusher, + epoch, + index_configs.to_vec(), + stats, + config.frozen_memtable_grace, + ); task_executor.add_handler( "memtable_flusher".to_string(), Box::new(memtable_handler), @@ -1387,6 +1463,7 @@ impl ShardWriter { config.clone(), schema.clone(), pk_field_ids, + pk_columns, config.max_memtable_batches, config.max_memtable_rows, index_configs.to_vec(), @@ -1452,14 +1529,7 @@ impl ShardWriter { /// `AlreadyExists`, indicating this writer has been fenced. #[instrument(name = "sw_put", level = "info", skip_all, fields(batch_count = batches.len(), shard_id = %self.config.shard_id))] pub async fn put(&self, batches: Vec) -> Result { - if batches.is_empty() { - return Err(Error::invalid_input("Cannot write empty batch list")); - } - for (i, batch) in batches.iter().enumerate() { - if batch.num_rows() == 0 { - return Err(Error::invalid_input(format!("Batch {} is empty", i))); - } - } + Self::validate_non_empty(&batches)?; match &self.mode { WriterMode::MemTable { @@ -1482,6 +1552,51 @@ impl ShardWriter { } } + /// Like [`Self::put`], but returns the durability watcher *without* awaiting + /// it. The row is visible to reads on this writer the instant this returns; + /// the caller awaits durability via the watcher (`None` when `durable_write` + /// is off). + /// + /// This lets a caller hold an *external* lock across only the in-memory + /// read-merge-insert and await durability after releasing it, so concurrent + /// flushes still coalesce. The insert stays guarded by the internal + /// `state_lock`, so `BatchStore`'s single-writer invariant holds regardless. + /// + /// MemTable mode only; errors in WAL-only mode (no in-memory tier). + #[instrument(name = "sw_put_no_wait", level = "info", skip_all, fields(batch_count = batches.len(), shard_id = %self.config.shard_id))] + pub async fn put_no_wait( + &self, + batches: Vec, + ) -> Result<(WriteResult, Option)> { + Self::validate_non_empty(&batches)?; + + match &self.mode { + WriterMode::MemTable { + state, + writer_state, + backpressure, + } => { + self.put_memtable_no_wait(batches, state, writer_state, backpressure) + .await + } + WriterMode::WalOnly { .. } => Err(Error::invalid_input( + "put_no_wait is only supported in MemTable mode", + )), + } + } + + fn validate_non_empty(batches: &[RecordBatch]) -> Result<()> { + if batches.is_empty() { + return Err(Error::invalid_input("Cannot write empty batch list")); + } + for (i, batch) in batches.iter().enumerate() { + if batch.num_rows() == 0 { + return Err(Error::invalid_input(format!("Batch {} is empty", i))); + } + } + Ok(()) + } + async fn put_memtable( &self, batches: Vec, @@ -1489,6 +1604,26 @@ impl ShardWriter { writer_state: &Arc, backpressure: &BackpressureController, ) -> Result { + let (result, watcher) = self + .put_memtable_no_wait(batches, state_lock, writer_state, backpressure) + .await?; + // Wait for durability if configured (outside the lock). + if let Some(mut watcher) = watcher { + watcher.wait().await?; + } + Ok(result) + } + + /// In-memory half of [`Self::put_memtable`]: insert under `state_lock`, + /// trigger the WAL flush, and return the watcher un-awaited for the caller + /// to wait on. `None` when `durable_write` is off. See [`Self::put_no_wait`]. + async fn put_memtable_no_wait( + &self, + batches: Vec, + state_lock: &Arc>, + writer_state: &Arc, + backpressure: &BackpressureController, + ) -> Result<(WriteResult, Option)> { // Apply backpressure if needed (before acquiring main lock) backpressure .maybe_apply_backpressure(|| { @@ -1502,7 +1637,7 @@ impl ShardWriter { let start = std::time::Instant::now(); // Acquire write lock for entire operation (atomic approach) - let (batch_positions, mut durable_watcher, batch_store, indexes) = { + let (batch_positions, durable_watcher, batch_store, indexes) = { let mut state = state_lock.write().await; // 1. Insert all batches into memtable atomically @@ -1533,8 +1668,9 @@ impl ShardWriter { self.stats.record_put(start.elapsed()); - // Wait for durability if configured (outside the lock) - if self.config.durable_write { + // Trigger the flush here (outside the lock) so the watcher can resolve; + // only the `wait()` is the caller's to schedule. + let watcher = if self.config.durable_write { self.wal_flusher.trigger_flush( WalFlushSource::BatchStore { batch_store, @@ -1543,10 +1679,12 @@ impl ShardWriter { batch_positions.end, None, )?; - durable_watcher.wait().await?; - } + Some(durable_watcher) + } else { + None + }; - Ok(WriteResult { batch_positions }) + Ok((WriteResult { batch_positions }, watcher)) } async fn put_wal_only( @@ -1781,7 +1919,7 @@ impl ShardWriter { frozen: state .frozen_memtables .iter() - .map(|m| in_memory_ref(m)) + .map(|m| in_memory_ref(&m.memtable)) .collect(), }) } @@ -2174,6 +2312,9 @@ struct MemTableFlushHandler { /// at all. index_configs: Vec, stats: SharedWriteStats, + /// How long a frozen memtable lingers in memory after its flush commits + /// before `SweepExpired` evicts it. See `ShardWriterConfig::frozen_memtable_grace`. + grace: Duration, } impl MemTableFlushHandler { @@ -2183,6 +2324,7 @@ impl MemTableFlushHandler { epoch: u64, index_configs: Vec, stats: SharedWriteStats, + grace: Duration, ) -> Self { Self { state, @@ -2190,22 +2332,51 @@ impl MemTableFlushHandler { epoch, index_configs, stats, + grace, } } + + /// Evict frozen memtables whose post-flush grace has elapsed. Un-stamped + /// (not-yet-flushed) entries are always kept. + async fn sweep_expired_frozen(&self) { + let now = now_millis(); + let grace_ms = self.grace.as_millis() as u64; + let mut state = self.state.write().await; + state + .frozen_memtables + .retain(|frozen| match frozen.flushed_at_ms { + Some(flushed_at) => now.saturating_sub(flushed_at) < grace_ms, + None => true, + }); + } } #[async_trait] impl MessageHandler for MemTableFlushHandler { - async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> { - let TriggerMemTableFlush { memtable, done } = message; + fn tickers(&mut self) -> Vec<(Duration, MessageFactory)> { + // Zero grace evicts on commit, so no sweeper is needed. + if self.grace.is_zero() { + return vec![]; + } + // Sweep often enough that eviction lags the grace by at most ~1/3, so a + // generation lives no more than ~grace * 4/3 past its flush commit. + let tick = (self.grace / 3).max(Duration::from_millis(100)); + vec![(tick, Box::new(|| TriggerMemTableFlush::SweepExpired))] + } - let result = self.flush_memtable(memtable).await; - if let Some(tx) = done { - // Send result through the channel - caller is waiting for it - let _ = tx.send(result); - } else { - // No done channel, propagate errors - result?; + async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> { + match message { + TriggerMemTableFlush::Flush { memtable, done } => { + let result = self.flush_memtable(memtable).await; + if let Some(tx) = done { + // Send result through the channel - caller is waiting for it + let _ = tx.send(result); + } else { + // No done channel, propagate errors + result?; + } + } + TriggerMemTableFlush::SweepExpired => self.sweep_expired_frozen().await, } Ok(()) } @@ -2299,15 +2470,26 @@ impl MemTableFlushHandler { state.frozen_memtable_bytes = state.frozen_memtable_bytes.saturating_sub(memtable_size); } - // Drop the queryable handle ONLY on commit success. On failure - // keep it: rows must stay in the read union until a later flush - // or WAL replay, else a transient flush error reopens the hole. - // Keyed by generation, so non-FIFO completion is fine. + // Retire the frozen handle on commit success, keyed by generation + // (non-FIFO completion is fine). Zero grace evicts here; otherwise + // stamp the grace clock so it lingers for multi-part as-of reads + // until `SweepExpired`. On failure leave it un-stamped: rows stay in + // the read union until a later flush or WAL replay, else a transient + // error reopens the hole. if flush_result.is_ok() { let flushed_generation = memtable.generation(); - state - .frozen_memtables - .retain(|m| m.generation() != flushed_generation); + if self.grace.is_zero() { + state + .frozen_memtables + .retain(|frozen| frozen.memtable.generation() != flushed_generation); + } else { + let now = now_millis(); + for frozen in state.frozen_memtables.iter_mut() { + if frozen.memtable.generation() == flushed_generation { + frozen.flushed_at_ms = Some(now); + } + } + } } } @@ -2694,6 +2876,75 @@ mod tests { writer.close().await.unwrap(); } + #[tokio::test] + async fn test_put_no_wait_durable_visible_then_durable() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let schema = create_test_schema(); + + let config = ShardWriterConfig { + shard_id: Uuid::new_v4(), + shard_spec_id: 0, + durable_write: true, + sync_indexed_write: false, + max_wal_buffer_size: 1024 * 1024, + max_wal_flush_interval: None, + max_memtable_size: 64 * 1024 * 1024, + manifest_scan_batch_size: 2, + ..Default::default() + }; + + let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) + .await + .unwrap(); + + let batch = create_test_batch(&schema, 0, 10); + let (result, watcher) = writer.put_no_wait(vec![batch]).await.unwrap(); + assert_eq!(result.batch_positions, 0..1); + + // Row is visible in memory before durability is awaited. + let stats = writer.memtable_stats().await.unwrap(); + assert_eq!(stats.row_count, 10); + + // durable_write is on, so a watcher is returned and resolves once the + // triggered flush lands. + let mut watcher = watcher.expect("durable_write returns a watcher"); + watcher.wait().await.unwrap(); + + writer.close().await.unwrap(); + } + + #[tokio::test] + async fn test_put_no_wait_non_durable_returns_no_watcher() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let schema = create_test_schema(); + + let config = ShardWriterConfig { + shard_id: Uuid::new_v4(), + shard_spec_id: 0, + durable_write: false, + sync_indexed_write: false, + max_wal_buffer_size: 1024 * 1024, + max_wal_flush_interval: None, + max_memtable_size: 64 * 1024 * 1024, + manifest_scan_batch_size: 2, + ..Default::default() + }; + + let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) + .await + .unwrap(); + + let batch = create_test_batch(&schema, 0, 10); + let (result, watcher) = writer.put_no_wait(vec![batch]).await.unwrap(); + assert_eq!(result.batch_positions, 0..1); + assert!(watcher.is_none(), "non-durable put has nothing to await"); + + let stats = writer.memtable_stats().await.unwrap(); + assert_eq!(stats.row_count, 10); + + writer.close().await.unwrap(); + } + #[tokio::test] async fn test_shard_writer_multiple_writes() { let (store, base_path, base_uri, _temp_dir) = create_local_store().await; @@ -4190,10 +4441,12 @@ mod tests { writer.close().await.unwrap(); } - /// On a successful flush commit the sealed generation is dropped from - /// the queryable set (no leak), and its rows land in the manifest. + /// On a successful flush commit the sealed generation's rows land in the + /// manifest immediately, but the in-memory handle is NOT dropped — it + /// lingers for `frozen_memtable_grace` (so in-flight as-of reads keep + /// batch-resolved membership), then is swept by the `SweepExpired` ticker. #[tokio::test] - async fn test_frozen_dropped_after_successful_flush() { + async fn test_frozen_retained_during_grace_then_swept() { let (store, base_path, base_uri, _temp_dir) = create_local_store().await; let schema = create_test_schema(); let config = ShardWriterConfig { @@ -4205,6 +4458,8 @@ mod tests { max_wal_flush_interval: None, max_memtable_size: 64 * 1024 * 1024, manifest_scan_batch_size: 2, + // Short grace so the sweep is observable without a slow test. + frozen_memtable_grace: Duration::from_secs(1), ..Default::default() }; let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) @@ -4219,13 +4474,66 @@ mod tests { writer.force_seal_active().await.unwrap(); writer.wait_for_flush_drain().await.unwrap(); + // Recorded in the manifest at commit time. + let manifest = writer.manifest().await.unwrap().expect("manifest exists"); + assert!( + manifest + .flushed_generations + .iter() + .any(|g| g.generation == initial_gen), + "flushed generation must be recorded in the manifest" + ); + + // Still queryable in memory immediately after commit (within grace). + let refs = writer.in_memory_memtable_refs().await.unwrap(); + assert_eq!(refs.active.generation, initial_gen + 1); + assert!( + refs.frozen.iter().any(|f| f.generation == initial_gen), + "flushed generation must stay queryable during the grace window" + ); + + // After the grace elapses (plus a sweep tick) the handle is evicted. + tokio::time::sleep(Duration::from_millis(1_500)).await; let refs = writer.in_memory_memtable_refs().await.unwrap(); assert!( refs.frozen.is_empty(), - "frozen handle must be dropped once the flush commit lands" + "frozen handle must be swept once the grace elapses" ); - assert_eq!(refs.active.generation, initial_gen + 1); + writer.close().await.unwrap(); + } + + /// With zero grace (the default) a frozen handle is evicted synchronously on + /// flush commit — no sweep tick, no lingering window. + #[tokio::test] + async fn test_frozen_evicted_immediately_with_zero_grace() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let schema = create_test_schema(); + let config = ShardWriterConfig { + shard_id: Uuid::new_v4(), + shard_spec_id: 0, + durable_write: false, + sync_indexed_write: false, + max_wal_buffer_size: 64 * 1024 * 1024, + max_wal_flush_interval: None, + max_memtable_size: 64 * 1024 * 1024, + manifest_scan_batch_size: 2, + frozen_memtable_grace: Duration::ZERO, + ..Default::default() + }; + let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) + .await + .unwrap(); + + let initial_gen = writer.memtable_stats().await.unwrap().generation; + writer + .put(vec![create_test_batch(&schema, 0, 10)]) + .await + .unwrap(); + writer.force_seal_active().await.unwrap(); + writer.wait_for_flush_drain().await.unwrap(); + + // Rows are durably in the manifest... let manifest = writer.manifest().await.unwrap().expect("manifest exists"); assert!( manifest @@ -4235,6 +4543,13 @@ mod tests { "flushed generation must be recorded in the manifest" ); + // ...and the in-memory handle is already gone, no sweep tick needed. + let refs = writer.in_memory_memtable_refs().await.unwrap(); + assert!( + refs.frozen.is_empty(), + "frozen handle must be evicted on commit when grace is zero" + ); + writer.close().await.unwrap(); } diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index d591e42cc73..87dda8e7e57 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -191,6 +191,13 @@ pub struct CompactionOptions { /// specified then the default (see /// [`crate::dataset::Scanner::batch_size`]) will be used. pub batch_size: Option, + /// The number of bytes to allow to queue up in the I/O buffer when scanning + /// the input fragments. If not specified then the default (see + /// [`crate::dataset::Scanner::io_buffer_size`]) will be used. + /// + /// Increasing this can avoid a deadlock that occurs when a single batch of + /// data is larger than the I/O buffer size. + pub io_buffer_size: Option, /// Whether to defer remapping indices during compaction. If true, indices will /// not be remapped during this compaction operation. Instead, the fragment reuse index /// is updated and will be used to perform remapping later. @@ -237,6 +244,7 @@ impl Default for CompactionOptions { num_threads: None, max_bytes_per_file: None, batch_size: None, + io_buffer_size: None, defer_index_remap: false, compaction_mode: None, enable_binary_copy: false, @@ -264,6 +272,7 @@ impl CompactionOptions { /// - `lance.compaction.materialize_deletions_threshold` /// - `lance.compaction.defer_index_remap` /// - `lance.compaction.batch_size` + /// - `lance.compaction.io_buffer_size` /// - `lance.compaction.compaction_mode` /// - `lance.compaction.binary_copy_read_batch_bytes` /// - `lance.compaction.max_source_fragments` @@ -347,6 +356,14 @@ impl CompactionOptions { )) })?); } + "io_buffer_size" => { + self.io_buffer_size = Some(value.parse().map_err(|_| { + Error::invalid_input(format!( + "Invalid value for {}: '{}' (expected a non-negative integer)", + key, value + )) + })?); + } "compaction_mode" => { self.compaction_mode = Some(CompactionMode::try_from(value.as_str())?); } @@ -1194,6 +1211,8 @@ async fn transform_blob_v2_batch( /// and preserve insertion order. /// - `batch_size`: Optional batch size; if provided, set it on the scanner to control /// read batching. +/// - `io_buffer_size`: Optional I/O buffer size in bytes; if provided, set it on the +/// scanner to control how much data is queued during reads. /// - `with_frags`: Whether to scan only the specified old fragments and force /// in-order reading. /// - `capture_row_ids`: When index remapping is needed, include and capture the @@ -1209,6 +1228,7 @@ async fn prepare_reader( dataset: &Dataset, fragments: &[Fragment], batch_size: Option, + io_buffer_size: Option, with_frags: bool, capture_row_ids: bool, ) -> Result<( @@ -1234,6 +1254,9 @@ async fn prepare_reader( if let Some(bs) = batch_size { scanner.batch_size(bs); } + if let Some(io_buffer_size) = io_buffer_size { + scanner.io_buffer_size(io_buffer_size); + } if with_frags { scanner .with_fragments(fragments.to_vec()) @@ -1515,6 +1538,7 @@ async fn rewrite_files( dataset.as_ref(), &fragments, options.batch_size, + options.io_buffer_size, true, needs_remapping, ) @@ -2636,6 +2660,57 @@ mod tests { assert_eq!(scanned_data, data); } + #[rstest] + #[tokio::test] + async fn test_compact_with_io_buffer_size( + #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)] + data_storage_version: LanceFileVersion, + ) { + // Compaction should succeed and produce correct results when an + // explicit io_buffer_size is provided via CompactionOptions. + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let data = sample_data(); + + // Create a table with 2 small fragments so there is something to compact. + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 5_000, + max_rows_per_group: 1_000, + data_storage_version: Some(data_storage_version), + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); + + let options = CompactionOptions { + // A generous buffer so the read does not deadlock on large batches. + io_buffer_size: Some(256 * 1024 * 1024), + ..Default::default() + }; + let plan = plan_compaction(&dataset, &options).await.unwrap(); + assert_eq!(plan.tasks().len(), 1); + + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert_eq!(metrics.fragments_removed, 2); + assert_eq!(metrics.fragments_added, 1); + + // All rows are preserved after compaction. + let scanner = dataset.scan(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let scanned_data = concat_batches(&batches[0].schema(), &batches).unwrap(); + assert_eq!(scanned_data.num_rows(), data.num_rows()); + } + #[rstest] #[tokio::test] async fn test_compact_deletions( @@ -4232,6 +4307,133 @@ mod tests { assert_eq!(scanner.count_rows().await.unwrap(), count3); } + /// Deferred compaction that materializes deletions must not corrupt an + /// inverted (FTS) index read through the fragment-reuse index. The index's + /// posting lists reference doc_ids positionally; if the load-time remap + /// dropped the deleted rows it would renumber the doc_ids and desync the + /// posting lists (out-of-bounds `num_tokens`, wrong/stale row ids). The + /// tombstone-preserve-positions load path must keep results correct in the + /// FRI window and after the physical remap + trim. + #[tokio::test] + async fn test_read_inverted_index_with_defer_index_remap_and_deletions() { + // Enough surviving docs for several compressed posting-list blocks + // (BLOCK_SIZE = 128), split across several fragments so compaction has + // real work — but no larger. + const ROWS: i32 = 1200; + const DELETED: i32 = 400; + + // Every row contains "lance", so the term matches all live rows; `id` + // tells us exactly which rows survive. + let ids = Int32Array::from_iter_values(0..ROWS); + let docs = LargeStringArray::from_iter_values((0..ROWS).map(|_| "lance apple orange")); + let batch = RecordBatch::try_new( + Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("doc", DataType::LargeUtf8, false), + ]) + .into(), + vec![Arc::new(ids) as ArrayRef, Arc::new(docs) as ArrayRef], + ) + .unwrap(); + let schema_ref = batch.schema(); + let stream = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema_ref); + let mut dataset = Dataset::write( + stream, + "memory://test/table", + Some(WriteParams { + max_rows_per_file: 200, // 6 fragments + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["doc"], + IndexType::Inverted, + Some("doc_idx".into()), + &InvertedIndexParams::default(), + false, + ) + .await + .unwrap(); + + // Delete a prefix, then deferred-compact so the deletions are + // materialized into the fragment-reuse index the index is read through. + dataset.delete(&format!("id < {DELETED}")).await.unwrap(); + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!( + dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .is_some(), + "deferred compaction must leave a fragment-reuse index" + ); + + // FTS "lance" → sorted surviving ids. Projecting `id` forces a take, so + // a stale row address would error or return a wrong/dead row. + async fn search_ids(dataset: &Dataset) -> Vec { + let mut scanner = dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("lance".to_owned())) + .unwrap(); + scanner.project::<&str>(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut ids: Vec = batches + .iter() + .flat_map(|b| { + b.column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + ids.sort_unstable(); + ids + } + + let expected = (DELETED..ROWS).collect::>(); + + // FRI window: index read through the reuse index. + let during = search_ids(&dataset).await; + assert_eq!( + during, expected, + "FRI-window FTS must return exactly the surviving rows (no resurrection, no loss, no stale rows)" + ); + + // Physical remap + trim: must still be correct. + remapping::remap_column_index(&mut dataset, &["doc"], Some("doc_idx".into())) + .await + .unwrap(); + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let after = search_ids(&dataset).await; + assert_eq!( + after, expected, + "FTS must stay correct after physical remap + fragment-reuse trim" + ); + } + #[tokio::test] async fn test_read_ngram_index_with_defer_index_remap() { // Generate random words using lance-datagen @@ -4615,6 +4817,668 @@ mod tests { ); } + #[tokio::test] + async fn test_read_ivf_rq_index_v3_with_defer_index_remap() { + use arrow_array::cast::AsArray; + use lance_index::vector::bq::RQBuildParams; + + let mut dataset = lance_datagen::gen_batch() + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(128)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + let stored: Vec> = { + let mut scanner = dataset.scan(); + scanner.project(&["vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut out = Vec::new(); + for batch in &batches { + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let values = vecs.value(i); + let values = values.as_primitive::(); + out.push(values.values().to_vec()); + } + } + out + }; + + let index_name = Some("vec_idx".into()); + dataset + .create_index( + &["vec"], + IndexType::Vector, + index_name.clone(), + &VectorIndexParams { + metric_type: DistanceType::L2, + stages: vec![ + StageParams::Ivf(IvfBuildParams { + max_iters: 2, + num_partitions: Some(2), + sample_rate: 2, + ..Default::default() + }), + StageParams::RQ(RQBuildParams::new(1)), + ], + version: crate::index::vector::IndexFileVersion::V3, + skip_transpose: false, + runtime_hints: Default::default(), + }, + false, + ) + .await + .unwrap(); + let indices = dataset.load_indices().await.unwrap(); + let original_index = indices.iter().find(|idx| idx.name == "vec_idx").unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }; + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert!(metrics.fragments_removed > 0); + assert!(metrics.fragments_added > 0); + + let Some(current_index) = dataset.load_index_by_name("vec_idx").await.unwrap() else { + panic!("vec index must be available"); + }; + assert_eq!(current_index.uuid, original_index.uuid); + + let frag_reuse_present = dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME); + assert!( + frag_reuse_present, + "defer_index_remap must record a {} index", + FRAG_REUSE_INDEX_NAME + ); + + let sample_step = (stored.len() / 8).max(1); + let mut checked = 0; + for query in stored.iter().step_by(sample_step) { + let query_vec = PrimitiveArray::::from_iter_values(query.iter().copied()); + let mut scanner = dataset.scan(); + scanner.nearest("vec", &query_vec, 5).unwrap(); + scanner.project(&["vec"]).unwrap().with_row_id(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert!(!batches.is_empty(), "query returned no batches"); + let top = &batches[0]; + assert!(top.num_rows() > 0, "query returned empty top batch"); + let top_vec = top["vec"].as_fixed_size_list().value(0); + let top_vec = top_vec.as_primitive::(); + assert_eq!( + top_vec.values(), + query.as_slice(), + "top-1 self-recall returned a different vector than the query" + ); + checked += 1; + } + assert!(checked > 0, "expected to check at least one stored vector"); + } + + /// Build an `id` + `vec` dataset, create the given IVF vector index, + /// optionally delete rows, then run deferred compaction (which materializes + /// the deletions into the fragment-reuse index) and assert that KNN over + /// surviving vectors during the FRI window (a) never returns a deleted row + /// and (b) stays consistent with the pre-compaction answer. + /// + /// The deletion path is the interesting one: materialized deletions drop + /// rows from the quantization storage at load time, which shifts storage + /// positions. Flat storage (FLAT/PQ/SQ/RQ) is scanned linearly so this is + /// fine, but the HNSW graph addresses storage positionally and is not + /// frag-reuse aware, so a desync would surface here as recall collapse or a + /// resurrected/again-deleted row. + /// Top-k `id`s for a KNN query against the `vec` column. + async fn vector_knn_ids(dataset: &Dataset, query: &[f32], k: usize) -> Vec { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + let qa = PrimitiveArray::::from_iter_values(query.iter().copied()); + let mut scanner = dataset.scan(); + scanner.nearest("vec", &qa, k).unwrap(); + scanner.project(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut ids = Vec::new(); + for b in &batches { + ids.extend(b["id"].as_primitive::().values().iter().copied()); + } + ids + } + + async fn check_vector_defer_compaction( + params: VectorIndexParams, + delete_predicate: Option<&str>, + k: usize, + min_overlap: usize, + ) { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + use lance_datagen::Dimension; + + const DIM: u32 = 32; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(DIM)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vec_idx".into()), + ¶ms, + false, + ) + .await + .unwrap(); + let original_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + + if let Some(pred) = delete_predicate { + dataset.delete(pred).await.unwrap(); + } + + // Collect surviving (id, vec) pairs and the set of surviving ids. + let mut survivors: Vec<(i32, Vec)> = Vec::new(); + { + let mut scanner = dataset.scan(); + scanner.project(&["id", "vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + for batch in &batches { + let ids = batch["id"].as_primitive::(); + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let v = vecs.value(i); + let v = v.as_primitive::().values().to_vec(); + survivors.push((ids.value(i), v)); + } + } + } + assert!(!survivors.is_empty()); + let surviving_ids: std::collections::HashSet = + survivors.iter().map(|(id, _)| *id).collect(); + + // Sample queries from survivors and capture the pre-compaction answer. + let step = (survivors.len() / 16).max(1); + let queries: Vec<(i32, Vec)> = survivors.iter().step_by(step).cloned().collect(); + let mut baseline: Vec> = Vec::new(); + for (_, q) in &queries { + baseline.push(vector_knn_ids(&dataset, q, k).await); + } + + // Deferred compaction materializes the deletions into the frag-reuse index. + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert!( + dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME), + "deferred compaction must record a frag-reuse index" + ); + assert_eq!( + dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid, + original_uuid, + "index must not be physically remapped yet (FRI window)" + ); + + // During the FRI window: no deleted rows, and stable vs the baseline. + for (i, (_, q)) in queries.iter().enumerate() { + let after = vector_knn_ids(&dataset, q, k).await; + for id in &after { + assert!( + surviving_ids.contains(id), + "KNN returned id {id} that is not a surviving row (query #{i})" + ); + } + let overlap = after.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= min_overlap, + "KNN top-{k} diverged after deferred compaction: overlap {overlap} < {min_overlap} (query #{i})" + ); + } + } + + fn small_ivf() -> lance_index::vector::ivf::IvfBuildParams { + lance_index::vector::ivf::IvfBuildParams { + max_iters: 2, + num_partitions: Some(2), + sample_rate: 2, + ..Default::default() + } + } + + #[tokio::test] + async fn test_ivf_flat_defer_compaction_with_deletions() { + let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf()); + // Flat storage is scanned linearly; dropping deleted rows is exact. + check_vector_defer_compaction(params, Some("id < 1500"), 10, 10).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_sq_defer_compaction_merge_only() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + SQBuildParams::default(), + ); + // No deletions: storage positions are stable, so the graph stays aligned. + check_vector_defer_compaction(params, None, 10, 9).await; + } + + // NOTE: IVF_HNSW_* under materialized deletions is a known gap (lance#3993, + // HNSW auto-remap not implemented) — the HNSW graph isn't realigned after the + // frag-reuse drop. Deferred remap is gated off for HNSW tables, so there is + // no lance-level reproducer here; the gate is tested in the data plane. + // Merge-only HNSW is covered (see the *_remap_and_trim tests). + + #[tokio::test] + async fn test_ivf_pq_defer_compaction_with_deletions() { + use lance_index::vector::pq::PQBuildParams; + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + small_ivf(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + #[tokio::test] + async fn test_ivf_sq_defer_compaction_with_deletions() { + use lance_index::vector::sq::builder::SQBuildParams; + let params = VectorIndexParams::with_ivf_sq_params( + DistanceType::L2, + small_ivf(), + SQBuildParams::default(), + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + #[tokio::test] + async fn test_ivf_rq_defer_compaction_with_deletions() { + use lance_index::vector::bq::RQBuildParams; + let params = VectorIndexParams::with_ivf_rq_params( + DistanceType::L2, + small_ivf(), + RQBuildParams::new(1), + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + /// Merge-only deferred compaction, then a PHYSICAL remap + FRI trim. Asserts + /// the index is rebuilt, the fragment-reuse index trims to zero versions, + /// and KNN stays consistent with the pre-compaction answer through both the + /// FRI window and the physical remap. (HNSW rebuilds its graph on physical + /// remap, so the overlap is recall-tolerant.) + async fn check_vector_remap_and_trim( + params: VectorIndexParams, + k: usize, + window_overlap: usize, + post_remap_overlap: Option, + ) { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + use lance_datagen::Dimension; + + const DIM: u32 = 32; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(DIM)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vec_idx".into()), + ¶ms, + false, + ) + .await + .unwrap(); + let original_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + + // Sample queries from stored vectors + capture the pre-compaction answer. + let mut rows: Vec> = Vec::new(); + { + let mut scanner = dataset.scan(); + scanner.project(&["vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + for batch in &batches { + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let v = vecs.value(i); + rows.push(v.as_primitive::().values().to_vec()); + } + } + } + let step = (rows.len() / 16).max(1); + let queries: Vec> = rows.iter().step_by(step).cloned().collect(); + let mut baseline: Vec> = Vec::new(); + for q in &queries { + baseline.push(vector_knn_ids(&dataset, q, k).await); + } + + // Merge-only deferred compaction. + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert_eq!( + dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid, + original_uuid, + "index must not be physically remapped yet (FRI window)" + ); + for (i, q) in queries.iter().enumerate() { + let window = vector_knn_ids(&dataset, q, k).await; + let overlap = window.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= window_overlap, + "FRI-window KNN diverged: overlap {overlap} < {window_overlap} (query #{i})" + ); + } + + // Physical remap + trim the fragment-reuse index. + remapping::remap_column_index(&mut dataset, &["vec"], Some("vec_idx".into())) + .await + .unwrap(); + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + + let remapped_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + assert_ne!( + remapped_uuid, original_uuid, + "index should have been physically remapped" + ); + if let Some(meta) = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + { + let versions = load_frag_reuse_index_details(&dataset, &meta) + .await + .unwrap() + .versions + .len(); + assert_eq!(versions, 0, "frag-reuse index must trim to zero versions"); + } + + for (i, q) in queries.iter().enumerate() { + let after = vector_knn_ids(&dataset, q, k).await; + // No stale/desynced addresses (a bad address fails the take above). + assert!( + !after.is_empty(), + "post-remap KNN returned no rows (query #{i})" + ); + // Physical remap rebuilds the HNSW graph, so recall is only compared + // for the exact (non-HNSW) types. + if let Some(min_overlap) = post_remap_overlap { + let overlap = after.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= min_overlap, + "post-remap KNN diverged: overlap {overlap} < {min_overlap} (query #{i})" + ); + } + } + } + + #[tokio::test] + async fn test_ivf_flat_remap_and_trim() { + let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf()); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + // Regression: PQ storage used to remap its codes through the frag-reuse + // index but keep the pre-remap `row_ids` field, so search returned stale + // (compacted-away) addresses and the take failed with "fragment ... does + // not exist" — even merge-only, and only observable when the query fetches + // row content (the existing `test_read_ivf_pq_index_v3_with_defer_index_remap` + // projects no columns, so it never takes and missed this). + #[tokio::test] + async fn test_ivf_pq_remap_and_trim() { + use lance_index::vector::pq::PQBuildParams; + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + small_ivf(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_sq_remap_and_trim() { + use lance_index::vector::sq::builder::SQBuildParams; + let params = VectorIndexParams::with_ivf_sq_params( + DistanceType::L2, + small_ivf(), + SQBuildParams::default(), + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_rq_remap_and_trim() { + use lance_index::vector::bq::RQBuildParams; + let params = VectorIndexParams::with_ivf_rq_params( + DistanceType::L2, + small_ivf(), + RQBuildParams::new(1), + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_sq_remap_and_trim() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + SQBuildParams::default(), + ); + // Physical remap rebuilds the HNSW graph, so use a recall-tolerant overlap. + check_vector_remap_and_trim(params, 10, 7, None).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_pq_remap_and_trim() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, pq::PQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_pq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_remap_and_trim(params, 10, 7, None).await; + } + + // Scalar index correctness across deferred compaction WITH materialized + // deletions. The existing test_read_*_index_with_defer_index_remap tests are + // merge-only and project no columns (count-only), so they never take and + // never exercise the deletion drop path. These add an `id` column, delete a + // prefix, defer-compact, then run the indexed query *projecting id* (a take) + // and assert no deleted row is returned. Bitmap/BTree have no positional + // internal structure so the drop path is exact; the Inverted (FTS) index + // does (see its test below), and currently desyncs under deletions. + + #[tokio::test] + async fn test_bitmap_index_defer_compaction_with_deletions() { + use arrow_array::cast::AsArray; + use arrow_array::types::Int32Type; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "category", + lance_datagen::array::cycle::(vec![1, 2, 3]), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["category"], + IndexType::Bitmap, + Some("category_idx".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + dataset.delete("id < 1500").await.unwrap(); + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert!( + dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME), + "deferred compaction must record a frag-reuse index" + ); + + let mut scanner = dataset.scan(); + scanner.filter("category = 3").unwrap(); + scanner.project(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut returned = 0; + for b in &batches { + for id in b["id"].as_primitive::().values() { + assert!( + *id >= 1500, + "bitmap returned deleted id {id} in the FRI window" + ); + returned += 1; + } + } + assert!(returned > 0, "expected surviving category=3 rows"); + } + + // NOTE: Inverted/FTS under materialized deletions is broken (BM25 scores + // via positional num_tokens[doc_id]; the frag-reuse drop shifts doc_id + // positions -> out-of-bounds). It is gated off defer in the data plane + // until fixed, so there is no lance-level reproducer here. Merge-only FTS + // is covered by test_read_inverted_index_with_defer_index_remap. + #[tokio::test] async fn test_default_compaction_planner() { let test_dir = TempStrDir::default(); @@ -4683,6 +5547,10 @@ mod tests { "lance.compaction.batch_size".to_string(), "4096".to_string(), ), + ( + "lance.compaction.io_buffer_size".to_string(), + "1073741824".to_string(), + ), ( "lance.compaction.compaction_mode".to_string(), "try_binary_copy".to_string(), @@ -4701,6 +5569,7 @@ mod tests { assert!((opts.materialize_deletions_threshold - 0.25).abs() < f32::EPSILON); assert!(opts.defer_index_remap); assert_eq!(opts.batch_size, Some(4096)); + assert_eq!(opts.io_buffer_size, Some(1_073_741_824)); assert_eq!(opts.compaction_mode, Some(CompactionMode::TryBinaryCopy)); assert_eq!(opts.binary_copy_read_batch_bytes, Some(8_388_608)); } diff --git a/rust/lance/src/dataset/optimize/remapping.rs b/rust/lance/src/dataset/optimize/remapping.rs index dab62bf6166..266ac977a69 100644 --- a/rust/lance/src/dataset/optimize/remapping.rs +++ b/rust/lance/src/dataset/optimize/remapping.rs @@ -220,25 +220,37 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { return Ok(()); } - // Sequentially apply the row addr maps from oldest to latest - let mut curr_index_id = *index_id; - for (i, row_id_map) in frag_reuse_index.row_id_maps.iter().enumerate() { - let version = &frag_reuse_index.details.versions[i]; - // load on-disk index metadata before auto-remap - let curr_index_meta = read_manifest_indexes( - &dataset.object_store, - &dataset.manifest_location, - &dataset.manifest, - ) - .await? - .into_iter() - .find(|idx| idx.uuid == curr_index_id) - .unwrap(); - - let maybe_index_bitmap = curr_index_meta.fragment_bitmap.clone(); - let (should_remap, bitmap_after_remap) = match maybe_index_bitmap { - Some(mut index_frag_bitmap) => { - let mut should_remap = false; + // Read the index's on-disk metadata once. Its stored row addresses are at + // this baseline; we compose all reuse versions into a single remap so the + // index file is rebuilt and committed exactly once, rather than once per + // version (the reuse index can accumulate many versions before remap runs). + let curr_index_meta = read_manifest_indexes( + &dataset.object_store, + &dataset.manifest_location, + &dataset.manifest, + ) + .await? + .into_iter() + .find(|idx| idx.uuid == *index_id) + .ok_or_else(|| { + Error::index(format!( + "index {index_id} not found in manifest; it may have been concurrently dropped" + )) + })?; + + // Compose the coverage (fragment bitmap) remap across every reuse version in + // one pass. Chaining is automatic: a version inserts its new fragments, + // which a later version then sees as its old fragments. `data_predates_version` + // is evaluated against the fixed baseline (there are no intermediate + // commits), and the new-fragment branch handles a bitmap that was already + // coverage-remapped + persisted before the data was remapped (e.g. while + // remapping a *sibling* index). + let baseline_version = curr_index_meta.dataset_version; + let (should_remap, bitmap_after_remap) = match curr_index_meta.fragment_bitmap.clone() { + Some(mut index_frag_bitmap) => { + let mut should_remap = false; + for version in frag_reuse_index.details.versions.iter() { + let data_predates_version = baseline_version < version.dataset_version; for group in version.groups.iter() { let mut old_frag_in_index = 0; for old_frag in group.old_frags.iter() { @@ -258,67 +270,97 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { group.old_frags ))); } - index_frag_bitmap - .extend(group.new_frags.clone().into_iter().map(|f| f.id as u32)); + index_frag_bitmap.extend(group.new_frags.iter().map(|f| f.id as u32)); + should_remap = true; + } else if data_predates_version + && group + .new_frags + .iter() + .any(|new_frag| index_frag_bitmap.contains(new_frag.id as u32)) + { + // The bitmap was already coverage-remapped onto this + // group's new fragments and persisted before the data was + // remapped, so the old fragments are gone from the bitmap + // but the index data still needs remapping. should_remap = true; } } - (should_remap, Some(index_frag_bitmap)) } - // if there is no fragment bitmap for the index, - // we attempt remapping but will not update the fragment bitmap. - None => (true, None), - }; - - if should_remap { - let remap_result = index::remap_index(dataset, &curr_index_id, row_id_map).await?; - - let new_index_meta = match remap_result { - RemapResult::Drop => continue, - RemapResult::Keep(new_id) => IndexMetadata { - uuid: new_id, - name: curr_index_meta.name.clone(), - fields: curr_index_meta.fields.clone(), - dataset_version: dataset.manifest.version, - fragment_bitmap: bitmap_after_remap, - index_details: curr_index_meta.index_details.clone(), - index_version: curr_index_meta.index_version, - created_at: curr_index_meta.created_at, - base_id: None, - files: curr_index_meta.files.clone(), - }, - RemapResult::Remapped(remapped_index) => IndexMetadata { - uuid: remapped_index.new_id, - name: curr_index_meta.name.clone(), - fields: curr_index_meta.fields.clone(), - dataset_version: dataset.manifest.version, - fragment_bitmap: bitmap_after_remap, - index_details: Some(Arc::new(remapped_index.index_details)), - index_version: remapped_index.index_version as i32, - created_at: curr_index_meta.created_at, - base_id: None, - files: remapped_index.files, - }, - }; - - let new_id = new_index_meta.uuid; + (should_remap, Some(index_frag_bitmap)) + } + // if there is no fragment bitmap for the index, + // we attempt remapping but will not update the fragment bitmap. + None => (true, None), + }; - let transaction = Transaction::new( - dataset.manifest.version, - Operation::CreateIndex { - new_indices: vec![new_index_meta], - removed_indices: vec![curr_index_meta.clone()], - }, - None, - ); + if !should_remap { + return Ok(()); + } - dataset - .apply_commit(transaction, &Default::default(), &Default::default()) - .await?; + // Compose the row-address remap across all versions. `remap_row_id` already + // chains every version (and passes through addresses a version does not + // touch), so mapping the union of all versions' keys yields a single + // baseline -> final address map applied in one rebuild. + // + // Map every old address; do NOT filter by the current `fragment_bitmap`. In + // the sibling-coverage-remap case the bitmap was already advanced onto the + // new fragments while the index data still holds old addresses, so filtering + // by it would drop exactly the keys this index needs and leave its data + // stale (an empty map makes `index::remap_index` return `Keep`). The map is + // bounded by the rows the reuse index touched; addresses this index does not + // store are simply never looked up. + let composed_row_id_map: HashMap> = frag_reuse_index + .row_id_maps + .iter() + .flat_map(|row_id_map| row_id_map.keys().copied()) + .map(|old_addr| (old_addr, frag_reuse_index.remap_row_id(old_addr))) + .collect(); + + let remap_result = index::remap_index(dataset, index_id, &composed_row_id_map).await?; + + let new_index_meta = match remap_result { + // The composed remap emptied the index (every row deleted). Matching the + // prior per-version behavior, leave the existing index untouched and + // commit nothing -- there is no remap to apply. + RemapResult::Drop => return Ok(()), + RemapResult::Keep(new_id) => IndexMetadata { + uuid: new_id, + name: curr_index_meta.name.clone(), + fields: curr_index_meta.fields.clone(), + dataset_version: dataset.manifest.version, + fragment_bitmap: bitmap_after_remap, + index_details: curr_index_meta.index_details.clone(), + index_version: curr_index_meta.index_version, + created_at: curr_index_meta.created_at, + base_id: None, + files: curr_index_meta.files.clone(), + }, + RemapResult::Remapped(remapped_index) => IndexMetadata { + uuid: remapped_index.new_id, + name: curr_index_meta.name.clone(), + fields: curr_index_meta.fields.clone(), + dataset_version: dataset.manifest.version, + fragment_bitmap: bitmap_after_remap, + index_details: Some(Arc::new(remapped_index.index_details)), + index_version: remapped_index.index_version as i32, + created_at: curr_index_meta.created_at, + base_id: None, + files: remapped_index.files, + }, + }; - curr_index_id = new_id; - } - } + let transaction = Transaction::new( + dataset.manifest.version, + Operation::CreateIndex { + new_indices: vec![new_index_meta], + removed_indices: vec![curr_index_meta], + }, + None, + ); + + dataset + .apply_commit(transaction, &Default::default(), &Default::default()) + .await?; Ok(()) } diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 9a5cd94dd09..d4b58e4783f 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -72,7 +72,7 @@ use lance_index::scalar::inverted::query::{ FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, PhraseQuery, fill_fts_query_column, }; use lance_index::scalar::inverted::{SCORE_COL, SCORE_FIELD}; -use lance_index::vector::{DEFAULT_QUERY_PARALLELISM, DIST_COL, Query}; +use lance_index::vector::{ApproxMode, DEFAULT_QUERY_PARALLELISM, DIST_COL, Query}; use lance_index::{metrics::NoOpMetricsCollector, scalar::inverted::FTS_SCHEMA}; use lance_io::stream::RecordBatchStream; use lance_linalg::distance::MetricType; @@ -829,6 +829,15 @@ pub struct Scanner { /// Which version of the relational algebra to use when generating the physical plan relational_algebra_version: u32, + /// Target degree of parallelism for the physical optimizer. + /// + /// This is passed as `ConfigOptions::execution::target_partitions` to the + /// physical optimizer (e.g. `EnforceDistribution`), which uses it to decide + /// how many parallel partitions to target when inserting exchange nodes. + /// + /// Defaults to `get_num_compute_intensive_cpus()`. + target_parallelism: Option, + // Legacy fields to help migrate some old projection behavior to new behavior // // There are two behaviors we are moving away from: @@ -1053,6 +1062,7 @@ impl Scanner { explicit_projection: false, autoproject_scoring_columns: true, relational_algebra_version: LANCE_RELATIONAL_ALGEBRA_VERSION, + target_parallelism: None, }; scanner.apply_blob_handling(); scanner @@ -1379,6 +1389,16 @@ impl Scanner { self } + /// Set the target number of partitions for the physical optimizer. + /// + /// Overrides the default (`get_num_compute_intensive_cpus()`). Used by + /// `EnforceDistribution` and similar rules to decide how many parallel + /// partitions to use. Set to 1 in tests that assert specific plan shapes. + pub fn target_parallelism(&mut self, n: usize) -> &mut Self { + self.target_parallelism = Some(n); + self + } + /// Set whether to read data in order (default: true) /// /// A scan will always read from the disk concurrently. If this property @@ -1572,6 +1592,7 @@ impl Scanner { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }); self.nearest_query_count = query_count; self.is_batch_nearest = is_batch_nearest; @@ -1740,6 +1761,19 @@ impl Scanner { self } + /// Configure the speed / accuracy tradeoff for approximate vector search. + /// + /// This setting is currently only used by RQ-quantized indexes, such as + /// IVF_RQ. Other index types ignore this setting. + pub fn approx_mode(&mut self, approx_mode: ApproxMode) -> &mut Self { + if let Some(q) = self.nearest.as_mut() { + q.approx_mode = approx_mode; + } else { + log::warn!("approx_mode is not set because nearest has not been called yet"); + } + self + } + /// Configure partition-search concurrency for each vector query. /// /// The default is 0. @@ -2617,7 +2651,10 @@ impl Scanner { plan = self.apply_aggregate(plan, agg).await?; let optimizer = get_physical_optimizer(); - let options = Default::default(); + let mut options = ConfigOptions::default(); + options.execution.target_partitions = self + .target_parallelism + .unwrap_or_else(get_num_compute_intensive_cpus); for rule in optimizer.rules { plan = rule.optimize(plan, &options)?; } @@ -2681,7 +2718,10 @@ impl Scanner { } let optimizer = get_physical_optimizer(); - let options: ConfigOptions = Default::default(); + let mut options = ConfigOptions::default(); + options.execution.target_partitions = self + .target_parallelism + .unwrap_or_else(get_num_compute_intensive_cpus); for rule in optimizer.rules { plan = rule.optimize(plan, &options)?; } @@ -3551,33 +3591,35 @@ impl Scanner { .clone(); let mut columns = vec![column]; - if let Some(expr) = filter_plan.full_expr.as_ref() { - let filter_columns = Planner::column_names_in_expr(expr); - columns.extend(filter_columns); + if let Some(refine_expr) = filter_plan.refine_expr.as_ref() { + columns.extend(Planner::column_names_in_expr(refine_expr)); } - let flat_fts_scan_schema = Arc::new(self.dataset.schema().project(&columns).unwrap()); - let mut scan_node = self.scan_fragments( - true, - false, - false, - false, - false, - flat_fts_scan_schema, - Arc::new(fragments), - None, - false, - ); + let scan_projection = self + .dataset + .empty_projection() + .with_row_id() + .union_columns(&columns, OnMissing::Error)?; - if let Some(expr) = filter_plan.full_expr.as_ref() { - // If there is a prefilter we need to manually apply it to the new data - scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?); + let PlannedFilteredScan { mut plan, .. } = self + .filtered_read( + filter_plan, + scan_projection, + /*make_deletions_null=*/ false, + Some(Arc::new(fragments)), + None, + /*is_prefilter=*/ true, + ) + .await?; + + if let Some(refine_expr) = filter_plan.refine_expr.as_ref() { + plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?); } let flat_match_plan = Arc::new(FlatMatchQueryExec::new( self.dataset.clone(), query.clone(), params.clone(), - scan_node, + plan, )); Ok(flat_match_plan) } @@ -3655,7 +3697,7 @@ impl Scanner { .dataset .open_vector_index( q.column.as_str(), - &selected_index_segments[0].uuid.to_string(), + &selected_index_segments[0].uuid, &NoOpMetricsCollector, ) .await?; @@ -3689,11 +3731,7 @@ impl Scanner { // Fall back to opening the index for legacy indices without details let idx = self .dataset - .open_vector_index( - q.column.as_str(), - &index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index(q.column.as_str(), &index.uuid, &NoOpMetricsCollector) .await?; idx.metric_type() }; @@ -4248,6 +4286,7 @@ impl Scanner { with_make_deletions_null, ordered_output: ordered, file_reader_options: self.resolved_file_reader_options(), + parallelism_cap: None, }; Arc::new(LanceScanExec::new( self.dataset.clone(), @@ -4442,6 +4481,14 @@ impl Scanner { } else { input }; + let retain_vector = if self.is_batch_nearest { + let vector_field_id = self.dataset.schema().field_id(q.column.as_str())?; + self.projection_plan + .physical_projection + .contains_field_id(vector_field_id) + } else { + false + }; let flat_dist = Arc::new(KNNVectorDistanceExec::try_new_batch( input, &q.column, @@ -4453,6 +4500,7 @@ impl Scanner { lower_bound: q.lower_bound, upper_bound: q.upper_bound, distance_type: metric_type, + retain_vector, }, )?); @@ -4828,6 +4876,26 @@ impl Scanner { Ok(format!("{}", display.indent(verbose))) } + + /// Run [`Self::count_rows`]'s underlying plan and return it formatted with + /// runtime metrics. Equivalent to [`Self::analyze_plan`] but with a + /// `COUNT(*)` aggregate auto-applied first — the only way for callers + /// without a hand-built `AggregateExpr` (e.g. the Python bindings) to + /// inspect the plan that `count_rows` actually executed. + #[instrument(level = "info", skip(self))] + pub async fn analyze_count_plan(&self) -> Result { + let mut scanner = self.clone(); + scanner.aggregate(AggregateExpr::builder().count_star().build())?; + let plan = scanner.create_plan().await?; + analyze_plan( + plan, + LanceExecutionOptions { + batch_size: self.batch_size, + ..Default::default() + }, + ) + .await + } } // Search over all indexed fields including nested ones, collecting columns that have an @@ -5883,6 +5951,114 @@ mod test { (queries, query_values) } + async fn nested_vector_test_dataset(dim: u32) -> (TempStrDir, Dataset) { + let path = TempStrDir::default(); + let vec_field = ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + dim as i32, + ), + true, + ); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + payload_field.clone(), + ])); + + let batches: Vec = (0..5) + .map(|batch_idx| { + let vector_values: Float32Array = (0..dim * 80).map(|v| v as f32).collect(); + let vectors = + FixedSizeListArray::try_new_from_values(vector_values, dim as i32).unwrap(); + let payload = StructArray::from(vec![( + Arc::new(vec_field.clone()), + Arc::new(vectors) as ArrayRef, + )]); + RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values( + batch_idx * 80..(batch_idx + 1) * 80, + )), + Arc::new(payload), + ], + ) + .unwrap() + }) + .collect(); + + let params = WriteParams { + max_rows_per_group: 10, + max_rows_per_file: 200, + data_storage_version: Some(LanceFileVersion::Stable), + enable_stable_row_ids: true, + ..Default::default() + }; + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, &path, Some(params)).await.unwrap(); + (path, dataset) + } + + async fn escaped_nested_vector_test_dataset(dim: u32) -> (TempStrDir, Dataset) { + let path = TempStrDir::default(); + let vec_field = ArrowField::new( + "vec.with.dot", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + dim as i32, + ), + true, + ); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + payload_field.clone(), + ])); + + let batches: Vec = (0..5) + .map(|batch_idx| { + let vector_values: Float32Array = (0..dim * 80).map(|v| v as f32).collect(); + let vectors = + FixedSizeListArray::try_new_from_values(vector_values, dim as i32).unwrap(); + let payload = StructArray::from(vec![( + Arc::new(vec_field.clone()), + Arc::new(vectors) as ArrayRef, + )]); + RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values( + batch_idx * 80..(batch_idx + 1) * 80, + )), + Arc::new(payload), + ], + ) + .unwrap() + }) + .collect(); + + let params = WriteParams { + max_rows_per_group: 10, + max_rows_per_file: 200, + data_storage_version: Some(LanceFileVersion::Stable), + enable_stable_row_ids: true, + ..Default::default() + }; + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, &path, Some(params)).await.unwrap(); + (path, dataset) + } + fn assert_query_index_field(batch: &RecordBatch) { let schema = batch.schema(); let field = schema.field(0); @@ -5891,6 +6067,14 @@ mod test { assert!(!field.is_nullable()); } + fn assert_batch_knn_output_has_no_vector(batch: &RecordBatch, vector_column: &str) { + assert!( + batch.schema().column_with_name(vector_column).is_none(), + "batch flat KNN output must not include vector column '{vector_column}' when it is not projected; columns: {:?}", + batch.schema().field_names() + ); + } + async fn assert_batch_matches_single_queries( dataset: &Dataset, batch: &RecordBatch, @@ -5965,6 +6149,7 @@ mod test { let batch = scan.try_into_batch().await.unwrap(); assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); assert_eq!( batch.num_rows(), 2 * k, @@ -5987,6 +6172,25 @@ mod test { } assert_batch_matches_single_queries(dataset, &batch, &query_values, k, false, None).await; + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest("vec", &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&["i", "vec"]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec.schema().column_with_name("vec").is_some(), + "batch flat KNN should return vector column when projected" + ); + assert_batch_matches_single_queries( + dataset, + &batch_with_vec, + &query_values, + k, + false, + None, + ) + .await; + let query_values_one = (32..64).map(|v| v as f32).collect::>(); let queries_one = FixedSizeListArray::try_new_from_values( Float32Array::from(query_values_one.clone()), @@ -6012,12 +6216,190 @@ mod test { let batch = scan.try_into_batch().await.unwrap(); assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); assert_eq!( batch[QUERY_INDEX_COL].as_primitive::().values(), &[0, 0] ); } + #[tokio::test] + async fn test_batch_knn_flat_omits_vector_without_projection() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) + .await + .unwrap(); + let dataset = &test_ds.dataset; + let k = 2; + let (queries, query_values) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest("vec", &queries, k).unwrap(); + scan.use_index(false); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + assert_batch_knn_output_has_no_vector(&batch, "vec"); + assert_query_index_field(&batch); + assert!(batch.schema().column_with_name("i").is_some()); + assert!(batch.schema().column_with_name(DIST_COL).is_some()); + assert_batch_matches_single_queries(dataset, &batch, &query_values, k, false, None).await; + + let mut scan_rowid_only = dataset.scan(); + scan_rowid_only.nearest("vec", &queries, k).unwrap(); + scan_rowid_only.use_index(false); + scan_rowid_only.project(&[ROW_ID]).unwrap(); + let batch_rowid_only = scan_rowid_only.try_into_batch().await.unwrap(); + assert_batch_knn_output_has_no_vector(&batch_rowid_only, "vec"); + assert!(batch_rowid_only.schema().column_with_name(ROW_ID).is_some()); + assert!(batch_rowid_only.schema().column_with_name("i").is_none()); + + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest("vec", &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&["vec"]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec.schema().column_with_name("vec").is_some(), + "batch flat KNN must include vector column when vec is projected" + ); + } + + #[tokio::test] + async fn test_batch_knn_flat_filter_keeps_non_vector_columns() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) + .await + .unwrap(); + let dataset = &test_ds.dataset; + let k = 2; + let (queries, query_values) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest("vec", &queries, k).unwrap(); + scan.use_index(false); + scan.filter("i >= 0").unwrap(); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); + assert!(batch.schema().column_with_name("i").is_some()); + + let query_indices = batch[QUERY_INDEX_COL].as_primitive::(); + for query_index in 0..2 { + let query = + Float32Array::from(query_values[query_index * 32..(query_index + 1) * 32].to_vec()); + let mut single = dataset.scan(); + single.nearest("vec", &query, k).unwrap(); + single.use_index(false); + single.filter("i >= 0").unwrap(); + single.project(&["i"]).unwrap(); + let single_batch = single.try_into_batch().await.unwrap(); + + let mask = BooleanArray::from_iter( + query_indices + .iter() + .map(|value| value.map(|value| value == query_index as i32)), + ); + let batch_slice = arrow::compute::filter_record_batch(&batch, &mask).unwrap(); + assert_eq!( + batch_slice["i"].as_primitive::().values(), + single_batch["i"].as_primitive::().values() + ); + } + } + + #[tokio::test] + async fn test_batch_knn_flat_nested_vector_projection() { + const VECTOR_COLUMN: &str = "payload.vec"; + let (_tmp, dataset) = nested_vector_test_dataset(32).await; + let k = 2; + let (queries, _query_values) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan.use_index(false); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, VECTOR_COLUMN); + assert_eq!(batch.num_rows(), 2 * k); + assert!(batch.schema().column_with_name("i").is_some()); + + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&[VECTOR_COLUMN]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec + .schema() + .column_with_name(VECTOR_COLUMN) + .is_some(), + "batch flat KNN must include nested vector column when projected; columns: {:?}", + batch_with_vec.schema().field_names() + ); + } + + #[tokio::test] + async fn test_batch_knn_flat_escaped_nested_vector_projection() { + const VECTOR_COLUMN: &str = "payload.`vec.with.dot`"; + let (_tmp, dataset) = escaped_nested_vector_test_dataset(32).await; + let k = 2; + let (queries, _) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan.use_index(false); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, VECTOR_COLUMN); + assert_eq!(batch.num_rows(), 2 * k); + assert!(batch.schema().column_with_name("i").is_some()); + + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&[VECTOR_COLUMN]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec + .schema() + .column_with_name(VECTOR_COLUMN) + .is_some(), + "batch flat KNN must include escaped nested vector column when projected; columns: {:?}", + batch_with_vec.schema().field_names() + ); + } + + #[tokio::test] + async fn test_batch_knn_flat_projects_row_id_and_row_addr_without_vector() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) + .await + .unwrap(); + let dataset = &test_ds.dataset; + let k = 2; + let (queries, _) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest("vec", &queries, k).unwrap(); + scan.use_index(false); + scan.project(&[ROW_ID]).unwrap(); + scan.with_row_address(); + + let batch = scan.try_into_batch().await.unwrap(); + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); + assert_eq!(batch.num_rows(), 2 * k); + assert!(batch.schema().column_with_name(ROW_ID).is_some()); + assert!(batch.schema().column_with_name(ROW_ADDR).is_some()); + assert!(batch.schema().column_with_name(DIST_COL).is_some()); + assert_eq!( + batch[ROW_ADDR].as_primitive::().null_count(), + 0, + "row addresses should be materialized for all top-k rows" + ); + } + #[tokio::test] async fn test_primitive_query_length_multiple_of_dim_is_rejected() { let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) @@ -8276,6 +8658,9 @@ mod test { expected: &str, ) -> Result<()> { let mut scan = dataset.scan(); + // Pin target_parallelism=1 so EnforceDistribution produces deterministic plans + // regardless of the machine's CPU count. + scan.target_parallelism(1); plan(&mut scan)?; let exec_plan = scan.create_plan().await?; assert_plan_node_equals(exec_plan, expected).await @@ -8352,6 +8737,198 @@ mod test { .unwrap(); } + #[tokio::test] + async fn test_ngram_regex_index_scan() { + use arrow::array::AsArray; + + // A small, fixed corpus written across multiple fragments so the ngram + // index spans fragment boundaries. + let values = [ + "rhino", // 0 + "rhinos nose", // 1 + "cat", // 2 + "dog", // 3 + "cat dog", // 4 + "elephant", // 5 + "catalog", // 6 + "scatter", // 7 + "rhino horn", // 8 + "mouse", // 9 + "category", // 10 + "dogma", // 11 + ]; + let array = StringArray::from_iter_values(values); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "s", + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let write_params = WriteParams { + max_rows_per_file: 4, // 12 rows -> 3 fragments + ..Default::default() + }; + let mut dataset = Dataset::write(reader, "memory://test_ngram_regex", Some(write_params)) + .await + .unwrap(); + dataset + .create_index( + &["s"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + assert!( + dataset.get_fragments().len() > 1, + "expected a multi-fragment dataset" + ); + + // Scan with `filter` and return the matched `s` values, sorted. + async fn matched(dataset: &Dataset, filter: &str) -> Vec { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let batches = scan + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut out = Vec::new(); + for batch in batches { + let col = batch.column_by_name("s").unwrap().as_string::(); + out.extend(col.iter().flatten().map(|s| s.to_string())); + } + out.sort(); + out + } + + // `regexp_like`: a plain literal substring. + assert_eq!( + matched(&dataset, "regexp_like(s, 'rhino')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // `regexp_match` (coerced to `IsNotNull(regexp_match(...))`) accelerates too. + assert_eq!( + matched(&dataset, "regexp_match(s, 'rhino')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // Anchored: recheck must drop trigram false positives -- the `cat` + // trigram also occurs in cat dog / catalog / scatter / category. + assert_eq!(matched(&dataset, "regexp_like(s, 'cat$')").await, ["cat"]); + // AND across `.*`: row 8 ("rhino horn") shares the rhino trigrams but + // lacks the nose trigrams, so only "rhinos nose" survives. + assert_eq!( + matched(&dataset, "regexp_like(s, 'rhino.*nose')").await, + ["rhinos nose"] + ); + // Alternation. + assert_eq!( + matched(&dataset, "regexp_like(s, '(catalog|elephant)')").await, + ["catalog", "elephant"] + ); + // A non-accelerable pattern (no trigram derivable) still returns correct + // results via a full recheck. + assert_eq!(matched(&dataset, "regexp_like(s, 'o.m')").await, ["dogma"]); + // A case-insensitive flag is not accelerated (the index normalization + // disagrees with Unicode case folding) but must still return correct + // results via a full recheck -- here matching despite the upper-case + // pattern. This exercises the three-argument `regexp_like` flags path. + assert_eq!( + matched(&dataset, "regexp_like(s, 'RHINO', 'i')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + + // Infix LIKE is accelerated through the same machinery (a plain-literal + // `regexp_like` is rewritten to LIKE before it reaches the index). + assert_eq!( + matched(&dataset, "s LIKE '%rhino%'").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // Prefix LIKE: recheck drops "scatter", which contains the `cat` trigram + // but does not start with "cat". + assert_eq!( + matched(&dataset, "s LIKE 'cat%'").await, + ["cat", "cat dog", "catalog", "category"] + ); + + // The ngram index is actually engaged for every accelerated form. + for filter in [ + "regexp_like(s, 'rhino')", + "regexp_match(s, 'rhino')", + "s LIKE '%rhino%'", + ] { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let plan = scan.create_plan().await.unwrap(); + let plan_str = format!( + "{}", + datafusion::physical_plan::displayable(plan.as_ref()).indent(true) + ); + assert!( + plan_str.contains("ScalarIndexQuery") && plan_str.contains("NGram"), + "expected ngram index usage for `{filter}`, got plan:\n{plan_str}" + ); + } + } + + #[tokio::test] + async fn test_ngram_regex_non_accelerable_recheck() { + // `a.b` yields no trigram, so the index returns "recheck everything". + // This must still produce ALL correct matches across fragments, not an + // empty set (a regression test for the AtLeast recheck path, which a + // single-match case would not catch). + let unit = ["acb", "dog", "axb", "cat", "qqq", "rhino"]; + let values: Vec<&str> = unit.iter().copied().cycle().take(60).collect(); + let array = StringArray::from_iter_values(values); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "text", + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let write_params = WriteParams { + max_rows_per_file: 20, // 60 rows -> 3 fragments + ..Default::default() + }; + let mut dataset = + Dataset::write(reader, "memory://test_ngram_regex_ne", Some(write_params)) + .await + .unwrap(); + dataset + .create_index( + &["text"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + async fn count(dataset: &Dataset, filter: &str) -> usize { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let batches = scan + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + batches.iter().map(|b| b.num_rows()).sum() + } + + // "acb" and "axb" each appear 10 times in the 60 rows -> 20 matches. + assert_eq!(count(&dataset, "regexp_match(text, 'a.b')").await, 20); + assert_eq!(count(&dataset, "regexp_like(text, 'a.b')").await, 20); + } + #[tokio::test] async fn test_like_prefix_with_btree_index() { // Create dataset with string data that has various prefixes @@ -8783,6 +9360,93 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") ); } + /// Build an in-memory dataset with a single `Dictionary(Int16, Utf8)` column. + /// The dictionary cycles through "a", "b", "c" so each value appears in a + /// predictable, repeated pattern. + async fn dictionary_string_dataset() -> Dataset { + use arrow_array::{Int16Array, Int16DictionaryArray}; + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "etld", + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + false, + )])); + + let dictionary = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let indices = Int16Array::from((0..30).map(|i| i % 3).collect::>()); + let dict_array = Int16DictionaryArray::try_new(indices, dictionary).unwrap(); + + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(dict_array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + Dataset::write(reader, "memory://test_dict_filter", None) + .await + .unwrap() + } + + /// Regression test for filtering a dictionary-encoded string column via the + /// SQL string path (`Scanner::filter`). This used to fail to plan with + /// "could not convert to literal of type 'Dictionary(Int16, Utf8)'". + #[tokio::test] + async fn test_filter_on_dictionary_string_column() { + let dataset = dictionary_string_dataset().await; + + // Equality predicate. + let count = dataset + .scan() + .filter("etld = 'a'") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 10); + + // IN-list predicate. + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + + /// An `IN`/`=` predicate on a dictionary column with a scalar index should be + /// pushed down to the index rather than falling back to a full scan. + #[tokio::test] + async fn test_dictionary_string_column_uses_scalar_index() { + use lance_index::scalar::BuiltinIndexType; + + let mut dataset = dictionary_string_dataset().await; + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + dataset + .create_index(&["etld"], IndexType::Scalar, None, ¶ms, true) + .await + .unwrap(); + + let mut scanner = dataset.scan(); + scanner.filter("etld IN ('a', 'b')").unwrap(); + let plan = scanner.create_plan().await.unwrap(); + let plan_str = format!("{:?}", plan); + assert!( + plan_str.contains("ScalarIndexExec") || plan_str.contains("MaterializeIndex"), + "IN on a dictionary column should use the scalar index, but got: {}", + plan_str + ); + + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + #[tokio::test] async fn test_like_prefix_with_segmented_zone_map() { use lance_index::scalar::BuiltinIndexType; @@ -9654,7 +10318,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=6), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9686,7 +10350,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=15), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9714,7 +10378,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=5), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9736,7 +10400,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=5), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9835,7 +10499,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=8), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9871,7 +10535,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=11), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9964,14 +10628,13 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") log::info!("Test case: Combined Scalar/non-scalar filtered read"); let expected = if data_storage_version == LanceFileVersion::Legacy { "ProjectionExec: expr=[s@1 as s] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 - UnionExec - Take: columns=\"_rowid, (s)\" - CoalesceBatchesExec: target_batch_size=8192 - MaterializeIndex: query=[i > 10]@i_idx(BTree) - ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] - FilterExec: i@0 > 10 - LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None" + UnionExec + Take: columns=\"_rowid, (s)\" + CoalesceBatchesExec: target_batch_size=8192 + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] + FilterExec: i@0 > 10 + LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None" } else { "LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \ range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=-- @@ -9987,13 +10650,12 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") log::info!("Test case: Combined Scalar/non-scalar filtered read with empty projection"); let expected = if data_storage_version == LanceFileVersion::Legacy { "ProjectionExec: expr=[_rowaddr@0 as _rowaddr] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 - UnionExec - AddRowAddrExec - MaterializeIndex: query=[i > 10]@i_idx(BTree) - ProjectionExec: expr=[_rowaddr@2 as _rowaddr, _rowid@1 as _rowid] - FilterExec: i@0 > 10 - LanceScan: uri=..., projection=[i], row_id=true, row_addr=true, ordered=false, range=None" + UnionExec + AddRowAddrExec + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowaddr@2 as _rowaddr, _rowid@1 as _rowid] + FilterExec: i@0 > 10 + LanceScan: uri=..., projection=[i], row_id=true, row_addr=true, ordered=false, range=None" } else { "LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, \ range_after=None, row_id=false, row_addr=true, full_filter=i > Int32(10), refine_filter=-- @@ -10016,14 +10678,13 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") log::info!("Test case: Dynamic projection"); let expected = if data_storage_version == LanceFileVersion::Legacy { "ProjectionExec: expr=[regexp_match(s@1, .*) as matches] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 - UnionExec - Take: columns=\"_rowid, (s)\" - CoalesceBatchesExec: target_batch_size=8192 - MaterializeIndex: query=[i > 10]@i_idx(BTree) - ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] - FilterExec: i@0 > 10 - LanceScan: uri=..., row_id=true, row_addr=false, ordered=false, range=None" + UnionExec + Take: columns=\"_rowid, (s)\" + CoalesceBatchesExec: target_batch_size=8192 + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] + FilterExec: i@0 > 10 + LanceScan: uri=..., row_id=true, row_addr=false, ordered=false, range=None" } else { "ProjectionExec: expr=[regexp_match(s@0, .*) as matches] LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \ @@ -10106,7 +10767,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 MatchQuery: column=s, query=hello - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MaterializeIndex: query=[i > 10]@i_idx(BTree) ProjectionExec: expr=[_rowid@1 as _rowid] @@ -10134,15 +10795,31 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") .await?; log::info!("Test case: Full text search with unindexed rows"); - let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] + // The flat-FTS path now reads through `FilteredReadExec`, matching the + // brute-force KNN path. With no prefilter the scan still produces no + // pushdown, but the operator differs by storage version: legacy emits + // a `LanceScan`, v2 emits a `LanceRead` with empty filters. + let expected = if data_storage_version == LanceFileVersion::Legacy { + r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MatchQuery: column=s, query=hello FlatMatchQuery: column=s, query=hello - LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"#; + LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=true, range=None"# + } else { + r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] + Take: columns="_rowid, _score, (s)" + CoalesceBatchesExec: target_batch_size=8192 + SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + MatchQuery: column=s, query=hello + FlatMatchQuery: column=s, query=hello + LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=--, refine_filter=--"# + }; dataset.append_new_data().await?; assert_plan_equals( &dataset.dataset, @@ -10175,36 +10852,46 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") .await?; log::info!("Test case: Full text search with unindexed rows and prefilter"); + // After routing flat FTS through `FilteredReadExec`, the BTree on `i` + // pushes into the unindexed-fragment scan too — no more `FilterExec` on + // top of an unfiltered `LanceScan`. Legacy uses the `MaterializeIndex` + // shape, v2 uses `LanceRead` with `full_filter` set. let expected = if data_storage_version == LanceFileVersion::Legacy { r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MatchQuery: column=s, query=hello - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MaterializeIndex: query=[i > 10]@i_idx(BTree) ProjectionExec: expr=[_rowid@1 as _rowid] FilterExec: i@0 > 10 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None FlatMatchQuery: column=s, query=hello - FilterExec: i@1 > 10 - LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"# + CoalescePartitionsExec + UnionExec + Take: columns="_rowid, (s)" + CoalesceBatchesExec: target_batch_size=8192 + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] + FilterExec: i@0 > 10 + LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None"# } else { r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MatchQuery: column=s, query=hello LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=-- ScalarIndexQuery: query=[i > 10]@i_idx(BTree) FlatMatchQuery: column=s, query=hello - FilterExec: i@1 > 10 - LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"# + LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=-- + ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"# }; assert_plan_equals( &dataset.dataset, @@ -10274,7 +10961,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@2 IS NOT NULL SortExec: TopK(fetch=34), expr=[_distance@2 ASC NULLS LAST, _rowid@0 ASC NULLS LAST]... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec] FilterExec: _distance@2 IS NOT NULL @@ -10327,6 +11014,41 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") assert_query_index_field(&batch); } + #[tokio::test(flavor = "multi_thread")] + async fn test_fts_multiple_unindexed_appends() { + // An FTS query over an indexed dataset plus more than one unindexed append + // must return matches from every unindexed fragment, not just the first. The + // flat search over unindexed data reads only a single input partition, so when + // the default parallelism splits the scan across partitions it used to drop + // every append but the first. + let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false) + .await + .unwrap(); + test_ds.make_fts_index().await.unwrap(); + // Two separate appends -> two unindexed fragments, each large enough that the + // physical optimizer parallelizes the flat scan across partitions. + test_ds.append_data_with_range(400, 5400).await.unwrap(); + test_ds.append_data_with_range(5400, 10400).await.unwrap(); + + // Every row's `s` value contains the token "s", so FTS("s") matches all rows. + let total = test_ds.dataset.count_rows(None).await.unwrap(); + let returned = test_ds + .dataset + .scan() + .full_text_search(FullTextSearchQuery::new("s".to_owned())) + .unwrap() + .try_into_stream() + .await + .unwrap() + .try_fold( + 0usize, + |acc, batch| async move { Ok(acc + batch.num_rows()) }, + ) + .await + .unwrap(); + assert_eq!(returned, total); + } + #[rstest] #[tokio::test] async fn test_fast_search_scalar_index_skips_unindexed_fragments( @@ -10807,6 +11529,26 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") assert_eq!(scanner.nearest_mut().unwrap().query_parallelism, -1); } + #[tokio::test] + async fn test_knn_approx_mode_defaults_and_setter() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false) + .await + .unwrap(); + let query_vector = Float32Array::from(vec![0.0; 32]); + let mut scanner = test_ds.dataset.scan(); + scanner.nearest("vec", &query_vector, 5).unwrap(); + assert_eq!( + scanner.nearest_mut().unwrap().approx_mode, + ApproxMode::Normal + ); + + scanner.approx_mode(ApproxMode::Accurate); + assert_eq!( + scanner.nearest_mut().unwrap().approx_mode, + ApproxMode::Accurate + ); + } + #[tokio::test] async fn test_ivf_pq_query_parallelism_returns_same_results() { let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false) diff --git a/rust/lance/src/dataset/schema_evolution.rs b/rust/lance/src/dataset/schema_evolution.rs index f5d792979df..5ef35a33ab7 100644 --- a/rust/lance/src/dataset/schema_evolution.rs +++ b/rust/lance/src/dataset/schema_evolution.rs @@ -1,13 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashSet, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use super::fragment::FileFragment; use super::{ Dataset, transaction::{Operation, Transaction}, + write::cleanup_data_fragments, }; +use crate::index::DatasetIndexExt; use crate::{Error, Result, io::exec::Planner}; use arrow::compute::CastOptions; use arrow::compute::can_cast_types; @@ -239,7 +244,7 @@ pub(super) async fn add_columns_to_fragments( read_columns: Option>, fragments: &[FileFragment], batch_size: Option, -) -> Result<(Vec, Schema)> { +) -> Result<(Vec, Schema, Vec)> { // Check names early (before calling add_columns_impl) to avoid extra work if // the names are wrong. let version = dataset.manifest.data_storage_format.lance_file_version()?; @@ -261,10 +266,10 @@ pub(super) async fn add_columns_to_fragments( } let transforms = optimizer.optimize(dataset, transforms)?; - let (output_schema, fragments) = match transforms { + let (output_schema, new_fragments, fragments_to_cleanup) = match transforms { NewColumnTransform::BatchUDF(udf) => { check_names(udf.output_schema.as_ref())?; - let fragments = add_columns_impl( + let result = add_columns_impl( fragments, read_columns, udf.mapper, @@ -273,7 +278,11 @@ pub(super) async fn add_columns_to_fragments( None, ) .await?; - Result::Ok((udf.output_schema, fragments)) + Result::Ok(( + udf.output_schema, + result.fragments, + result.fragments_to_cleanup, + )) } NewColumnTransform::SqlExpressions(expressions) => { // We just transform the SQL expression into a UDF backed by DataFusion @@ -336,22 +345,22 @@ pub(super) async fn add_columns_to_fragments( let mapper = Box::new(mapper); let read_columns = Some(read_schema.field_names().into_iter().cloned().collect()); - let fragments = + let result = add_columns_impl(fragments, read_columns, mapper, batch_size, None, None).await?; - Ok((output_schema, fragments)) + Ok((output_schema, result.fragments, result.fragments_to_cleanup)) } NewColumnTransform::Stream(stream) => { let output_schema = stream.schema(); check_names(output_schema.as_ref())?; let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?; - Ok((output_schema, fragments)) + Ok((output_schema, fragments.clone(), fragments)) } NewColumnTransform::Reader(reader) => { let output_schema = reader.schema(); check_names(output_schema.as_ref())?; let stream = reader.into_stream(); let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?; - Ok((output_schema, fragments)) + Ok((output_schema, fragments.clone(), fragments)) } NewColumnTransform::AllNulls(output_schema) => { check_names(output_schema.as_ref())?; @@ -379,14 +388,20 @@ pub(super) async fn add_columns_to_fragments( )); } - Ok((output_schema, fragments)) + Ok((output_schema, fragments, Vec::new())) } }?; - let mut schema = dataset.schema().merge(output_schema.as_ref())?; + let mut schema = match dataset.schema().merge(output_schema.as_ref()) { + Ok(schema) => schema, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + }; schema.set_field_id(Some(dataset.manifest.max_field_id())); - Ok((fragments, schema)) + Ok((new_fragments, schema, fragments_to_cleanup)) } pub(super) async fn add_columns( @@ -395,7 +410,7 @@ pub(super) async fn add_columns( read_columns: Option>, batch_size: Option, ) -> Result<()> { - let (fragments, schema) = add_columns_to_fragments( + let (fragments, schema, fragments_to_cleanup) = add_columns_to_fragments( dataset, transforms, read_columns, @@ -406,11 +421,75 @@ pub(super) async fn add_columns( let operation = Operation::Merge { fragments, schema }; let transaction = Transaction::new(dataset.manifest.version, operation, None); - dataset + match dataset .apply_commit(transaction, &Default::default(), &Default::default()) - .await?; + .await + { + Ok(()) => Ok(()), + Err(e) => { + cleanup_new_column_data_files(&dataset.get_fragments(), &fragments_to_cleanup).await; + Err(e) + } + } +} - Ok(()) +async fn cleanup_new_column_data_files(fragments: &[FileFragment], new_fragments: &[Fragment]) { + let Some(first_fragment) = fragments.first() else { + return; + }; + + // add_columns rewrites fragment metadata in place, so cleanup must delete + // only files created by the current attempt and must not touch pre-existing + // files that still belong to the fragment. + let original_files_by_fragment = fragments + .iter() + .map(|fragment| { + let files = fragment + .metadata + .files + .iter() + .map(|file| (file.base_id, file.path.clone())) + .collect::>(); + (fragment.id() as u64, files) + }) + .collect::>(); + + let fragments_to_cleanup = new_fragments + .iter() + .filter_map(|fragment| { + let original_files = original_files_by_fragment.get(&fragment.id)?; + let files = fragment + .files + .iter() + .filter(|file| !original_files.contains(&(file.base_id, file.path.clone()))) + .cloned() + .collect::>(); + + if files.is_empty() { + None + } else { + let mut fragment = fragment.clone(); + fragment.files = files; + Some(fragment) + } + }) + .collect::>(); + + cleanup_data_fragments( + &first_fragment.dataset().object_store, + &first_fragment.dataset().base, + &fragments_to_cleanup, + ) + .await; +} + +struct AddColumnFragments { + /// Fragments produced by the add-columns operation and returned to the + /// caller for the final merge commit. + fragments: Vec, + /// Uncommitted fragments whose newly written data files must be removed if + /// the operation fails before the merge commit completes. + fragments_to_cleanup: Vec, } #[allow(clippy::type_complexity)] @@ -421,63 +500,96 @@ async fn add_columns_impl( batch_size: Option, result_cache: Option>, schemas: Option<(Schema, Schema)>, -) -> Result> { +) -> Result { let read_columns_ref = read_columns.as_deref(); let mapper_ref = mapper.as_ref(); - let fragments = futures::stream::iter(fragments) - .then(|fragment| { - let cache_ref = result_cache.clone(); - let schemas_ref = &schemas; - async move { - if let Some(cache) = &cache_ref { - let fragment_id = fragment.id() as u32; - let fragment = cache.get_fragment(fragment_id)?; - if let Some(fragment) = fragment { - return Ok(fragment); - } + + let mut new_fragments = Vec::with_capacity(fragments.len()); + let mut fragments_to_cleanup = Vec::with_capacity(fragments.len()); + + for fragment in fragments { + if let Some(cache) = &result_cache { + let fragment_id = fragment.id() as u32; + let fragment = match cache.get_fragment(fragment_id) { + Ok(fragment) => fragment, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); } + }; + if let Some(fragment) = fragment { + new_fragments.push(fragment); + continue; + } + } - let mut updater = fragment - .updater(read_columns_ref, schemas_ref.clone(), batch_size) - .await?; - - let mut batch_index = 0; - // TODO: the structure of the updater prevents batch-level parallelism here, - // but there is no reason why we couldn't do this in parallel. - while let Some(batch) = updater.next().await? { - let batch_info = BatchInfo { - fragment_id: fragment.id() as u32, - batch_index, - }; + let mut updater = match fragment + .updater(read_columns_ref, schemas.clone(), batch_size) + .await + { + Ok(updater) => updater, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + }; + let fragment_result = async { + let mut batch_index = 0; + // TODO: the structure of the updater prevents batch-level parallelism here, + // but there is no reason why we couldn't do this in parallel. + while let Some(batch) = updater.next().await? { + let batch_info = BatchInfo { + fragment_id: fragment.id() as u32, + batch_index, + }; - let new_batch = if let Some(cache) = &cache_ref { - if let Some(batch) = cache.get_batch(&batch_info)? { - batch - } else { - let new_batch = mapper_ref(batch)?; - cache.insert_batch(batch_info, new_batch.clone())?; - new_batch - } + let new_batch = if let Some(cache) = &result_cache { + if let Some(batch) = cache.get_batch(&batch_info)? { + batch } else { - mapper_ref(batch)? - }; + let new_batch = mapper_ref(batch)?; + cache.insert_batch(batch_info, new_batch.clone())?; + new_batch + } + } else { + mapper_ref(batch)? + }; - updater.update(new_batch).await?; - batch_index += 1; - } + updater.update(new_batch).await?; + batch_index += 1; + } - let fragment = updater.finish().await?; + let new_fragment = updater.finish().await?; + fragments_to_cleanup.push(new_fragment.clone()); - if let Some(cache) = &cache_ref { - cache.insert_fragment(fragment.clone())?; - } + if let Some(cache) = &result_cache { + // Once the checkpoint store owns this fragment, retries may load + // it back instead of rewriting it. Removing it from the cleanup + // set avoids deleting data that has already been checkpointed. + cache.insert_fragment(new_fragment.clone())?; + fragments_to_cleanup.pop(); + } - Ok::<_, Error>(fragment) + Ok::<_, Error>(new_fragment) + } + .await; + + match fragment_result { + Ok(new_fragment) => { + new_fragments.push(new_fragment); } - }) - .try_collect::>() - .await?; - Ok(fragments) + Err(e) => { + updater.cleanup_unfinished_writer().await; + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + } + } + + Ok(AddColumnFragments { + fragments: new_fragments, + fragments_to_cleanup, + }) } async fn add_columns_from_stream( @@ -489,49 +601,80 @@ async fn add_columns_from_stream( let mut new_fragments = Vec::with_capacity(fragments.len()); let mut last_seen_batch: Option = None; for fragment in fragments { - let mut updater = fragment + let mut updater = match fragment .updater::(Some(&[]), schemas.clone(), batch_size) - .await?; - while let Some(batch) = updater.next().await? { - debug_assert_eq!(batch.num_columns(), 1); - let mut rows_remaining = batch.num_rows(); + .await + { + Ok(updater) => updater, + Err(e) => { + cleanup_new_column_data_files(fragments, &new_fragments).await; + return Err(e); + } + }; + let result: Result = async { + while let Some(batch) = updater.next().await? { + debug_assert_eq!(batch.num_columns(), 1); + let mut rows_remaining = batch.num_rows(); + + // The updater yields an empty batch when every row in a read batch + // has been deleted (e.g. a whole batch falls within the deletion + // vector). There is nothing to pull from the stream in that case, so + // feed an empty batch back to keep the updater in sync and continue. + if rows_remaining == 0 { + updater + .update(RecordBatch::new_empty(stream.schema())) + .await?; + continue; + } - let mut batches = Vec::new(); + let mut batches = Vec::new(); - while rows_remaining > 0 { - let next_batch = if let Some(last_seen_batch) = last_seen_batch { - last_seen_batch - } else { - stream.next().await.ok_or_else(|| { - Error::invalid_input( - "Stream ended before producing values for all rows in dataset", - ) - })?? - }; - let num_rows = next_batch.num_rows(); - if num_rows > rows_remaining { - let new_batch = next_batch.slice(0, rows_remaining); - batches.push(new_batch); - last_seen_batch = - Some(next_batch.slice(rows_remaining, num_rows - rows_remaining)); - rows_remaining = 0; - } else { - batches.push(next_batch); - rows_remaining -= num_rows; - last_seen_batch = None; + while rows_remaining > 0 { + let next_batch = if let Some(last_seen) = last_seen_batch.take() { + last_seen + } else { + stream.next().await.ok_or_else(|| { + Error::invalid_input( + "Stream ended before producing values for all rows in dataset", + ) + })?? + }; + let num_rows = next_batch.num_rows(); + if num_rows > rows_remaining { + let new_batch = next_batch.slice(0, rows_remaining); + batches.push(new_batch); + last_seen_batch = + Some(next_batch.slice(rows_remaining, num_rows - rows_remaining)); + rows_remaining = 0; + } else { + batches.push(next_batch); + rows_remaining -= num_rows; + last_seen_batch = None; + } } - } - let new_batch = - arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?; + let new_batch = + arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?; - updater.update(new_batch).await?; + updater.update(new_batch).await?; + } + updater.finish().await + } + .await; + + match result { + Ok(new_fragment) => new_fragments.push(new_fragment), + Err(e) => { + updater.cleanup_unfinished_writer().await; + cleanup_new_column_data_files(fragments, &new_fragments).await; + return Err(e); + } } - new_fragments.push(updater.finish().await?); } // Ensure the stream is fully consumed if last_seen_batch.is_some() || stream.next().await.is_some() { + cleanup_new_column_data_files(fragments, &new_fragments).await; return Err(Error::invalid_input_source( "Stream produced more values than expected for dataset".into(), )); @@ -605,6 +748,41 @@ pub(super) async fn alter_columns( new_schema.validate()?; + // If any column being cast has an attached index, fail fast. Cast operations + // rewrite the underlying column data and silently invalidate any index on the + // affected column(s). The current behavior is to drop such indices without + // warning, which has caused production incidents where vector search silently + // regressed to brute-force scan. We require users to explicitly drop the + // index before altering the column type, so the action is never silent. + if !cast_fields.is_empty() { + let indices = dataset.load_indices().await?; + let affected: Vec<&lance_table::format::IndexMetadata> = indices + .iter() + .filter(|idx| { + cast_fields + .iter() + .any(|(old, _)| idx.fields.contains(&old.id)) + }) + .collect(); + if !affected.is_empty() { + let affected_cols: Vec = cast_fields + .iter() + .filter(|(old, _)| affected.iter().any(|i| i.fields.contains(&old.id))) + .map(|(old, _)| old.name.clone()) + .collect(); + let affected_idx_names: Vec = affected.iter().map(|i| i.name.clone()).collect(); + return Err(Error::invalid_input(format!( + "Cannot cast column(s) [{}] to a new type: they have {} index(es) \ + attached: [{}]. Cast rewrites column data and invalidates any index \ + on the affected column(s). Drop the index(es) with drop_index() \ + before altering, then recreate them after the cast completes.", + affected_cols.join(", "), + affected.len(), + affected_idx_names.join(", "), + ))); + } + } + // If we aren't casting a column, we don't need to touch the fragments. let transaction = if cast_fields.is_empty() { Transaction::new( @@ -653,7 +831,7 @@ pub(super) async fn alter_columns( }; let mapper = Box::new(mapper); - let fragments = add_columns_impl( + let result = add_columns_impl( &dataset.get_fragments(), Some(read_columns), mapper, @@ -666,7 +844,8 @@ pub(super) async fn alter_columns( // Some data files may no longer contain any columns in the dataset (e.g. if every // remaining column has been altered into a different data file) and so we remove them let schema_field_ids = new_schema.field_ids().into_iter().collect::>(); - let fragments = fragments + let fragments = result + .fragments .into_iter() .map(|mut frag| { frag.files.retain(|f| { @@ -734,56 +913,751 @@ pub(super) async fn drop_columns(dataset: &mut Dataset, columns: &[&str]) -> Res .apply_commit(transaction, &Default::default(), &Default::default()) .await?; - Ok(()) -} + Ok(()) +} + +/// Exclude the fields from `other` Schema, and returns a new Schema. +pub fn exclude(source: &Schema, other: &Schema, version: &LanceFileVersion) -> Result { + let other: Schema = other.try_into().map_err(|_| { + Error::schema("The other schema is not compatible with this schema".to_string()) + })?; + let mut fields = vec![]; + for field in source.fields.iter() { + if let Some(other_field) = other.field(&field.name) { + if version.support_remove_sub_column(field) + && let Some(f) = field.exclude(other_field) + { + fields.push(f) + } + } else { + fields.push(field.clone()); + } + } + Ok(Schema { + fields, + metadata: source.metadata.clone(), + }) +} + +#[cfg(test)] +mod test { + use std::{collections::HashMap, fs, num::NonZero, path::Path as StdPath, sync::Mutex}; + + use crate::dataset::WriteParams; + use arrow_array::{ + ArrayRef, Int32Array, ListArray, RecordBatchIterator, StringArray, StructArray, + }; + + use super::*; + use arrow_schema::Fields as ArrowFields; + use lance_core::utils::tempfile::TempStrDir; + use lance_file::version::LanceFileVersion; + use lance_table::format::{BasePath, DataFile}; + use rstest::rstest; + + // Used to validate that futures returned are Send. + fn require_send(t: T) -> T { + t + } + + fn file_paths_in(dir: impl AsRef) -> Vec { + fn collect_files( + base_dir: &StdPath, + dir: &StdPath, + files: &mut Vec, + ) -> std::io::Result<()> { + if !dir.exists() { + return Ok(()); + } + for entry in std::fs::read_dir(dir)? { + let path = entry?.path(); + if path.is_dir() { + collect_files(base_dir, &path, files)?; + } else if path.is_file() + && path + .file_name() + .and_then(|name| name.to_str()) + .is_some_and(|file_name| !file_name.starts_with('.')) + { + files.push( + path.strip_prefix(base_dir) + .unwrap() + .to_string_lossy() + .to_string(), + ); + } + } + Ok(()) + } + + let base_dir = dir.as_ref(); + let mut files = Vec::new(); + collect_files(base_dir, base_dir, &mut files).unwrap(); + files.sort(); + files + } + + fn data_file_paths_in(base_dir: &str) -> Vec { + file_paths_in(StdPath::new(base_dir).join("data")) + } + + #[tokio::test] + async fn test_append_columns_exprs() -> Result<()> { + let num_rows = 5; + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..num_rows as i32))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::Legacy), + ..Default::default() + }), + ) + .await?; + dataset.validate().await?; + + // Adding a duplicate column name will break + let fut = dataset.add_columns( + NewColumnTransform::SqlExpressions(vec![("id".into(), "id + 1".into())]), + None, + None, + ); + // (Quick validation that the future is Send) + let res = require_send(fut).await; + assert!(matches!(res, Err(Error::InvalidInput { .. }))); + + // Can add a column that is independent of any existing ones + dataset + .add_columns( + NewColumnTransform::SqlExpressions(vec![("value".into(), "2 * random()".into())]), + None, + None, + ) + .await?; + + // Can add a column derived from an existing one. + dataset + .add_columns( + NewColumnTransform::SqlExpressions(vec![("double_id".into(), "2 * id".into())]), + None, + None, + ) + .await?; + + // Can derive a column from existing ones across multiple data files. + dataset + .add_columns( + NewColumnTransform::SqlExpressions(vec![( + "triple_id".into(), + "id + double_id".into(), + )]), + None, + None, + ) + .await?; + + // These can be read back, the dataset is valid + dataset.validate().await?; + + let data = dataset.scan().try_into_batch().await?; + let expected_schema = ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("value", DataType::Float64, true), + ArrowField::new("double_id", DataType::Int32, false), + ArrowField::new("triple_id", DataType::Int32, false), + ]); + assert_eq!(data.schema().as_ref(), &expected_schema); + assert_eq!(data.num_rows(), num_rows); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_with_fully_deleted_batch() -> Result<()> { + // Regression test: when an entire read batch has been deleted, the + // updater yields a 0-row batch. The inner loop then never runs and + // `batches` stays empty, so `concat_batches(&batches[0]..)` used to + // panic with "index out of bounds: the len is 0 but the index is 0". + // + // A single fragment holds 105 rows; deleting the trailing 5 rows means + // that, when read with batch_size=50, the third batch [100..105) is + // fully filtered out and produces an empty batch. + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "i", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..105))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 200, // keep all rows in a single fragment + ..Default::default() + }), + ) + .await?; + + // Delete the entire trailing batch [100..105). + dataset.delete("i >= 100").await?; + assert_eq!(dataset.count_rows(None).await?, 100); + + let new_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "j", + DataType::Int32, + false, + )])); + let new_batch = RecordBatch::try_new( + new_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..100))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(new_batch)], new_schema.clone()); + + // Read with batch_size=50 so the deleted trailing rows form a full empty batch. + dataset + .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, Some(50)) + .await?; + + let data = dataset.scan().try_into_batch().await?; + assert_eq!(data.num_rows(), 100); + assert_eq!( + data.column_by_name("j").unwrap().as_ref(), + &Int32Array::from_iter_values(0..100) + ); + + Ok(()) + } + + #[rstest] + #[tokio::test] + async fn test_add_columns_cleans_up_blob_v2_data_on_stream_error( + #[values( + ("inline", b"inline".to_vec()), + ("packed", vec![1u8; 128 * 1024]), + ("dedicated", vec![2u8; 5 * 1024 * 1024]), + ("external", b"external".to_vec()) + )] + blob_case: (&str, Vec), + ) -> Result<()> { + let (blob_kind, payload) = blob_case; + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let external_dir = tempfile::tempdir()?; + let external_path = external_dir.path().join("blob.bin"); + fs::write(&external_path, &payload)?; + let external_baseline_files = file_paths_in(external_dir.path()); + let external_baseline_payload = fs::read(&external_path)?; + + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + initial_bases: Some(vec![BasePath::new( + 1, + external_dir.path().to_string_lossy().to_string(), + Some("external".to_string()), + false, + )]), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + let mut blob_builder = crate::BlobArrayBuilder::new(2); + if blob_kind == "external" { + blob_builder.push_uri(external_path.to_string_lossy())?; + } else { + blob_builder.push_bytes(payload)?; + } + blob_builder.push_bytes(b"extra")?; + let blob_array = blob_builder.finish()?; + let blob_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let blob_batch = RecordBatch::try_new(blob_schema.clone(), vec![blob_array])?; + let reader = RecordBatchIterator::new(vec![Ok(blob_batch)], blob_schema); + + let err = dataset + .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("Stream produced more values than expected for dataset") + ); + + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean up new data files and blob v2 sidecars on failure" + ); + assert_eq!( + file_paths_in(external_dir.path()), + external_baseline_files, + "cleanup must not delete external files" + ); + assert_eq!( + fs::read(&external_path)?, + external_baseline_payload, + "cleanup must not modify external files" + ); + dataset.validate().await?; + + Ok(()) + } + + #[tokio::test] + async fn test_cleanup_preserves_checkpointed_fragment_files() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let original_fragments = dataset.get_fragments(); + assert_eq!(original_fragments.len(), 2); + + let data_dir = StdPath::new(test_uri).join("data"); + let cached_file = data_dir.join("checkpointed.lance"); + let cached_blob_dir = data_dir.join("checkpointed"); + fs::write(&cached_file, b"checkpointed data")?; + fs::create_dir_all(&cached_blob_dir)?; + fs::write( + cached_blob_dir.join("00000000000000000000000000000001.blob"), + b"blob", + )?; + + let mut checkpointed_fragment = original_fragments[0].metadata().clone(); + checkpointed_fragment.files.push(DataFile::new( + "checkpointed.lance", + vec![dataset.manifest.max_field_id() + 1], + vec![0], + 2, + 2, + NonZero::new(17), + None, + )); + + #[derive(Default)] + struct CheckpointedFragmentStore { + fragment: Mutex>, + } + + impl UDFCheckpointStore for CheckpointedFragmentStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, fragment_id: u32) -> Result> { + if fragment_id == 0 { + Ok(self.fragment.lock().unwrap().clone()) + } else { + Ok(None) + } + } + + fn insert_fragment(&self, _fragment: Fragment) -> Result<()> { + Ok(()) + } + } + + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(|_| Err(Error::invalid_input("injected UDF failure"))), + output_schema: Arc::new(ArrowSchema::new(vec![ArrowField::new( + "checkpointed", + DataType::Int32, + true, + )])), + result_checkpoint: Some(Arc::new(CheckpointedFragmentStore { + fragment: Mutex::new(Some(checkpointed_fragment)), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!(err.to_string().contains("injected UDF failure")); + + assert!( + cached_file.exists(), + "cleanup must not delete fragment files restored from a checkpoint" + ); + assert!( + cached_blob_dir.exists(), + "cleanup must not delete blob sidecars restored from a checkpoint" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_cleans_current_blob_v2_writer_on_udf_error() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + let call_count = Arc::new(Mutex::new(0usize)); + let mapper_call_count = call_count.clone(); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut call_count = mapper_call_count.lock().unwrap(); + *call_count += 1; + if *call_count == 2 { + return Err(Error::invalid_input("injected UDF failure")); + } + + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: None, + }); + + let err = dataset + .add_columns(transforms, None, Some(1)) + .await + .unwrap_err(); + assert!(err.to_string().contains("injected UDF failure")); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean files written by the current unfinished writer" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_on_checkpoint_lookup_error() + -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + + struct FailingLookupStore { + inserted: Arc>>, + } + + impl UDFCheckpointStore for FailingLookupStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, fragment_id: u32) -> Result> { + if fragment_id == 1 { + Err(Error::invalid_input("injected checkpoint lookup failure")) + } else { + Ok(None) + } + } + + fn insert_fragment(&self, fragment: Fragment) -> Result<()> { + *self.inserted.lock().unwrap() = Some(fragment); + Ok(()) + } + } + + let inserted = Arc::new(Mutex::new(None)); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(FailingLookupStore { + inserted: inserted.clone(), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected checkpoint lookup failure") + ); + let inserted = inserted.lock().unwrap().clone().unwrap(); + let new_file = inserted + .files + .iter() + .find(|file| { + file.fields + .iter() + .any(|field| *field > dataset.manifest.max_field_id()) + }) + .expect("checkpoint should record the newly written data file"); + let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path); + let new_blob_dir = StdPath::new(test_uri) + .join("data") + .join(StdPath::new(&new_file.path).file_stem().unwrap()); + assert!( + new_file_path.exists(), + "cleanup must not delete data files after checkpoint takes ownership" + ); + assert!( + new_blob_dir.exists(), + "cleanup must not delete blob sidecars after checkpoint takes ownership" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_cleans_finished_blob_v2_writer_on_checkpoint_insert_error() + -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + struct FailingInsertStore; -/// Exclude the fields from `other` Schema, and returns a new Schema. -pub fn exclude(source: &Schema, other: &Schema, version: &LanceFileVersion) -> Result { - let other: Schema = other.try_into().map_err(|_| { - Error::schema("The other schema is not compatible with this schema".to_string()) - })?; - let mut fields = vec![]; - for field in source.fields.iter() { - if let Some(other_field) = other.field(&field.name) { - if version.support_remove_sub_column(field) - && let Some(f) = field.exclude(other_field) - { - fields.push(f) + impl UDFCheckpointStore for FailingInsertStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, _fragment_id: u32) -> Result> { + Ok(None) + } + + fn insert_fragment(&self, _fragment: Fragment) -> Result<()> { + Err(Error::invalid_input("injected checkpoint insert failure")) } - } else { - fields.push(field.clone()); } + + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(FailingInsertStore)), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected checkpoint insert failure") + ); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean finished writer files when checkpoint insert fails" + ); + + Ok(()) } - Ok(Schema { - fields, - metadata: source.metadata.clone(), - }) -} -#[cfg(test)] -mod test { - use std::collections::HashMap; - use std::sync::Mutex; + #[tokio::test] + async fn test_add_columns_cleans_blob_v2_files_on_declared_schema_merge_error() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); - use crate::dataset::WriteParams; - use arrow_array::{ - ArrayRef, Int32Array, ListArray, RecordBatchIterator, StringArray, StructArray, - }; + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); - use super::*; - use arrow_schema::Fields as ArrowFields; - use lance_core::utils::tempfile::TempStrDir; - use lance_file::version::LanceFileVersion; - use rstest::rstest; + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema: Arc::new(ArrowSchema::new(vec![ + ArrowField::new("declared", DataType::Int32, true), + ArrowField::new("declared", DataType::Int32, true), + ])), + result_checkpoint: None, + }); - // Used to validate that futures returned are Send. - fn require_send(t: T) -> T { - t + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!(matches!(err, Error::Schema { .. })); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean files written before declared schema merge fails" + ); + + Ok(()) } #[tokio::test] - async fn test_append_columns_exprs() -> Result<()> { - let num_rows = 5; + async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_after_later_failure() + -> Result<()> { let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( "id", DataType::Int32, @@ -791,75 +1665,101 @@ mod test { )])); let batch = RecordBatch::try_new( schema.clone(), - vec![Arc::new(Int32Array::from_iter_values(0..num_rows as i32))], + vec![Arc::new(Int32Array::from_iter_values(0..2))], )?; - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); let test_dir = TempStrDir::default(); - let test_uri = &test_dir; + let test_uri = test_dir.as_str(); let mut dataset = Dataset::write( reader, test_uri, Some(WriteParams { - data_storage_version: Some(LanceFileVersion::Legacy), + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), ..Default::default() }), ) .await?; - dataset.validate().await?; - // Adding a duplicate column name will break - let fut = dataset.add_columns( - NewColumnTransform::SqlExpressions(vec![("id".into(), "id + 1".into())]), - None, - None, - ); - // (Quick validation that the future is Send) - let res = require_send(fut).await; - assert!(matches!(res, Err(Error::InvalidInput { .. }))); + struct InsertThenFailStore { + inserted: Arc>>, + } - // Can add a column that is independent of any existing ones - dataset - .add_columns( - NewColumnTransform::SqlExpressions(vec![("value".into(), "2 * random()".into())]), - None, - None, - ) - .await?; + impl UDFCheckpointStore for InsertThenFailStore { + fn get_batch(&self, info: &BatchInfo) -> Result> { + if info.fragment_id == 1 { + Err(Error::invalid_input("injected later checkpoint failure")) + } else { + Ok(None) + } + } - // Can add a column derived from an existing one. - dataset - .add_columns( - NewColumnTransform::SqlExpressions(vec![("double_id".into(), "2 * id".into())]), - None, - None, - ) - .await?; + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } - // Can derive a column from existing ones across multiple data files. - dataset - .add_columns( - NewColumnTransform::SqlExpressions(vec![( - "triple_id".into(), - "id + double_id".into(), - )]), - None, - None, - ) - .await?; + fn get_fragment(&self, _fragment_id: u32) -> Result> { + Ok(None) + } - // These can be read back, the dataset is valid - dataset.validate().await?; + fn insert_fragment(&self, fragment: Fragment) -> Result<()> { + *self.inserted.lock().unwrap() = Some(fragment); + Ok(()) + } + } - let data = dataset.scan().try_into_batch().await?; - let expected_schema = ArrowSchema::new(vec![ - ArrowField::new("id", DataType::Int32, false), - ArrowField::new("value", DataType::Float64, true), - ArrowField::new("double_id", DataType::Int32, false), - ArrowField::new("triple_id", DataType::Int32, false), - ]); - assert_eq!(data.schema().as_ref(), &expected_schema); - assert_eq!(data.num_rows(), num_rows); + let inserted = Arc::new(Mutex::new(None)); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(InsertThenFailStore { + inserted: inserted.clone(), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected later checkpoint failure") + ); + + let inserted = inserted.lock().unwrap().clone().unwrap(); + let new_file = inserted + .files + .iter() + .find(|file| { + file.fields + .iter() + .any(|field| *field > dataset.manifest.max_field_id()) + }) + .expect("checkpoint should record the newly written data file"); + let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path); + let new_blob_dir = StdPath::new(test_uri) + .join("data") + .join(StdPath::new(&new_file.path).file_stem().unwrap()); + assert!( + new_file_path.exists(), + "cleanup must not delete data files after checkpoint takes ownership" + ); + assert!( + new_blob_dir.exists(), + "cleanup must not delete blob sidecars after checkpoint takes ownership" + ); Ok(()) } @@ -1784,7 +2684,6 @@ mod test { ) -> Result<()> { // Create a table with 2 scalar columns, 1 vector column - use crate::index::DatasetIndexExt; use arrow::datatypes::{Int32Type, Int64Type}; use arrow_array::{Float16Array, Float32Array, Int64Array, ListArray}; use half::f16; @@ -1885,7 +2784,10 @@ mod test { assert_eq!(f.files.len(), 2); }); - // Cast scalar column with index, should not keep index (TODO: keep it) + // Cast scalar column with index. The index must be dropped first; cast + // is now a fail-fast operation when an index is attached, see + // test_alter_columns_cast_fails_with_attached_index for that path. + dataset.drop_index("i_idx").await?; dataset .alter_columns(&[ColumnAlteration::new("i".into()).cast_to(DataType::Int64)]) .await?; @@ -1906,7 +2808,8 @@ mod test { ]); assert_eq!(&ArrowSchema::from(dataset.schema()), &expected_schema); - // We currently lose the index when casting a column + // The scalar index on `i` is gone (we dropped it); the vector index on + // `vec` is still present. let indices = dataset.load_indices().await?; assert_eq!(indices.len(), 1); @@ -1915,7 +2818,8 @@ mod test { assert_eq!(f.files.len(), 3); }); - // Cast vector column, should not keep index (TODO: keep it) + // Cast vector column. Drop its index first (same reason as above). + dataset.drop_index("vec_idx").await?; dataset .alter_columns(&[ ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( @@ -1983,6 +2887,120 @@ mod test { Ok(()) } + /// Cast on a column with an attached index must fail fast rather than + /// silently dropping the index. This guards against the historical behavior + /// where cast would rewrite column data and the index would vanish without + /// any error or warning, causing vector search to silently regress to a + /// brute-force scan. + #[rstest] + #[tokio::test] + async fn test_alter_columns_cast_fails_with_attached_index( + #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)] + data_storage_version: LanceFileVersion, + ) -> Result<()> { + use lance_arrow::FixedSizeListArrayExt; + use lance_index::IndexType; + use lance_linalg::distance::MetricType; + use lance_testing::datagen::generate_random_array; + + use crate::index::vector::VectorIndexParams; + + // Build a small dataset with one indexed vector column. + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 64, + ), + false, + )])); + let nrows = 256; + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new( + ::try_new_from_values( + generate_random_array(64 * nrows as usize), + 64, + ) + .unwrap(), + )], + )?; + + let test_dir = TempStrDir::default(); + let mut dataset = Dataset::write( + RecordBatchIterator::new(vec![Ok(batch)], schema.clone()), + &test_dir, + Some(WriteParams { + data_storage_version: Some(data_storage_version), + ..Default::default() + }), + ) + .await?; + + // Build an IVF_PQ index on the vector column. + let params = VectorIndexParams::ivf_pq(4, 8, 8, MetricType::L2, 50); + dataset + .create_index(&["vec"], IndexType::Vector, None, ¶ms, false) + .await?; + + let indices_before = dataset.load_indices().await?; + assert_eq!(indices_before.len(), 1, "precondition: index exists"); + let index_name = indices_before[0].name.clone(); + + // Attempting to cast the indexed column must fail with a clear message + // that names the offending index(es). + let result = dataset + .alter_columns(&[ + ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + )), + ]) + .await; + let err = result.expect_err("cast on indexed column should fail"); + let msg = err.to_string(); + assert!( + msg.contains("vec") && msg.contains(&index_name), + "error should mention column and index name, got: {msg}" + ); + assert!( + msg.contains("drop_index"), + "error should suggest the remediation, got: {msg}" + ); + + // The dataset must be unchanged: schema is still float32, index still present. + assert_eq!( + dataset.schema().field("vec").unwrap().data_type(), + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 64, + ), + ); + let indices_after = dataset.load_indices().await?; + assert_eq!(indices_after.len(), 1, "index should still exist"); + assert_eq!(indices_after[0].name, index_name); + + // Sanity check: after dropping the index, the same cast should succeed. + dataset.drop_index(&index_name).await?; + dataset + .alter_columns(&[ + ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + )), + ]) + .await?; + assert_eq!( + dataset.schema().field("vec").unwrap().data_type(), + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + ), + ); + + Ok(()) + } + #[rstest] #[tokio::test] async fn test_drop_columns( diff --git a/rust/lance/src/dataset/tests/dataset_aggregate.rs b/rust/lance/src/dataset/tests/dataset_aggregate.rs index ef2a90e6315..8d45cda98e2 100644 --- a/rust/lance/src/dataset/tests/dataset_aggregate.rs +++ b/rust/lance/src/dataset/tests/dataset_aggregate.rs @@ -44,6 +44,7 @@ use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount, assert_pla use lance_arrow::FixedSizeListArrayExt; use lance_index::IndexType; use lance_index::scalar::FullTextSearchQuery; +use lance_index::scalar::ScalarIndexParams; use lance_index::scalar::inverted::InvertedIndexParams; use lance_linalg::distance::MetricType; @@ -268,7 +269,9 @@ async fn test_count_star_single_fragment() { vec![], ); - // Verify COUNT(*) has empty projection optimization + // COUNT(*) is rewritten by CountPushdown into a Final aggregate + // over CountFromMaskExec, which answers from manifest metadata + the + // deletion mask instead of scanning column data. let mut scanner = ds.scan(); scanner .aggregate(AggregateExpr::substrait(agg_bytes.clone())) @@ -276,8 +279,8 @@ async fn test_count_star_single_fragment() { let plan = scanner.create_plan().await.unwrap(); assert_plan_node_equals( plan, - "AggregateExec: mode=Single, gby=[], aggr=[count(...)] - LanceRead: uri=..., projection=[], num_fragments=1, range_before=None, range_after=None, row_id=false, row_addr=true, full_filter=--, refine_filter=--", + "AggregateExec: mode=Final, gby=[], aggr=[count(...)] + CountFromMask", ) .await .unwrap(); @@ -1204,11 +1207,12 @@ async fn test_scanner_count_rows() { .unwrap(); let plan = scanner.create_plan().await.unwrap(); - // COUNT(*) should have empty projection (optimized to not read any columns) + // COUNT(*) is rewritten by CountPushdown into a Final aggregate + // over CountFromMaskExec. assert_plan_node_equals( plan.clone(), - "AggregateExec: mode=Single, gby=[], aggr=[count(Int32(1))] - LanceRead: uri=..., projection=[], num_fragments=2, range_before=None, range_after=None, row_id=false, row_addr=true, full_filter=--, refine_filter=--", + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CountFromMask", ) .await .unwrap(); @@ -1255,6 +1259,185 @@ async fn test_scanner_count_rows_with_filter() { ); } +#[tokio::test] +async fn test_scanner_count_rows_with_indexed_filter() { + // When the filter is fully evaluable by a scalar index that covers + // every dataset fragment, the rule rewrites COUNT(*) into a Final + // aggregate over CountFromMaskExec, with the ScalarIndexExec + // wired in as the prefilter — no LanceRead, no column scan. + let mut ds = create_numeric_dataset("memory://test_count_indexed", 2, 50).await; + ds.create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let mut scanner = ds.scan(); + scanner.filter("x < 50").unwrap(); + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + + assert_plan_node_equals( + plan.clone(), + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CountFromMask + ScalarIndexQuery: query=[x < 50]@x_idx(BTree)", + ) + .await + .unwrap(); + + let stream = execute_plan(plan, LanceExecutionOptions::default()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!( + batches[0].column(0).as_primitive::().value(0), + 50, + ); +} + +#[tokio::test] +async fn test_scanner_count_rows_with_indexed_filter_stable_row_ids() { + // Indexed-filter count under stable row ids, with deletions in both + // fragments. The rule fires and the cross-fragment count stays correct. + let tmp = tempdir().unwrap(); + let uri = tmp.path().to_str().unwrap(); + let mut ds = gen_batch() + .col("x", array::step::()) + .col("y", array::step_custom::(0, 2)) + .col("category", array::cycle::(vec![1, 2, 3])) + .into_dataset_with_params( + uri, + FragmentCount::from(2), + FragmentRowCount::from(50), + Some(crate::dataset::WriteParams { + max_rows_per_file: 50, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + ds.create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + // Delete one row from each fragment (x=10 in frag 0, x=70 in frag 1). + ds.delete("x = 10 OR x = 70").await.unwrap(); + + let mut scanner = ds.scan(); + scanner.filter("x < 100").unwrap(); + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + + assert_plan_node_equals( + plan.clone(), + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CountFromMask + ScalarIndexQuery: query=[x < 100]@x_idx(BTree)", + ) + .await + .unwrap(); + + let stream = execute_plan(plan, LanceExecutionOptions::default()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + // 100 rows match `x < 100`, minus the two deletions. + assert_eq!( + batches[0].column(0).as_primitive::().value(0), + 98, + ); +} + +#[tokio::test] +async fn test_scanner_count_rows_with_partial_index_coverage() { + // Index covers the first two fragments, then a third fragment is + // appended. The rule cannot answer the count from the index alone for + // the appended fragment, so it emits a split plan: CountFromMaskExec + // over the indexed fragments + AggregateExec(Partial)/FilteredReadExec + // over the rest, both unioned and summed by AggregateExec(Final). + let tmp = tempdir().unwrap(); + let uri = tmp.path().to_str().unwrap(); + let mut ds = create_numeric_dataset(uri, 2, 50).await; + ds.create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + // Append a third fragment that the index does not cover. + let reader = gen_batch() + .col("x", array::step_custom::(100, 1)) + .col("y", array::step_custom::(0, 2)) + .col("category", array::cycle::(vec![1, 2, 3])) + .into_reader_rows( + lance_datagen::RowCount::from(50), + lance_datagen::BatchCount::from(1), + ); + let ds = Dataset::write( + reader, + uri, + Some(crate::dataset::WriteParams { + mode: crate::dataset::WriteMode::Append, + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(ds.get_fragments().len(), 3); + + let mut scanner = ds.scan(); + // `x < 1000` matches every row (values are 0..100 + 100..150). The + // pushdown branch contributes the first 100 from the indexed fragments; + // the scan branch contributes the 50 rows from the appended fragment. + scanner.filter("x < 1000").unwrap(); + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + // Pin target_parallelism=1 so EnforceDistribution produces a deterministic + // plan snapshot regardless of the machine's CPU count. + scanner.target_parallelism(1); + let plan = scanner.create_plan().await.unwrap(); + + assert_plan_node_equals( + plan.clone(), + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CoalescePartitionsExec + UnionExec + CountFromMask + ScalarIndexQuery: query=[x < 1000]@x_idx(BTree) + AggregateExec: mode=Partial, gby=[], aggr=[count(Int32(1))] + LanceRead: uri=..., projection=[], num_fragments=1, range_before=None, range_after=None, row_id=false, row_addr=true, full_filter=x < Int64(1000), refine_filter=--", + ) + .await + .unwrap(); + + let stream = execute_plan(plan, LanceExecutionOptions::default()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!( + batches[0].column(0).as_primitive::().value(0), + 150, + ); +} + #[tokio::test] async fn test_scanner_count_rows_empty_result() { let ds = create_numeric_dataset("memory://test_count_rows_empty", 1, 100).await; diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index e785de7bee4..267296c984b 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -1137,6 +1137,78 @@ async fn test_fts_without_index() { assert_eq!(results.num_rows(), 1); } +#[tokio::test] +async fn test_fts_without_index_uses_scalar_index_for_prefilter() { + // Verify that flat FTS (no inverted index on text) routes its prefilter + // through `FilteredReadExec` so a scalar index on the filter column is + // actually used. Six rows with two distinct ids: a prefilter of `id = 1` + // must match exactly the three text rows tagged with id=1. + let text = StringArray::from(vec![ + "alpha bravo", + "charlie delta", + "alpha echo", + "foxtrot", + "alpha golf", + "hotel india", + ]); + let ids = Int32Array::from(vec![1, 1, 1, 2, 2, 2]); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![ + Field::new("text", text.data_type().to_owned(), false), + Field::new("id", ids.data_type().to_owned(), false), + ]) + .into(), + vec![Arc::new(text) as ArrayRef, Arc::new(ids) as ArrayRef], + ) + .unwrap(); + let schema = batch.schema(); + let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + let test_uri = TempStrDir::default(); + let mut dataset = Dataset::write(batches, &test_uri, None).await.unwrap(); + + // Scalar index on `id` only — no FTS index on `text`. + dataset + .create_index( + &["id"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let mut scan = dataset.scan(); + scan.prefilter(true) + .full_text_search( + FullTextSearchQuery::new("alpha".to_owned()) + .with_columns(&["text".to_string()]) + .unwrap(), + ) + .unwrap() + .filter("id = 1") + .unwrap(); + + let plan = scan.analyze_plan().await.unwrap(); + // The flat-FTS path now reads via `FilteredReadExec` (prints as `LanceRead`) + // with the prefilter plumbed into it, so the scalar index on `id` is used. + assert_contains!(&plan, "FlatMatchQuery"); + assert_contains!(&plan, "LanceRead"); + assert_contains!(&plan, "full_filter=id = Int32(1)"); + // The legacy plan ran a `LanceScan` wrapped in a manual `LanceFilterExec`; + // make sure we did not regress to that shape. + assert_not_contains!(&plan, "LanceScan:"); + + let results = scan.try_into_batch().await.unwrap(); + // Only rows with id=1 AND text matching "alpha": rows 0 ("alpha bravo") + // and 2 ("alpha echo"). Row 4 ("alpha golf") has id=2 and must be excluded. + assert_eq!( + results.num_rows(), + 2, + "expected the two id=1 rows that match `alpha`, got plan:\n{plan}" + ); +} + #[tokio::test] async fn test_fts_rank() { let params = InvertedIndexParams::default(); @@ -1204,6 +1276,83 @@ async fn test_fts_rank() { assert_eq!(row_ids, &[0]); } +#[tokio::test] +async fn test_fts_unfiltered_after_filtered_returns_real_row_ids() { + // After a filtered FTS scan populates the per-partition cache, + // the next unfiltered scan must still return real row_ids, not + // partition-local doc_ids. Needs >1 fragment so the two differ + // (fragment N's row_ids start at N << 32). + let text_col = GenericStringArray::::from(vec![ + "alpha first", + "alpha second", + "alpha third", + "alpha fourth", + ]); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![arrow_schema::Field::new( + "text", + text_col.data_type().to_owned(), + false, + )]) + .into(), + vec![Arc::new(text_col) as ArrayRef], + ) + .unwrap(); + let schema = batch.schema(); + let test_uri = TempStrDir::default(); + let mut dataset = Dataset::write( + RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema), + &test_uri, + Some(WriteParams { + max_rows_per_file: 1, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["text"], + IndexType::Inverted, + None, + &InvertedIndexParams::default(), + true, + ) + .await + .unwrap(); + + let fts = |ds: &Dataset, filter: Option<&str>| { + let mut s = ds.scan(); + s.with_row_id() + .full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap(); + if let Some(f) = filter { + s.prefilter(true).filter(f).unwrap(); + } + s + }; + let sorted_row_ids = |b: &RecordBatch| { + let mut v: Vec = b[ROW_ID].as_primitive::().values().to_vec(); + v.sort(); + v + }; + + let fresh = sorted_row_ids(&fts(&dataset, None).try_into_batch().await.unwrap()); + assert_eq!(fresh.len(), 4); + + // Reopen so the baseline scan's cached LazyDocSet doesn't mask + // the regression -- the filtered scan needs to be the first + // thing that touches the DocSet. + let dataset = Dataset::open(test_uri.as_str()).await.unwrap(); + fts(&dataset, Some("text LIKE 'alpha first%'")) + .try_into_batch() + .await + .unwrap(); + + let after = sorted_row_ids(&fts(&dataset, None).try_into_batch().await.unwrap()); + assert_eq!(after, fresh); +} + async fn create_fts_dataset< Offset: arrow::array::OffsetSizeTrait, ListOffset: arrow::array::OffsetSizeTrait, @@ -2001,11 +2150,7 @@ mod fts_serializing_backend { ) -> Option { let guard = self.serialized.lock().await; if let Some((bytes, stored_codec, _)) = guard.get(key) { - return Some( - stored_codec - .deserialize(&bytes.clone()) - .expect("deserialization should succeed"), - ); + return stored_codec.deserialize(&bytes.clone()).hit(); } drop(guard); self.passthrough.get(key, codec).await diff --git a/rust/lance/src/dataset/tests/dataset_merge_update.rs b/rust/lance/src/dataset/tests/dataset_merge_update.rs index 6374a0d2867..c96a3db915f 100644 --- a/rust/lance/src/dataset/tests/dataset_merge_update.rs +++ b/rust/lance/src/dataset/tests/dataset_merge_update.rs @@ -8,13 +8,15 @@ use crate::dataset::ROW_ID; use crate::dataset::WriteDestination; use crate::dataset::optimize::{CompactionOptions, compact_files}; use crate::dataset::transaction::{DataReplacementGroup, Operation}; -use crate::dataset::{AutoCleanupParams, MergeInsertBuilder, ProjectionRequest}; +use crate::dataset::{AutoCleanupParams, MergeInsertBuilder, ProjectionRequest, UpdateBuilder}; use crate::index::DatasetIndexExt; use crate::{Dataset, Error}; use lance_core::ROW_ADDR; use lance_index::IndexType; use lance_index::optimize::OptimizeOptions; +use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::ScalarIndexParams; +use lance_index::scalar::inverted::tokenizer::InvertedIndexParams; use mock_instant::thread_local::MockClock; use crate::dataset::write::{InsertBuilder, WriteMode, WriteParams}; @@ -26,7 +28,7 @@ use arrow_array::{ ArrayRef, Float32Array, Int32Array, ListArray, RecordBatchIterator, StringArray, types::Int32Type, }; -use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; +use arrow_schema::{DataType, Field as ArrowField, Fields, Schema as ArrowSchema}; use lance_arrow::BLOB_META_KEY; use lance_core::utils::tempfile::{TempDir, TempStrDir}; use lance_datafusion::utils::reader_to_stream; @@ -1045,7 +1047,8 @@ async fn test_datafile_replacement_error() { Operation::DataReplacement { replacements: vec![DataReplacementGroup(0, new_data_file)], }, - Some(2), + // read at the current version (after the Merge above) + Some(dataset.manifest.version), None, None, Arc::new(Default::default()), @@ -1624,6 +1627,121 @@ async fn test_merge_insert_with_reordered_columns_and_index() { final_dataset.validate().await.unwrap(); } +/// With stable row ids, updating a top-level struct column keeps a scalar index on a +/// nested child field correct. The update API rejects nested column references, so a +/// nested field can only be changed by setting its whole struct column; that update must +/// not wrongly extend the child-field index over the rewritten fragment (which would +/// leave the updated value unscanned and silently dropped). +#[tokio::test] +async fn test_update_struct_column_keeps_nested_index() { + let struct_fields = Fields::from(vec![ArrowField::new("x", DataType::Int32, true)]); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("s", DataType::Struct(struct_fields.clone()), true), + ])); + let s_arr = StructArray::new( + struct_fields.clone(), + vec![Arc::new(Int32Array::from(vec![10, 20, 30])) as ArrayRef], + None, + ); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + Arc::new(s_arr) as ArrayRef, + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + "memory://test_update_nested_index", + Some(WriteParams { + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + // BTree index on the NESTED field `s.x`. + dataset + .create_index( + &["s.x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + + let pre = dataset + .scan() + .filter("s.x = 20") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!(pre.num_rows(), 1, "precondition: s.x=20 should match id=2"); + + // Nested column references are rejected by `set`, so update the whole struct column + // `s` for id=2, changing s.x 20 -> 999. + let update_result = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id = 2") + .unwrap() + .set("s", "named_struct('x', cast(999 as int))") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + let dataset = update_result.new_dataset; + + // The nested `s.x` index must NOT be extended to the rewritten fragment: its + // effective coverage stays {0}, so the rewritten fragment is left unindexed and + // fully scanned. + let sx_idx = dataset + .load_indices() + .await + .unwrap() + .iter() + .find(|i| i.fields.len() == 1) + .expect("nested s.x index") + .clone(); + let effective = sx_idx + .effective_fragment_bitmap(&dataset.fragment_bitmap) + .expect("index has a fragment bitmap"); + assert_eq!( + effective.iter().collect::>(), + vec![0], + "nested-field index must not be extended to the rewritten fragment" + ); + + // The updated value must be found, and the stale value gone. + let new = dataset + .scan() + .filter("s.x = 999") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!( + new.num_rows(), + 1, + "updated value s.x=999 must be found after the struct-column update" + ); + let old = dataset + .scan() + .filter("s.x = 20") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!(old.num_rows(), 0, "s.x=20 should no longer match any row"); +} + /// DataReplacement should invalidate index fragment bitmaps for replaced fields. #[tokio::test] async fn test_data_replacement_invalidates_index_bitmap() { @@ -2631,3 +2749,82 @@ async fn test_sub_schema_merge_insert_binary_v2_2() { assert_eq!(binary_arr.value(0), data_a.as_slice()); assert_eq!(binary_arr.value(1), data_b.as_slice()); } + +#[tokio::test] +async fn test_fts_unfiltered_after_compaction_returns_remapped_row_ids() { + // After `compact_files` with `defer_index_remap = true`, queries + // read the old FTS index but must apply the dataset's + // FragReuseIndex remap. Otherwise the deferred-row_id path + // returns pre-compaction row_ids that no longer exist. + use arrow::datatypes::UInt64Type; + + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("text", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![0, 1, 2, 3])), + Arc::new(StringArray::from(vec![ + "alpha first", + "alpha second", + "alpha third", + "alpha fourth", + ])), + ], + ) + .unwrap(); + let mut dataset = Dataset::write( + RecordBatchIterator::new(vec![Ok(batch)], schema), + "memory://test_fts_frag_reuse", + Some(WriteParams { + max_rows_per_file: 1, // 4 fragments -> 4 partitions + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["text"], + IndexType::Inverted, + None, + &InvertedIndexParams::default(), + true, + ) + .await + .unwrap(); + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 1000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + let after = dataset + .scan() + .with_row_id() + .full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!(after.num_rows(), 4); + let returned: Vec = after[ROW_ID].as_primitive::().values().to_vec(); + let live: std::collections::HashSet = + dataset.scan().with_row_id().try_into_batch().await.unwrap()[ROW_ID] + .as_primitive::() + .values() + .iter() + .copied() + .collect(); + for id in &returned { + assert!(live.contains(id), "stale row_id {id}"); + } +} diff --git a/rust/lance/src/dataset/tests/dataset_scanner.rs b/rust/lance/src/dataset/tests/dataset_scanner.rs index dcc64aa0632..4c44cb0795b 100644 --- a/rust/lance/src/dataset/tests/dataset_scanner.rs +++ b/rust/lance/src/dataset/tests/dataset_scanner.rs @@ -62,6 +62,7 @@ async fn test_vector_filter_fts_search() { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; // Case 1: search with prefilter=true, query_filter=vector([300,300,300,300]) diff --git a/rust/lance/src/dataset/tests/dataset_versioning.rs b/rust/lance/src/dataset/tests/dataset_versioning.rs index 5ac01c498b2..c04dd0f3183 100644 --- a/rust/lance/src/dataset/tests/dataset_versioning.rs +++ b/rust/lance/src/dataset/tests/dataset_versioning.rs @@ -211,6 +211,77 @@ async fn test_version_id_fast_path() { assert_eq!(historical.latest_version_id().await.unwrap(), 2); } +#[rstest] +#[tokio::test] +async fn test_stale_checks_cover_fast_successor_and_latest_version( + #[values(false, true)] enable_v2_manifest_paths: bool, +) { + let expected_scheme = if enable_v2_manifest_paths { + ManifestNamingScheme::V2 + } else { + ManifestNamingScheme::V1 + }; + let test_uri = TempStrDir::default(); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "i", + DataType::UInt32, + false, + )])); + + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt32Array::from_iter_values(0..5))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema.clone()); + + let original = Dataset::write( + reader, + &test_uri, + Some(WriteParams { + enable_v2_manifest_paths, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(original.manifest_location().naming_scheme, expected_scheme); + assert!(!original.is_stale().await.unwrap()); + assert!(!original.has_successor_version().await.unwrap()); + + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt32Array::from_iter_values(5..10))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema); + let updated = Dataset::write( + reader, + &test_uri, + Some(WriteParams { + mode: WriteMode::Append, + enable_v2_manifest_paths, + ..Default::default() + }), + ) + .await + .unwrap(); + + assert!(original.is_stale().await.unwrap()); + assert!(original.has_successor_version().await.unwrap()); + assert_eq!(updated.manifest_location().naming_scheme, expected_scheme); + assert!(!updated.is_stale().await.unwrap()); + assert!(!updated.has_successor_version().await.unwrap()); + + let historical = updated.checkout_version(1).await.unwrap(); + assert_eq!( + historical.manifest_location().naming_scheme, + expected_scheme + ); + assert!(historical.is_stale().await.unwrap()); + assert!(historical.has_successor_version().await.unwrap()); +} + #[rstest] #[tokio::test] async fn test_restore( @@ -565,6 +636,65 @@ async fn test_fragment_id_never_reset() { assert_eq!(dataset.manifest.max_fragment_id(), Some(4)); } +/// create_branch and shallow_clone must read the SOURCE ref's chain, not the +/// receiver's. Both chains get a version 2 with diverged row counts so a clone +/// that wrongly resolves the version under the receiver succeeds silently with +/// the wrong data. +#[tokio::test] +async fn test_create_branch_and_shallow_clone_from_other_branch() { + let tempdir = TempDir::default(); + let test_uri = tempdir.path_str(); + + let gen_rows = |start: i32, rows: u64| { + gen_batch() + .col("id", array::step_custom::(start, 1)) + .into_reader_rows(RowCount::from(rows), BatchCount::from(1)) + }; + let write = |uri: String, start: i32, rows: u64, mode: WriteMode| async move { + Dataset::write( + gen_rows(start, rows), + uri.as_str(), + Some(WriteParams { + mode, + ..Default::default() + }), + ) + .await + .unwrap() + }; + + // main v1: 50 rows. + let mut main_ds = write(test_uri.clone(), 0, 50, WriteMode::Create).await; + // dev: forked at v1, appended 30 rows -> dev v2 has 80 rows. + let dev_ds = main_ds.create_branch("dev", 1, None).await.unwrap(); + write(dev_ds.uri().to_string(), 1000, 30, WriteMode::Append).await; + // Diverge main to the same version number with a different row count. + let mut main_ds = write(test_uri.clone(), 5000, 10, WriteMode::Append).await; // main v2: 60 rows + + // Cross-source create_branch: receiver is main, source is dev. + let child_ds = main_ds + .create_branch("child", ("dev", 2), None) + .await + .unwrap(); + assert_eq!( + child_ds.count_rows(None).await.unwrap(), + 80, + "child must clone dev@2, not main@2" + ); + + // Cross-source shallow_clone: same rule. + let clone_uri = format!("{}_clone", test_uri); + let cloned_ds = main_ds + .shallow_clone(&clone_uri, ("dev", 2), None) + .await + .unwrap(); + assert_eq!( + cloned_ds.count_rows(None).await.unwrap(), + 80, + "shallow clone must read dev@2, not main@2" + ); +} + #[tokio::test] async fn test_branch() { let tempdir = TempDir::default(); @@ -797,6 +927,86 @@ async fn test_branch() { "branch1" ); + // Opening at a branch-pointing tag through the builder must check out the + // tag's branch chain, not main's chain at the tag's version number. + let tag_open = DatasetBuilder::from_uri(&test_uri) + .with_tag("tag1") + .load() + .await + .unwrap(); + assert_eq!(tag_open.manifest.branch.as_deref(), Some("dev/branch2")); + assert_eq!(tag_open.version().version, 3); + assert_eq!(tag_open.count_rows(None).await.unwrap(), 100); + + // Malformed branch names are rejected at the boundary + for bad_name in ["", "branch1/"] { + let err = main_dataset + .checkout_version((Some(bad_name), None::)) + .await + .unwrap_err(); + assert!( + matches!(err, Error::InvalidRef { .. }), + "checkout of {:?} must be rejected as InvalidRef, got: {}", + bad_name, + err + ); + let err = DatasetBuilder::from_uri(&test_uri) + .with_branch(bad_name, None) + .load() + .await + .unwrap_err(); + assert!( + matches!(err, Error::InvalidRef { .. }), + "open of {:?} must be rejected as InvalidRef, got: {}", + bad_name, + err + ); + } + + // "main" stays a valid spelling of the main branch on checkout; the JNI + // bindings construct Ref::Version(Some("main"), _) directly. + let main_by_name = checkout_branch1.checkout_branch("main").await.unwrap(); + assert_eq!(main_by_name.manifest.branch, None); + assert_eq!(main_by_name.version().version, 1); + let main_by_ref = checkout_branch1 + .checkout_version(crate::dataset::refs::Ref::Version( + Some("main".to_string()), + None, + )) + .await + .unwrap(); + assert_eq!(main_by_ref.manifest.branch, None); + + // A checkout whose resolved manifest is not on the requested branch must + // error loudly instead of handing back another branch's data: stage main's + // manifest under a branch path that was never created, so resolution finds + // a manifest belonging to main. + use object_store::ObjectStoreExt as _; + let staged_manifest = main_dataset.manifest_location().path.clone(); + let staged_copy = Path::parse(format!( + "{}/tree/ghost/_versions/{}", + test_uri, + staged_manifest.filename().unwrap() + )) + .unwrap(); + main_dataset + .object_store + .inner + .copy(&staged_manifest, &staged_copy) + .await + .unwrap(); + let err = main_dataset.checkout_branch("ghost").await.unwrap_err(); + assert!( + err.to_string().contains("resolved a manifest belonging to"), + "expected the branch-mismatch guardrail, got: {}", + err + ); + main_dataset + .object_store + .remove_dir_all(Path::parse(format!("{}/tree/ghost", test_uri)).unwrap()) + .await + .unwrap(); + let mut dataset = main_dataset; // Finally delete all branches assert!(matches!( diff --git a/rust/lance/src/dataset/transaction.rs b/rust/lance/src/dataset/transaction.rs index 3f96b9964d5..4555cd7ee6c 100644 --- a/rust/lance/src/dataset/transaction.rs +++ b/rust/lance/src/dataset/transaction.rs @@ -17,11 +17,11 @@ use super::write::merge_insert::inserted_rows::KeyExistenceFilter; use crate::dataset::transaction::UpdateMode::{RewriteColumns, RewriteRows}; use crate::index::mem_wal::update_mem_wal_index_merged_generations; use crate::utils::temporal::timestamp_to_nanos; -use deepsize::DeepSizeOf; use lance_core::datatypes::{ LANCE_UNENFORCED_CLUSTERING_KEY_POSITION, LANCE_UNENFORCED_PRIMARY_KEY, LANCE_UNENFORCED_PRIMARY_KEY_POSITION, }; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result, datatypes::Schema}; use lance_file::{datatypes::Fields, version::LanceFileVersion}; use lance_index::mem_wal::MergedGeneration; @@ -476,7 +476,7 @@ pub enum UpdateMode { pub struct UpdatedFragmentOffsets(pub HashMap); impl DeepSizeOf for UpdatedFragmentOffsets { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0.iter().fold(0_usize, |acc, (frag_id, bitmap)| { acc + frag_id.deep_size_of_children(context) + (bitmap.len() as usize).saturating_mul(std::mem::size_of::()) @@ -1361,7 +1361,7 @@ pub struct RewrittenIndex { } impl DeepSizeOf for RewrittenIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.new_index_details .type_url .deep_size_of_children(context) diff --git a/rust/lance/src/dataset/updater.rs b/rust/lance/src/dataset/updater.rs index b9bc34f8706..90ef8df914b 100644 --- a/rust/lance/src/dataset/updater.rs +++ b/rust/lance/src/dataset/updater.rs @@ -6,13 +6,13 @@ use futures::StreamExt; use lance_core::datatypes::{OnMissing, OnTypeMismatch}; use lance_core::utils::deletion::DeletionVector; use lance_core::{Error, Result, datatypes::Schema}; -use lance_table::format::Fragment; +use lance_table::format::{DataFile, Fragment}; use lance_table::utils::stream::ReadBatchFutStream; use super::Dataset; use super::fragment::FragmentReader; use super::scanner::get_default_batch_size; -use super::write::{GenericWriter, open_writer}; +use super::write::{GenericWriter, cleanup_data_fragments, open_update_writer}; use crate::dataset::FileFragment; use crate::dataset::utils::SchemaAdapter; @@ -146,13 +146,7 @@ impl Updater { .data_storage_format .lance_file_version()?; - open_writer( - &self.fragment.dataset().object_store, - &schema, - &self.fragment.dataset().base, - data_storage_version, - ) - .await + open_update_writer(self.dataset(), &schema, data_storage_version).await } /// Update one batch. @@ -221,6 +215,34 @@ impl Updater { Ok(self.fragment.metadata().clone()) } + /// Clean up any data file and blob sidecars created by the current unfinished writer. + pub(super) async fn cleanup_unfinished_writer(&mut self) { + let Some(writer) = self.writer.take() else { + return; + }; + let (path, base_id) = writer.data_file_path(); + let path = path.to_string(); + drop(writer); + + if path.is_empty() { + return; + } + + let mut fragment = Fragment::new(self.fragment.id() as u64); + // cleanup_data_fragments only needs path/base_id to remove the unfinished + // data file and any blob sidecars. Build a minimal synthetic fragment so + // we can reuse the shared cleanup path without fabricating full metadata. + fragment + .files + .push(DataFile::new(path, vec![], vec![], 0, 0, None, base_id)); + cleanup_data_fragments( + &self.dataset().object_store, + &self.dataset().base, + &[fragment], + ) + .await; + } + /// Get the final schema of the fragment after the update. /// /// This may be None if the schema is not known. This can happen if it was diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index 8be2753cb96..ff0a119158c 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -6,7 +6,10 @@ use chrono::TimeDelta; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use futures::{Stream, StreamExt, TryStreamExt}; -use lance_arrow::BLOB_META_KEY; +use lance_arrow::{ + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME, +}; use lance_core::datatypes::{ NullabilityComparison, OnMissing, OnTypeMismatch, SchemaCompareOptions, }; @@ -35,7 +38,9 @@ use tracing::{info, instrument}; use crate::Dataset; use crate::dataset::blob::{ - BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver, preprocess_blob_batches, + BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver, + blob_dedicated_threshold_from_metadata, blob_inline_threshold_from_metadata, + preprocess_blob_batches, }; use crate::session::Session; @@ -170,6 +175,77 @@ fn validate_external_blob_write_params(params: &WriteParams) -> Result<()> { Ok(()) } +fn validate_blob_threshold_metadata_for_append( + input_schema: &Schema, + dataset_schema: &Schema, +) -> Result<()> { + for input_field in &input_schema.fields { + let Some(dataset_field) = dataset_schema.field(&input_field.name) else { + continue; + }; + let input_is_blob_v2 = input_field + .metadata + .get(ARROW_EXT_NAME_KEY) + .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME); + let dataset_is_blob_v2 = dataset_field + .metadata + .get(ARROW_EXT_NAME_KEY) + .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME); + if !input_is_blob_v2 && !dataset_is_blob_v2 { + continue; + } + + let has_inline_threshold = input_field + .metadata + .contains_key(BLOB_INLINE_SIZE_THRESHOLD_META_KEY); + let has_dedicated_threshold = input_field + .metadata + .contains_key(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY); + if !has_inline_threshold && !has_dedicated_threshold { + continue; + } + + if has_inline_threshold { + let input_inline_threshold = + blob_inline_threshold_from_metadata(&input_field.metadata, &input_field.name)?; + let dataset_inline_threshold = + blob_inline_threshold_from_metadata(&dataset_field.metadata, &dataset_field.name)?; + if input_inline_threshold != dataset_inline_threshold { + return Err(Error::invalid_input(format!( + "Cannot append data with blob threshold metadata {}={} for field '{}'; \ + the dataset schema has effective value {}. Blob thresholds for existing \ + columns are stored in the dataset schema.", + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + input_inline_threshold, + input_field.name, + dataset_inline_threshold, + ))); + } + } + if has_dedicated_threshold { + let input_dedicated_threshold = + blob_dedicated_threshold_from_metadata(&input_field.metadata, &input_field.name)?; + let dataset_dedicated_threshold = blob_dedicated_threshold_from_metadata( + &dataset_field.metadata, + &dataset_field.name, + )?; + if input_dedicated_threshold != dataset_dedicated_threshold { + return Err(Error::invalid_input(format!( + "Cannot append data with blob threshold metadata {}={} for field '{}'; \ + the dataset schema has effective value {}. Blob thresholds for existing \ + columns are stored in the dataset schema.", + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + input_dedicated_threshold, + input_field.name, + dataset_dedicated_threshold, + ))); + } + } + } + + Ok(()) +} + /// Auto cleanup parameters #[derive(Debug, Clone)] pub struct AutoCleanupParams { @@ -507,7 +583,7 @@ pub async fn do_write_fragments( }; let external_base_resolver = if storage_version >= LanceFileVersion::V2_2 - && schema.fields.iter().any(|field| field.is_blob_v2()) + && schema.fields_pre_order().any(|field| field.is_blob_v2()) { Some(Arc::new( build_external_base_resolver(dataset, ¶ms).await?, @@ -953,6 +1029,7 @@ pub async fn write_fragments_internal( ..Default::default() }, )?; + validate_blob_threshold_metadata_for_append(&converted_schema, dataset.schema())?; let write_schema = dataset.schema().project_by_schema( &converted_schema, OnMissing::Error, @@ -984,7 +1061,8 @@ pub async fn write_fragments_internal( (converted_schema, params.storage_version_or_default()) }; - if storage_version < LanceFileVersion::V2_2 && schema.fields.iter().any(|f| f.is_blob_v2()) { + if storage_version < LanceFileVersion::V2_2 && schema.fields_pre_order().any(|f| f.is_blob_v2()) + { return Err(Error::invalid_input(format!( "Blob v2 requires file version >= 2.2 (got {:?})", storage_version @@ -992,13 +1070,10 @@ pub async fn write_fragments_internal( } if storage_version >= LanceFileVersion::V2_2 - && schema - .fields - .iter() - .any(|f| f.metadata.contains_key(BLOB_META_KEY)) + && let Some(blob_field_path) = legacy_blob_field_path(&schema) { return Err(Error::invalid_input(format!( - "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)." + "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Found legacy blob field: {blob_field_path}. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)." ))); } @@ -1017,10 +1092,23 @@ pub async fn write_fragments_internal( Ok((fragments, schema)) } +fn legacy_blob_field_path(schema: &Schema) -> Option { + schema + .fields_pre_order() + .find(|field| field.metadata.contains_key(BLOB_META_KEY)) + .map(|field| { + schema + .field_path(field.id) + .unwrap_or_else(|_| field.name.clone()) + }) +} + #[async_trait::async_trait] pub trait GenericWriter: Send { /// Write the given batches to the file async fn write(&mut self, batches: &[RecordBatch]) -> Result<()>; + /// Get the file path and base ID for the data file being written. + fn data_file_path(&self) -> (&str, Option); /// Get the current position in the file /// /// We use this to know when the file is too large and we need to start @@ -1047,13 +1135,17 @@ where async fn write(&mut self, batches: &[RecordBatch]) -> Result<()> { self.writer.write(batches).await } + fn data_file_path(&self) -> (&str, Option) { + (&self.path, self.base_id) + } async fn tell(&mut self) -> Result { Ok(self.writer.tell().await? as u64) } async fn finish(&mut self) -> Result<(u32, DataFile)> { + let num_rows = self.writer.finish().await? as u32; let size_bytes = self.writer.tell().await?; Ok(( - self.writer.finish().await? as u32, + num_rows, DataFile::new_legacy( self.path.clone(), self.writer.schema(), @@ -1086,6 +1178,9 @@ impl GenericWriter for V2WriterAdapter { } Ok(()) } + fn data_file_path(&self) -> (&str, Option) { + (&self.path, self.base_id) + } async fn tell(&mut self) -> Result { Ok(self.writer.tell().await?) } @@ -1106,17 +1201,17 @@ impl GenericWriter for V2WriterAdapter { .map(|(_, column_index)| *column_index as i32) .collect::>(); let (major, minor) = self.writer.version().to_numbers(); - let num_rows = self.writer.finish().await? as u32; + let write_summary = self.writer.finish().await?; let data_file = DataFile::new( std::mem::take(&mut self.path), field_ids, column_indices, major, minor, - NonZero::new(self.writer.tell().await?), + NonZero::new(write_summary.size_bytes), self.base_id, ); - Ok((num_rows, data_file)) + Ok((write_summary.num_rows as u32, data_file)) } } @@ -1139,6 +1234,39 @@ pub async fn open_writer( .await } +pub(super) async fn open_update_writer( + dataset: &Dataset, + schema: &Schema, + storage_version: LanceFileVersion, +) -> Result> { + // add_columns / alter_columns reuse the normal writer stack, but they do not + // flow through WriteParams. Rebuild the external base resolver here so blob + // v2 reference columns can resolve dataset-registered external URIs. + let external_base_resolver = if storage_version >= LanceFileVersion::V2_2 + && schema.fields_pre_order().any(|f| f.is_blob_v2()) + { + Some(Arc::new( + build_external_base_resolver(Some(dataset), &WriteParams::default()).await?, + )) + } else { + None + }; + + open_writer_with_options( + &dataset.object_store, + schema, + &dataset.base, + storage_version, + WriterOptions { + add_data_dir: true, + external_base_resolver, + source_store_registry: dataset.session.store_registry(), + ..Default::default() + }, + ) + .await +} + #[derive(Default)] struct WriterOptions { add_data_dir: bool, @@ -1215,7 +1343,7 @@ async fn open_writer_with_options( source_store_registry, source_store_params, blob_pack_file_size_threshold, - )) + )?) } else { None }; diff --git a/rust/lance/src/dataset/write/commit.rs b/rust/lance/src/dataset/write/commit.rs index 2ab34441997..baad71b3e39 100644 --- a/rust/lance/src/dataset/write/commit.rs +++ b/rust/lance/src/dataset/write/commit.rs @@ -104,6 +104,10 @@ impl<'a> CommitBuilder<'a> { } /// Pass a commit handler to use for the dataset. + /// + /// Takes precedence over the destination dataset's own handler. If not + /// set, a `Dataset` destination commits through its own handler and a + /// `Uri` destination resolves one from the uri. pub fn with_commit_handler(mut self, commit_handler: Arc) -> Self { self.commit_handler = Some(commit_handler); self @@ -241,7 +245,9 @@ impl<'a> CommitBuilder<'a> { WriteDestination::Dataset(dataset) => ( dataset.object_store.clone(), dataset.base.clone(), - dataset.commit_handler.clone(), + self.commit_handler + .clone() + .unwrap_or_else(|| dataset.commit_handler.clone()), ), WriteDestination::Uri(uri) => { let commit_handler = if let (Some(_), Some(commit_handler)) = diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs index 20209ed7f30..bfd702c9c3b 100644 --- a/rust/lance/src/dataset/write/insert.rs +++ b/rust/lance/src/dataset/write/insert.rs @@ -442,7 +442,7 @@ struct WriteContext<'a> { mod test { use std::collections::HashMap; - use arrow_array::{BinaryArray, Int32Array, RecordBatchReader, StructArray}; + use arrow_array::{ArrayRef, BinaryArray, Int32Array, RecordBatchReader, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Schema}; use lance_arrow::BLOB_META_KEY; @@ -559,6 +559,41 @@ mod test { } } + #[tokio::test] + async fn create_v2_2_dataset_rejects_nested_legacy_blob_schema() { + let image_field = Field::new("image_bytes", DataType::Binary, true).with_metadata( + HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]), + ); + let schema = Arc::new(Schema::new(vec![Field::new( + "summary_image_nested", + DataType::Struct(vec![image_field.clone()].into()), + true, + )])); + let image_values: ArrayRef = Arc::new(BinaryArray::from(vec![Some(b"abc".as_slice())])); + let nested_values = StructArray::from(vec![(Arc::new(image_field), image_values)]); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(nested_values)]).unwrap(); + + let dataset = InsertBuilder::new("memory://forced-nested-blob-v2") + .with_params(&WriteParams { + mode: WriteMode::Create, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch)], schema.clone())) + .await; + + let err = dataset.unwrap_err(); + match err { + Error::InvalidInput { source, .. } => { + let message = source.to_string(); + assert!(message.contains("Legacy blob columns")); + assert!(message.contains("summary_image_nested.image_bytes")); + assert!(message.contains("lance.blob.v2")); + } + other => panic!("unexpected error: {other:?}"), + } + } + mod external_error { use super::*; use std::fmt; diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index f6776852db3..b14421c963f 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -58,7 +58,9 @@ use crate::{ }, index::DatasetIndexInternalExt, io::exec::{ - AddRowAddrExec, Planner, TakeExec, project, scalar_index::MapIndexExec, utils::ReplayExec, + AddRowAddrExec, Planner, TakeExec, project, + scalar_index::{IndexLookup, MapIndexExec}, + utils::ReplayExec, }, }; use arrow_array::{ @@ -121,6 +123,7 @@ use lance_table::format::{Fragment, IndexMetadata, RowIdMeta}; use log::info; use roaring::RoaringTreemap; use snafu::ResultExt; +use std::collections::HashMap; use std::{ collections::{BTreeMap, HashSet}, sync::{ @@ -624,30 +627,67 @@ impl MergeInsertJob { .map(|_| SchemaComparison::Subschema) } - async fn join_key_as_scalar_index(&self) -> Result> { - if self.params.on.len() != 1 { - // joining on more than one column - Ok(None) - } else { - let col = &self.params.on[0]; - self.dataset + /// Collect every join column that has a scalar index supporting exact + /// equality. + /// + /// For a single-column join this matches the previous behavior. For a + /// multi-column (composite key) join, every indexed column contributes + /// an additional `IsIn` probe inside one [`MapIndexExec`]: their AND + /// yields the row addresses where every indexed column matches some + /// source value. The downstream hash join still filters by the full + /// composite key, so unindexed columns simply do not prune the + /// candidate set — they are checked by the post-filter. + /// + /// Returns an empty vec when no join column has a usable scalar index; + /// callers should then fall through to the full-scan path. + async fn indexed_join_keys(&self) -> Result> { + let mut indexed = Vec::with_capacity(self.params.on.len()); + for col in &self.params.on { + if let Some(idx) = self + .dataset .load_scalar_index( IndexCriteria::default() .for_column(col) // Unclear if this would work if the index does not support exact equality .supports_exact_equality(), ) - .await + .await? + { + indexed.push((col.clone(), idx)); + } } + Ok(indexed) + } + + /// Fragments that cannot be reached by every index in `indexed_keys` + /// and therefore must be scanned separately and unioned in alongside + /// the indexed take. A fragment is "reachable" by the composite probe + /// only if it is in the intersection of all the indices' fragment + /// bitmaps; everything else falls into the unindexed set. + async fn unindexed_fragments_for_keys( + &self, + indexed_keys: &[(String, IndexMetadata)], + ) -> Result> { + let mut unindexed: HashMap = HashMap::new(); + for (_, index) in indexed_keys { + for frag in self.dataset.unindexed_fragments(&index.name).await? { + unindexed.entry(frag.id).or_insert(frag); + } + } + Ok(unindexed.into_values().collect()) } async fn create_indexed_scan_joined_stream( &self, source: SendableRecordBatchStream, - index: IndexMetadata, + indexed_keys: Vec<(String, IndexMetadata)>, ) -> Result { // This relies on a few non-standard physical operators and so we cannot use the // datafusion dataframe API and need to construct the plan manually :'( + debug_assert!( + !indexed_keys.is_empty(), + "create_indexed_scan_joined_stream requires at least one indexed key" + ); let schema = source.schema(); let add_row_addr = match self.check_compatible_schema(&schema)? { SchemaComparison::FullCompatible => false, @@ -662,22 +702,25 @@ impl MergeInsertJob { // the new data into memory. In the future, we can do better let shared_input = Arc::new(ReplayExec::new(Capacity::Unbounded, input)); - // 3 - Use the index to map input to row addresses - // First, we need to project to the key column - let field = schema.field_with_name(&self.params.on[0])?; - let index_mapper_input = Arc::new(project( - shared_input.clone(), - // schema for only the key join column - &Schema::new(vec![field.clone()]), - )?); - - // Then we pass the key column into the index mapper - let index_column = self.params.on[0].clone(); - let mut index_mapper: Arc = Arc::new(MapIndexExec::new( - // create index from original data and key column + // 3 - Probe every indexed join column. For composite keys this is + // the AND of one `IsIn` query per indexed column, which yields + // a tighter candidate set than probing a single column. The + // downstream hash join still filters by the full composite key, + // so unindexed `on` columns simply do not prune the candidates. + let lookup_fields = indexed_keys + .iter() + .map(|(col, _)| Ok(schema.field_with_name(col)?.clone())) + .collect::>>()?; + let index_mapper_input = + Arc::new(project(shared_input.clone(), &Schema::new(lookup_fields))?); + + let lookups = indexed_keys + .iter() + .map(|(col, idx)| IndexLookup::new(col.clone(), idx.name.clone())) + .collect::>(); + let mut index_mapper: Arc = Arc::new(MapIndexExec::new_multi( self.dataset.clone(), - index_column.clone(), - index.name.clone(), + lookups, index_mapper_input, )); @@ -722,8 +765,12 @@ impl MergeInsertJob { .filter(|name| name.as_str() != ROW_ID && name.as_str() != ROW_ADDR) .collect::>(); - // 5a - We also need to scan any new unindexed data and union it in - let unindexed_fragments = self.dataset.unindexed_fragments(&index.name).await?; + // 5a - We also need to scan any new unindexed data and union it in. + // A row can be reached by the composite index probe only if it + // lives in a fragment covered by *every* chosen index, so the + // "unindexed" set is the union of fragments missing from any + // one of them. + let unindexed_fragments = self.unindexed_fragments_for_keys(&indexed_keys).await?; if !unindexed_fragments.is_empty() { let mut builder = self.dataset.scan(); if add_row_addr { @@ -748,22 +795,47 @@ impl MergeInsertJob { // field names (DF doesn't support this as of version 44) target = Self::prefix_columns_phys(target, "target_"); - // 6 - Finally, join the input (source table) with the taken data (target table) - let source_key = Column::new_with_schema(&index_column, shared_input.schema().as_ref())?; - let target_key = Column::new_with_schema( - &format!("target_{}", index_column), - target.schema().as_ref(), - )?; + // 6 - Join the source against the taken target rows on the full + // composite key. Probing the index produces a super-set of the + // actual matches (when not every key column has an index, or + // even when they do — the per-column `IsIn` lists do not + // correlate values across the tuple), so this join is what + // trims candidates down to the exact composite-key matches. + let on_keys = self + .params + .on + .iter() + .map(|col| { + let source_key = Column::new_with_schema(col, shared_input.schema().as_ref())?; + let target_key = + Column::new_with_schema(&format!("target_{}", col), target.schema().as_ref())?; + Ok::<_, Error>(( + Arc::new(source_key) as Arc, + Arc::new(target_key) as Arc, + )) + }) + .collect::>>()?; + + // Use standard SQL NULL semantics for composite keys so this path + // produces the same result as the full-scan path. The + // single-column case keeps its historical `NullEqualsNull` behavior + // to avoid changing semantics for existing callers. + let null_equality = if self.params.on.len() == 1 { + NullEquality::NullEqualsNull + } else { + NullEquality::NullEqualsNothing + }; + let joined = Arc::new( HashJoinExec::try_new( shared_input, target, - vec![(Arc::new(source_key), Arc::new(target_key))], + on_keys, None, &JoinType::Full, None, PartitionMode::CollectLeft, - NullEquality::NullEqualsNull, + null_equality, false, ) .unwrap(), @@ -882,8 +954,11 @@ impl MergeInsertJob { ) { // keeping unmatched rows, no deletion - if let Some(index) = self.join_key_as_scalar_index().await? { - return self.create_indexed_scan_joined_stream(source, index).await; + let indexed_keys = self.indexed_join_keys().await?; + if !indexed_keys.is_empty() { + return self + .create_indexed_scan_joined_stream(source, indexed_keys) + .await; } } @@ -1618,7 +1693,7 @@ impl MergeInsertJob { self.params.delete_not_matched_by_source, WhenNotMatchedBySource::Keep ) { - self.join_key_as_scalar_index().await?.is_some() + !self.indexed_join_keys().await?.is_empty() } else { false }; @@ -2149,18 +2224,13 @@ impl Merger { &self.output_schema } - // Retrieves a bitmap of rows where at least one of the columns in the range - // col_offset..coll_offset+num_cols is not null. - // - fn not_all_null( - batch: &RecordBatch, - col_offset: usize, - num_cols: usize, - ) -> Result { + // Retrieves a bitmap of rows where at least one of the given columns is + // not null. + fn not_all_null(batch: &RecordBatch, cols: &[usize]) -> Result { // For our purposes we know there is always at least 1 on key - debug_assert_ne!(num_cols, 0); - let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(col_offset))?; - for idx in col_offset + 1..col_offset + num_cols { + debug_assert!(!cols.is_empty()); + let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(cols[0]))?; + for &idx in &cols[1..] { let is_valid = arrow::compute::is_not_null(batch.column(idx))?; at_least_one_valid = arrow::compute::or(&at_least_one_valid, &is_valid)?; } @@ -2188,8 +2258,37 @@ impl Merger { right_offset: usize, num_keys: usize, ) -> Result<(BooleanArray, BooleanArray, BooleanArray)> { - let in_left = Self::not_all_null(combined_batch, 0, num_keys)?; - let in_right = Self::not_all_null(combined_batch, right_offset, num_keys)?; + // The outer join distinguishes its three cases by which side's join + // keys were NULL-padded: a present row always has non-null keys, while + // the absent side is filled with NULLs. We therefore test the *key* + // columns, located by name. They are NOT necessarily the first + // `num_keys` columns — a partial-schema source can place a payload + // column (e.g. an all-null vector) at position 0, and checking + // positions [0, num_keys) there misreads an all-null leading payload + // column as an absent join side, silently dropping every matched row + // (https://github.com/lancedb/lancedb/issues/3515). The target half + // carries the same columns in the same order, offset by `right_offset`. + let source_key_cols = self + .params + .on + .iter() + .map(|key| { + combined_batch.schema().index_of(key).map_err(|_| { + Error::internal(format!( + "merge insert key column '{}' not found in joined batch", + key + )) + }) + }) + .collect::>>()?; + debug_assert_eq!(source_key_cols.len(), num_keys); + let target_key_cols = source_key_cols + .iter() + .map(|c| c + right_offset) + .collect::>(); + + let in_left = Self::not_all_null(combined_batch, &source_key_cols)?; + let in_right = Self::not_all_null(combined_batch, &target_key_cols)?; let in_both = arrow::compute::and(&in_left, &in_right)?; let left_only = arrow::compute::and(&in_left, &arrow::compute::not(&in_right)?)?; let right_only = arrow::compute::and(&arrow::compute::not(&in_left)?, &in_right)?; @@ -3442,6 +3541,116 @@ mod tests { } } + /// Reproduces https://github.com/lancedb/lancedb/issues/3515: + /// a partial-schema `merge_insert` with a scalar index on the join key, + /// where every fragment is covered by the index (no unindexed data), + /// silently updates 0 rows instead of the expected matches. + #[rstest::rstest] + #[tokio::test] + async fn test_repro_3515_partial_schema_fully_indexed( + #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1, LanceFileVersion::V2_2)] + version: LanceFileVersion, + ) { + const N: usize = 1000; + const UPD: usize = 128; + let vec_field = Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4), + true, + ); + let full_schema = Arc::new(Schema::new(vec![ + vec_field.clone(), + Field::new("path", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + Field::new("file_size", DataType::Int64, true), + ])); + + // 1000 rows: vector all-null, path "/img/{i}.jpg", status "pending". + let paths = StringArray::from((0..N).map(|i| format!("/img/{i}.jpg")).collect::>()); + let statuses = StringArray::from(vec!["pending"; N]); + let file_sizes = Int64Array::from((0..N as i64).map(|i| 1000 + i).collect::>()); + let null_vectors = arrow_array::new_null_array(vec_field.data_type(), N); + let batch = RecordBatch::try_new( + full_schema.clone(), + vec![ + null_vectors, + Arc::new(paths), + Arc::new(statuses), + Arc::new(file_sizes), + ], + ) + .unwrap(); + + let mut ds = Dataset::write( + RecordBatchIterator::new([Ok(batch)], full_schema.clone()), + "memory://", + Some(WriteParams { + data_storage_version: Some(version), + ..Default::default() + }), + ) + .await + .unwrap(); + + // Scalar index on the merge key, covering every fragment. + ds.create_index( + &["path"], + IndexType::Scalar, + None, + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + let ds = Arc::new(ds); + + // Partial-schema source (no `file_size`): update the first 128 rows. + let upd_schema = Arc::new(Schema::new(vec![ + vec_field, + Field::new("path", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + ])); + let upd_paths = StringArray::from( + (0..UPD) + .map(|i| format!("/img/{i}.jpg")) + .collect::>(), + ); + let upd_vectors = + FixedSizeListArray::try_new_from_values(Float32Array::from(vec![0.1f32; 4 * UPD]), 4) + .unwrap(); + let upd_statuses = StringArray::from(vec!["indexed"; UPD]); + let updates = RecordBatch::try_new( + upd_schema.clone(), + vec![ + Arc::new(upd_vectors), + Arc::new(upd_paths), + Arc::new(upd_statuses), + ], + ) + .unwrap(); + + let (ds, stats) = MergeInsertBuilder::try_new(ds.clone(), vec!["path".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::DoNothing) + .try_build() + .unwrap() + .execute_reader(RecordBatchIterator::new([Ok(updates)], upd_schema)) + .await + .unwrap(); + + assert_eq!( + stats.num_updated_rows, UPD as u64, + "expected {UPD} updated rows on {version:?}, got {}", + stats.num_updated_rows + ); + let n_indexed = ds + .count_rows(Some("status = 'indexed'".to_string())) + .await + .unwrap(); + assert_eq!(n_indexed, UPD, "expected {UPD} rows flipped to 'indexed'"); + } + #[tokio::test] async fn test_indexed_merge_insert() { let test_dir = TempStrDir::default(); @@ -3564,6 +3773,368 @@ mod tests { assert_eq!(ds.count_rows(None).await.unwrap(), 2048); } + /// Multi-column (composite key) merge_insert when one or more join + /// columns have a scalar index. Before this change the indexed path + /// was hard-gated to single-column joins; composite-key merges fell + /// through to a full target scan even when every key column was + /// indexed. Now each indexed column contributes an AND-ed `IsIn` + /// probe inside one `MapIndexExec`, and the downstream hash join trims + /// the candidates to the exact composite-key matches — rows that + /// happen to match one key column but not the other must NOT be + /// touched. + #[rstest::rstest] + #[case::index_on_first(true, false)] + #[case::index_on_second(false, true)] + #[case::index_on_both(true, true)] + #[tokio::test] + async fn test_indexed_merge_insert_composite_key( + #[case] index_on_a: bool, + #[case] index_on_b: bool, + ) { + // Target rows: every (a, b) combination from {1,2} x {10,20}. + let initial = record_batch!( + ("a", Int32, [1, 1, 2, 2]), + ("b", Int32, [10, 20, 10, 20]), + ("value", Int32, [100, 200, 300, 400]) + ) + .unwrap(); + let schema = initial.schema(); + + let mut ds = Dataset::write( + RecordBatchIterator::new(vec![Ok(initial.clone())], schema.clone()), + "memory://", + None, + ) + .await + .unwrap(); + + let params = ScalarIndexParams::default(); + if index_on_a { + ds.create_index(&["a"], IndexType::Scalar, None, ¶ms, false) + .await + .unwrap(); + } + if index_on_b { + ds.create_index(&["b"], IndexType::Scalar, None, ¶ms, false) + .await + .unwrap(); + } + + // Update (1, 10) and insert (3, 30). A naive single-column probe + // on `a` would also pull (1, 20) into the candidate set; the + // composite hash join must keep (1, 20) untouched. + let source = record_batch!( + ("a", Int32, [1, 3]), + ("b", Int32, [10, 30]), + ("value", Int32, [999, 333]) + ) + .unwrap(); + + let (updated_ds, stats) = + MergeInsertBuilder::try_new(Arc::new(ds), vec!["a".to_string(), "b".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::InsertAll) + .try_build() + .unwrap() + .execute_reader(Box::new(RecordBatchIterator::new( + vec![Ok(source.clone())], + source.schema(), + ))) + .await + .unwrap(); + + assert_eq!(stats.num_updated_rows, 1); + assert_eq!(stats.num_inserted_rows, 1); + assert_eq!(updated_ds.count_rows(None).await.unwrap(), 5); + + let untouched = updated_ds + .count_rows(Some("a = 1 AND b = 20 AND value = 200".to_string())) + .await + .unwrap(); + assert_eq!( + untouched, 1, + "(1, 20) must not be clobbered by an `a`-only probe" + ); + + let updated = updated_ds + .count_rows(Some("a = 1 AND b = 10 AND value = 999".to_string())) + .await + .unwrap(); + assert_eq!(updated, 1); + + let inserted = updated_ds + .count_rows(Some("a = 3 AND b = 30 AND value = 333".to_string())) + .await + .unwrap(); + assert_eq!(inserted, 1); + } + + /// Composite key merge_insert with no scalar index on any join column + /// must keep working via the full-scan fallback. Guards against the + /// indexed path becoming a hard requirement after this optimization. + #[tokio::test] + async fn test_indexed_merge_insert_composite_key_no_index() { + let initial = record_batch!( + ("a", Int32, [1, 1, 2]), + ("b", Int32, [10, 20, 10]), + ("value", Int32, [100, 200, 300]) + ) + .unwrap(); + let schema = initial.schema(); + + let ds = Dataset::write( + RecordBatchIterator::new(vec![Ok(initial.clone())], schema.clone()), + "memory://", + None, + ) + .await + .unwrap(); + + let source = record_batch!( + ("a", Int32, [1]), + ("b", Int32, [20]), + ("value", Int32, [999]) + ) + .unwrap(); + + let (updated_ds, stats) = + MergeInsertBuilder::try_new(Arc::new(ds), vec!["a".to_string(), "b".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::InsertAll) + .try_build() + .unwrap() + .execute_reader(Box::new(RecordBatchIterator::new( + vec![Ok(source.clone())], + source.schema(), + ))) + .await + .unwrap(); + + assert_eq!(stats.num_updated_rows, 1); + assert_eq!(stats.num_inserted_rows, 0); + let count = updated_ds + .count_rows(Some("a = 1 AND b = 20 AND value = 999".to_string())) + .await + .unwrap(); + assert_eq!(count, 1); + } + + /// Composite-key merge_insert must use standard SQL NULL semantics + /// (NULL != NULL) on the post-filter hash join so its behavior is + /// identical to the full-scan path; otherwise enabling the indexed + /// path for multi-column joins would silently change semantics. + #[tokio::test] + async fn test_indexed_merge_insert_composite_key_null_semantics() { + let initial = record_batch!( + ("a", Int32, [Some(1)]), + ("b", Utf8, [Option::<&str>::None]), + ("value", Int32, [Some(10)]) + ) + .unwrap(); + let schema = initial.schema(); + + let mut ds = Dataset::write( + RecordBatchIterator::new(vec![Ok(initial.clone())], schema.clone()), + "memory://", + None, + ) + .await + .unwrap(); + + ds.create_index( + &["a"], + IndexType::Scalar, + None, + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + + let source = record_batch!( + ("a", Int32, [Some(1)]), + ("b", Utf8, [Option::<&str>::None]), + ("value", Int32, [Some(99)]) + ) + .unwrap(); + + let (updated_ds, stats) = + MergeInsertBuilder::try_new(Arc::new(ds), vec!["a".to_string(), "b".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::InsertAll) + .try_build() + .unwrap() + .execute_reader(Box::new(RecordBatchIterator::new( + vec![Ok(source.clone())], + source.schema(), + ))) + .await + .unwrap(); + + assert_eq!(stats.num_inserted_rows, 1); + assert_eq!(stats.num_updated_rows, 0); + assert_eq!(updated_ds.count_rows(None).await.unwrap(), 2); + } + + /// Composite-key merge_insert where new (unindexed) fragments are + /// appended after the indices were built. The indexed take only sees + /// fragments covered by every chosen index, so the unindexed remainder + /// must be unioned in via a full scan — otherwise updates to rows + /// that live in those fragments are silently dropped. + #[tokio::test] + async fn test_indexed_merge_insert_composite_key_unindexed_fragments() { + let first = record_batch!( + ("a", Int32, [1, 2]), + ("b", Int32, [10, 20]), + ("value", Int32, [100, 200]) + ) + .unwrap(); + let schema = first.schema(); + + let mut ds = Dataset::write( + RecordBatchIterator::new(vec![Ok(first.clone())], schema.clone()), + "memory://", + Some(WriteParams { + max_rows_per_file: 64, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::default(); + ds.create_index(&["a"], IndexType::Scalar, None, ¶ms, false) + .await + .unwrap(); + ds.create_index(&["b"], IndexType::Scalar, None, ¶ms, false) + .await + .unwrap(); + + // Append a fragment AFTER both indices are built. The new (3, 30) + // row lives in a fragment neither index covers, so the indexed + // take alone would miss it. + let appended = record_batch!( + ("a", Int32, [3]), + ("b", Int32, [30]), + ("value", Int32, [300]) + ) + .unwrap(); + ds.append( + RecordBatchIterator::new(vec![Ok(appended.clone())], appended.schema()), + None, + ) + .await + .unwrap(); + + // Source updates one row in the indexed fragment AND one row in + // the appended (unindexed) fragment. + let source = record_batch!( + ("a", Int32, [1, 3]), + ("b", Int32, [10, 30]), + ("value", Int32, [999, 333]) + ) + .unwrap(); + + let (updated_ds, stats) = + MergeInsertBuilder::try_new(Arc::new(ds), vec!["a".to_string(), "b".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::InsertAll) + .try_build() + .unwrap() + .execute_reader(Box::new(RecordBatchIterator::new( + vec![Ok(source.clone())], + source.schema(), + ))) + .await + .unwrap(); + + assert_eq!( + stats.num_updated_rows, 2, + "row in the unindexed fragment must also be updated" + ); + assert_eq!(stats.num_inserted_rows, 0); + assert_eq!(updated_ds.count_rows(None).await.unwrap(), 3); + } + + /// Composite-key `MapIndexExec` formats its Display so plans expose + /// every probed column, and `with_new_children` round-trips the full + /// lookup list rather than collapsing back to a single-column form. + /// Lives here (not in scalar_index.rs's tests) so the new lines don't + /// pile up in a file with pending upstream conflicts. + #[test] + fn map_index_exec_multi_lookup_plan_shape() { + use crate::io::exec::scalar_index::{IndexLookup, MapIndexExec, ScalarIndexExec}; + use crate::utils::test::NoContextTestFixture; + use datafusion::physical_plan::{ExecutionPlan, displayable}; + use datafusion::scalar::ScalarValue; + use lance_index::scalar::{ + SargableQuery, + expression::{ScalarIndexExpr, ScalarIndexSearch}, + }; + use lance_select::result::IndexExprResultWireFormat; + + let fixture = NoContextTestFixture::new(); + let dataset = Arc::new(fixture.dataset); + + let dummy_input: Arc = Arc::new(ScalarIndexExec::new( + dataset.clone(), + ScalarIndexExpr::Query(ScalarIndexSearch { + column: "ordered".to_string(), + index_name: "ordered_idx".to_string(), + index_type: "BTree".to_string(), + query: Arc::new(SargableQuery::Equals(ScalarValue::UInt64(Some(1)))), + needs_recheck: false, + fragment_bitmap: None, + }), + IndexExprResultWireFormat::default(), + )); + + let lookups = vec![ + IndexLookup::new("a", "a_idx"), + IndexLookup::new("b", "b_idx"), + ]; + let plan: Arc = Arc::new(MapIndexExec::new_multi( + dataset.clone(), + lookups, + dummy_input.clone(), + )); + + let rendered = format!("{}", displayable(plan.as_ref()).indent(false)); + assert!( + rendered.contains("IndexedLookup [a, b]"), + "multi-lookup Display must list every probed column, got: {rendered}", + ); + + let rebuilt = plan + .with_new_children(vec![dummy_input.clone()]) + .expect("with_new_children must accept exactly one child"); + let rebuilt_rendered = format!("{}", displayable(rebuilt.as_ref()).indent(false)); + assert!( + rebuilt_rendered.contains("IndexedLookup [a, b]"), + "with_new_children must preserve every lookup, got: {rebuilt_rendered}", + ); + + // The single-lookup convenience constructor still renders without + // the column list, so existing EXPLAIN output is unchanged for + // single-column joins. + let single: Arc = Arc::new(MapIndexExec::new( + dataset, + "ordered".to_string(), + "ordered_idx".to_string(), + dummy_input, + )); + let single_rendered = format!("{}", displayable(single.as_ref()).indent(false)); + assert!( + single_rendered.contains("IndexedLookup") + && !single_rendered.contains("IndexedLookup ["), + "single-lookup Display must not include the column list, got: {single_rendered}", + ); + } + mod subcols { use super::*; use rstest::rstest; diff --git a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs index 89ec893705e..805073e75e2 100644 --- a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs +++ b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs @@ -13,9 +13,9 @@ use arrow_array::{ StringArray, StructArray, }; use arrow_schema::DataType; -use deepsize::DeepSizeOf; use lance_core::Result; -use lance_index::scalar::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; +use lance_core::deepsize::DeepSizeOf; +use lance_core::utils::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; use lance_table::format::pb; // Default bloom filter config: 8192 items @ 0.00057 fpp -> 16KiB filter diff --git a/rust/lance/src/dataset/write/update.rs b/rust/lance/src/dataset/write/update.rs index 6672c58db1f..f8e71b834a7 100644 --- a/rust/lance/src/dataset/write/update.rs +++ b/rust/lance/src/dataset/write/update.rs @@ -34,6 +34,16 @@ use lance_table::format::{Fragment, RowIdMeta}; use roaring::RoaringTreemap; use snafu::ResultExt; +/// Collect a field id and all of its descendant field ids (pre-order). A struct +/// column update rewrites the whole subtree, so an index on any descendant must be +/// treated as modified. +fn collect_subtree_field_ids(field: &lance_core::datatypes::Field, out: &mut Vec) { + out.push(field.id as u32); + for child in &field.children { + collect_subtree_field_ids(child, out); + } +} + /// Build an update operation. /// /// This operation is similar to SQL's UPDATE statement. It allows you to change @@ -386,10 +396,14 @@ impl UpdateJob { dataset: Arc, update_data: UpdateData, ) -> Result { + // Updated columns are top-level (nested references are rejected by `set`), but a + // struct-column update rewrites all of its descendants. Collect the full field + // subtree so an index on a nested child field is recognized as modified and not + // wrongly extended over the rewritten fragment. let mut fields_for_preserving_frag_bitmap = Vec::new(); for column_name in self.updates.keys() { - if let Ok(field_id) = dataset.schema().field_id(column_name) { - fields_for_preserving_frag_bitmap.push(field_id as u32); + if let Some(field) = dataset.schema().field(column_name) { + collect_subtree_field_ids(field, &mut fields_for_preserving_frag_bitmap); } } diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index eaa3dc6119d..45aa96a3fd0 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -29,9 +29,7 @@ use lance_index::mem_wal::{MEM_WAL_INDEX_NAME, MemWalIndex}; use lance_index::optimize::OptimizeOptions; use lance_index::pb::index::Implementation; pub use lance_index::progress::{IndexBuildProgress, NoopIndexBuildProgress}; -use lance_index::scalar::expression::{ - IndexInformationProvider, MultiQueryParser, ScalarQueryParser, -}; +use lance_index::scalar::expression::{IndexInformationProvider, MultiQueryParser}; use lance_index::scalar::inverted::{InvertedIndex, InvertedIndexPlugin}; use lance_index::scalar::lance_format::LanceIndexStore; use lance_index::scalar::registry::{TrainingCriteria, TrainingOrdering}; @@ -47,6 +45,8 @@ use lance_index::{INDEX_FILE_NAME, Index, IndexType, PrewarmOptions, pb, vector: use lance_index::{ IndexCriteria, is_system_index, metrics::{MetricsCollector, NoOpMetricsCollector}, + registry::display_type_from_url, + scalar::btree::BTREE_LOOKUP_NAME, }; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; use lance_io::traits::Reader; @@ -55,7 +55,7 @@ use lance_io::utils::{ read_version, }; use lance_table::format::{Fragment, SelfDescribingFileReader}; -use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{IndexFile, IndexMetadata, list_index_files_with_sizes}; use lance_table::io::manifest::read_manifest_indexes; use roaring::RoaringBitmap; use scalar::index_matches_criteria; @@ -164,7 +164,8 @@ pub(crate) async fn build_index_metadata_from_segments( let mut new_indices = Vec::with_capacity(segments.len()); for segment in segments { let (uuid, fragment_bitmap, index_details, index_version) = segment.into_parts(); - if index_details.type_url.ends_with("InvertedIndexDetails") { + let is_inverted_index = index_details.type_url.ends_with("InvertedIndexDetails"); + if is_inverted_index { let metadata = IndexMetadata { uuid, name: index_name.to_string(), @@ -181,7 +182,10 @@ pub(crate) async fn build_index_metadata_from_segments( .await?; } let index_dir = dataset.indices_dir().clone().join(uuid.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; + let mut files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; + if is_inverted_index { + retain_committed_inverted_files(&mut files); + } new_indices.push(IndexMetadata { uuid, name: index_name.to_string(), @@ -199,6 +203,10 @@ pub(crate) async fn build_index_metadata_from_segments( Ok(new_indices) } +fn retain_committed_inverted_files(files: &mut Vec) { + files.retain(|file| !file.path.starts_with("staging/")); +} + fn validate_segment_index_details(index_name: &str, segments: &[IndexMetadata]) -> Result<()> { let mut type_url = None::<&str>; for segment in segments { @@ -250,15 +258,49 @@ fn segment_has_inverted_details(segment: &IndexMetadata) -> bool { .is_some_and(|details| details.type_url.ends_with("InvertedIndexDetails")) } +fn segment_has_bitmap_details(segment: &IndexMetadata) -> bool { + segment + .index_details + .as_ref() + .is_some_and(|details| details.type_url.ends_with("BitmapIndexDetails")) +} + +/// Detect BTree segments, preserving a legacy pre-details fallback. +fn segment_has_btree_details(segment: &IndexMetadata) -> bool { + segment.index_details.as_ref().map_or_else( + || { + segment + .files + .as_ref() + .is_some_and(|files| files.iter().any(|file| file.path == BTREE_LOOKUP_NAME)) + }, + |details| details.type_url.ends_with("BTreeIndexDetails"), + ) +} + +fn segment_has_zonemap_details(segment: &IndexMetadata) -> bool { + segment + .index_details + .as_ref() + .is_some_and(|details| details.type_url.ends_with("ZoneMapIndexDetails")) +} + +fn segment_has_fmindex_details(segment: &IndexMetadata) -> bool { + segment + .index_details + .as_ref() + .is_some_and(|details| details.type_url.ends_with("FMIndexIndexDetails")) +} + // Cache keys for different index types #[derive(Debug, Clone)] pub(crate) struct LegacyVectorIndexCacheKey<'a> { - uuid: &'a str, + uuid: &'a Uuid, fri_uuid: Option<&'a Uuid>, } impl<'a> LegacyVectorIndexCacheKey<'a> { - fn new(uuid: &'a str, fri_uuid: Option<&'a Uuid>) -> Self { + fn new(uuid: &'a Uuid, fri_uuid: Option<&'a Uuid>) -> Self { Self { uuid, fri_uuid } } } @@ -270,7 +312,7 @@ impl CacheKey for LegacyVectorIndexCacheKey<'_> { if let Some(fri_uuid) = self.fri_uuid { format!("{}-{}", self.uuid, fri_uuid).into() } else { - self.uuid.into() + self.uuid.to_string().into() } } @@ -286,12 +328,12 @@ impl CacheKey for LegacyVectorIndexCacheKey<'_> { /// Legacy indices use `LegacyVectorIndexCacheKey` instead (in-memory only). #[derive(Debug, Clone)] pub(crate) struct IvfIndexStateCacheKey<'a> { - uuid: &'a str, + uuid: &'a Uuid, fri_uuid: Option<&'a Uuid>, } impl<'a> IvfIndexStateCacheKey<'a> { - fn new(uuid: &'a str, fri_uuid: Option<&'a Uuid>) -> Self { + fn new(uuid: &'a Uuid, fri_uuid: Option<&'a Uuid>) -> Self { Self { uuid, fri_uuid } } } @@ -307,7 +349,7 @@ impl CacheKey for IvfIndexStateCacheKey<'_> { if let Some(fri_uuid) = self.fri_uuid { format!("{}-{}", self.uuid, fri_uuid).into() } else { - self.uuid.into() + self.uuid.to_string().into() } } @@ -318,17 +360,17 @@ impl CacheKey for IvfIndexStateCacheKey<'_> { /// Wrapper that stores a live VectorIndex in the cache. /// Used for v0.1/v0.2 indices that don't support serializable caching. -#[derive(Debug, deepsize::DeepSizeOf)] +#[derive(Debug, lance_core::deepsize::DeepSizeOf)] pub(crate) struct CachedLegacyVectorIndex(Arc); #[derive(Debug, Clone)] pub struct FragReuseIndexCacheKey<'a> { - pub uuid: &'a str, + pub uuid: &'a Uuid, pub fri_uuid: Option<&'a Uuid>, } impl<'a> FragReuseIndexCacheKey<'a> { - pub fn new(uuid: &'a str, fri_uuid: Option<&'a Uuid>) -> Self { + pub fn new(uuid: &'a Uuid, fri_uuid: Option<&'a Uuid>) -> Self { Self { uuid, fri_uuid } } } @@ -340,7 +382,7 @@ impl CacheKey for FragReuseIndexCacheKey<'_> { if let Some(fri_uuid) = self.fri_uuid { format!("{}-{}", self.uuid, fri_uuid).into() } else { - self.uuid.into() + self.uuid.to_string().into() } } @@ -416,6 +458,7 @@ fn legacy_type_name(index_uri: &str, index_type_hint: Option<&str>) -> String { "BloomFilter" => IndexType::BloomFilter.to_string(), "RTree" => IndexType::RTree.to_string(), "Inverted" => IndexType::Inverted.to_string(), + "FMIndex" => IndexType::Fm.to_string(), "Json" => IndexType::Scalar.to_string(), "Flat" | "Vector" => IndexType::Vector.to_string(), other if other.contains("Vector") => IndexType::Vector.to_string(), @@ -474,7 +517,7 @@ pub(crate) async fn remap_index( let new_id = Uuid::new_v4(); let generic = match dataset - .open_generic_index(&field_path, &index_id.to_string(), &NoOpMetricsCollector) + .open_generic_index(&field_path, index_id, &NoOpMetricsCollector) .await { Ok(g) => g, @@ -492,10 +535,10 @@ pub(crate) async fn remap_index( let created_index = match generic.index_type() { it if it.is_scalar() => { - let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_id.to_string())?; + let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_id)?; let scalar_index = dataset - .open_scalar_index(&field_path, &index_id.to_string(), &NoOpMetricsCollector) + .open_scalar_index(&field_path, index_id, &NoOpMetricsCollector) .await?; if !scalar_index.can_remap() { return Ok(RemapResult::Drop); @@ -545,7 +588,7 @@ pub(crate) async fn remap_index( matched.index_version, matched.name )) })?; - remap_vector_index( + let files = remap_vector_index( Arc::new(dataset.clone()), &field_path, index_id, @@ -555,17 +598,13 @@ pub(crate) async fn remap_index( ) .await?; - // Capture file sizes for the vector index - let index_dir = dataset.indices_dir().join(new_id.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; - CreatedIndex { index_details: prost_types::Any::from_msg( &lance_index::pb::VectorIndexDetails::default(), ) .unwrap(), index_version, - files: Some(files), + files, } } _ => { @@ -581,20 +620,54 @@ pub(crate) async fn remap_index( new_id, index_details: created_index.index_details, index_version: created_index.index_version, - files: created_index.files, + files: Some(created_index.files), })) } +/// Snapshot of every scalar index on a dataset, captured at planning time +/// and consumed by the scalar/aggregate pushdown machinery. +/// +/// Built once per planner invocation by walking the manifest's `IndexMetadata` +/// entries; thereafter all lookups are synchronous, so optimizer rules and the +/// filter parser can interrogate it without needing an async context. #[derive(Debug)] pub struct ScalarIndexInfo { + /// Per-column dispatch table for [`apply_scalar_indices`]: keyed by the + /// full dotted field path (e.g. `"x"`, `"metadata.status.code"`), the same + /// string callers use when referring to columns in filter expressions. + /// + /// The value pairs the column's data type with a [`MultiQueryParser`] + /// that fans out to every per-index parser registered for that column. + /// When a column carries more than one index (e.g. BTree + bitmap), the + /// `MultiQueryParser` tries each in order and the first match wins; the + /// resulting [`crate::scalar::expression::ScalarIndexSearch`] records + /// which specific index was chosen. So *which* index served the query is + /// an output of parsing, not an input — that's why this map is keyed only + /// by column. + /// + /// `fragment_bitmaps`, by contrast, *is* keyed by `(column, index_name)`, + /// because by the time the optimizer needs the bitmap the index name is + /// already pinned in the parsed leaf. indexed_columns: HashMap)>, + /// `(column, index_name) → fragment_bitmap` taken straight off each + /// [`IndexMetadata`] at construction time. Used by the optimizer rule for + /// aggregate pushdown to reason about index coverage synchronously. + /// Indices that omit `fragment_bitmap` (legacy or unsupported) simply + /// don't appear here and so report coverage as unknown. + fragment_bitmaps: HashMap<(String, String), RoaringBitmap>, } impl IndexInformationProvider for ScalarIndexInfo { - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> { + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> { self.indexed_columns .get(col) - .map(|(ty, parser)| (ty, parser.as_ref() as &dyn ScalarQueryParser)) + .map(|(ty, parser)| (ty, parser.as_ref())) + } + + fn fragment_bitmap(&self, column: &str, index_name: &str) -> Option { + self.fragment_bitmaps + .get(&(column.to_string(), index_name.to_string())) + .cloned() } } @@ -617,7 +690,10 @@ struct IndexDescriptionImpl { field_ids: Vec, segments: Vec, index_type: String, - details: IndexDetails, + /// Index details, or `None` for indices created before details were + /// persisted in the manifest. Such indices are still described on a + /// best-effort basis rather than rejected. + details: Option, rows_indexed: u64, } @@ -645,57 +721,52 @@ impl IndexDescriptionImpl { } let field_ids_vec: Vec = field_ids.iter().map(|id| *id as u32).collect(); - // This should not fail as we have already filtered out indexes without index details. - let index_details = example_metadata.index_details.as_ref().ok_or_else(|| { - let fields = field_ids - .iter() - .map(|id| { - dataset - .schema() - .field_by_id(*id) - .map(|f| format!("{}({})", f.name, id)) - .unwrap_or_else(|| format!("({})", id)) - }) - .collect::>() - .join(", "); - - Error::index(format!( - "Index details are required for index description. This index must be retrained to support this method. (index_name={}, uuid={}, fields=[{}])", - name, - example_metadata.uuid, - fields - )) - })?; - let type_url = &index_details.type_url; - if !segments.iter().all(|shard| { - shard - .index_details - .as_ref() - .map(|d| d.type_url == *type_url) - .unwrap_or(false) - }) { - return Err(Error::index( - "Index type URL should be present and identical across all segments".to_string(), - )); + // Index details may be absent on indices created before details were + // persisted in the manifest. We describe such indices on a best-effort + // basis rather than erroring, so callers can still see they exist. + let details = example_metadata.index_details.clone().map(IndexDetails); + if let Some(details) = details.as_ref() { + let type_url = &details.0.type_url; + if !segments.iter().all(|shard| { + shard + .index_details + .as_ref() + .map(|d| d.type_url == *type_url) + .unwrap_or(false) + }) { + return Err(Error::index( + "Index type URL should be present and identical across all segments" + .to_string(), + )); + } } - let details = IndexDetails(index_details.clone()); - - let index_type = if details.is_vector() { - derive_vector_index_type(index_details) - } else if let Some(system_type) = lance_index::infer_system_index_type(example_metadata) { - // System indices (frag-reuse, mem-wal) are identified by name, not - // by a plugin entry, so the plugin lookup below would return - // "Unknown" otherwise. - system_type.to_string() - } else { - // We attempted to infer the index type when we loaded the indices, - // so if we hit this branch the index type is truly unknown. - details - .get_plugin() - .map(|p| p.name().to_string()) - .unwrap_or_else(|_| "Unknown".to_string()) - }; + let index_type = + if let Some(system_type) = lance_index::infer_system_index_type(example_metadata) { + // System indices (frag-reuse, mem-wal) are identified by name, not + // by index details, so this must be checked before the plugin lookup. + system_type.to_string() + } else if let Some(details) = details.as_ref() { + if details.is_vector() { + derive_vector_index_type(&details.0) + } else { + // Fall back to a name derived from the type URL when no plugin + // is registered, so a known type URL is never reported as the + // opaque "Unknown". + details + .get_plugin() + .map(|p| p.name().to_string()) + .unwrap_or_else(|_| { + display_type_from_url(details.0.type_url.as_str()).to_string() + }) + } + } else if segment_has_vector_details(example_metadata) { + // Legacy vector indices predate VectorIndexDetails and are + // recognized by their monolithic index file name. + "Vector".to_string() + } else { + "Unknown".to_string() + }; let mut fragment_rows = HashMap::with_capacity(dataset.manifest.fragments.len()); for fragment in dataset.iter_fragments() { @@ -772,7 +843,10 @@ impl IndexDescription for IndexDescriptionImpl { } fn type_url(&self) -> &str { - self.details.0.type_url.as_str() + self.details + .as_ref() + .map(|d| d.0.type_url.as_str()) + .unwrap_or("") } fn rows_indexed(&self) -> u64 { @@ -780,13 +854,14 @@ impl IndexDescription for IndexDescriptionImpl { } fn details(&self) -> Result { - if self.details.is_vector() { - vector_details_as_json(&self.details.0) + let Some(details) = self.details.as_ref() else { + return Ok("{}".to_string()); + }; + if details.is_vector() { + vector_details_as_json(&details.0) } else { - let plugin = self.details.get_plugin()?; - plugin - .details_as_json(&self.details.0) - .map(|v| v.to_string()) + let plugin = details.get_plugin()?; + plugin.details_as_json(&details.0).map(|v| v.to_string()) } } @@ -803,6 +878,155 @@ impl IndexDescription for IndexDescriptionImpl { } } +impl Dataset { + /// Build, without committing, the transaction that publishes one or more + /// existing physical index segments as a logical index. + /// + /// This stages the same manifest update as + /// [`commit_existing_index_segments`](DatasetIndexExt::commit_existing_index_segments) + /// but does not advance the dataset version. Use + /// [`CommitBuilder`](crate::dataset::CommitBuilder) to commit the returned + /// [`Transaction`]. + /// + /// The transaction is a snapshot built against the current dataset version, + /// so commit it promptly. A concurrent index creation with the same name is + /// rejected at commit time with a retryable conflict, but other concurrent + /// changes to the same index between staging and commit — a compaction/rewrite + /// that remaps it, or dropping/renaming the indexed column — are not + /// conflict-checked and may leave a duplicate or stale index entry. + /// + /// # Side effects + /// + /// For most index types this only reads the segment directories. For inverted + /// (full-text) segments it also finalizes the segment's on-disk files within + /// the segment's UUID directory before returning. Finalization is idempotent, + /// and any files left behind if the returned transaction is never committed + /// are reclaimed by `cleanup_old_versions` like other unreferenced index + /// files. + /// + /// # Example + /// + /// ``` + /// # use std::sync::Arc; + /// # use lance::Result; + /// # use lance::dataset::{CommitBuilder, Dataset}; + /// # use lance::index::IndexSegment; + /// # async fn example(dataset: Arc, segments: Vec) -> Result<()> { + /// let transaction = dataset + /// .build_existing_index_segments_transaction("vector_idx", "vector", segments) + /// .await?; + /// CommitBuilder::new(dataset).execute(transaction).await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn build_existing_index_segments_transaction( + &self, + index_name: &str, + column: &str, + segments: Vec, + ) -> Result { + let Some(field) = self.schema().field(column) else { + return Err(Error::index(format!( + "CreateIndex: column '{column}' does not exist" + ))); + }; + + let segments = segments + .into_iter() + .map(IntoIndexSegment::into_index_segment) + .collect::>>()?; + let new_indices = + build_index_metadata_from_segments(self, index_name, field.id, segments).await?; + validate_segment_metadata(index_name, &new_indices)?; + validate_segment_index_details(index_name, &new_indices)?; + + let incoming_type_url = new_indices[0] + .index_details + .as_ref() + .map(|details| details.type_url.clone()); + let dataset_fragments = self.fragment_bitmap.as_ref().clone(); + let mut incoming_fragments = RoaringBitmap::new(); + for segment in &new_indices { + if segment.fields != [field.id] { + return Err(Error::invalid_input(format!( + "CreateIndex: segment {} was built for fields {:?}, expected [{}]", + segment.uuid, segment.fields, field.id + ))); + } + if let Some(fragment_bitmap) = &segment.fragment_bitmap { + incoming_fragments |= fragment_bitmap.clone(); + } + } + + let existing_named_indices = self.load_indices_by_name(index_name).await?; + if existing_named_indices + .iter() + .any(|idx| idx.fields != [field.id]) + { + return Err(Error::index(format!( + "Index name '{index_name}' already exists with different fields, \ + please specify a different name" + ))); + } + let removed_indices = existing_named_indices + .into_iter() + .filter(|idx| { + idx.index_details + .as_ref() + .zip(incoming_type_url.as_deref()) + .is_none_or(|(details, expected)| details.type_url == expected) + }) + .map(|idx| -> Result> { + let Some(existing_fragments) = idx.effective_fragment_bitmap(&dataset_fragments) + else { + if incoming_fragments != dataset_fragments { + return Err(Error::invalid_input(format!( + "CreateIndex: cannot replace legacy index segment {} for '{}' with partial fragment coverage; rebuild all fragments in one commit", + idx.uuid, index_name + ))); + } + return Ok(Some(idx)); + }; + + // A zero-fragment segment can be used to create an index while + // deferring the actual build. Such a segment is disjoint from every + // other segment but should still be removed. + if existing_fragments.is_empty() { + return Ok(Some(idx)); + } + + if existing_fragments.is_disjoint(&incoming_fragments) { + return Ok(None); + } + + let uncovered = existing_fragments - &incoming_fragments; + if !uncovered.is_empty() { + return Err(Error::invalid_input(format!( + "CreateIndex: incoming segments for '{}' would orphan fragments {:?} from existing segment {}", + index_name, + uncovered.iter().collect::>(), + idx.uuid + ))); + } + + Ok(Some(idx)) + }) + .collect::>>()? + .into_iter() + .flatten() + .collect::>(); + + Ok(Transaction::new( + self.manifest.version, + Operation::CreateIndex { + new_indices, + removed_indices, + }, + None, + )) + } +} + #[async_trait] impl DatasetIndexExt for Dataset { type IndexBuilder<'a> = CreateIndexBuilder<'a>; @@ -900,7 +1124,7 @@ impl DatasetIndexExt for Dataset { for index_meta in indices { let index = self - .open_generic_index(name, &index_meta.uuid.to_string(), &NoOpMetricsCollector) + .open_generic_index(name, &index_meta.uuid, &NoOpMetricsCollector) .await?; index.prewarm().await?; } @@ -916,7 +1140,7 @@ impl DatasetIndexExt for Dataset { for index_meta in indices { let index = self - .open_generic_index(name, &index_meta.uuid.to_string(), &NoOpMetricsCollector) + .open_generic_index(name, &index_meta.uuid, &NoOpMetricsCollector) .await?; match options { @@ -1026,8 +1250,9 @@ impl DatasetIndexExt for Dataset { if let Some(frag_reuse_index_meta) = indices.iter().find(|idx| idx.name == FRAG_REUSE_INDEX_NAME) { - let uuid = frag_reuse_index_meta.uuid.to_string(); - let fri_key = FragReuseIndexKey { uuid: &uuid }; + let fri_key = FragReuseIndexKey { + uuid: &frag_reuse_index_meta.uuid, + }; let frag_reuse_index = self .index_cache .get_or_insert_with_key(fri_key, || async move { @@ -1069,7 +1294,12 @@ impl DatasetIndexExt for Dataset { } let all_vector = source_segments.iter().all(segment_has_vector_details); let all_inverted = source_segments.iter().all(segment_has_inverted_details); - if !all_vector && !all_inverted { + let all_bitmap = source_segments.iter().all(segment_has_bitmap_details); + let all_btree = source_segments.iter().all(segment_has_btree_details); + let all_fmindex = source_segments.iter().all(segment_has_fmindex_details); + let all_zonemap = source_segments.iter().all(segment_has_zonemap_details); + if !all_vector && !all_inverted && !all_bitmap && !all_btree && !all_fmindex && !all_zonemap + { return Err(Error::invalid_input( "merge_existing_index_segments requires all segments to have the same supported index type" .to_string(), @@ -1083,8 +1313,16 @@ impl DatasetIndexExt for Dataset { source_segments, ) .await? - } else { + } else if all_inverted { crate::index::scalar::inverted::merge_segments(self, source_segments).await? + } else if all_fmindex { + crate::index::scalar::fmindex::merge_segments(self, source_segments).await? + } else if all_bitmap { + crate::index::scalar::bitmap::merge_segments(self, source_segments).await? + } else if all_zonemap { + crate::index::scalar::zonemap::merge_segments(self, source_segments).await? + } else { + crate::index::scalar::btree::merge_segments(self, source_segments).await? }; merged_segment.dataset_version = self.manifest.version; merged_segment.fields = vec![field_id]; @@ -1097,122 +1335,34 @@ impl DatasetIndexExt for Dataset { column: &str, segments: Vec, ) -> Result<()> { - let Some(field) = self.schema().field(column) else { - return Err(Error::index(format!( - "CreateIndex: column '{column}' does not exist" - ))); - }; + let transaction = self + .build_existing_index_segments_transaction(index_name, column, segments) + .await?; - let segments = segments - .into_iter() - .map(IntoIndexSegment::into_index_segment) - .collect::>>()?; - let new_indices = - build_index_metadata_from_segments(self, index_name, field.id, segments).await?; - validate_segment_metadata(index_name, &new_indices)?; - validate_segment_index_details(index_name, &new_indices)?; + self.apply_commit(transaction, &Default::default(), &Default::default()) + .await?; - let incoming_type_url = new_indices[0] - .index_details - .as_ref() - .map(|details| details.type_url.clone()); - let dataset_fragments = self.fragment_bitmap.as_ref().clone(); - let mut incoming_fragments = RoaringBitmap::new(); - for segment in &new_indices { - if segment.fields != [field.id] { - return Err(Error::invalid_input(format!( - "CreateIndex: segment {} was built for fields {:?}, expected [{}]", - segment.uuid, segment.fields, field.id - ))); - } - if let Some(fragment_bitmap) = &segment.fragment_bitmap { - incoming_fragments |= fragment_bitmap.clone(); - } - } + Ok(()) + } - let existing_named_indices = self.load_indices_by_name(index_name).await?; - if existing_named_indices + async fn load_scalar_index<'a, 'b>( + &'a self, + criteria: IndexCriteria<'b>, + ) -> Result> { + let indices = self.load_indices().await?; + + let mut indices = indices .iter() - .any(|idx| idx.fields != [field.id]) - { - return Err(Error::index(format!( - "Index name '{index_name}' already exists with different fields, \ - please specify a different name" - ))); - } - let removed_indices = existing_named_indices - .into_iter() .filter(|idx| { - idx.index_details - .as_ref() - .zip(incoming_type_url.as_deref()) - .is_none_or(|(details, expected)| details.type_url == expected) - }) - .map(|idx| -> Result> { - let Some(existing_fragments) = idx.fragment_bitmap.as_ref() else { - if incoming_fragments != dataset_fragments { - return Err(Error::invalid_input(format!( - "CreateIndex: cannot replace legacy index segment {} for '{}' with partial fragment coverage; rebuild all fragments in one commit", - idx.uuid, index_name - ))); + // We shouldn't have any indices with empty fields, but just in case, log an error + // but don't fail the operation (we might not be using that index) + if idx.fields.is_empty() { + if idx.name != FRAG_REUSE_INDEX_NAME { + log::error!("Index {} has no fields", idx.name); } - return Ok(Some(idx)); - }; - - if existing_fragments.is_disjoint(&incoming_fragments) { - return Ok(None); - } - - let uncovered = existing_fragments - &incoming_fragments; - if !uncovered.is_empty() { - return Err(Error::invalid_input(format!( - "CreateIndex: incoming segments for '{}' would orphan fragments {:?} from existing segment {}", - index_name, - uncovered.iter().collect::>(), - idx.uuid - ))); - } - - Ok(Some(idx)) - }) - .collect::>>()? - .into_iter() - .flatten() - .collect::>(); - - let transaction = Transaction::new( - self.manifest.version, - Operation::CreateIndex { - new_indices, - removed_indices, - }, - None, - ); - - self.apply_commit(transaction, &Default::default(), &Default::default()) - .await?; - - Ok(()) - } - - async fn load_scalar_index<'a, 'b>( - &'a self, - criteria: IndexCriteria<'b>, - ) -> Result> { - let indices = self.load_indices().await?; - - let mut indices = indices - .iter() - .filter(|idx| { - // We shouldn't have any indices with empty fields, but just in case, log an error - // but don't fail the operation (we might not be using that index) - if idx.fields.is_empty() { - if idx.name != FRAG_REUSE_INDEX_NAME { - log::error!("Index {} has no fields", idx.name); - } - false - } else { - true + false + } else { + true } }) .collect::>(); @@ -1311,7 +1461,7 @@ impl DatasetIndexExt for Dataset { index_version: res.new_index_version, created_at: Some(chrono::Utc::now()), base_id: None, // New merged index file locates in the cloned dataset. - files: res.files, + files: Some(res.files), }; removed_indices.extend(res.removed_indices.iter().map(|&idx| idx.clone())); new_indices.push(new_idx); @@ -1549,7 +1699,7 @@ async fn collect_regular_indices_statistics( } let index = ds - .open_generic_index(field_path, &meta.uuid.to_string(), &NoOpMetricsCollector) + .open_generic_index(field_path, &meta.uuid, &NoOpMetricsCollector) .await?; indices_stats.push(index.statistics()?); @@ -1642,21 +1792,21 @@ pub trait DatasetIndexInternalExt: DatasetIndexExt { async fn open_generic_index( &self, column: &str, - uuid: &str, + uuid: &Uuid, metrics: &dyn MetricsCollector, ) -> Result>; /// Opens the requested scalar index async fn open_scalar_index( &self, column: &str, - uuid: &str, + uuid: &Uuid, metrics: &dyn MetricsCollector, ) -> Result>; /// Opens the requested vector index async fn open_vector_index( &self, column: &str, - uuid: &str, + uuid: &Uuid, metrics: &dyn MetricsCollector, ) -> Result>; /// Opens all segments for one logical vector index and returns a materialized snapshot. @@ -1703,7 +1853,7 @@ impl DatasetIndexInternalExt for Dataset { async fn open_generic_index( &self, column: &str, - uuid: &str, + uuid: &Uuid, metrics: &dyn MetricsCollector, ) -> Result> { // Checking for cache existence is cheap so we just check the vector caches. @@ -1748,7 +1898,10 @@ impl DatasetIndexInternalExt for Dataset { } else { // Fall back to file existence check for older indices without file metadata let index_dir = self.indice_files_dir(&index_meta)?; - let index_file = index_dir.clone().join(uuid).join(INDEX_FILE_NAME); + let index_file = index_dir + .clone() + .join(uuid.to_string()) + .join(INDEX_FILE_NAME); let object_store = self.object_store_for_index(&index_meta).await?; object_store.exists(&index_file).await? }; @@ -1766,7 +1919,7 @@ impl DatasetIndexInternalExt for Dataset { async fn open_scalar_index( &self, column: &str, - uuid: &str, + uuid: &Uuid, metrics: &dyn MetricsCollector, ) -> Result> { // Caching (including the choice of in-memory vs. serializable state) is @@ -1782,7 +1935,7 @@ impl DatasetIndexInternalExt for Dataset { async fn open_vector_index( &self, column: &str, - uuid: &str, + uuid: &Uuid, metrics: &dyn MetricsCollector, ) -> Result> { let frag_reuse_uuid = self.frag_reuse_index_uuid().await; @@ -1797,9 +1950,15 @@ impl DatasetIndexInternalExt for Dataset { if let Some(entry) = self.index_cache.get_with_key(&state_key).await { log::debug!("Found IvfIndexState in cache uuid: {}", uuid); let partition_cache = self.index_cache.with_key_prefix(&state_key.key()); + let frag_reuse_index = self.open_frag_reuse_index(metrics).await?; return entry .0 - .reconstruct(object_store, self.metadata_cache.as_ref(), partition_cache) + .reconstruct( + object_store, + self.metadata_cache.as_ref(), + partition_cache, + frag_reuse_index, + ) .await; } @@ -1811,8 +1970,19 @@ impl DatasetIndexInternalExt for Dataset { let frag_reuse_index = self.open_frag_reuse_index(metrics).await?; let index_dir = self.indice_files_dir(&index_meta)?; - let index_file = index_dir.clone().join(uuid).join(INDEX_FILE_NAME); - let reader: Arc = object_store.open(&index_file).await?.into(); + let index_file = index_dir + .clone() + .join(uuid.to_string()) + .join(INDEX_FILE_NAME); + let file_sizes = index_meta.file_size_map(); + let reader: Arc = vector::open_index_file( + object_store.as_ref(), + &index_file, + INDEX_FILE_NAME, + &file_sizes, + ) + .await? + .into(); let tailing_bytes = read_last_block(reader.as_ref()).await?; let (major_version, minor_version) = read_version(&tailing_bytes)?; @@ -1835,7 +2005,7 @@ impl DatasetIndexInternalExt for Dataset { minor_version, ) { (0, 1) | (0, 0) => { - info!(target: TRACE_IO_EVENTS, index_uuid=uuid, r#type=IO_TYPE_OPEN_VECTOR, version="0.1", index_type="IVF_PQ"); + info!(target: TRACE_IO_EVENTS, index_uuid=%uuid, r#type=IO_TYPE_OPEN_VECTOR, version="0.1", index_type="IVF_PQ"); let proto = open_index_proto(reader.as_ref()).await?; match &proto.implementation { Some(Implementation::VectorIndex(vector_index)) => { @@ -1857,7 +2027,7 @@ impl DatasetIndexInternalExt for Dataset { } (0, 2) => { - info!(target: TRACE_IO_EVENTS, index_uuid=uuid, r#type=IO_TYPE_OPEN_VECTOR, version="0.2", index_type="IVF_PQ"); + info!(target: TRACE_IO_EVENTS, index_uuid=%uuid, r#type=IO_TYPE_OPEN_VECTOR, version="0.2", index_type="IVF_PQ"); let reader = PreviousFileReader::try_new_self_described_from_reader( reader.clone(), Some(&self.metadata_cache.file_metadata_cache(&index_file)), @@ -1879,7 +2049,6 @@ impl DatasetIndexInternalExt for Dataset { self.object_store.clone(), SchedulerConfig::max_bandwidth(&self.object_store), ); - let file_sizes = index_meta.file_size_map(); let cached_size = file_sizes .get(INDEX_FILE_NAME) .map(|&size| CachedFileSize::new(size)) @@ -1906,7 +2075,7 @@ impl DatasetIndexInternalExt for Dataset { let (_, element_type) = get_vector_type(self.schema(), &field_path)?; - info!(target: TRACE_IO_EVENTS, index_uuid=uuid, r#type=IO_TYPE_OPEN_VECTOR, version="0.3", index_type=index_metadata.index_type); + info!(target: TRACE_IO_EVENTS, index_uuid=%uuid, r#type=IO_TYPE_OPEN_VECTOR, version="0.3", index_type=index_metadata.index_type); match index_metadata.index_type.as_str() { "IVF_FLAT" => match element_type { @@ -2054,6 +2223,15 @@ impl DatasetIndexInternalExt for Dataset { }; let (index, ivf_entry) = result?; metrics.record_index_load(); + // Attribute the one-time index-open I/O (file footers, IVF centroids, + // quantization metadata) to this query's metrics. This runs only on a + // real open; cache hits return earlier, so a warm query reports zero + // index-open I/O. + if let Some(io_stats) = metrics.io_stats() + && let Some(open_stats) = index.open_io_stats() + { + io_stats.add_scan_stats(&open_stats); + } if let Some(ivf_entry) = ivf_entry { let state_key = IvfIndexStateCacheKey::new(uuid, frag_reuse_uuid.as_ref()); self.index_cache @@ -2091,7 +2269,7 @@ impl DatasetIndexInternalExt for Dataset { let mut segments = Vec::with_capacity(metadatas.len()); for metadata in metadatas { let index = self - .open_vector_index(column, &metadata.uuid.to_string(), &NoOpMetricsCollector) + .open_vector_index(column, &metadata.uuid, &NoOpMetricsCollector) .await?; segments.push((metadata, index)); } @@ -2104,19 +2282,27 @@ impl DatasetIndexInternalExt for Dataset { metrics: &dyn MetricsCollector, ) -> Result>> { if let Some(frag_reuse_index_meta) = self.load_index_by_name(FRAG_REUSE_INDEX_NAME).await? { - let uuid = frag_reuse_index_meta.uuid.to_string(); - let frag_reuse_key = FragReuseIndexKey { uuid: &uuid }; - let uuid_clone = uuid.clone(); + let frag_reuse_uuid = frag_reuse_index_meta.uuid; + let frag_reuse_key = FragReuseIndexKey { + uuid: &frag_reuse_uuid, + }; let index = self .index_cache .get_or_insert_with_key(frag_reuse_key, || async move { - let index_meta = self.load_index(&uuid_clone).await?.ok_or_else(|| Error::index(format!("Index with id {} does not exist", uuid_clone)))?; + let index_meta = + self.load_index(&frag_reuse_uuid).await?.ok_or_else(|| { + Error::index(format!( + "Index with id {} does not exist", + frag_reuse_uuid + )) + })?; let index_details = load_frag_reuse_index_details(self, &index_meta).await?; let index = - open_frag_reuse_index(frag_reuse_index_meta.uuid, index_details.as_ref()).await?; + open_frag_reuse_index(frag_reuse_index_meta.uuid, index_details.as_ref()) + .await?; - info!(target: TRACE_IO_EVENTS, index_uuid=uuid_clone, r#type=IO_TYPE_OPEN_FRAG_REUSE); + info!(target: TRACE_IO_EVENTS, index_uuid=%frag_reuse_uuid, r#type=IO_TYPE_OPEN_FRAG_REUSE); metrics.record_index_load(); Ok(index) @@ -2144,7 +2330,7 @@ impl DatasetIndexInternalExt for Dataset { return Ok(Some(index)); } - let uuid = mem_wal_meta.uuid.to_string(); + let uuid = mem_wal_meta.uuid; let index_meta = self .load_index(&uuid) @@ -2152,7 +2338,7 @@ impl DatasetIndexInternalExt for Dataset { .ok_or_else(|| Error::index(format!("Index with id {} does not exist", uuid)))?; let index = open_mem_wal_index(index_meta)?; - info!(target: TRACE_IO_EVENTS, index_uuid=uuid, r#type=IO_TYPE_OPEN_MEM_WAL); + info!(target: TRACE_IO_EVENTS, index_uuid=%uuid, r#type=IO_TYPE_OPEN_MEM_WAL); metrics.record_index_load(); self.index_cache @@ -2177,6 +2363,12 @@ impl DatasetIndexInternalExt for Dataset { let indices = self.load_indices().await?; let schema = self.schema(); let mut indexed_fields = Vec::new(); + // (column, index_name) → union of every contributing IndexMetadata's + // fragment_bitmap. Multiple entries can land here for delta-merged + // indices that share a name. We only insert when every contributing + // entry has a bitmap; if any are missing, we leave the entry absent + // so the optimizer treats coverage as unknown. + let mut fragment_bitmaps: HashMap<(String, String), Option> = HashMap::new(); for index in indices.iter().filter(|idx| { let idx_schema = schema.project_by_ids(idx.fields.as_slice(), true); let is_vector_index = idx_schema @@ -2235,6 +2427,23 @@ impl DatasetIndexInternalExt for Dataset { let query_parser = plugin.new_query_parser(index.name.clone(), &index_details.0); if let Some(query_parser) = query_parser { + // Union the per-segment fragment bitmap into this + // (column, index_name) entry. If any segment is missing a + // bitmap, downgrade the entry to None so callers know + // coverage is partial/unknown. + let key = (field_path.clone(), index.name.clone()); + fragment_bitmaps + .entry(key) + .and_modify(|entry| { + if let (Some(acc), Some(seg)) = + (entry.as_mut(), index.fragment_bitmap.as_ref()) + { + *acc |= seg; + } else { + *entry = None; + } + }) + .or_insert_with(|| index.fragment_bitmap.clone()); indexed_fields.push((field_path, (field.data_type(), query_parser))); } } @@ -2259,8 +2468,14 @@ impl DatasetIndexInternalExt for Dataset { ) }); } + // Drop entries we couldn't pin to a known bitmap. + let fragment_bitmaps = fragment_bitmaps + .into_iter() + .filter_map(|(k, v)| v.map(|bm| (k, bm))) + .collect(); Ok(ScalarIndexInfo { indexed_columns: index_info_map, + fragment_bitmaps, }) } @@ -2486,7 +2701,7 @@ mod tests { kmeans::{KMeansParams, train_kmeans}, sq::builder::SQBuildParams, }; - use lance_io::{assert_io_eq, assert_io_lt}; + use lance_io::{assert_io_eq, assert_io_lt, utils::tracking_store::IoStats}; use lance_linalg::distance::{DistanceType, MetricType}; use lance_testing::datagen::generate_random_array; use object_store::ObjectStoreExt; @@ -2528,6 +2743,20 @@ mod tests { } } + fn list_io_stats(stats: &IoStats) -> IoStats { + let requests = stats + .requests + .iter() + .filter(|request| request.method == "list") + .cloned() + .collect::>(); + IoStats { + read_iops: requests.len() as u64, + requests, + ..Default::default() + } + } + fn segment_from_metadata(metadata: &IndexMetadata) -> IndexSegment { IndexSegment::new( metadata.uuid, @@ -3847,7 +4076,7 @@ mod tests { .unwrap(); let indices = dataset.load_indices().await.unwrap(); let index = dataset - .open_generic_index("tag", &indices[0].uuid.to_string(), &NoOpMetricsCollector) + .open_generic_index("tag", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); assert_eq!(index.index_type(), IndexType::Bitmap); @@ -4372,6 +4601,81 @@ mod tests { assert_eq!(descriptions[0].index_type(), inferred_type); } + #[tokio::test] + async fn test_describe_indices_tolerates_missing_index_details() { + // An index whose manifest entry has no index details (e.g. created + // before details were persisted) is still described on a best-effort + // basis rather than causing describe_indices to error. + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + let field_id = dataset.schema().field("id").unwrap().id; + + let metadata = IndexMetadata { + uuid: Uuid::new_v4(), + name: "mystery_idx".to_string(), + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(std::iter::once(0_u32).collect()), + index_details: None, + index_version: 0, + created_at: None, + base_id: None, + files: None, + }; + + let desc = IndexDescriptionImpl::try_new(vec![metadata], &dataset) + .await + .unwrap(); + assert_eq!(desc.index_type(), "Unknown"); + assert_eq!(desc.type_url(), ""); + assert_eq!(desc.details().unwrap(), "{}"); + assert_eq!(desc.rows_indexed(), 10); + } + + #[tokio::test] + async fn test_describe_indices_derives_type_from_url_without_plugin() { + // When index details exist but no plugin is registered for the type + // URL, the index type is derived from the type URL rather than being + // reported as the opaque "Unknown". + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + let field_id = dataset.schema().field("id").unwrap().id; + + let metadata = IndexMetadata { + uuid: Uuid::new_v4(), + name: "mystery_idx".to_string(), + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(std::iter::once(0_u32).collect()), + index_details: Some(Arc::new(prost_types::Any { + type_url: "/lance.index.pb.MysteryIndexDetails".to_string(), + value: Vec::new(), + })), + index_version: 0, + created_at: None, + base_id: None, + files: None, + }; + + let desc = IndexDescriptionImpl::try_new(vec![metadata], &dataset) + .await + .unwrap(); + assert_eq!(desc.index_type(), "Mystery"); + assert_eq!(desc.type_url(), "/lance.index.pb.MysteryIndexDetails"); + } + #[rstest] #[case::btree("i", IndexType::BTree, Box::new(ScalarIndexParams::default()))] #[case::bitmap("i", IndexType::Bitmap, Box::new(ScalarIndexParams::default()))] @@ -6588,7 +6892,18 @@ mod tests { ) .into_reader_rows(RowCount::from(20), BatchCount::from(2)); - let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 20, + max_rows_per_group: 20, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); let field_id = dataset.schema().field("vector").unwrap().id; let original = write_vector_segment_metadata( @@ -6622,6 +6937,331 @@ mod tests { assert!(err.to_string().contains("would orphan fragments")); } + #[tokio::test] + async fn test_commit_existing_index_segments_removes_empty_segment() { + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let mut dataset = Dataset::write(reader, test_dir.path().to_str().unwrap(), None) + .await + .unwrap(); + let field_id = dataset.schema().field("vector").unwrap().id; + let uuid = Uuid::new_v4(); + + // Commit a 0-fragment segment, then a real segment covering the dataset. + let empty = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + uuid, + std::iter::empty::(), + b"empty", + ) + .await; + dataset + .commit_existing_index_segments( + "vector_idx", + "vector", + vec![segment_from_metadata(&empty)], + ) + .await + .unwrap(); + let seg = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg", + ) + .await; + dataset + .commit_existing_index_segments( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg)], + ) + .await + .unwrap(); + + // The real segment covers the dataset, so the redundant empty one is removed. + let committed = dataset.load_indices_by_name("vector_idx").await.unwrap(); + assert_eq!( + committed.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg.uuid]), + "empty segment should be removed once a real segment covers the dataset", + ); + } + + #[tokio::test] + async fn test_build_existing_index_segments_transaction_does_not_commit() { + use crate::dataset::CommitBuilder; + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(2)); + + let dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 10, + max_rows_per_group: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + // 20 rows with max_rows_per_file=10 yields two single-fragment files. + assert_eq!(dataset.get_fragments().len(), 2); + + let read_version = dataset.manifest.version; + let field_id = dataset.schema().field("vector").unwrap().id; + let seg0 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg0", + ) + .await; + let seg1 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [1_u32], + b"seg1", + ) + .await; + + let transaction = dataset + .build_existing_index_segments_transaction( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg0), segment_from_metadata(&seg1)], + ) + .await + .unwrap(); + + // Building the transaction must not publish the index. + assert!( + dataset + .load_indices_by_name("vector_idx") + .await + .unwrap() + .is_empty(), + "building a transaction must not publish the index" + ); + assert_eq!(transaction.read_version, read_version); + let Operation::CreateIndex { + new_indices, + removed_indices, + } = &transaction.operation + else { + panic!("expected index creation transaction"); + }; + assert_eq!( + new_indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg0.uuid, seg1.uuid]), + ); + assert!(removed_indices.is_empty()); + + // The returned transaction can be committed via CommitBuilder. + let committed = CommitBuilder::new(Arc::new(dataset)) + .execute(transaction) + .await + .unwrap(); + let indices = committed.load_indices_by_name("vector_idx").await.unwrap(); + assert_eq!( + indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg0.uuid, seg1.uuid]), + ); + } + + #[tokio::test] + async fn test_build_existing_index_segments_transaction_removes_empty_segment() { + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let mut dataset = Dataset::write(reader, test_dir.path().to_str().unwrap(), None) + .await + .unwrap(); + let field_id = dataset.schema().field("vector").unwrap().id; + + // Commit a 0-fragment placeholder segment. + let empty = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + std::iter::empty::(), + b"empty", + ) + .await; + dataset + .commit_existing_index_segments( + "vector_idx", + "vector", + vec![segment_from_metadata(&empty)], + ) + .await + .unwrap(); + + // Staging a real segment that covers the dataset must mark the placeholder + // for removal at build time, exercising the zero-fragment guard on the + // staged path (not just the committed path). + let seg = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg", + ) + .await; + let transaction = dataset + .build_existing_index_segments_transaction( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg)], + ) + .await + .unwrap(); + let Operation::CreateIndex { + new_indices, + removed_indices, + } = &transaction.operation + else { + panic!("expected index creation transaction"); + }; + assert_eq!( + new_indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg.uuid]), + ); + assert_eq!( + removed_indices + .iter() + .map(|i| i.uuid) + .collect::>(), + HashSet::from([empty.uuid]), + "the zero-fragment placeholder must be staged for removal", + ); + } + + #[tokio::test] + async fn test_build_existing_index_segments_transaction_commits_after_version_advances() { + use crate::dataset::CommitBuilder; + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(2)); + let dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 10, + max_rows_per_group: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + let read_version = dataset.manifest.version; + let field_id = dataset.schema().field("vector").unwrap().id; + let seg0 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg0", + ) + .await; + let seg1 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [1_u32], + b"seg1", + ) + .await; + + // Stage the transaction at `read_version`. + let transaction = dataset + .build_existing_index_segments_transaction( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg0), segment_from_metadata(&seg1)], + ) + .await + .unwrap(); + assert_eq!(transaction.read_version, read_version); + + // Advance the dataset with an unrelated append, moving HEAD past read_version. + let more = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let dataset = Dataset::write( + more, + test_uri, + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert!(dataset.manifest.version > read_version); + + // The staged transaction still commits cleanly against the advanced HEAD. + let committed = CommitBuilder::new(Arc::new(dataset)) + .execute(transaction) + .await + .unwrap(); + let indices = committed.load_indices_by_name("vector_idx").await.unwrap(); + assert_eq!( + indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg0.uuid, seg1.uuid]), + ); + } + #[tokio::test] async fn test_resolve_index_column_error_cases() { use lance_datagen::{BatchCount, RowCount, array}; @@ -7094,6 +7734,103 @@ mod tests { } } + #[tokio::test] + async fn test_scalar_index_create_does_not_list_files() { + let test_dir = TempStrDir::default(); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("category", DataType::Int32, false), + ])); + let ids = Int32Array::from_iter_values(0..128); + let categories = Int32Array::from_iter_values((0..128).map(|value| value % 8)); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids), Arc::new(categories)]) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut dataset = Dataset::write(reader, test_dir.as_str(), None) + .await + .unwrap(); + let io_tracker = dataset.object_store.as_ref().io_tracker().clone(); + + io_tracker.incremental_stats(); + dataset + .create_index( + &["category"], + IndexType::Bitmap, + Some("category_bitmap".to_string()), + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let stats = io_tracker.incremental_stats(); + let list_stats = list_io_stats(&stats); + assert_io_eq!( + list_stats, + read_iops, + 0, + "new scalar index files should be reported by writer return values" + ); + } + + #[tokio::test] + async fn test_vector_index_create_does_not_list_files() { + let test_dir = TempStrDir::default(); + let dimension = 8; + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "vector", + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), + dimension, + ), + false, + ), + ])); + let ids = Int32Array::from_iter_values(0..256); + let vectors = (0..256) + .map(|row| { + Some( + (0..dimension) + .map(|dim| Some((row * dimension + dim) as f32)) + .collect::>(), + ) + }) + .collect::>(); + let vector_array = + FixedSizeListArray::from_iter_primitive::(vectors, dimension); + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids), Arc::new(vector_array)]) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut dataset = Dataset::write(reader, test_dir.as_str(), None) + .await + .unwrap(); + let io_tracker = dataset.object_store.as_ref().io_tracker().clone(); + + io_tracker.incremental_stats(); + dataset + .create_index( + &["vector"], + IndexType::Vector, + Some("vector_ivf_flat".to_string()), + &VectorIndexParams::ivf_flat(4, MetricType::L2), + true, + ) + .await + .unwrap(); + + let stats = io_tracker.incremental_stats(); + let list_stats = list_io_stats(&stats); + assert_io_eq!( + list_stats, + read_iops, + 0, + "new V3 vector index files should be reported by builder return values" + ); + } + #[tokio::test] async fn test_index_file_sizes_through_lifecycle() { use crate::dataset::WriteDestination; @@ -7439,12 +8176,16 @@ mod tests { .unwrap(); assert!(results.num_rows() > 0); - // Verify IOPs + // Verify IOPs. The deferred DocSet loads per-doc num_tokens/row_ids on + // first use rather than eagerly at index open, so a cold (un-prewarmed) + // query opens the docs file on demand — a couple more IOPs than the + // eager path, but constant and only on the first query (prewarm or a + // warm cache serve it with zero IO). let stats = dataset.object_store.as_ref().io_stats_incremental(); assert_io_lt!( stats, read_iops, - 15, + 18, "Inverted index query should use minimal IOPs" ); } diff --git a/rust/lance/src/index/api.rs b/rust/lance/src/index/api.rs index 0119db7ac01..f856e9004f3 100644 --- a/rust/lance/src/index/api.rs +++ b/rust/lance/src/index/api.rs @@ -177,13 +177,10 @@ pub trait DatasetIndexExt { /// /// Note that it is possible to have multiple indices with the same UUID, /// as they are the deltas of the same index. - async fn load_index(&self, uuid: &str) -> Result> { - self.load_indices().await.map(|indices| { - indices - .iter() - .find(|idx| idx.uuid.to_string() == uuid) - .cloned() - }) + async fn load_index(&self, uuid: &Uuid) -> Result> { + self.load_indices() + .await + .map(|indices| indices.iter().find(|idx| idx.uuid == *uuid).cloned()) } /// Loads a specific index with the given index name. diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 4398928d3e2..d3ecde030c1 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -11,11 +11,12 @@ use lance_index::{ optimize::OptimizeOptions, progress::NoopIndexBuildProgress, scalar::{ - CreatedIndex, OldIndexDataFilter, inverted::InvertedIndex, lance_format::LanceIndexStore, + CreatedIndex, OldIndexDataFilter, ScalarIndex, inverted::InvertedIndex, + lance_format::LanceIndexStore, }, }; use lance_select::{RowAddrTreeMap, RowSetOps}; -use lance_table::format::{Fragment, IndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{Fragment, IndexMetadata}; use roaring::RoaringBitmap; use uuid::Uuid; @@ -36,7 +37,7 @@ pub struct IndexMergeResults<'a> { pub new_index_version: i32, pub new_index_details: prost_types::Any, /// List of files and their sizes for the merged index - pub files: Option>, + pub files: Vec, } async fn build_stable_row_id_filter( @@ -64,16 +65,341 @@ async fn build_stable_row_id_filter( .try_collect::>() .await?; - let row_id_maps = row_id_sequences - .iter() - .map(|(_, seq)| RowAddrTreeMap::from(seq.as_ref())) - .collect::>(); + let frag_by_id: std::collections::HashMap = dataset + .get_fragments() + .into_iter() + .map(|f| (f.id() as u32, f)) + .collect(); + + let mut row_id_maps = Vec::with_capacity(row_id_sequences.len()); + for (frag_id, seq) in &row_id_sequences { + row_id_maps.push(live_row_ids(frag_by_id.get(frag_id), seq).await?); + } let row_id_map_refs = row_id_maps.iter().collect::>(); // Merge all fragment-local row-id sets into one exact membership structure. Ok(::union_all(&row_id_map_refs)) } +/// The fragment's live row ids: its persisted row-id sequence minus the rows +/// its deletion vector marks gone. A persisted sequence covers every row the +/// fragment ever held, so a row whose old copy was deleted (e.g. rewritten by an +/// update under the same stable row id) would otherwise be retained as a stale +/// old-index entry. +async fn live_row_ids( + fragment: Option<&crate::dataset::fragment::FileFragment>, + seq: &lance_table::rowids::RowIdSequence, +) -> Result { + // Propagate a deletion-vector read failure rather than swallowing it: a + // swallowed error would fall through to the "no deletions" branch below, + // putting the deleted rows back into the allow-list as stale entries. + let deletion_vector = match fragment { + Some(f) if f.metadata().deletion_file.is_some() => f.get_deletion_vector().await?, + _ => None, + }; + Ok(match deletion_vector { + Some(dv) => seq + .iter() + .enumerate() + .filter(|(offset, _)| !dv.contains(*offset as u32)) + .map(|(_, row_id)| row_id) + .collect(), + None => RowAddrTreeMap::from(seq), + }) +} + +/// Build the [`OldIndexDataFilter`] that must be applied to existing index +/// rows when their owning fragments have been pruned by compaction or +/// deletions. +pub async fn build_old_data_filter( + dataset: &Dataset, + effective_old_frags: &RoaringBitmap, + deleted_old_frags: &RoaringBitmap, +) -> Result> { + if dataset.manifest.uses_stable_row_ids() { + let valid_old_row_ids = build_stable_row_id_filter(dataset, effective_old_frags).await?; + Ok(Some(OldIndexDataFilter::RowIds(valid_old_row_ids))) + } else { + Ok(Some(OldIndexDataFilter::Fragments { + to_keep: effective_old_frags.clone(), + to_remove: deleted_old_frags.clone(), + })) + } +} + +/// Split the stored fragment coverage of `segments` into fragments still live in +/// `dataset` (`effective`) and fragments that compaction or deletion has already +/// retired (`deleted`). +pub fn split_segment_coverage<'a>( + dataset: &Dataset, + segments: impl IntoIterator, +) -> (RoaringBitmap, RoaringBitmap) { + let mut effective = RoaringBitmap::new(); + let mut deleted = RoaringBitmap::new(); + for segment in segments { + if let Some(eff) = segment.effective_fragment_bitmap(&dataset.fragment_bitmap) { + effective |= eff; + } + if let Some(del) = segment.deleted_fragment_bitmap(&dataset.fragment_bitmap) { + deleted |= del; + } + } + (effective, deleted) +} + +/// Build one [`OldIndexDataFilter`] per segment, each derived from that segment's +/// *own* effective (still-live) and retired fragment coverage, plus the union of +/// every segment's still-live coverage. +pub async fn build_per_segment_filters( + dataset: &Dataset, + segments: &[&IndexMetadata], +) -> Result<(RoaringBitmap, Vec>)> { + let mut effective_union = RoaringBitmap::new(); + let mut filters = Vec::with_capacity(segments.len()); + for segment in segments { + if segment.fragment_bitmap.is_none() { + return Err(Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + ))); + } + let effective = segment + .effective_fragment_bitmap(&dataset.fragment_bitmap) + .unwrap_or_default(); + let deleted = segment + .deleted_fragment_bitmap(&dataset.fragment_bitmap) + .unwrap_or_default(); + effective_union |= &effective; + filters.push(build_old_data_filter(dataset, &effective, &deleted).await?); + } + Ok((effective_union, filters)) +} + +async fn load_unindexed_training_data( + dataset: &Dataset, + field_path: &str, + update_criteria: &lance_index::scalar::UpdateCriteria, + unindexed: &[Fragment], +) -> Result { + let fragments = if update_criteria.requires_old_data { + None + } else { + Some(unindexed.to_vec()) + }; + load_training_data( + dataset, + field_path, + &update_criteria.data_criteria, + fragments, + true, + None, + ) + .await +} + +/// Build a fresh, canonical (non-sharded) scalar index over `fragment_ids`, +/// reusing `reference_index`'s params and training criteria. +async fn rebuild_scalar_segment( + dataset: &Dataset, + reference_index: &Arc, + field_path: &str, + column_name: &str, + uuid: Uuid, + fragment_ids: Vec, +) -> Result { + let params = reference_index.derive_index_params()?; + let update_criteria = reference_index.update_criteria(); + let training_data = load_training_data( + dataset, + field_path, + &update_criteria.data_criteria, + None, + true, + Some(fragment_ids), + ) + .await?; + super::scalar::build_scalar_index( + dataset, + column_name, + uuid, + ¶ms, + true, + None, + Some(training_data), + Arc::new(NoopIndexBuildProgress), + ) + .await +} + +/// The index segments to rewrite in this optimize pass. +/// +/// Normally the trailing `num_indices_to_merge` segments. Under stable row ids, +/// any *older* segment that still covers a fragment carrying deletions is added +/// too: an update deletes a row's old copy (leaving a deletion vector) and +/// rewrites it under the same row id, so its stale old-value postings survive +/// until that segment is rewritten and filtered. Only the segments that actually +/// cover a deleted-from fragment are pulled in -- clean segments in between are +/// left untouched -- so an edit to old data does not force a full reindex. +/// +/// The deletion check is conservative (any current deletion vector on a covered +/// fragment), so a segment built after those deletions may be rewritten as a +/// harmless no-op; it never leaves a stale segment behind (PR #7359). +fn select_segments_to_merge<'a>( + dataset: &Dataset, + old_indices: &[&'a IndexMetadata], + options: &OptimizeOptions, +) -> Vec<&'a IndexMetadata> { + let num_to_merge = options + .num_indices_to_merge + .unwrap_or(1) + .min(old_indices.len()); + let tail_start = old_indices.len() - num_to_merge; + + // Address-style row ids mask stale postings at search time, and append mode + // (num_to_merge == 0) defers cleanup to a real merge; both keep the plain tail. + if num_to_merge == 0 || !dataset.manifest.uses_stable_row_ids() { + return old_indices[tail_start..].to_vec(); + } + + let deleted_frags: RoaringBitmap = dataset + .get_fragments() + .iter() + .filter(|f| f.metadata().deletion_file.is_some()) + .map(|f| f.id() as u32) + .collect(); + if deleted_frags.is_empty() { + return old_indices[tail_start..].to_vec(); + } + + let mut selected = Vec::new(); + for (i, idx) in old_indices.iter().enumerate() { + let covers_deleted = idx + .effective_fragment_bitmap(&dataset.fragment_bitmap) + .is_some_and(|eff| !eff.is_disjoint(&deleted_frags)); + if i >= tail_start || covers_deleted { + selected.push(*idx); + } + } + selected +} + +#[allow(clippy::too_many_arguments)] +async fn merge_scalar_indices<'a>( + dataset: Arc, + old_indices: &[&'a IndexMetadata], + unindexed: &[Fragment], + options: &OptimizeOptions, + index_type: IndexType, + field_path: &str, + column_name: &str, + base_unindexed_bitmap: RoaringBitmap, +) -> Result, RoaringBitmap, CreatedIndex)>> { + if old_indices.is_empty() { + return Err(Error::index( + "merge_scalar_indices: no previous index found".to_string(), + )); + } + + let selected_old_indices = select_segments_to_merge(dataset.as_ref(), old_indices, options); + + // No new data + ≤1 old selected = rewriting one segment to itself. + if unindexed.is_empty() && selected_old_indices.len() <= 1 { + return Ok(None); + } + + // For the delta case (`selected` empty) the reference is purely + // for reading params; fall back to the last old index then. + let reference_idx = selected_old_indices + .first() + .copied() + .unwrap_or(old_indices[old_indices.len() - 1]); + let reference_index = dataset + .open_scalar_index(field_path, &reference_idx.uuid, &NoOpMetricsCollector) + .await?; + let update_criteria = reference_index.update_criteria(); + + // Effective = bitmap ∩ live fragments; deleted = bitmap \ live fragments. + let (effective_old_frags, deleted_old_frags) = + split_segment_coverage(dataset.as_ref(), selected_old_indices.iter().copied()); + + let mut frag_bitmap = base_unindexed_bitmap.clone(); + frag_bitmap |= &effective_old_frags; + let new_uuid = Uuid::new_v4(); + + // Scalar Index that expos an N:1 segment-merge primitive reachable without + // rescanning the dataset + let has_segment_merge_primitive = matches!(index_type, IndexType::BTree); + + // Merge new data into the existing segment(s) without rebuilding from + // scratch, when all hold: + // - `effective_old_frags`: the selected segments' coverage intersected + // with live fragments is non-empty, i.e. there is old data worth keeping. + // - `update_criteria` only requires the newly appended data. Indexes that + // need old data must rebuild over `frag_bitmap` so the scanned rows + // exactly match the segment coverage being committed. + // - `has_segment_merge_primitive` (Indices supports N:1 segments merge) OR + // `selected_old_indices.len() == 1` (any scalar type can `update` one). + // Otherwise (e.g. ≥2 selected segments of a type without an N:1 merge + // primitive) the index is rebuilt from scratch over `frag_bitmap`. + let can_merge_segments = !effective_old_frags.is_empty() + && !update_criteria.requires_old_data + && (has_segment_merge_primitive || selected_old_indices.len() == 1); + + let created_index = if !can_merge_segments { + rebuild_scalar_segment( + dataset.as_ref(), + &reference_index, + field_path, + column_name, + new_uuid, + frag_bitmap.iter().collect(), + ) + .await? + } else { + let new_data_stream = + load_unindexed_training_data(dataset.as_ref(), field_path, &update_criteria, unindexed) + .await?; + let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid)?; + + match index_type { + IndexType::BTree => { + let (_, old_data_filters) = + build_per_segment_filters(dataset.as_ref(), &selected_old_indices).await?; + crate::index::scalar::btree::open_and_merge_segments( + dataset.as_ref(), + field_path, + &selected_old_indices, + new_data_stream, + &new_store, + &old_data_filters, + ) + .await? + } + // NOTE: IndexType::Inverted never reaches here -- it is handled by the + // dedicated arm in merge_indices_with_unindexed_frags before this + // function is called. + _ => { + let old_data_filter = build_old_data_filter( + dataset.as_ref(), + &effective_old_frags, + &deleted_old_frags, + ) + .await?; + reference_index + .update(new_data_stream, &new_store, old_data_filter) + .await? + } + } + }; + + Ok(Some(( + new_uuid, + selected_old_indices.to_vec(), + frag_bitmap, + created_index, + ))) +} + async fn metadata_is_vector_index(dataset: &Dataset, index: &IndexMetadata) -> Result { if let Some(files) = &index.files { return Ok(files.iter().any(|file| file.path == INDEX_FILE_NAME)); @@ -238,7 +564,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( vec![(selected_metadata, selected_index)], )?; let selected_ivf_view = selected_logical_index.as_ivf()?; - let (new_uuid, indices_merged) = Box::pin(optimize_vector_indices( + let (new_uuid, indices_merged, files) = Box::pin(optimize_vector_indices( dataset.as_ref().clone(), Option::< lance_io::stream::RecordBatchStreamAdapter< @@ -254,8 +580,6 @@ pub async fn merge_indices_with_unindexed_frags<'a>( return Ok(None); } - let index_dir = dataset.indices_dir().join(new_uuid.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; let new_fragment_bitmap = removed_segment .effective_fragment_bitmap(&dataset.fragment_bitmap) .or_else(|| removed_segment.fragment_bitmap.clone()) @@ -268,7 +592,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( CreatedIndex { index_details: vector_index_details_default(), index_version: lance_index::IndexType::Vector.version() as u32, - files: Some(files), + files, }, )) } else { @@ -290,7 +614,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( Some(scanner.try_into_stream().await?) }; - let (new_uuid, indices_merged) = optimize_vector_indices( + let (new_uuid, indices_merged, files) = optimize_vector_indices( dataset.as_ref().clone(), new_data_stream, &field_path, @@ -321,9 +645,6 @@ pub async fn merge_indices_with_unindexed_frags<'a>( .map(|d| d.as_ref().clone()) .unwrap_or_else(vector_index_details_default); - let index_dir = dataset.indices_dir().join(new_uuid.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; - Ok(( new_uuid, removed_indices, @@ -334,16 +655,15 @@ pub async fn merge_indices_with_unindexed_frags<'a>( // index_version <= our max supported version, so we can safely // write the current library's version for this index type. index_version: lance_index::IndexType::Vector.version() as u32, - files: Some(files), + files, }, )) } } else { - let mut frag_bitmap = base_unindexed_bitmap.clone(); let mut indices = Vec::with_capacity(old_indices.len()); for idx in old_indices { match dataset - .open_generic_index(&field_path, &idx.uuid.to_string(), &NoOpMetricsCollector) + .open_generic_index(&field_path, &idx.uuid, &NoOpMetricsCollector) .await { Ok(index) => indices.push(index), @@ -372,26 +692,17 @@ pub async fn merge_indices_with_unindexed_frags<'a>( let index_type = indices[0].index_type(); match index_type { IndexType::Inverted => { - let num_to_merge = options - .num_indices_to_merge - .unwrap_or(1) - .min(old_indices.len()); - if unindexed.is_empty() && num_to_merge <= 1 { + let selected_old_indices = + select_segments_to_merge(dataset.as_ref(), old_indices, options); + if unindexed.is_empty() && selected_old_indices.len() <= 1 { return Ok(None); } - - let selected_start = old_indices.len().saturating_sub(num_to_merge); - let selected_old_indices = &old_indices[selected_start..]; let reference_idx = selected_old_indices .first() .copied() .unwrap_or(old_indices[old_indices.len() - 1]); let reference_index = dataset - .open_scalar_index( - &field_path, - &reference_idx.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index(&field_path, &reference_idx.uuid, &NoOpMetricsCollector) .await?; let update_criteria = reference_index.update_criteria(); if update_criteria.requires_old_data { @@ -409,7 +720,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( let created_index = super::scalar::build_scalar_index( dataset.as_ref(), column.name.as_str(), - &new_uuid.to_string(), + new_uuid, ¶ms, true, None, @@ -441,18 +752,14 @@ pub async fn merge_indices_with_unindexed_frags<'a>( let mut frag_bitmap = base_unindexed_bitmap; let mut effective_old_frags = RoaringBitmap::new(); let mut selected_indices = Vec::with_capacity(selected_old_indices.len()); - for idx in selected_old_indices { + for idx in &selected_old_indices { if let Some(effective) = idx.effective_fragment_bitmap(&dataset.fragment_bitmap) { frag_bitmap |= &effective; effective_old_frags |= &effective; } let scalar_index = dataset - .open_scalar_index( - &field_path, - &idx.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index(&field_path, &idx.uuid, &NoOpMetricsCollector) .await?; let inverted_index = scalar_index .as_any() @@ -481,14 +788,13 @@ pub async fn merge_indices_with_unindexed_frags<'a>( }; let new_uuid = Uuid::new_v4(); - let new_store = - LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid.to_string())?; + let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid)?; let created_index = if selected_indices.is_empty() { let params = reference_index.derive_index_params()?; super::scalar::build_scalar_index( dataset.as_ref(), column.name.as_str(), - &new_uuid.to_string(), + new_uuid, ¶ms, true, None, @@ -515,105 +821,21 @@ pub async fn merge_indices_with_unindexed_frags<'a>( )) } it if it.is_scalar() => { - let num_to_merge = options - .num_indices_to_merge - .unwrap_or(1) - .min(old_indices.len()); - if unindexed.is_empty() && num_to_merge <= 1 { - return Ok(None); - } - - // Use effective bitmap (intersected with existing dataset fragments) - // to avoid carrying stale data from pruned indices. - let effective_old_frags: RoaringBitmap = old_indices - .iter() - .filter_map(|idx| idx.effective_fragment_bitmap(&dataset.fragment_bitmap)) - .fold(RoaringBitmap::new(), |mut acc, b| { - acc |= &b; - acc - }); - let deleted_old_frags: RoaringBitmap = old_indices - .iter() - .filter_map(|idx| idx.deleted_fragment_bitmap(&dataset.fragment_bitmap)) - .fold(RoaringBitmap::new(), |mut acc, b| { - acc |= &b; - acc - }); - frag_bitmap |= &effective_old_frags; - - let index = dataset - .open_scalar_index( - &field_path, - &old_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) - .await?; - - let update_criteria = index.update_criteria(); - - let fragments = if update_criteria.requires_old_data { - None - } else { - Some(unindexed.to_vec()) - }; - let new_data_stream = load_training_data( - dataset.as_ref(), + let Some(result) = merge_scalar_indices( + dataset.clone(), + old_indices, + unindexed, + options, + it, &field_path, - &update_criteria.data_criteria, - fragments, - true, - None, + column.name.as_str(), + base_unindexed_bitmap, ) - .await?; - - let new_uuid = Uuid::new_v4(); - - let created_index = if effective_old_frags.is_empty() { - // Old data is fully stale (bitmap pruned to empty). Rebuild - // from scratch instead of merging stale entries. - let params = index.derive_index_params()?; - super::scalar::build_scalar_index( - dataset.as_ref(), - column.name.as_str(), - &new_uuid.to_string(), - ¶ms, - true, - None, - Some(new_data_stream), - Arc::new(NoopIndexBuildProgress), - ) - .await? - } else { - let new_store = - LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid.to_string())?; - let old_data_filter = if dataset.manifest.uses_stable_row_ids() { - // Stable row IDs are opaque IDs, so fragment-bit filtering on - // (row_id >> 32) is invalid. Build an exact allow-list from retained - // fragments' row-id sequences and use precise filtering. - let valid_old_row_ids = - build_stable_row_id_filter(dataset.as_ref(), &effective_old_frags) - .await?; - Some(OldIndexDataFilter::RowIds(valid_old_row_ids)) - } else { - // Address-style row IDs encode fragment_id in high 32 bits. - // Fragment bitmap filtering is valid and cheaper in this mode. - Some(OldIndexDataFilter::Fragments { - to_keep: effective_old_frags, - to_remove: deleted_old_frags, - }) - }; - index - .update(new_data_stream, &new_store, old_data_filter) - .await? + .await? + else { + return Ok(None); }; - - // TODO: don't hard-code index version - Ok(( - new_uuid, - vec![old_indices[old_indices.len() - 1]], - frag_bitmap, - created_index, - )) + Ok(result) } _ => Err(Error::index(format!( "Append index: invalid index type: {:?}", @@ -641,7 +863,7 @@ mod tests { use arrow::datatypes::{Float32Type, UInt32Type}; use arrow_array::cast::AsArray; use arrow_array::{ - FixedSizeListArray, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, + FixedSizeListArray, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, }; use arrow_schema::{DataType, Field, Schema}; use futures::TryStreamExt; @@ -653,7 +875,7 @@ mod tests { use lance_index::vector::sq::builder::SQBuildParams; use lance_index::{ IndexType, - scalar::ScalarIndexParams, + scalar::{BuiltinIndexType, ScalarIndexParams, SearchResult, TextQuery}, vector::{ivf::IvfBuildParams, pq::PQBuildParams}, }; use lance_linalg::distance::MetricType; @@ -661,8 +883,8 @@ mod tests { use rstest::rstest; use crate::dataset::builder::DatasetBuilder; - use crate::dataset::optimize::compact_files; - use crate::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteParams}; + use crate::dataset::optimize::{CompactionOptions, compact_files}; + use crate::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams}; use crate::index::vector::VectorIndexParams; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; @@ -775,11 +997,7 @@ mod tests { let mut num_rows = 0; for index in indices.iter() { let index = dataset - .open_vector_index( - "vector", - index.uuid.to_string().as_str(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &index.uuid, &NoOpMetricsCollector) .await .unwrap(); num_rows += index.num_rows(); @@ -1220,7 +1438,7 @@ mod tests { } #[tokio::test] - async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() { + async fn test_optimize_btree_multi_segment_optimize_default() { async fn query_id_count(dataset: &Dataset, id: &str) -> usize { dataset .scan() @@ -1238,119 +1456,1216 @@ mod tests { let test_uri = test_dir.as_str(); let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); - let ids = StringArray::from_iter_values((0..256).map(|i| format!("song-{i}"))); - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap(); - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let make_batch = |start: i32, end: i32| { + let ids = StringArray::from_iter_values((start..end).map(|i| format!("song-{i}"))); + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap() + }; + + // Three fragments of 64 rows each; each commits as its own BTree + // segment so optimize sees a multi-segment scalar logical index. + let reader = RecordBatchIterator::new( + vec![ + Ok(make_batch(0, 64)), + Ok(make_batch(64, 128)), + Ok(make_batch(128, 192)), + ], + schema.clone(), + ); let mut dataset = Dataset::write( reader, test_uri, Some(WriteParams { max_rows_per_file: 64, - enable_stable_row_ids: true, ..Default::default() }), ) .await .unwrap(); - dataset - .create_index( + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 3); + + let mut staged_segments = Vec::new(); + for fragment in &fragments { + let segment = crate::index::create::CreateIndexBuilder::new( + &mut dataset, &["id"], IndexType::BTree, - Some("id_idx".into()), - &ScalarIndexParams::default(), - true, + ¶ms, ) + .name("id_idx".into()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() .await .unwrap(); + staged_segments.push(segment); + } + dataset + .commit_existing_index_segments("id_idx", "id", staged_segments) + .await + .unwrap(); + assert_eq!( + dataset.load_indices_by_name("id_idx").await.unwrap().len(), + 3 + ); - assert_eq!(query_id_count(&dataset, "song-42").await, 1); - - compact_files( - &mut dataset, - crate::dataset::optimize::CompactionOptions { - target_rows_per_fragment: 512, + let appended = RecordBatchIterator::new(vec![Ok(make_batch(192, 256))], schema.clone()); + let mut dataset = Dataset::write( + appended, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Append, ..Default::default() - }, - None, + }), ) .await .unwrap(); - - let frags = dataset.get_fragments(); - assert!(!frags.is_empty()); - assert!(frags.iter().all(|frag| frag.id() > 0)); - assert!( - dataset - .unindexed_fragments("id_idx") - .await - .unwrap() - .is_empty() - ); + assert_eq!(dataset.get_fragments().len(), 4); dataset .optimize_indices(&OptimizeOptions::default()) .await .unwrap(); + // Reload from disk to ensure we're reading committed manifest state. let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); - assert_eq!(query_id_count(&dataset, "song-42").await, 1); + + // Each of these IDs lives in a distinct old segment / fragment. + // song-10 lives in fragment 0, song-80 in fragment 1, song-160 in + // fragment 2, and song-200 in the appended fragment. After optimize + // every row must still be reachable through the logical index, + // regardless of which segment absorbed the new data. + for id in ["song-10", "song-80", "song-160", "song-200"] { + assert_eq!( + query_id_count(&dataset, id).await, + 1, + "expected exactly one row for {id} after multi-segment optimize" + ); + } + + // `OptimizeOptions::default()` (= num_indices_to_merge: None) merges + // the newest segment with the unindexed fragment, like the + // inverted/vector default. The three old segments minus the merged one + // plus the new delta means three segments remain, and together they + // must still cover every dataset fragment without overlap. + let segments_after = dataset.load_indices_by_name("id_idx").await.unwrap(); + assert_eq!( + segments_after.len(), + 3, + "default optimize must merge one delta, not all segments, got {segments_after:?}" + ); + let mut covered = RoaringBitmap::new(); + for segment in &segments_after { + let bitmap = segment + .fragment_bitmap + .as_ref() + .expect("each segment should carry fragment coverage"); + assert!( + covered.is_disjoint(bitmap), + "post-optimize segments must not overlap, got {segments_after:?}" + ); + covered |= bitmap; + } + let mut expected = RoaringBitmap::new(); + for frag in dataset.get_fragments() { + expected.insert(frag.id() as u32); + } + assert_eq!( + covered, expected, + "post-optimize segments should cover every dataset fragment" + ); } #[tokio::test] - async fn test_optimize_scalar_no_unindexed_fragments() { + async fn test_optimize_fmindex_default_rebuilds_old_and_new_rows() { let test_dir = TempStrDir::default(); let test_uri = test_dir.as_str(); - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); - let ids = StringArray::from_iter_values((0..32).map(|i| format!("song-{i}"))); - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap(); - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); - let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + let schema = Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)])); + let make_batch = |values: &[&str]| { + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(StringArray::from_iter_values( + values.iter().copied(), + ))], + ) + .unwrap() + }; + + let reader = RecordBatchIterator::new( + vec![Ok(make_batch(&["old alpha needle", "old beta"]))], + schema.clone(), + ); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + max_rows_per_file: 2, + ..Default::default() + }), + ) + .await + .unwrap(); + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Fm); dataset .create_index( - &["id"], - IndexType::BTree, - Some("id_idx".into()), - &ScalarIndexParams::default(), + &["text"], + IndexType::Fm, + Some("text_fmindex".to_string()), + ¶ms, true, ) .await .unwrap(); - let before = dataset.load_indices_by_name("id_idx").await.unwrap(); - assert_eq!(before.len(), 1); - let original_uuid = before[0].uuid; - let original_version = dataset.manifest.version; + let appended = RecordBatchIterator::new( + vec![Ok(make_batch(&["new gamma needle", "new delta"]))], + schema.clone(), + ); + dataset.append(appended, None).await.unwrap(); + + assert!( + !dataset + .unindexed_fragments("text_fmindex") + .await + .unwrap() + .is_empty() + ); - // `merge(1)` would historically rebuild the single existing segment - // (steady state, nothing unindexed) and replace its UUID; with the - // short-circuit it must skip work entirely. dataset - .optimize_indices(&OptimizeOptions::merge(1)) + .optimize_indices(&OptimizeOptions::default()) .await .unwrap(); - let after = dataset.load_indices_by_name("id_idx").await.unwrap(); - assert_eq!(after.len(), 1, "no new segment should be produced"); - assert_eq!( - after[0].uuid, original_uuid, - "no-op optimize must not churn the index UUID" + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + assert!( + dataset + .unindexed_fragments("text_fmindex") + .await + .unwrap() + .is_empty() ); + + let committed = dataset.load_indices_by_name("text_fmindex").await.unwrap(); + assert_eq!(committed.len(), 1); assert_eq!( - dataset.manifest.version, original_version, - "no-op optimize must not advance the dataset version" + committed[0] + .fragment_bitmap + .as_ref() + .expect("FMIndex segment should carry fragment coverage") + .len(), + 2 ); - // The default options also short-circuit (num_to_merge defaults to 1 - // when there is a single old segment). - dataset - .optimize_indices(&OptimizeOptions::default()) - .await - .unwrap(); + let logical = crate::index::scalar_logical::open_named_scalar_index( + &dataset, + "text", + "text_fmindex", + &NoOpMetricsCollector, + ) + .await + .unwrap(); + + for (pattern, expected) in [("old alpha", 1), ("new gamma", 1), ("needle", 2)] { + let query = TextQuery::StringContains(pattern.to_string()); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!("expected exact result for {pattern}, got {other:?}"), + }; + let count = row_addrs.true_rows().row_addrs().unwrap().count(); + assert_eq!( + count, expected, + "expected {expected} matches for {pattern}, got {count}" + ); + } + } + + #[tokio::test] + async fn test_optimize_btree_optimize_append() { + async fn query_id_count(dataset: &Dataset, id: &str) -> usize { + dataset + .scan() + .filter(&format!("id = '{}'", id)) + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows() + } + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); + let make_batch = |start: i32, end: i32| { + let ids = StringArray::from_iter_values((start..end).map(|i| format!("song-{i}"))); + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap() + }; + + // Start with two fragments + two committed BTree segments. + let reader = RecordBatchIterator::new( + vec![Ok(make_batch(0, 64)), Ok(make_batch(64, 128))], + schema.clone(), + ); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + let original_segment_uuids: Vec<_> = { + let mut staged = Vec::new(); + for fragment in dataset.get_fragments() { + let segment = crate::index::create::CreateIndexBuilder::new( + &mut dataset, + &["id"], + IndexType::BTree, + ¶ms, + ) + .name("id_idx".into()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + staged.push(segment); + } + let uuids = staged.iter().map(|s| s.uuid).collect::>(); + dataset + .commit_existing_index_segments("id_idx", "id", staged) + .await + .unwrap(); + uuids + }; + assert_eq!(original_segment_uuids.len(), 2); + + // Append a third fragment, leave it unindexed, then run append-mode optimize. + let appended = RecordBatchIterator::new(vec![Ok(make_batch(128, 192))], schema.clone()); + let mut dataset = Dataset::write( + appended, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + + // Read fresh from disk to make sure we're inspecting committed state. + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // append() must preserve every original old segment unchanged and add + // exactly one new segment covering only the newly appended fragments. + let committed = dataset.load_indices_by_name("id_idx").await.unwrap(); + let committed_uuids: std::collections::HashSet<_> = + committed.iter().map(|idx| idx.uuid).collect(); + for original in &original_segment_uuids { + assert!( + committed_uuids.contains(original), + "append() must not remove pre-existing segment {original}, \ + but the committed UUIDs are {committed_uuids:?}" + ); + } + assert_eq!( + committed.len(), + original_segment_uuids.len() + 1, + "append() should add exactly one new delta segment, got {committed:?}" + ); + let new_segment = committed + .iter() + .find(|idx| !original_segment_uuids.contains(&idx.uuid)) + .expect("append() must add a new delta segment"); + let new_segment_frags: Vec<_> = new_segment + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect(); + // The appended fragment should be the only one covered by the new delta; + // old segments retain their own coverage. + assert_eq!(new_segment_frags.len(), 1); + + // Sanity check: queries across all fragments still return their rows. + for id in ["song-10", "song-100", "song-160"] { + assert_eq!(query_id_count(&dataset, id).await, 1, "missing row {id}"); + } + } + + #[tokio::test] + async fn test_optimize_bitmap_index_append() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "category", + DataType::Utf8, + false, + )])); + let make_batch = |labels: &[&str]| { + let arr = StringArray::from_iter_values(labels.iter().copied()); + RecordBatch::try_new(schema.clone(), vec![Arc::new(arr)]).unwrap() + }; + + // One fragment + one Bitmap segment. + let reader = + RecordBatchIterator::new(vec![Ok(make_batch(&["a", "b", "a", "c"]))], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 4, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Bitmap); + dataset + .create_index( + &["category"], + IndexType::Bitmap, + Some("cat_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + let original_uuid = { + let committed = dataset.load_indices_by_name("cat_idx").await.unwrap(); + assert_eq!(committed.len(), 1); + committed[0].uuid + }; + + // Append a second fragment, leave it unindexed, then optimize with + // `append()` (= num_indices_to_merge: Some(0)). + let appended = + RecordBatchIterator::new(vec![Ok(make_batch(&["b", "d", "d", "a"]))], schema.clone()); + let mut dataset = Dataset::write( + appended, + test_uri, + Some(WriteParams { + max_rows_per_file: 4, + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // append() (= num_indices_to_merge: Some(0)) is now honored uniformly: + // Bitmap, like BTree, must keep the original segment untouched and add + // exactly one delta segment covering only the appended fragment. + let committed = dataset.load_indices_by_name("cat_idx").await.unwrap(); + assert_eq!( + committed.len(), + 2, + "Bitmap optimize append() must add a delta segment, not merge, got {committed:?}" + ); + assert!( + committed.iter().any(|idx| idx.uuid == original_uuid), + "append() must preserve the pre-existing segment {original_uuid}, got {committed:?}" + ); + let new_segment = committed + .iter() + .find(|idx| idx.uuid != original_uuid) + .expect("append() must add a new delta segment"); + let new_segment_frags: std::collections::BTreeSet = new_segment + .fragment_bitmap + .as_ref() + .expect("delta Bitmap should carry fragment coverage") + .iter() + .collect(); + assert_eq!( + new_segment_frags, + [1u32].into_iter().collect(), + "the delta segment must cover only the appended fragment" + ); + + // Data correctness: a value that lives only in the appended fragment + // is queryable through the (now multi-segment) index. + let rows = dataset + .scan() + .filter("category = 'd'") + .unwrap() + .project(&["category"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(rows, 2, "value 'd' lives in appended fragment"); + } + + #[tokio::test] + async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() { + async fn query_id_count(dataset: &Dataset, id: &str) -> usize { + dataset + .scan() + .filter(&format!("id = '{}'", id)) + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows() + } + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); + let ids = StringArray::from_iter_values((0..256).map(|i| format!("song-{i}"))); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + assert_eq!(query_id_count(&dataset, "song-42").await, 1); + + compact_files( + &mut dataset, + crate::dataset::optimize::CompactionOptions { + target_rows_per_fragment: 512, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + let frags = dataset.get_fragments(); + assert!(!frags.is_empty()); + assert!(frags.iter().all(|frag| frag.id() > 0)); + assert!( + dataset + .unindexed_fragments("id_idx") + .await + .unwrap() + .is_empty() + ); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + assert_eq!(query_id_count(&dataset, "song-42").await, 1); + } + + /// Under stable row ids, updating an indexed column and then calling + /// `optimize_indices` must not leave stale entries (old value -> updated row) + /// in the scalar index. An update deletes the old copy of each row and + /// rewrites it under the same stable row id, so the old index entry is stale + /// and must be dropped on merge. Covers BTree, Bitmap, and Inverted (FTS), + /// which take three different merge paths. + #[tokio::test] + async fn test_optimize_scalar_index_drops_stale_rows_after_update() { + use crate::dataset::UpdateBuilder; + use arrow_array::Int32Array; + use lance_index::scalar::FullTextSearchQuery; + use lance_index::scalar::inverted::InvertedIndexParams; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + // 100 rows: num == id; cat = "A" for id<50 else "B"; body = "alpha" for + // id<50 else "beta". + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("num", DataType::Int32, false), + Field::new("cat", DataType::Utf8, false), + Field::new("body", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| if i < 50 { "A" } else { "B" }), + )), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| if i < 50 { "alpha" } else { "beta" }), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["num"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + dataset + .create_index( + &["cat"], + IndexType::Bitmap, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + dataset + .create_index( + &["body"], + IndexType::Inverted, + None, + &InvertedIndexParams::default(), + true, + ) + .await + .unwrap(); + + // Update the first 25 rows (id < 25): num -> -1, cat -> 'B', body -> 'beta'. + let res = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("num", "-1") + .unwrap() + .set("cat", "'B'") + .unwrap() + .set("body", "'beta'") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + dataset = res.new_dataset.as_ref().clone(); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // BTree: `num >= 0` matches ids 25..99 (75 rows); the 25 updated rows + // hold num = -1 and must not appear. + let btree_count = dataset + .scan() + .filter("num >= 0") + .unwrap() + .count_rows() + .await + .unwrap(); + assert_eq!(btree_count, 75, "btree returned stale/incorrect rows"); + + // Bitmap: only the 25 rows (ids 25..49) that still carry cat = 'A' match; + // the 25 rows updated to 'B' must not. + let bitmap_count = dataset + .scan() + .filter("cat = 'A'") + .unwrap() + .count_rows() + .await + .unwrap(); + assert_eq!(bitmap_count, 25, "bitmap returned stale rows"); + + // FTS: only the 25 rows (ids 25..49) whose body still reads "alpha" match; + // the 25 rows updated to "beta" must not. + let mut scan = dataset.scan(); + scan.full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap(); + let fts_count = scan.count_rows().await.unwrap(); + assert_eq!(fts_count, 25, "FTS index returned stale rows"); + } + + /// Multi-segment variant (Jack Ye's repro, PR #7359): with one BTree segment + /// per fragment, default optimize merges only the tail segment. A stable-row-id + /// update to a row in an older segment's fragment must still drop that + /// segment's stale postings -- the merge has to reach back to cover it. + #[tokio::test] + async fn test_optimize_btree_drops_stale_rows_across_segments_after_update() { + use crate::dataset::UpdateBuilder; + use crate::index::CreateIndexBuilder; + use arrow_array::Int32Array; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("num", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from_iter_values(0..100)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + // Two fragments (0..49, 50..99) -> one BTree segment each. + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + let fragments = dataset.get_fragments(); + let mut segments = Vec::new(); + for fragment in &fragments { + segments.push( + CreateIndexBuilder::new(&mut dataset, &["num"], IndexType::BTree, ¶ms) + .name("num_idx".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("num_idx", "num", segments) + .await + .unwrap(); + + // Update the first 25 rows (in the first/older segment's fragment). + let res = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("num", "-1") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + dataset = res.new_dataset.as_ref().clone(); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + assert_eq!( + dataset + .scan() + .filter("num = 0") + .unwrap() + .count_rows() + .await + .unwrap(), + 0, + "stale entry leaked from the older, unmerged segment" + ); + assert_eq!( + dataset + .scan() + .filter("num >= 0") + .unwrap() + .count_rows() + .await + .unwrap(), + 75 + ); + } + + /// Same multi-segment gap for FTS, which takes the separate Inverted dispatch + /// path. One Inverted segment per fragment; an update to the older segment's + /// fragment must not leave its old-token postings searchable. + #[tokio::test] + async fn test_optimize_fts_drops_stale_rows_across_segments_after_update() { + use crate::dataset::UpdateBuilder; + use crate::index::CreateIndexBuilder; + use arrow_array::Int32Array; + use lance_index::scalar::FullTextSearchQuery; + use lance_index::scalar::inverted::InvertedIndexParams; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("body", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| if i < 50 { "alpha" } else { "beta" }), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = InvertedIndexParams::default(); + let fragments = dataset.get_fragments(); + let mut segments = Vec::new(); + for fragment in &fragments { + segments.push( + CreateIndexBuilder::new(&mut dataset, &["body"], IndexType::Inverted, ¶ms) + .name("body_idx".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("body_idx", "body", segments) + .await + .unwrap(); + + // Update the first 25 rows (older segment's fragment): body -> "beta". + let res = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("body", "'beta'") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + dataset = res.new_dataset.as_ref().clone(); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + let mut scan = dataset.scan(); + scan.full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap(); + assert_eq!( + scan.count_rows().await.unwrap(), + 25, + "FTS stale rows leaked from the older, unmerged segment" + ); + } + + /// `optimize_indices` builds the stable-row-id allow-list by subtracting each + /// fragment's deletion vector. If a deletion vector cannot be read, the merge + /// must fail loudly: swallowing the error (treating the load as "no + /// deletions") would put every deleted row back into the allow-list and + /// silently reintroduce the stale entries this fix removes. Simulate an + /// unreadable deletion vector by deleting the file the manifest still + /// references, then assert optimize errors instead of succeeding. + #[tokio::test] + async fn test_optimize_errors_when_deletion_vector_unreadable() { + use crate::dataset::UpdateBuilder; + use arrow_array::Int32Array; + use lance_table::io::deletion::deletion_file_path; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("num", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from_iter_values(0..100)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["num"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + // Update rewrites the first 25 rows under the same stable row ids, + // leaving a deletion vector on the original fragment. + UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("num", "-1") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + + // Reload cold (nothing has cached the deletion vector), then remove the + // deletion file the manifest still references so the next read fails. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + let mut removed = 0; + for fragment in dataset.get_fragments() { + if let Some(deletion_file) = fragment.metadata().deletion_file.clone() { + let path = + deletion_file_path(&dataset.base, fragment.metadata().id, &deletion_file); + dataset.object_store.delete(&path).await.unwrap(); + removed += 1; + } + } + assert_eq!( + removed, 1, + "update should have left exactly one deletion file" + ); + + let result = dataset.optimize_indices(&OptimizeOptions::default()).await; + assert!( + result.is_err(), + "optimize must fail when a deletion vector cannot be read, not \ + silently keep the deleted rows in the index" + ); + } + + #[tokio::test] + async fn test_optimize_scalar_no_unindexed_fragments() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); + let ids = StringArray::from_iter_values((0..32).map(|i| format!("song-{i}"))); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let before = dataset.load_indices_by_name("id_idx").await.unwrap(); + assert_eq!(before.len(), 1); + let original_uuid = before[0].uuid; + let original_version = dataset.manifest.version; + + // `merge(1)` would historically rebuild the single existing segment + // (steady state, nothing unindexed) and replace its UUID; with the + // short-circuit it must skip work entirely. + dataset + .optimize_indices(&OptimizeOptions::merge(1)) + .await + .unwrap(); + + let after = dataset.load_indices_by_name("id_idx").await.unwrap(); + assert_eq!(after.len(), 1, "no new segment should be produced"); + assert_eq!( + after[0].uuid, original_uuid, + "no-op optimize must not churn the index UUID" + ); + assert_eq!( + dataset.manifest.version, original_version, + "no-op optimize must not advance the dataset version" + ); + + // The default options also short-circuit (num_to_merge defaults to 1 + // when there is a single old segment). + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); let after_default = dataset.load_indices_by_name("id_idx").await.unwrap(); assert_eq!(after_default[0].uuid, original_uuid); assert_eq!(dataset.manifest.version, original_version); } + + #[rstest] + #[case::address_row_ids(false)] + #[case::stable_row_ids(true)] + #[tokio::test] + async fn test_optimize_btree_no_duplicate_row_addr(#[case] use_stable_row_ids: bool) { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("payload", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![10])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + enable_stable_row_ids: use_stable_row_ids, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Reordered source columns (payload, id) force the partial-schema + // RewriteColumns path instead of a full row rewrite. + let source_schema = Arc::new(Schema::new(vec![ + Field::new("payload", DataType::Int32, false), + Field::new("id", DataType::Int32, false), + ])); + let source_batch = RecordBatch::try_new( + source_schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![100])), + Arc::new(Int32Array::from(vec![1])), + ], + ) + .unwrap(); + let merge_job = + MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .try_build() + .unwrap(); + let source_reader = Box::new(RecordBatchIterator::new( + [Ok(source_batch)], + source_schema.clone(), + )); + merge_job + .execute(reader_to_stream(source_reader)) + .await + .unwrap(); + + // Build a delta BTree segment over the now-unindexed fragment. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + assert_eq!( + dataset.load_indices_by_name("id_idx").await.unwrap().len(), + 2, + "append must create a delta segment over the rewritten fragment" + ); + + // Force the old segment + delta segment to merge. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + let rows = dataset + .scan() + .filter("id = 1") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(rows, 1, "id = 1 must return exactly one row after merge"); + } + + #[tokio::test] + async fn test_optimize_btree_merge_remaps_deferred_compaction() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let make = |range: std::ops::Range| { + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(range))], + ) + .unwrap() + }; + + // Two fragments: [0, 50) and [50, 100). + let reader = + RecordBatchIterator::new(vec![Ok(make(0..50)), Ok(make(50..100))], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Deferred-remap compaction fuses the two fragments into one and leaves a + // pending FragReuseIndex; the index segment is not eagerly remapped. + compact_files( + &mut dataset, + CompactionOptions { + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + // Append a third fragment, left unindexed. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .append( + RecordBatchIterator::new(vec![Ok(make(100..150))], schema.clone()), + None, + ) + .await + .unwrap(); + + // Merge the deferred-remapped old segment with the new delta. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + // A value from the compacted fragments must still be found via the index. + let hit = dataset + .scan() + .filter("id = 25") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!( + hit, 1, + "compacted-then-merged row must remain queryable via the index" + ); + let total = dataset + .scan() + .filter("id >= 0") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(total, 150, "no rows may be lost across compaction + merge"); + } } diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index af7ea7ce19c..bbb055463dc 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -10,7 +10,7 @@ use crate::{ index::{ DatasetIndexExt, DatasetIndexInternalExt, IntoIndexSegment, build_index_metadata_from_segments, - scalar::build_scalar_index, + scalar::{build_bitmap_index_segment, build_scalar_index}, vector::{ LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index, build_empty_vector_index, build_vector_index, @@ -53,7 +53,7 @@ pub struct CreateIndexBuilder<'a> { replace: bool, train: bool, fragments: Option>, - index_uuid: Option, + index_uuid: Option, preprocessed_data: Option>, progress: Arc, /// Transaction properties to store with this commit. @@ -103,7 +103,7 @@ impl<'a> CreateIndexBuilder<'a> { self } - pub fn index_uuid(mut self, uuid: String) -> Self { + pub fn index_uuid(mut self, uuid: Uuid) -> Self { self.index_uuid = Some(uuid); self } @@ -207,14 +207,10 @@ impl<'a> CreateIndexBuilder<'a> { self.index_type, self.params, self.fragments.as_ref(), - self.index_uuid.as_deref(), + self.index_uuid.as_ref(), )?; - let index_id = match &self.index_uuid { - Some(uuid_str) => Uuid::parse_str(uuid_str) - .map_err(|e| Error::index(format!("Invalid UUID string provided: {}", e)))?, - None => Uuid::new_v4(), - }; + let index_id = self.index_uuid.unwrap_or_else(Uuid::new_v4); let mut output_index_uuid = index_id; let created_index = match (self.index_type, self.params.index_name()) { ( @@ -222,6 +218,7 @@ impl<'a> CreateIndexBuilder<'a> { | IndexType::BTree | IndexType::Inverted | IndexType::NGram + | IndexType::Fm | IndexType::ZoneMap | IndexType::BloomFilter | IndexType::LabelList @@ -258,17 +255,44 @@ impl<'a> CreateIndexBuilder<'a> { .preprocessed_data .take() .map(|reader| lance_datafusion::utils::reader_to_stream(Box::new(reader))); - build_scalar_index( - self.dataset, - column, - &index_id.to_string(), - ¶ms, - train, - self.fragments.clone(), - preprocesssed_data, - self.progress.clone(), - ) - .await? + if self.index_type == IndexType::Bitmap && self.fragments.is_some() { + if !train { + return Err(Error::invalid_input( + "canonical bitmap segment build requires train=true".to_string(), + )); + } + if preprocesssed_data.is_some() { + return Err(Error::invalid_input( + "canonical bitmap segment build does not accept preprocessed data" + .to_string(), + )); + } + let fragments = self.fragments.clone().ok_or_else(|| { + Error::invalid_input( + "canonical bitmap segment build requires fragment ids".to_string(), + ) + })?; + build_bitmap_index_segment( + self.dataset, + column, + index_id, + fragments, + self.progress.clone(), + ) + .await? + } else { + build_scalar_index( + self.dataset, + column, + index_id, + ¶ms, + train, + self.fragments.clone(), + preprocesssed_data, + self.progress.clone(), + ) + .await? + } } (IndexType::Scalar, LANCE_SCALAR_INDEX) => { // Guess the index type @@ -282,7 +306,7 @@ impl<'a> CreateIndexBuilder<'a> { build_scalar_index( self.dataset, column, - &index_id.to_string(), + index_id, params, train, self.fragments.clone(), @@ -308,7 +332,7 @@ impl<'a> CreateIndexBuilder<'a> { build_scalar_index( self.dataset, column, - &index_id.to_string(), + index_id, ¶ms, train, self.fragments.clone(), @@ -338,16 +362,16 @@ impl<'a> CreateIndexBuilder<'a> { })?; let index_version = vec_params.index_type().version() as u32; - if train { + let files = if train { // Check if this is distributed indexing (fragment-level) if let Some(fragments) = &self.fragments { // For distributed indexing, build only on specified fragments // This creates temporary index metadata without committing - let segment_uuid = Box::pin(build_distributed_vector_index( + let (segment_uuid, files) = Box::pin(build_distributed_vector_index( self.dataset, column, &index_name, - &index_id.to_string(), + index_id, vec_params, fri, fragments, @@ -355,18 +379,19 @@ impl<'a> CreateIndexBuilder<'a> { )) .await?; output_index_uuid = segment_uuid; + files } else { // Standard full dataset indexing Box::pin(build_vector_index( self.dataset, column, &index_name, - &index_id.to_string(), + index_id, vec_params, fri, self.progress.clone(), )) - .await?; + .await? } } else { // Create empty vector index @@ -374,22 +399,15 @@ impl<'a> CreateIndexBuilder<'a> { self.dataset, column, &index_name, - &index_id.to_string(), + index_id, vec_params, ) - .await?; - } - // Capture file sizes after vector index creation - let index_dir = self - .dataset - .indices_dir() - .join(output_index_uuid.to_string()); - let files = - list_index_files_with_sizes(&self.dataset.object_store, &index_dir).await?; + .await? + }; CreatedIndex { index_details: vector_index_details(vec_params), index_version, - files: Some(files), + files, } } // Can't use if let Some(...) here because it's not stable yet. @@ -416,7 +434,7 @@ impl<'a> CreateIndexBuilder<'a> { ))?; if train { - ext.create_index(self.dataset, column, &index_id.to_string(), self.params) + ext.create_index(self.dataset, column, &index_id, self.params) .await?; } else { todo!("create empty vector index when train=false"); @@ -428,7 +446,7 @@ impl<'a> CreateIndexBuilder<'a> { CreatedIndex { index_details: vector_index_details_default(), index_version: self.index_type.version() as u32, - files: Some(files), + files, } } (IndexType::FragmentReuse, _) => { @@ -461,12 +479,20 @@ impl<'a> CreateIndexBuilder<'a> { index_version: created_index.index_version as i32, created_at: Some(chrono::Utc::now()), base_id: None, - files: created_index.files, + files: Some(created_index.files), }) } #[instrument(skip_all)] async fn execute(mut self) -> Result { + // Multi-segment FM-Index path: when num_segments > 1, build one segment + // per fragment group and commit them all atomically. + if let Some(num_segments) = self.fmindex_num_segments() + && num_segments > 1 + { + return self.execute_multi_segment_fmindex(num_segments).await; + } + let new_idx = self.execute_uncommitted().await?; let index_uuid = new_idx.uuid; let removed_indices = if self.replace { @@ -480,23 +506,12 @@ impl<'a> CreateIndexBuilder<'a> { } else { vec![] }; - let transaction = if uses_segment_commit_path(self.index_type, &new_idx.name, self.params) { - let field_id = *new_idx.fields.first().ok_or_else(|| { - Error::internal(format!( - "Index '{}' is missing field ids after build", - new_idx.name - )) - })?; - let index_name = new_idx.name.clone(); + let transaction = if uses_segment_commit_path(self.index_type, self.params) { let dataset_version = new_idx.dataset_version; - let segments = vec![new_idx.into_index_segment()?]; - let new_indices = - build_index_metadata_from_segments(self.dataset, &index_name, field_id, segments) - .await?; TransactionBuilder::new( dataset_version, Operation::CreateIndex { - new_indices, + new_indices: vec![new_idx], removed_indices, }, ) @@ -532,6 +547,225 @@ impl<'a> CreateIndexBuilder<'a> { )) }) } + /// Extract `num_segments` from FM-Index params if this is an FM-Index build. + fn fmindex_num_segments(&self) -> Option { + if self.index_type != IndexType::Fm { + return None; + } + let scalar_params = self.params.as_any().downcast_ref::()?; + let params_json = scalar_params.params.as_deref()?; + let json: serde_json::Value = serde_json::from_str(params_json).ok()?; + json.get("num_segments")?.as_u64().map(|n| n as u32) + } + + /// Build FM-Index with multiple segments, each covering a subset of fragments. + async fn execute_multi_segment_fmindex(&mut self, num_segments: u32) -> Result { + // Validate column count: same check as execute_uncommitted + if self.columns.len() != 1 { + return Err(Error::index( + "Only support building index on 1 column at the moment".to_string(), + )); + } + + let column_input = &self.columns[0]; + let Some(field_path) = self.dataset.schema().resolve_case_insensitive(column_input) else { + return Err(Error::index(format!( + "CreateIndex: column '{column_input}' does not exist" + ))); + }; + let field = *field_path.last().unwrap(); + let names: Vec<&str> = field_path.iter().map(|f| f.name.as_str()).collect(); + let column = format_field_path(&names); + + let train = if self.train { + self.dataset.count_rows(None).await? > 0 + } else { + false + }; + + let indices = self.dataset.load_indices().await?; + let index_name = if let Some(name) = self.name.take() { + name + } else { + let column_path = default_index_name(&names); + let base_name = format!("{column_path}_idx"); + let mut candidate = base_name.clone(); + let mut counter = 2; + while indices + .iter() + .any(|idx| idx.name == candidate && idx.fields != [field.id]) + { + candidate = format!("{base_name}_{counter}"); + counter += 1; + } + candidate + }; + let existing_named_indices = indices + .iter() + .filter(|idx| idx.name == index_name) + .collect::>(); + if existing_named_indices + .iter() + .any(|idx| idx.fields != [field.id]) + { + return Err(Error::index(format!( + "Index name '{index_name}' already exists with different fields, \ + please specify a different name" + ))); + } + if !existing_named_indices.is_empty() && !self.replace { + return Err(Error::index(format!( + "Index name '{index_name}' already exists, \ + please specify a different name or use replace=True" + ))); + } + + let all_fragment_ids: Vec = self.dataset.fragment_bitmap.as_ref().iter().collect(); + if !train || all_fragment_ids.is_empty() { + let segment_uuid = Uuid::new_v4(); + let created_index = build_scalar_index( + self.dataset, + &column, + segment_uuid, + &ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Fm), + false, + None, + None, + self.progress.clone(), + ) + .await?; + let metadata = IndexMetadata { + uuid: segment_uuid, + name: index_name.clone(), + fields: vec![field.id], + dataset_version: self.dataset.manifest.version, + fragment_bitmap: Some(roaring::RoaringBitmap::new()), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: Some(created_index.files), + }; + let segments = vec![metadata.into_index_segment()?]; + let new_indices = + build_index_metadata_from_segments(self.dataset, &index_name, field.id, segments) + .await?; + + // Collect all same-name indices for removal when replace is set + let removed_indices = if self.replace { + existing_named_indices + .into_iter() + .cloned() + .collect::>() + } else { + vec![] + }; + + let transaction = TransactionBuilder::new( + self.dataset.manifest.version, + Operation::CreateIndex { + new_indices, + removed_indices, + }, + ) + .transaction_properties(self.transaction_properties.clone()) + .build(); + + self.dataset + .apply_commit(transaction, &Default::default(), &Default::default()) + .await?; + + let indices = self.dataset.load_indices_by_name(&index_name).await?; + return indices.into_iter().next().ok_or_else(|| { + Error::internal(format!( + "FM-Index segments for '{}' not found after commit", + index_name + )) + }); + } + + let num_segments = (num_segments as usize).min(all_fragment_ids.len()).max(1); + let chunk_size = all_fragment_ids.len().div_ceil(num_segments); + + let mut segment_metadatas = Vec::with_capacity(num_segments); + for chunk in all_fragment_ids.chunks(chunk_size) { + let fragment_ids = chunk.to_vec(); + let segment_uuid = Uuid::new_v4(); + let created_index = build_scalar_index( + self.dataset, + &column, + segment_uuid, + &ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Fm), + true, + Some(fragment_ids.clone()), + None, + self.progress.clone(), + ) + .await?; + + segment_metadatas.push(IndexMetadata { + uuid: segment_uuid, + name: index_name.clone(), + fields: vec![field.id], + dataset_version: self.dataset.manifest.version, + fragment_bitmap: Some(fragment_ids.into_iter().collect()), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: Some(created_index.files), + }); + } + + // Convert to IndexSegments and build proper transaction metadata + let segments = segment_metadatas + .into_iter() + .map(IntoIndexSegment::into_index_segment) + .collect::>>()?; + let new_indices = + build_index_metadata_from_segments(self.dataset, &index_name, field.id, segments) + .await?; + + // Collect all same-name indices for removal when replace is set, + // matching the standard execute() path behavior. + let removed_indices = if self.replace { + existing_named_indices + .into_iter() + .cloned() + .collect::>() + } else { + vec![] + }; + + let transaction = TransactionBuilder::new( + self.dataset.manifest.version, + Operation::CreateIndex { + new_indices, + removed_indices, + }, + ) + .transaction_properties(self.transaction_properties.clone()) + .build(); + + self.dataset + .apply_commit(transaction, &Default::default(), &Default::default()) + .await?; + + let indices = self.dataset.load_indices_by_name(&index_name).await?; + indices.into_iter().next().ok_or_else(|| { + Error::internal(format!( + "FM-Index segments for '{}' not found after commit", + index_name + )) + }) + } +} + +fn is_btree_scalar_params(params: &dyn IndexParams) -> bool { + params + .as_any() + .downcast_ref::() + .is_some_and(|p| p.index_type.eq_ignore_ascii_case("btree")) } /// Validate that a user-supplied `index_uuid` is permitted for this build. @@ -539,7 +773,7 @@ fn ensure_index_uuid_allowed( index_type: IndexType, params: &dyn IndexParams, fragments: Option<&Vec>, - index_uuid: Option<&str>, + index_uuid: Option<&Uuid>, ) -> Result<()> { let is_btree = index_type == IndexType::BTree || params @@ -560,26 +794,35 @@ fn ensure_index_uuid_allowed( Ok(()) } -fn uses_segment_commit_path( - index_type: IndexType, - index_name: &str, - params: &dyn IndexParams, -) -> bool { - if index_name != LANCE_VECTOR_INDEX { - return false; +fn uses_segment_commit_path(index_type: IndexType, params: &dyn IndexParams) -> bool { + let params_family = params.index_name(); + + if params_family == LANCE_VECTOR_INDEX + && matches!( + index_type, + IndexType::Vector + | IndexType::IvfPq + | IndexType::IvfSq + | IndexType::IvfFlat + | IndexType::IvfRq + | IndexType::IvfHnswFlat + | IndexType::IvfHnswPq + | IndexType::IvfHnswSq + ) + && params.as_any().is::() + { + return true; + } + + if params_family == LANCE_SCALAR_INDEX { + match index_type { + IndexType::BTree => return true, + IndexType::Scalar if is_btree_scalar_params(params) => return true, + _ => {} + } } - matches!( - index_type, - IndexType::Vector - | IndexType::IvfPq - | IndexType::IvfSq - | IndexType::IvfFlat - | IndexType::IvfRq - | IndexType::IvfHnswFlat - | IndexType::IvfHnswPq - | IndexType::IvfHnswSq - ) && params.as_any().is::() + false } impl<'a> IntoFuture for CreateIndexBuilder<'a> { @@ -615,7 +858,6 @@ mod tests { use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::kmeans::{KMeansParams, train_kmeans}; use lance_linalg::distance::{DistanceType, MetricType}; - use serde_json::json; use std::{collections::BTreeSet, ops::Bound, sync::Arc}; use uuid::Uuid; @@ -1014,7 +1256,7 @@ mod tests { let params = InvertedIndexParams::default(); let fragments = dataset.get_fragments(); let fragment_ids: Vec = fragments.iter().map(|f| f.id() as u32).collect(); - let shared_uuid = Uuid::new_v4().to_string(); + let shared_uuid = Uuid::new_v4(); let build_progress = Arc::new(RecordingProgress::default()); for &fragment_id in &fragment_ids { @@ -1022,11 +1264,11 @@ mod tests { CreateIndexBuilder::new(&mut dataset, &["text"], IndexType::Inverted, ¶ms) .name("distributed_index".to_string()) .fragments(vec![fragment_id]) - .index_uuid(shared_uuid.clone()) + .index_uuid(shared_uuid) .progress(build_progress.clone()); let index_metadata = builder.execute_uncommitted().await.unwrap(); - assert_eq!(index_metadata.uuid.to_string(), shared_uuid); + assert_eq!(index_metadata.uuid, shared_uuid); assert_eq!(index_metadata.name, "distributed_index"); let fragment_bitmap = index_metadata.fragment_bitmap.as_ref().unwrap(); @@ -1135,7 +1377,7 @@ mod tests { let err = dataset .merge_index_metadata( - &Uuid::new_v4().to_string(), + &Uuid::new_v4(), IndexType::BTree, None, Arc::new(NoopIndexBuildProgress), @@ -1286,7 +1528,7 @@ mod tests { let err = CreateIndexBuilder::new(&mut dataset, &["value"], index_type, ¶ms) .name("value_btree_segments".to_string()) .fragments(vec![fragment_id]) - .index_uuid(Uuid::new_v4().to_string()) + .index_uuid(Uuid::new_v4()) .execute_uncommitted() .await .unwrap_err(); @@ -1389,10 +1631,8 @@ mod tests { } #[tokio::test] - async fn test_distributed_build_bitmap() { - use datafusion::common::ScalarValue; - use lance_index::scalar::{SargableQuery, SearchResult, bitmap::BITMAP_LOOKUP_NAME}; - use lance_select::RowSetOps; + async fn test_bitmap_execute_uncommitted_writes_canonical_segment() { + use lance_index::scalar::bitmap::BITMAP_LOOKUP_NAME; let tmpdir = TempStrDir::default(); let dataset_uri = format!("file://{}", tmpdir.as_str()); @@ -1432,69 +1672,15 @@ mod tests { ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Bitmap); let fragments = dataset.get_fragments(); let fragment_ids: Vec = fragments.iter().map(|f| f.id() as u32).collect(); - let shared_uuid = Uuid::new_v4().to_string(); - let mut shard_metadata = None; - let shard_groups = fragment_ids.chunks(2).collect::>(); - - for (shard_id, fragment_group) in shard_groups.iter().enumerate() { - let params = base_params - .clone() - .with_params(&json!({ "shard_id": shard_id as u32 })); - let index_metadata = - CreateIndexBuilder::new(&mut dataset, &["category"], IndexType::Bitmap, ¶ms) - .name("distributed_bitmap".to_string()) - .fragments(fragment_group.to_vec()) - .index_uuid(shared_uuid.clone()) - .execute_uncommitted() - .await - .unwrap(); - if shard_metadata.is_none() { - shard_metadata = Some(index_metadata); - } - } - - dataset - .merge_index_metadata( - &shared_uuid, - IndexType::Bitmap, - None, - Arc::new(NoopIndexBuildProgress), - ) - .await - .unwrap(); - - let mut committed_index_metadata = shard_metadata.unwrap(); - committed_index_metadata.fragment_bitmap = Some(fragment_ids.iter().copied().collect()); - committed_index_metadata.files = Some( - list_index_files_with_sizes( - dataset.object_store.as_ref(), - &dataset.indices_dir().clone().join(shared_uuid.clone()), - ) - .await - .unwrap(), - ); - committed_index_metadata.dataset_version = dataset.manifest.version; - - let transaction = TransactionBuilder::new( - dataset.manifest.version, - Operation::CreateIndex { - new_indices: vec![committed_index_metadata], - removed_indices: vec![], - }, - ) - .build(); - dataset - .apply_commit(transaction, &Default::default(), &Default::default()) - .await - .unwrap(); + let selected_fragments = fragment_ids[..2].to_vec(); + let index = + CreateIndexBuilder::new(&mut dataset, &["category"], IndexType::Bitmap, &base_params) + .name("bitmap_segment".to_string()) + .fragments(selected_fragments.clone()) + .execute_uncommitted() + .await + .unwrap(); - let dataset = Dataset::open(&dataset_uri).await.unwrap(); - let indices = dataset - .load_indices_by_name("distributed_bitmap") - .await - .unwrap(); - assert_eq!(indices.len(), 1); - let index = &indices[0]; assert_eq!( index .fragment_bitmap @@ -1502,37 +1688,15 @@ mod tests { .unwrap() .iter() .collect::>(), - fragment_ids + selected_fragments ); let files = index.files.as_ref().unwrap(); assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME)); assert!( files.iter().all(|file| !file.path.starts_with("part_")), - "committed bitmap index should only reference merged files" + "staged bitmap segment should only reference canonical files" ); - - let scalar_index = crate::index::scalar::open_scalar_index( - &dataset, - "category", - index, - &NoOpMetricsCollector, - ) - .await - .unwrap(); - assert_eq!(scalar_index.index_type(), IndexType::Bitmap); - - let query_result = scalar_index - .search( - &SargableQuery::Equals(ScalarValue::Int32(Some(2))), - &NoOpMetricsCollector, - ) - .await - .unwrap(); - let SearchResult::Exact(query_rows) = query_result else { - panic!("expected exact bitmap result"); - }; - assert_eq!(query_rows.true_rows().len(), Some(2)); } #[tokio::test] @@ -1829,6 +1993,23 @@ mod tests { let segments = input_segments.clone(); assert_eq!(segments.len(), input_segments.len()); + crate::index::scalar::inverted::finalize_segment_files_if_needed( + &dataset, + &input_segments[0], + ) + .await + .unwrap(); + let stale_staging_path = dataset + .indices_dir() + .join(input_segments[0].uuid.to_string()) + .join("staging") + .join("orphan.lance"); + dataset + .object_store + .put(&stale_staging_path, b"stale") + .await + .unwrap(); + dataset .commit_existing_index_segments("text_idx", "text", segments) .await @@ -1852,6 +2033,19 @@ mod tests { let indices = dataset.load_indices_by_name("text_idx").await.unwrap(); assert_eq!(indices.len(), input_segments.len()); + let finalized_segment = indices + .iter() + .find(|index| index.uuid == input_segments[0].uuid) + .expect("finalized segment should be committed"); + assert!( + finalized_segment + .files + .as_ref() + .expect("committed segment should track files") + .iter() + .all(|file| !file.path.starts_with("staging/")), + "stale staging files must not be committed in IndexMetadata.files" + ); } #[tokio::test] @@ -1933,6 +2127,143 @@ mod tests { assert_eq!(results.num_rows(), 20); } + #[tokio::test] + async fn test_btree_merge_existing_index_segments() { + use datafusion::common::ScalarValue; + use lance_index::scalar::{SargableQuery, SearchResult}; + use std::ops::Bound; + + // Open `segment` and count rows whose `id` falls in `[lo, hi)`. + async fn count_in_range( + dataset: &Dataset, + segment: &IndexMetadata, + lo: i32, + hi: i32, + ) -> usize { + let field_path = dataset.schema().field_path(segment.fields[0]).unwrap(); + let index = crate::index::scalar::open_scalar_index( + dataset, + &field_path, + segment, + &NoOpMetricsCollector, + ) + .await + .unwrap(); + let query = SargableQuery::Range( + Bound::Included(ScalarValue::Int32(Some(lo))), + Bound::Excluded(ScalarValue::Int32(Some(hi))), + ); + match index.search(&query, &NoOpMetricsCollector).await.unwrap() { + SearchResult::Exact(row_addrs) => { + row_addrs.true_rows().row_addrs().unwrap().count() + } + other => panic!("expected exact result, got {other:?}"), + } + } + + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + + // 128 rows across two 64-row fragments. Stable row ids so the + // retired-fragment filter below exercises the exact row-id allow-list. + let reader = gen_batch() + .col("id", lance_datagen::array::step::()) + .into_reader_rows( + lance_datagen::RowCount::from(64), + lance_datagen::BatchCount::from(2), + ); + let mut dataset = Dataset::write( + reader, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Overwrite, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + // One staged BTree segment per fragment, committed as a multi-segment + // logical index. + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + let mut staged = Vec::new(); + for fragment in dataset.get_fragments() { + staged.push( + CreateIndexBuilder::new(&mut dataset, &["id"], IndexType::BTree, ¶ms) + .name("id_btree".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("id_btree", "id", staged) + .await + .unwrap(); + + // Phase 1 — healthy merge: the two per-fragment segments consolidate + // into a single canonical segment covering both fragments, and a range + // spanning both (ids 50..100) returns every matching row. + let merged = dataset + .merge_existing_index_segments(dataset.load_indices_by_name("id_btree").await.unwrap()) + .await + .unwrap(); + assert_eq!( + merged.fragment_bitmap.as_ref().unwrap(), + &roaring::RoaringBitmap::from_iter([0u32, 1]) + ); + assert!( + merged + .index_details + .as_ref() + .unwrap() + .type_url + .ends_with("BTreeIndexDetails") + ); + assert_eq!(count_in_range(&dataset, &merged, 50, 100).await, 50); + + // Phase 2 — retire fragment 0: delete >10% of its rows so compaction + // rewrites only frag 0 (frag 1 has no deletions and is at target size). + // The committed per-fragment segment now claims a fragment the dataset + // no longer has. + dataset.delete("id < 16").await.unwrap(); + crate::dataset::optimize::compact_files( + &mut dataset, + crate::dataset::optimize::CompactionOptions { + target_rows_per_fragment: 64, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + let live_frags: roaring::RoaringBitmap = dataset + .get_fragments() + .iter() + .map(|f| f.id() as u32) + .collect(); + assert!(!live_frags.contains(0), "compaction should retire frag 0"); + + // Filtered merge: coverage drops the retired fragment but keeps the + // live one, and the merged page data does not leak the retired row ids + // (ids < 16 lived only in frag 0, so the range now returns nothing). + let merged = dataset + .merge_existing_index_segments(dataset.load_indices_by_name("id_btree").await.unwrap()) + .await + .unwrap(); + let coverage = merged.fragment_bitmap.as_ref().unwrap(); + assert!(!coverage.contains(0), "must drop retired frag 0"); + assert!(coverage.contains(1), "must keep live frag 1"); + assert_eq!( + count_in_range(&dataset, &merged, 0, 16).await, + 0, + "must filter retired-fragment row ids" + ); + } + #[tokio::test] async fn test_commit_existing_index_supports_local_hnsw_segments() { let tmpdir = TempStrDir::default(); @@ -1969,7 +2300,7 @@ mod tests { CreateIndexBuilder::new(&mut dataset, &["vector"], IndexType::Vector, ¶ms) .name("vector_idx".to_string()) - .index_uuid(uuid.to_string()) + .index_uuid(uuid) .execute_uncommitted() .await .unwrap(); @@ -2194,39 +2525,38 @@ mod tests { // Load indices after optimization let indices_after = dataset.load_indices().await.unwrap(); - // There should be 3 indices: - // 1. one scalar index with name "id_idx", and the bitmap is [0,1] - // 2. one delta vector index with name "vector_idx", and the bitmap is [0] - // 3. one delta vector index with name "vector_idx", and the bitmap is [1] - assert_eq!(indices_after.len(), 3, "{:?}", indices_after); - let id_idx = indices_after + // After unifying scalar optimize, `OptimizeOptions::append()` honors + // `Some(0)` for BTree the same way it does for vector: keep the old + // segment, add a delta for the unindexed fragment. So we now expect: + // 1. id_idx old segment, bitmap [0] + // 2. id_idx delta segment, bitmap [1] + // 3. vector_idx old segment, bitmap [0] + // 4. vector_idx delta segment, bitmap [1] + // Previously BTree silently merged into 1 segment because legacy + // scalar ignored `num_indices_to_merge`. + assert_eq!(indices_after.len(), 4, "{:?}", indices_after); + let id_indices = indices_after .iter() - .find(|idx| idx.name == "id_idx") - .unwrap(); + .filter(|idx| idx.name == "id_idx") + .collect::>(); let vector_indices = indices_after .iter() .filter(|idx| idx.name == "vector_idx") .collect::>(); - assert!( - id_idx - .fragment_bitmap - .as_ref() - .unwrap() - .contains_range(0..2) - && id_idx.fragment_bitmap.as_ref().unwrap().len() == 2 - ); - assert_eq!(vector_indices.len(), 2); - assert!( - vector_indices - .iter() - .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(0) - && idx.fragment_bitmap.as_ref().unwrap().len() == 1) - ); - assert!( - vector_indices - .iter() - .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(1) - && idx.fragment_bitmap.as_ref().unwrap().len() == 1) - ); + for indices in [&id_indices, &vector_indices] { + assert_eq!(indices.len(), 2); + assert!( + indices + .iter() + .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(0) + && idx.fragment_bitmap.as_ref().unwrap().len() == 1) + ); + assert!( + indices + .iter() + .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(1) + && idx.fragment_bitmap.as_ref().unwrap().len() == 1) + ); + } } } diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 18c218ef4f7..ae2478589fb 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -4,12 +4,20 @@ //! Utilities for integrating scalar indices with datasets //! +pub(crate) mod bitmap; +pub(crate) mod btree; +pub(crate) mod fmindex; pub(crate) mod inverted; +pub(crate) mod zonemap; pub use inverted::{load_segment_details, load_segments}; +pub use crate::index::scalar_logical::{LogicalScalarIndex, load_named_scalar_segments}; + use std::sync::{Arc, LazyLock}; +use uuid::Uuid; + use crate::index::DatasetIndexExt; use crate::index::DatasetIndexInternalExt; use crate::session::index_caches::ProstAny; @@ -40,7 +48,7 @@ use lance_index::scalar::label_list::{ use lance_index::scalar::registry::{ ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, VALUE_COLUMN_NAME, }; -use lance_index::scalar::{CreatedIndex, InvertedIndexParams}; +use lance_index::scalar::{BuiltinIndexType, CreatedIndex, InvertedIndexParams}; use lance_index::scalar::{ ScalarIndex, ScalarIndexParams, bitmap::BITMAP_LOOKUP_NAME, inverted::INVERT_LIST_FILE, lance_format::LanceIndexStore, @@ -270,7 +278,7 @@ impl IndexDetails { pub(super) async fn build_scalar_index( dataset: &Dataset, column: &str, - uuid: &str, + uuid: Uuid, params: &ScalarIndexParams, train: bool, fragment_ids: Option>, @@ -285,7 +293,7 @@ pub(super) async fn build_scalar_index( ))?; let field: arrow_schema::Field = field.into(); - let index_store = LanceIndexStore::from_dataset_for_new(dataset, uuid)?; + let index_store = LanceIndexStore::from_dataset_for_new(dataset, &uuid)?; let plugin = SCALAR_INDEX_PLUGIN_REGISTRY.get_plugin_by_name(¶ms.index_type)?; let training_request = @@ -321,6 +329,51 @@ pub(super) async fn build_scalar_index( Ok(created_index) } +/// Build a canonical bitmap index segment over a caller-selected fragment set. +/// +/// This is intentionally separate from `build_scalar_index(..., fragment_ids=Some(...))`. +/// The latter is the legacy distributed scalar-index shard path. Here fragment ids only +/// restrict the scanned rows; the bitmap plugin receives no shard id and writes the +/// canonical bitmap layout for the staged segment root. +#[instrument(level = "debug", skip_all)] +pub(super) async fn build_bitmap_index_segment( + dataset: &Dataset, + column: &str, + uuid: Uuid, + fragment_ids: Vec, + progress: Arc, +) -> Result { + let field = dataset + .schema() + .field(column) + .ok_or(Error::invalid_input_source( + format!("No column with name {}", column).into(), + ))?; + let field: arrow_schema::Field = field.into(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + let plugin = SCALAR_INDEX_PLUGIN_REGISTRY.get_plugin_by_name(¶ms.index_type)?; + let training_request = + plugin.new_training_request(params.params.as_deref().unwrap_or("{}"), &field)?; + let criteria = training_request.criteria(); + + progress.stage_start("load_data", None, "rows").await?; + let training_data = + load_training_data(dataset, column, criteria, None, true, Some(fragment_ids)).await?; + progress.stage_complete("load_data").await?; + + let index_store = LanceIndexStore::from_dataset_for_new(dataset, &uuid)?; + plugin + .train_index( + training_data, + &index_store, + training_request, + None, + progress, + ) + .await +} + /// Fetches the scalar index plugin for a given index metadata /// /// The fast path, on newer datasets, is just a plugin lookup by the type URL of the index details. @@ -383,7 +436,7 @@ pub async fn open_scalar_index( index: &IndexMetadata, metrics: &dyn MetricsCollector, ) -> Result> { - let uuid_str = index.uuid.to_string(); + let index_uuid = index.uuid; let index_store = Arc::new(LanceIndexStore::from_dataset_for_existing(dataset, index).await?); let index_details = fetch_index_details(dataset, column, index).await?; @@ -393,7 +446,7 @@ pub async fn open_scalar_index( let index_cache = dataset .index_cache - .for_index(&uuid_str, frag_reuse_index.as_ref().map(|f| &f.uuid)); + .for_index(&index.uuid, frag_reuse_index.as_ref().map(|f| &f.uuid)); if let Some(index) = plugin .get_from_cache(index_store.clone(), frag_reuse_index.clone(), &index_cache) @@ -413,7 +466,7 @@ pub async fn open_scalar_index( .load_index(index_store, &index_details, frag_reuse_index, &index_cache) .await?; - tracing::info!(target: TRACE_IO_EVENTS, index_uuid = uuid_str, r#type = IO_TYPE_OPEN_SCALAR, index_type = index.index_type().to_string()); + tracing::info!(target: TRACE_IO_EVENTS, index_uuid = %index_uuid, r#type = IO_TYPE_OPEN_SCALAR, index_type = index.index_type().to_string()); metrics.record_index_load(); plugin.put_in_cache(&index_cache, index.clone()).await?; @@ -425,13 +478,14 @@ pub(crate) async fn infer_scalar_index_details( column: &str, index: &IndexMetadata, ) -> Result> { - let uuid = index.uuid.to_string(); - let type_key = crate::session::index_caches::ScalarIndexDetailsKey { uuid: &uuid }; + let type_key = crate::session::index_caches::ScalarIndexDetailsKey { uuid: &index.uuid }; if let Some(index_details) = dataset.index_cache.get_with_key(&type_key).await { return Ok(index_details.0.clone()); } - let index_dir = dataset.indice_files_dir(index)?.join(uuid.clone()); + let index_dir = dataset + .indice_files_dir(index)? + .join(index.uuid.to_string()); let col = dataset .schema() .field(column) @@ -564,11 +618,7 @@ pub async fn initialize_scalar_index( let column_name = field_names[0]; let source_scalar_index = source_dataset - .open_scalar_index( - column_name, - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index(column_name, &source_index.uuid, &NoOpMetricsCollector) .await?; let params = source_scalar_index.derive_index_params()?; @@ -1111,11 +1161,7 @@ mod tests { // Verify the index type is correct let target_scalar_index = target_dataset - .open_scalar_index( - "id", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index("id", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -1189,7 +1235,7 @@ mod tests { // Verify the index type is correct let scalar_index = dataset - .open_scalar_index("id", &indices[0].uuid.to_string(), &NoOpMetricsCollector) + .open_scalar_index("id", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -1234,7 +1280,7 @@ mod tests { ); let scalar_index = dataset - .open_scalar_index("id", &indices[0].uuid.to_string(), &NoOpMetricsCollector) + .open_scalar_index("id", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -1417,11 +1463,7 @@ mod tests { // Verify the index type is correct let target_scalar_index = target_dataset - .open_scalar_index( - "text", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index("text", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -1555,11 +1597,7 @@ mod tests { // Verify the index type is correct let target_scalar_index = target_dataset - .open_scalar_index( - "value", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index("value", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs new file mode 100644 index 00000000000..2eb5702ee28 --- /dev/null +++ b/rust/lance/src/index/scalar/bitmap.rs @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::scalar::bitmap::BitmapIndex; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_table::format::IndexMetadata; +use roaring::RoaringBitmap; +use std::sync::Arc; +use uuid::Uuid; + +use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; + +/// Merge one caller-defined group of source bitmap segments into a single segment. +pub(in crate::index) async fn merge_segments( + dataset: &Dataset, + segments: Vec, +) -> Result { + if segments.is_empty() { + return Err(Error::index("No segment metadata was provided".to_string())); + } + + let field_id = *segments[0].fields.first().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing field ids", + segments[0].uuid + )) + })?; + let field_path = dataset.schema().field_path(field_id)?; + + let mut source_indices = Vec::with_capacity(segments.len()); + let mut fragment_bitmap = RoaringBitmap::new(); + for segment in &segments { + fragment_bitmap |= segment.fragment_bitmap.as_ref().cloned().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + )) + })?; + let scalar_index = + super::open_scalar_index(dataset, &field_path, segment, &NoOpMetricsCollector).await?; + let bitmap_index = scalar_index + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::index(format!( + "merge_existing_index_segments: expected bitmap segment {}, got {:?}", + segment.uuid, + scalar_index.index_type() + )) + })?; + source_indices.push(Arc::new(bitmap_index.clone())); + } + + let new_uuid = Uuid::new_v4(); + let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_uuid)?; + let created_index = lance_index::scalar::bitmap::merge_bitmap_indices( + &source_indices, + &new_store, + lance_index::progress::noop_progress(), + ) + .await?; + + Ok(IndexMetadata { + uuid: new_uuid, + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(fragment_bitmap), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: Some(created_index.files), + ..segments[0].clone() + }) +} diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs new file mode 100644 index 00000000000..4339b8c183b --- /dev/null +++ b/rust/lance/src/index/scalar/btree.rs @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +#![allow(clippy::redundant_pub_crate)] + +//! BTree-specific helpers for the segmented index workflow. +use std::sync::Arc; + +use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use lance_core::ROW_ID; +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::pbold::BTreeIndexDetails; +use lance_index::scalar::btree::BTreeIndex; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::registry::VALUE_COLUMN_NAME; +use lance_index::scalar::{CreatedIndex, OldIndexDataFilter}; +use lance_table::format::IndexMetadata; +use uuid::Uuid; + +use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; + +/// Build a row-empty `new_data` stream for the BTree merge API. +fn empty_btree_update_stream( + dataset: &Dataset, + field_id: i32, +) -> Result { + let field = dataset.schema().field_by_id(field_id).ok_or_else(|| { + Error::invalid_input(format!( + "merge_existing_index_segments: field id {} does not exist", + field_id + )) + })?; + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new(VALUE_COLUMN_NAME, field.data_type(), true), + ArrowField::new(ROW_ID, arrow_schema::DataType::UInt64, false), + ])); + Ok(Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::empty(), + ))) +} + +fn ensure_btree_details(segment: &IndexMetadata) -> Result<()> { + if let Some(details) = segment.index_details.as_ref() + && !details.type_url.ends_with("BTreeIndexDetails") + { + return Err(Error::invalid_input(format!( + "Segment '{}' is not a BTree segment (details type_url = '{}')", + segment.uuid, details.type_url + ))); + } + Ok(()) +} + +/// Open the given BTree `segments` and k-way merge their already-sorted page +/// data, together with `new_data`, into a single canonical BTree written to +/// `new_store`. +pub(crate) async fn open_and_merge_segments( + dataset: &Dataset, + field_path: &str, + segments: &[&IndexMetadata], + new_data: SendableRecordBatchStream, + new_store: &LanceIndexStore, + old_data_filters: &[Option], +) -> Result { + let mut source_indices = Vec::with_capacity(segments.len()); + for &segment in segments { + let scalar_index = + super::open_scalar_index(dataset, field_path, segment, &NoOpMetricsCollector).await?; + let btree = scalar_index + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::index(format!( + "BTree merge: expected BTree segment {}, got {:?}", + segment.uuid, + scalar_index.index_type() + )) + })?; + source_indices.push(Arc::new(btree.clone())); + } + BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filters).await +} + +/// Merge one caller-defined group of source BTree segments into a single +/// physical segment. +pub(crate) async fn merge_segments( + dataset: &Dataset, + segments: Vec, +) -> Result { + if segments.is_empty() { + return Err(Error::index("No segment metadata was provided".to_string())); + } + + for segment in &segments { + ensure_btree_details(segment)?; + } + + // All source segments must belong to the same column. + let reference_fields = segments[0].fields.as_slice(); + for segment in segments.iter().skip(1) { + if segment.fields.as_slice() != reference_fields { + return Err(Error::invalid_input(format!( + "BTree merge_segments: segment {} has fields {:?}, expected {:?}", + segment.uuid, segment.fields, reference_fields, + ))); + } + } + + let field_id = *segments[0].fields.first().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing field ids", + segments[0].uuid + )) + })?; + let field_path = dataset.schema().field_path(field_id)?; + + let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); + let (fragment_bitmap, old_data_filters) = + crate::index::append::build_per_segment_filters(dataset, &segment_refs).await?; + + let output_uuid = Uuid::new_v4(); + let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid)?; + // Pure segment consolidation: no dataset scan, so `new_data` is an empty + // stream and the merge is driven entirely by the source page data. + let empty_new_data = empty_btree_update_stream(dataset, field_id)?; + let created_index = open_and_merge_segments( + dataset, + &field_path, + &segment_refs, + empty_new_data, + &new_store, + &old_data_filters, + ) + .await?; + + if !created_index + .index_details + .type_url + .ends_with("BTreeIndexDetails") + { + return Err(Error::internal(format!( + "merge_existing_index_segments: BTree merge produced unexpected details type_url '{}'", + created_index.index_details.type_url + ))); + } + debug_assert_eq!( + created_index.index_details, + prost_types::Any::from_msg(&BTreeIndexDetails::default()).unwrap(), + ); + + Ok(IndexMetadata { + uuid: output_uuid, + name: segments[0].name.clone(), + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(fragment_bitmap), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: Some(created_index.files), + }) +} diff --git a/rust/lance/src/index/scalar/fmindex.rs b/rust/lance/src/index/scalar/fmindex.rs new file mode 100644 index 00000000000..6c33498d929 --- /dev/null +++ b/rust/lance/src/index/scalar/fmindex.rs @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use lance_table::format::IndexMetadata; +use roaring::RoaringBitmap; +use std::sync::Arc; +use uuid::Uuid; + +use crate::{Dataset, Error, Result}; + +/// Merge one caller-defined group of source FM-Index segments into a single segment. +/// +/// FM-Index merge requires rebuilding from source text — there is no cheap way +/// to combine two BWT structures. This function re-reads text data from the +/// dataset for all fragments covered by the source segments and builds a fresh +/// FM-Index over the combined data. +pub(in crate::index) async fn merge_segments( + dataset: &Dataset, + segments: Vec, +) -> Result { + if segments.is_empty() { + return Err(Error::index("No segment metadata was provided".to_string())); + } + + let field_id = *segments[0].fields.first().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing field ids", + segments[0].uuid + )) + })?; + let column = dataset.schema().field_path(field_id)?; + + let mut fragment_bitmap = RoaringBitmap::new(); + for segment in &segments { + fragment_bitmap |= segment.fragment_bitmap.as_ref().cloned().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + )) + })?; + } + + // Intersect with the dataset's current live fragments to drop retired/compacted + // fragments, mirroring the btree merge behavior. + fragment_bitmap &= dataset.fragment_bitmap.as_ref(); + + if fragment_bitmap.is_empty() { + // All covered fragments have been retired; produce an empty index. + let new_uuid = Uuid::new_v4(); + let created_index = super::build_scalar_index( + dataset, + &column, + new_uuid, + &lance_index::scalar::ScalarIndexParams::for_builtin( + lance_index::scalar::BuiltinIndexType::Fm, + ), + false, + None, + None, + Arc::new(lance_index::progress::NoopIndexBuildProgress), + ) + .await?; + + return Ok(IndexMetadata { + uuid: new_uuid, + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(fragment_bitmap), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: Some(created_index.files), + ..segments[0].clone() + }); + } + + let fragment_ids: Vec = fragment_bitmap.iter().collect(); + let new_uuid = Uuid::new_v4(); + + let created_index = super::build_scalar_index( + dataset, + &column, + new_uuid, + &lance_index::scalar::ScalarIndexParams::for_builtin( + lance_index::scalar::BuiltinIndexType::Fm, + ), + true, + Some(fragment_ids), + None, + Arc::new(lance_index::progress::NoopIndexBuildProgress), + ) + .await?; + + Ok(IndexMetadata { + uuid: new_uuid, + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(fragment_bitmap), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: Some(created_index.files), + ..segments[0].clone() + }) +} diff --git a/rust/lance/src/index/scalar/inverted.rs b/rust/lance/src/index/scalar/inverted.rs index 44cf6ff2e08..000d2c3139c 100644 --- a/rust/lance/src/index/scalar/inverted.rs +++ b/rust/lance/src/index/scalar/inverted.rs @@ -64,7 +64,7 @@ pub(crate) async fn finalize_segment_files_if_needed( let store = Arc::new(LanceIndexStore::from_dataset_for_new( dataset, - &segment.uuid.to_string(), + &segment.uuid, )?); lance_index::scalar::inverted::builder::merge_index_files( dataset.object_store.as_ref(), @@ -118,7 +118,7 @@ pub(crate) async fn merge_segments( } let new_uuid = Uuid::new_v4(); - let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_uuid.to_string())?; + let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_uuid)?; let created_index = InvertedIndex::merge_segments( &source_indices, empty_inverted_update_stream(dataset, field_id)?, @@ -137,7 +137,7 @@ pub(crate) async fn merge_segments( index_version: created_index.index_version as i32, created_at: Some(chrono::Utc::now()), base_id: None, - files: created_index.files, + files: Some(created_index.files), ..segments[0].clone() }) } diff --git a/rust/lance/src/index/scalar/zonemap.rs b/rust/lance/src/index/scalar/zonemap.rs new file mode 100644 index 00000000000..0cbd98f2c40 --- /dev/null +++ b/rust/lance/src/index/scalar/zonemap.rs @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::sync::Arc; + +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::zonemap::ZoneMapIndex; +use lance_table::format::IndexMetadata; +use roaring::RoaringBitmap; +use uuid::Uuid; + +use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; + +/// Merge one caller-defined group of source ZoneMap segments into a single segment. +pub(in crate::index) async fn merge_segments( + dataset: &Dataset, + segments: Vec, +) -> Result { + if segments.is_empty() { + return Err(Error::index("No segment metadata was provided".to_string())); + } + + let field_id = *segments[0].fields.first().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing field ids", + segments[0].uuid + )) + })?; + let field_path = dataset.schema().field_path(field_id)?; + + let mut scalar_indices = Vec::with_capacity(segments.len()); + let mut fragment_bitmap = RoaringBitmap::new(); + let dataset_fragments = dataset.fragment_bitmap.as_ref(); + for segment in &segments { + let effective = segment + .effective_fragment_bitmap(dataset_fragments) + .ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + )) + })?; + fragment_bitmap |= effective; + let scalar_index = + super::open_scalar_index(dataset, &field_path, segment, &NoOpMetricsCollector).await?; + scalar_indices.push((segment.uuid, scalar_index)); + } + + let mut source_indices = Vec::with_capacity(scalar_indices.len()); + for (segment_uuid, scalar_index) in &scalar_indices { + let zonemap_index = scalar_index + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::index(format!( + "merge_existing_index_segments: expected zonemap segment {}, got {:?}", + segment_uuid, + scalar_index.index_type() + )) + })?; + source_indices.push(zonemap_index); + } + + let new_uuid = Uuid::new_v4(); + let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_uuid)?; + let created_index = lance_index::scalar::zonemap::merge_zonemap_indices( + &source_indices, + &new_store, + &fragment_bitmap, + ) + .await?; + + Ok(IndexMetadata { + uuid: new_uuid, + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(fragment_bitmap), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: Some(created_index.files), + ..segments[0].clone() + }) +} diff --git a/rust/lance/src/index/scalar_logical.rs b/rust/lance/src/index/scalar_logical.rs index 162a36a0c97..8ef86a6cb5f 100644 --- a/rust/lance/src/index/scalar_logical.rs +++ b/rust/lance/src/index/scalar_logical.rs @@ -7,8 +7,8 @@ use std::any::Any; use std::sync::Arc; use async_trait::async_trait; -use deepsize::{Context, DeepSizeOf}; use futures::future::try_join_all; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_core::{Error, Result}; use lance_index::metrics::MetricsCollector; use lance_index::scalar::{AnyQuery, CreatedIndex, ScalarIndex, SearchResult, UpdateCriteria}; @@ -31,7 +31,17 @@ pub struct LogicalScalarIndex { } impl LogicalScalarIndex { - fn try_new(name: String, column: String, segments: Vec>) -> Result { + /// Merge several already-opened segments of one scalar index into a single + /// searchable [`ScalarIndex`]. + /// + /// Used internally by `open_named_scalar_index`, and exposed so a + /// distributed query engine can open an explicit subset of a scalar + /// index's segments and present them as one index. + pub fn try_new( + name: String, + column: String, + segments: Vec>, + ) -> Result { let Some(first) = segments.first() else { return Err(Error::invalid_input(format!( "LogicalScalarIndex '{}' on column '{}' must contain at least one segment", @@ -76,13 +86,6 @@ impl Index for LogicalScalarIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input(format!( - "LogicalScalarIndex '{}' is not a vector index", - self.name - ))) - } - fn statistics(&self) -> Result { Ok(json!({ "index_name": self.name, @@ -210,7 +213,14 @@ fn index_intersects_dataset(index: &IndexMetadata, dataset: &Dataset) -> bool { .is_some_and(|index_bitmap| index_bitmap.intersection_len(&dataset.fragment_bitmap) > 0) } -async fn load_named_scalar_segments( +/// List the committed, dataset-intersecting segments of a named scalar index. +/// +/// Returns one [`IndexMetadata`] per usable segment. The result length is the +/// segment count: `1` means a single (non-segmented) index, `> 1` means the +/// index is split across multiple segments that a distributed engine may route +/// to different executors. All returned segments are validated to share the +/// same underlying index type. +pub async fn load_named_scalar_segments( dataset: &Dataset, column: &str, index_name: &str, @@ -292,13 +302,15 @@ pub async fn open_named_scalar_index( index_name, column ))), 1 => { - let uuid = indices[0].uuid.to_string(); - dataset.open_scalar_index(column, &uuid, metrics).await + dataset + .open_scalar_index(column, &indices[0].uuid, metrics) + .await } _ => { - let segments = try_join_all(indices.iter().map(|index| { - let uuid = index.uuid.to_string(); - async move { dataset.open_scalar_index(column, &uuid, metrics).await } + let segments = try_join_all(indices.iter().map(|index| async move { + dataset + .open_scalar_index(column, &index.uuid, metrics) + .await })) .await?; @@ -323,8 +335,13 @@ mod tests { use lance_datagen::array; use lance_index::IndexType; use lance_index::metrics::NoOpMetricsCollector; + use lance_index::scalar::bitmap::BITMAP_LOOKUP_NAME; use lance_index::scalar::{BuiltinIndexType, SargableQuery, ScalarIndexParams}; + use crate::Dataset; + use crate::dataset::WriteParams; + use crate::dataset::optimize::{CompactionOptions, compact_files}; + use crate::dataset::write::WriteMode; use crate::index::create::CreateIndexBuilder; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; @@ -498,6 +515,112 @@ mod tests { ); } + #[tokio::test] + async fn test_bitmap_segments_commit_and_query_as_logical_index() { + let test_dir = TempStrDir::default(); + let dataset = lance_datagen::gen_batch() + .col("value", array::step::()) + .into_dataset( + test_dir.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(16), + ) + .await + .unwrap(); + let mut dataset = dataset; + let fragments = dataset.get_fragments(); + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + let mut staged = Vec::new(); + + for fragment_group in fragments.chunks(2) { + let fragment_ids = fragment_group + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>(); + let segment = + CreateIndexBuilder::new(&mut dataset, &["value"], IndexType::Bitmap, ¶ms) + .name("value_bitmap".to_string()) + .fragments(fragment_ids.clone()) + .execute_uncommitted() + .await + .unwrap(); + assert_eq!( + segment + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + fragment_ids + ); + let files = segment.files.as_ref().unwrap(); + assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME)); + assert!(files.iter().all(|file| !file.path.starts_with("part_"))); + staged.push(segment); + } + + let staged_uuids = staged + .iter() + .map(|segment| segment.uuid) + .collect::>(); + let merged = dataset.merge_existing_index_segments(staged).await.unwrap(); + assert!(!staged_uuids.contains(&merged.uuid)); + assert_eq!( + merged + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + fragments + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>() + ); + let files = merged.files.as_ref().unwrap(); + assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME)); + assert!(files.iter().all(|file| !file.path.starts_with("part_"))); + + dataset + .commit_existing_index_segments("value_bitmap", "value", vec![merged]) + .await + .unwrap(); + + let committed = dataset.load_indices_by_name("value_bitmap").await.unwrap(); + assert_eq!(committed.len(), 1); + assert_eq!( + scalar_index_fragment_bitmap(&dataset, "value", "value_bitmap") + .await + .unwrap() + .unwrap(), + dataset.fragment_bitmap.as_ref().clone() + ); + + let logical = + open_named_scalar_index(&dataset, "value", "value_bitmap", &NoOpMetricsCollector) + .await + .unwrap(); + assert_eq!(logical.index_type(), IndexType::Bitmap); + + let query = SargableQuery::Equals(ScalarValue::Int32(Some(20))); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!( + "expected exact result from segmented bitmap, got {:?}", + other + ), + }; + + let searched_fragments = row_addrs + .true_rows() + .row_addrs() + .unwrap() + .map(|row_addr| RowAddress::from(u64::from(row_addr)).fragment_id()) + .collect::>(); + assert_eq!(searched_fragments, vec![1]); + } + #[tokio::test] async fn test_zonemap_segment_search_keeps_fragment_ids() { let dataset = lance_datagen::gen_batch() @@ -561,6 +684,348 @@ mod tests { ); } + #[tokio::test] + async fn test_merge_existing_index_segments_supports_zonemap_segments() { + let dataset = lance_datagen::gen_batch() + .col("value", array::step::()) + .into_ram_dataset(FragmentCount::from(4), FragmentRowCount::from(16)) + .await + .unwrap(); + let mut dataset = dataset; + let fragments = dataset.get_fragments(); + let zonemap_params = lance_index::scalar::zonemap::ZoneMapIndexBuilderParams::new(8); + let params_json = serde_json::to_value(&zonemap_params).unwrap(); + let params = + ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap).with_params(¶ms_json); + let mut staged = Vec::new(); + + for fragment in &fragments { + let segment = + CreateIndexBuilder::new(&mut dataset, &["value"], IndexType::ZoneMap, ¶ms) + .name("value_zonemap_merged".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + staged.push(segment); + } + + let staged_uuids = staged + .iter() + .map(|segment| segment.uuid) + .collect::>(); + let merged = dataset.merge_existing_index_segments(staged).await.unwrap(); + assert!(!staged_uuids.contains(&merged.uuid)); + assert_eq!( + merged + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + fragments + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>() + ); + assert!( + merged + .files + .as_ref() + .unwrap() + .iter() + .any(|file| file.path == "zonemap.lance") + ); + + dataset + .commit_existing_index_segments("value_zonemap_merged", "value", vec![merged]) + .await + .unwrap(); + + let committed = dataset + .load_indices_by_name("value_zonemap_merged") + .await + .unwrap(); + assert_eq!(committed.len(), 1); + + let logical = open_named_scalar_index( + &dataset, + "value", + "value_zonemap_merged", + &NoOpMetricsCollector, + ) + .await + .unwrap(); + assert_eq!(logical.index_type(), IndexType::ZoneMap); + assert_eq!( + logical.statistics().unwrap()["rows_per_zone"], + serde_json::json!(8) + ); + assert_eq!( + logical.calculate_included_frags().await.unwrap(), + dataset.fragment_bitmap.as_ref().clone() + ); + + let query = SargableQuery::Range( + Bound::Included(ScalarValue::Int32(Some(0))), + Bound::Included(ScalarValue::Int32(Some(10_000))), + ); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let searched_fragments = result + .row_addrs() + .true_rows() + .row_addrs() + .unwrap() + .map(|row_addr| RowAddress::from(u64::from(row_addr)).fragment_id()) + .collect::>(); + assert_eq!( + searched_fragments, + fragments + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>() + ); + + let selective_query = SargableQuery::Range( + Bound::Included(ScalarValue::Int32(Some(20))), + Bound::Included(ScalarValue::Int32(Some(43))), + ); + let selective_result = logical + .search(&selective_query, &NoOpMetricsCollector) + .await + .unwrap(); + let selective_fragments = selective_result + .row_addrs() + .true_rows() + .row_addrs() + .unwrap() + .map(|row_addr| RowAddress::from(u64::from(row_addr)).fragment_id()) + .collect::>(); + assert_eq!( + selective_fragments, + fragments[1..=2] + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>() + ); + } + + #[tokio::test] + async fn test_merge_existing_zonemap_segments_drops_retired_fragments() { + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + let reader = lance_datagen::gen_batch() + .col("value", array::step::()) + .into_reader_rows( + lance_datagen::RowCount::from(64), + lance_datagen::BatchCount::from(2), + ); + let mut dataset = Dataset::write( + reader, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Overwrite, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap); + let mut staged = Vec::new(); + for fragment in dataset.get_fragments() { + staged.push( + CreateIndexBuilder::new(&mut dataset, &["value"], IndexType::ZoneMap, ¶ms) + .name("value_zonemap_retired".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("value_zonemap_retired", "value", staged) + .await + .unwrap(); + + dataset.delete("value < 16").await.unwrap(); + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 64, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + let live_frags = dataset.fragment_bitmap.as_ref().clone(); + assert!(!live_frags.contains(0), "compaction should retire frag 0"); + + let merged = dataset + .merge_existing_index_segments( + dataset + .load_indices_by_name("value_zonemap_retired") + .await + .unwrap(), + ) + .await + .unwrap(); + let coverage = merged.fragment_bitmap.as_ref().unwrap(); + assert!(!coverage.contains(0), "must drop retired frag 0"); + assert!(coverage.contains(1), "must keep live indexed frag 1"); + + let field_path = dataset.schema().field_path(merged.fields[0]).unwrap(); + let index = crate::index::scalar::open_scalar_index( + &dataset, + &field_path, + &merged, + &NoOpMetricsCollector, + ) + .await + .unwrap(); + let query = SargableQuery::Range( + Bound::Included(ScalarValue::Int32(Some(0))), + Bound::Excluded(ScalarValue::Int32(Some(16))), + ); + let searched_fragments = index + .search(&query, &NoOpMetricsCollector) + .await + .unwrap() + .row_addrs() + .true_rows() + .row_addrs() + .unwrap() + .map(|row_addr| RowAddress::from(u64::from(row_addr)).fragment_id()) + .collect::>(); + assert!( + searched_fragments.is_empty(), + "must filter retired-fragment zones" + ); + } + + #[tokio::test] + async fn test_merge_then_commit_zonemap_segment_ignores_retired_fragment_coverage() { + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + let reader = lance_datagen::gen_batch() + .col("value", array::step::()) + .into_reader_rows( + lance_datagen::RowCount::from(64), + lance_datagen::BatchCount::from(2), + ); + let mut dataset = Dataset::write( + reader, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Overwrite, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap); + let segment = + CreateIndexBuilder::new(&mut dataset, &["value"], IndexType::ZoneMap, ¶ms) + .name("value_zonemap_replace_retired".to_string()) + .execute_uncommitted() + .await + .unwrap(); + let original_coverage = segment.fragment_bitmap.as_ref().unwrap().clone(); + assert!(original_coverage.contains(0)); + assert!(original_coverage.contains(1)); + + dataset + .commit_existing_index_segments("value_zonemap_replace_retired", "value", vec![segment]) + .await + .unwrap(); + + dataset.delete("value < 16").await.unwrap(); + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 64, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + let live_frags = dataset.fragment_bitmap.as_ref().clone(); + assert!(!live_frags.contains(0), "compaction should retire frag 0"); + + let merged = dataset + .merge_existing_index_segments( + dataset + .load_indices_by_name("value_zonemap_replace_retired") + .await + .unwrap(), + ) + .await + .unwrap(); + let merged_coverage = merged.fragment_bitmap.as_ref().unwrap().clone(); + let merged_uuid = merged.uuid; + + dataset + .commit_existing_index_segments("value_zonemap_replace_retired", "value", vec![merged]) + .await + .unwrap(); + + let committed = dataset + .load_indices_by_name("value_zonemap_replace_retired") + .await + .unwrap(); + assert_eq!(committed.len(), 1); + assert_eq!(committed[0].uuid, merged_uuid); + + let combined_bitmap = + scalar_index_fragment_bitmap(&dataset, "value", "value_zonemap_replace_retired") + .await + .unwrap() + .unwrap(); + assert_eq!(combined_bitmap, merged_coverage); + } + + #[tokio::test] + async fn test_merge_existing_index_segments_rejects_mismatched_zonemap_params() { + let dataset = lance_datagen::gen_batch() + .col("value", array::step::()) + .into_ram_dataset(FragmentCount::from(2), FragmentRowCount::from(16)) + .await + .unwrap(); + let mut dataset = dataset; + let fragments = dataset.get_fragments(); + let mut staged = Vec::new(); + + for (fragment, rows_per_zone) in fragments.iter().zip([8, 16]) { + let zonemap_params = + lance_index::scalar::zonemap::ZoneMapIndexBuilderParams::new(rows_per_zone); + let params_json = serde_json::to_value(&zonemap_params).unwrap(); + let params = + ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap).with_params(¶ms_json); + let segment = + CreateIndexBuilder::new(&mut dataset, &["value"], IndexType::ZoneMap, ¶ms) + .name("value_zonemap_mismatched".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + staged.push(segment); + } + + let err = dataset + .merge_existing_index_segments(staged) + .await + .unwrap_err(); + assert!( + err.to_string().contains("different rows_per_zone values"), + "unexpected error: {err}" + ); + } + #[tokio::test] async fn test_commit_existing_zonemap_segments_replaces_overlapping_segments() { let dataset = lance_datagen::gen_batch() @@ -631,4 +1096,368 @@ mod tests { dataset.fragment_bitmap.as_ref().clone() ); } + + #[tokio::test] + async fn test_fmindex_segments_commit_and_query_as_logical_index() { + let test_dir = TempStrDir::default(); + + let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new( + "text", + arrow_schema::DataType::Utf8, + false, + )])); + let write_params = crate::dataset::write::WriteParams { + max_rows_per_file: 4, + ..Default::default() + }; + let batches = vec![ + arrow_array::RecordBatch::try_new( + schema.clone(), + vec![Arc::new(arrow_array::StringArray::from(vec![ + "the quick brown fox", + "jumps over the lazy dog", + "hello world from rust", + "pack my box with five dozen liquor jugs", + "how vexingly quick daft zebras jump", + "the five boxing wizards jump quickly", + "sphinx of black quartz judge my vow", + "two driven jocks help fax my big quiz", + "waltz bad nymph for quick jigs vex", + "glib jocks quiz nymph to vex dwarf", + "quick brown fox jumps again here", + "lazy dog sleeps under the tree", + ]))], + ) + .unwrap(), + ]; + let reader = + arrow_array::RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + let mut dataset = Dataset::write(reader, test_dir.as_str(), Some(write_params)) + .await + .unwrap(); + + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 3); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Fm); + let mut segments = Vec::new(); + for fragment in &fragments { + let segment = CreateIndexBuilder::new(&mut dataset, &["text"], IndexType::Fm, ¶ms) + .name("text_fmindex".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + + assert_eq!( + segment + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + vec![fragment.id() as u32] + ); + segments.push(segment); + } + + dataset + .commit_existing_index_segments("text_fmindex", "text", segments) + .await + .unwrap(); + + let committed = dataset.load_indices_by_name("text_fmindex").await.unwrap(); + assert_eq!(committed.len(), fragments.len()); + + let logical = + open_named_scalar_index(&dataset, "text", "text_fmindex", &NoOpMetricsCollector) + .await + .unwrap(); + assert_eq!(logical.index_type(), IndexType::Fm); + + let query = lance_index::scalar::TextQuery::StringContains("quick".to_string()); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!( + "expected exact result from segmented fmindex, got {:?}", + other + ), + }; + let match_count = row_addrs.true_rows().row_addrs().unwrap().count(); + assert_eq!( + match_count, 5, + "expected exactly 5 matches for 'quick', got {match_count}" + ); + + // Verify fragment coverage via manifest metadata (not calculate_included_frags, + // which derives from row addresses and may not encode fragment IDs for all layouts) + assert_eq!( + scalar_index_fragment_bitmap(&dataset, "text", "text_fmindex") + .await + .unwrap() + .unwrap(), + dataset.fragment_bitmap.as_ref().clone() + ); + } + + #[tokio::test] + async fn test_fmindex_segments_merge_and_query() { + let test_dir = TempStrDir::default(); + + let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new( + "text", + arrow_schema::DataType::Utf8, + false, + )])); + let write_params = crate::dataset::write::WriteParams { + max_rows_per_file: 4, + ..Default::default() + }; + let batches = vec![ + arrow_array::RecordBatch::try_new( + schema.clone(), + vec![Arc::new(arrow_array::StringArray::from(vec![ + "alpha beta gamma delta", + "beta gamma delta epsilon", + "gamma delta epsilon zeta", + "delta epsilon zeta eta", + "epsilon zeta eta theta", + "zeta eta theta iota", + "eta theta iota kappa", + "theta iota kappa lambda", + ]))], + ) + .unwrap(), + ]; + let reader = + arrow_array::RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + let mut dataset = Dataset::write(reader, test_dir.as_str(), Some(write_params)) + .await + .unwrap(); + + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 2); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Fm); + let mut staged = Vec::new(); + for fragment in &fragments { + let segment = CreateIndexBuilder::new(&mut dataset, &["text"], IndexType::Fm, ¶ms) + .name("text_fmindex_merge".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + staged.push(segment); + } + assert_eq!(staged.len(), 2); + + let staged_uuids = staged.iter().map(|s| s.uuid).collect::>(); + let merged = dataset.merge_existing_index_segments(staged).await.unwrap(); + + assert!(!staged_uuids.contains(&merged.uuid)); + assert_eq!( + merged + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + fragments.iter().map(|f| f.id() as u32).collect::>() + ); + + dataset + .commit_existing_index_segments("text_fmindex_merge", "text", vec![merged]) + .await + .unwrap(); + + let committed = dataset + .load_indices_by_name("text_fmindex_merge") + .await + .unwrap(); + assert_eq!(committed.len(), 1); + + let logical = open_named_scalar_index( + &dataset, + "text", + "text_fmindex_merge", + &NoOpMetricsCollector, + ) + .await + .unwrap(); + assert_eq!(logical.index_type(), IndexType::Fm); + + let query = lance_index::scalar::TextQuery::StringContains("delta".to_string()); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!("expected exact result from merged fmindex, got {:?}", other), + }; + assert_eq!(row_addrs.true_rows().row_addrs().unwrap().count(), 4); + + let query = lance_index::scalar::TextQuery::StringContains("nonexistent".to_string()); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!("expected exact result from merged fmindex, got {:?}", other), + }; + assert_eq!(row_addrs.true_rows().row_addrs().unwrap().count(), 0); + } + + #[tokio::test] + async fn test_fmindex_merge_after_compaction_drops_retired_fragments() { + use crate::dataset::write::WriteParams; + + let test_dir = TempStrDir::default(); + + let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new( + "text", + arrow_schema::DataType::Utf8, + false, + )])); + // Create two fragments with 4 rows each so compaction can retire one + let write_params = WriteParams { + max_rows_per_file: 4, + enable_stable_row_ids: true, + ..Default::default() + }; + let batches = vec![ + arrow_array::RecordBatch::try_new( + schema.clone(), + vec![Arc::new(arrow_array::StringArray::from(vec![ + "alpha beta gamma", + "beta gamma delta", + "gamma delta epsilon", + "delta epsilon zeta", + "epsilon zeta eta", + "zeta eta theta", + "eta theta iota", + "theta iota kappa", + ]))], + ) + .unwrap(), + ]; + let reader = + arrow_array::RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + let mut dataset = Dataset::write(reader, test_dir.as_str(), Some(write_params)) + .await + .unwrap(); + + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 2); + + // Build per-fragment FM-Index segments and commit + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Fm); + let mut staged = Vec::new(); + for fragment in &fragments { + let segment = CreateIndexBuilder::new(&mut dataset, &["text"], IndexType::Fm, ¶ms) + .name("text_fmindex_compact".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + staged.push(segment); + } + dataset + .commit_existing_index_segments("text_fmindex_compact", "text", staged) + .await + .unwrap(); + + // Verify initial state: 2 segments, both fragments live + let committed = dataset + .load_indices_by_name("text_fmindex_compact") + .await + .unwrap(); + assert_eq!(committed.len(), 2); + + // Delete rows from fragment 0 to trigger compaction retirement + dataset.delete("text = 'alpha beta gamma'").await.unwrap(); + dataset.delete("text = 'beta gamma delta'").await.unwrap(); + crate::dataset::optimize::compact_files( + &mut dataset, + crate::dataset::optimize::CompactionOptions { + target_rows_per_fragment: 4, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + let live_frags: RoaringBitmap = dataset + .get_fragments() + .iter() + .map(|f| f.id() as u32) + .collect(); + assert!( + !live_frags.contains(0), + "compaction should retire fragment 0" + ); + + // Merge: the retired fragment should be dropped from coverage + let segments = dataset + .load_indices_by_name("text_fmindex_compact") + .await + .unwrap(); + let merged = dataset + .merge_existing_index_segments(segments) + .await + .unwrap(); + + let coverage = merged.fragment_bitmap.as_ref().unwrap(); + assert!( + !coverage.contains(0), + "merged coverage must drop retired fragment 0" + ); + assert!( + coverage.contains(1), + "merged coverage must keep live fragment 1" + ); + + // Commit the merged segment and verify search works + dataset + .commit_existing_index_segments("text_fmindex_compact", "text", vec![merged]) + .await + .unwrap(); + + let committed = dataset + .load_indices_by_name("text_fmindex_compact") + .await + .unwrap(); + assert_eq!(committed.len(), 1); + + let logical = open_named_scalar_index( + &dataset, + "text", + "text_fmindex_compact", + &NoOpMetricsCollector, + ) + .await + .unwrap(); + + // "alpha" only existed in the deleted/retired rows + let query = lance_index::scalar::TextQuery::StringContains("alpha".to_string()); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!("expected exact result from merged fmindex, got {:?}", other), + }; + assert_eq!( + row_addrs.true_rows().row_addrs().unwrap().count(), + 0, + "deleted rows from retired fragment should not appear in merged index" + ); + + // "theta" exists in fragment 1 rows only + let query = lance_index::scalar::TextQuery::StringContains("theta".to_string()); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!("expected exact result from merged fmindex, got {:?}", other), + }; + assert!( + row_addrs.true_rows().row_addrs().unwrap().count() > 0, + "rows from live fragment should still be searchable" + ); + } } diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index ff7d2383c67..af48bc94c41 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -9,6 +9,7 @@ use std::{any::Any, collections::HashMap}; pub mod builder; pub(crate) mod details; +pub mod hamming; pub mod ivf; pub mod pq; pub mod utils; @@ -18,7 +19,7 @@ mod fixture_test; use self::{ivf::*, pq::PQIndex}; use arrow_schema::{DataType, Schema}; -use builder::IvfIndexBuilder; +use builder::{IvfIndexBuilder, VectorIndexBuildSummary}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use futures::stream; @@ -29,7 +30,7 @@ use lance_index::metrics::NoOpMetricsCollector; use lance_index::optimize::OptimizeOptions; use lance_index::progress::{IndexBuildProgress, noop_progress}; use lance_index::vector::bq::builder::RabitQuantizer; -use lance_index::vector::bq::{RQBuildParams, RQRotationType}; +use lance_index::vector::bq::{RQBuildParams, RQRotationType, validate_supported_rq_num_bits}; use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; use lance_index::vector::hnsw::HNSW; use lance_index::vector::ivf::builder::recommended_num_partitions; @@ -52,9 +53,10 @@ use lance_index::vector::{ sq::{ScalarQuantizer, builder::SQBuildParams}, }; use lance_index::{INDEX_AUXILIARY_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, IndexType}; +use lance_io::object_store::ObjectStore; use lance_io::traits::Reader; use lance_linalg::distance::*; -use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{IndexFile, IndexMetadata}; use serde::Serialize; use tracing::instrument; use utils::get_vector_type; @@ -543,8 +545,18 @@ async fn prepare_vector_segment_build( ))); } - let num_rows = dataset.count_rows(None).await?; let index_type = params.index_type(); + if index_type == IndexType::IvfRq { + let Some(StageParams::RQ(rq_params)) = stages.last() else { + return Err(Error::index(format!( + "{mode}: invalid stages: {:?}", + stages + ))); + }; + validate_supported_rq_num_bits(rq_params.num_bits)?; + } + + let num_rows = dataset.count_rows(None).await?; let num_partitions = ivf_params0.num_partitions.unwrap_or_else(|| { recommended_num_partitions( num_rows, @@ -576,12 +588,12 @@ pub(crate) async fn build_distributed_vector_index( dataset: &Dataset, column: &str, _name: &str, - uuid: &str, + uuid: Uuid, params: &VectorIndexParams, frag_reuse_index: Option>, fragment_ids: &[u32], progress: Arc, -) -> Result { +) -> Result<(Uuid, Vec)> { let (element_type, index_type, ivf_params, shuffler) = prepare_vector_segment_build( dataset, column, @@ -602,8 +614,7 @@ pub(crate) async fn build_distributed_vector_index( let filtered_dataset = dataset.clone(); - let segment_uuid = Uuid::parse_str(uuid) - .map_err(|err| Error::invalid_input(format!("Invalid index UUID '{uuid}': {err}")))?; + let segment_uuid = uuid; let index_dir = dataset.indices_dir().join(segment_uuid.to_string()); let fragment_filter = fragment_ids.to_vec(); @@ -647,7 +658,7 @@ pub(crate) async fn build_distributed_vector_index( DataType::Float16 | DataType::Float32 | DataType::Float64 => { let ivf_model = make_ivf_model(); - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -663,11 +674,12 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } DataType::UInt8 => { let ivf_model = make_ivf_model(); - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -683,6 +695,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } _ => { return Err(Error::index(format!( @@ -711,7 +724,7 @@ pub(crate) async fn build_distributed_vector_index( let ivf_model = make_ivf_model(); let global_pq = make_global_pq(pq_params)?; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -731,6 +744,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } } } @@ -742,7 +756,7 @@ pub(crate) async fn build_distributed_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -757,6 +771,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } IndexType::IvfHnswFlat => { @@ -769,7 +784,7 @@ pub(crate) async fn build_distributed_vector_index( match element_type { DataType::UInt8 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -784,9 +799,10 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } _ => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -801,6 +817,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } } } @@ -822,7 +839,7 @@ pub(crate) async fn build_distributed_vector_index( let ivf_model = make_ivf_model(); let global_pq = make_global_pq(pq_params)?; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -842,6 +859,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } IndexType::IvfHnswSq => { @@ -857,7 +875,7 @@ pub(crate) async fn build_distributed_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -872,6 +890,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } IndexType::IvfRq => { @@ -884,7 +903,7 @@ pub(crate) async fn build_distributed_vector_index( let ivf_model = make_ivf_model(); - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -903,6 +922,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } _ => { @@ -911,9 +931,7 @@ pub(crate) async fn build_distributed_vector_index( index_type ))); } - }; - - Ok(segment_uuid) + } } /// Build a Vector Index @@ -922,11 +940,11 @@ pub(crate) async fn build_vector_index( dataset: &Dataset, column: &str, name: &str, - uuid: &str, + uuid: Uuid, params: &VectorIndexParams, frag_reuse_index: Option>, progress: Arc, -) -> Result<()> { +) -> Result> { let (element_type, index_type, ivf_params, shuffler) = prepare_vector_segment_build( dataset, column, @@ -941,10 +959,10 @@ pub(crate) async fn build_vector_index( match index_type { IndexType::IvfFlat => match element_type { DataType::Float16 | DataType::Float32 | DataType::Float64 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().clone().join(uuid), + dataset.indices_dir().clone().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -955,12 +973,13 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } DataType::UInt8 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().clone().join(uuid), + dataset.indices_dir().clone().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -971,6 +990,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } _ => { return Err(Error::index(format!( @@ -990,7 +1010,7 @@ pub(crate) async fn build_vector_index( match params.version { IndexFileVersion::Legacy => { - build_ivf_pq_index( + let files = build_ivf_pq_index( dataset, column, name, @@ -1001,12 +1021,13 @@ pub(crate) async fn build_vector_index( progress.clone(), ) .await?; + return Ok(files); } IndexFileVersion::V3 => { let mut builder = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().join(uuid), + dataset.indices_dir().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -1015,11 +1036,12 @@ pub(crate) async fn build_vector_index( frag_reuse_index, )?; - builder + let summary = builder .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } } } @@ -1031,10 +1053,10 @@ pub(crate) async fn build_vector_index( ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().clone().join(uuid), + dataset.indices_dir().clone().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -1045,6 +1067,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } IndexType::IvfRq => { let StageParams::RQ(rq_params) = &stages[1] else { @@ -1057,7 +1080,7 @@ pub(crate) async fn build_vector_index( let mut builder = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().join(uuid), + dataset.indices_dir().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -1066,11 +1089,12 @@ pub(crate) async fn build_vector_index( frag_reuse_index, )?; - builder + let summary = builder .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } IndexType::IvfHnswFlat => { let StageParams::Hnsw(hnsw_params) = &stages[1] else { @@ -1081,10 +1105,10 @@ pub(crate) async fn build_vector_index( }; match element_type { DataType::UInt8 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().clone().join(uuid), + dataset.indices_dir().clone().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -1095,12 +1119,13 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } _ => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().clone().join(uuid), + dataset.indices_dir().clone().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -1111,6 +1136,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } } } @@ -1127,10 +1153,10 @@ pub(crate) async fn build_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().clone().join(uuid), + dataset.indices_dir().clone().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -1141,6 +1167,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } IndexType::IvfHnswSq => { let StageParams::Hnsw(hnsw_params) = &stages[1] else { @@ -1155,10 +1182,10 @@ pub(crate) async fn build_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), - dataset.indices_dir().clone().join(uuid), + dataset.indices_dir().clone().join(uuid.to_string()), params.metric_type, shuffler, Some(ivf_params), @@ -1169,6 +1196,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } _ => { return Err(Error::index(format!( @@ -1176,8 +1204,7 @@ pub(crate) async fn build_vector_index( index_type ))); } - }; - Ok(()) + } } /// Build a Vector Index incrementally using an existing index's IVF model and quantizer @@ -1186,12 +1213,12 @@ pub(crate) async fn build_vector_index( pub(crate) async fn build_vector_index_incremental( dataset: &Dataset, column: &str, - uuid: &str, + uuid: Uuid, params: &VectorIndexParams, existing_index: Arc, frag_reuse_index: Option>, progress: Arc, -) -> Result<()> { +) -> Result { let stages = ¶ms.stages; if stages.is_empty() { @@ -1243,7 +1270,7 @@ pub(crate) async fn build_vector_index_incremental( Some(progress.clone()), ); - let index_dir = dataset.indices_dir().join(uuid); + let index_dir = dataset.indices_dir().join(uuid.to_string()); // Determine the index type and build incrementally let (sub_index_type, quantization_type) = existing_index.sub_index_type(); @@ -1251,7 +1278,7 @@ pub(crate) async fn build_vector_index_incremental( match (sub_index_type, quantization_type) { // IVF_FLAT (SubIndexType::Flat, QuantizationType::Flat) => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1266,9 +1293,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } (SubIndexType::Flat, QuantizationType::FlatBin) => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1283,6 +1311,7 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_PQ (SubIndexType::Flat, QuantizationType::Product) => { @@ -1296,17 +1325,18 @@ pub(crate) async fn build_vector_index_incremental( frag_reuse_index, OptimizeOptions::append(), )?; - builder + let summary = builder .with_ivf(ivf_model) .with_quantizer(quantizer.try_into()?) .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_SQ (SubIndexType::Flat, QuantizationType::Scalar) => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1321,6 +1351,7 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_RQ (SubIndexType::Flat, QuantizationType::Rabit) => { @@ -1334,13 +1365,14 @@ pub(crate) async fn build_vector_index_incremental( frag_reuse_index, OptimizeOptions::append(), )?; - builder + let summary = builder .with_ivf(ivf_model) .with_quantizer(quantizer.try_into()?) .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_HNSW variants (SubIndexType::Hnsw, quantization_type) => { @@ -1353,7 +1385,7 @@ pub(crate) async fn build_vector_index_incremental( match quantization_type { QuantizationType::Flat => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1368,9 +1400,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::FlatBin => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1385,9 +1418,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::Product => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1402,9 +1436,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::Scalar => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1419,6 +1454,7 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::Rabit => { return Err(Error::index( @@ -1428,8 +1464,6 @@ pub(crate) async fn build_vector_index_incremental( } } } - - Ok(()) } /// Build an empty vector index without training on data @@ -1438,9 +1472,9 @@ pub(crate) async fn build_empty_vector_index( _dataset: &Dataset, column: &str, name: &str, - _uuid: &str, + _uuid: Uuid, _params: &VectorIndexParams, -) -> Result<()> { +) -> Result> { // For now, return a NotImplementedError to indicate this functionality // is still being developed Err(Error::not_supported_source( @@ -1461,16 +1495,16 @@ pub(crate) async fn remap_vector_index( new_uuid: &Uuid, old_metadata: &IndexMetadata, mapping: &HashMap>, -) -> Result<()> { +) -> Result> { let old_index = dataset - .open_vector_index(column, &old_uuid.to_string(), &NoOpMetricsCollector) + .open_vector_index(column, old_uuid, &NoOpMetricsCollector) .await?; if let Some(ivf_index) = old_index.as_any().downcast_ref::() { - remap_index_file( + let file = remap_index_file( dataset.as_ref(), - &old_uuid.to_string(), - &new_uuid.to_string(), + old_uuid, + new_uuid, old_metadata.dataset_version, ivf_index, mapping, @@ -1482,26 +1516,26 @@ pub(crate) async fn remap_vector_index( vec![], ) .await?; + Ok(vec![file]) } else { // it's v3 index - remap_index_file_v3( + let files = remap_index_file_v3( dataset.as_ref(), - &new_uuid.to_string(), + new_uuid, old_index, mapping, column.to_string(), ) .await?; + Ok(files) } - - Ok(()) } /// Open the Vector index on dataset, specified by the `uuid`. #[instrument(level = "debug", skip(dataset, vec_idx, reader))] pub(crate) async fn open_vector_index( dataset: Arc, - uuid: &str, + uuid: &Uuid, vec_idx: &lance_index::pb::VectorIndex, reader: Arc, frag_reuse_index: Option>, @@ -1532,7 +1566,7 @@ pub(crate) async fn open_vector_index( } let ivf = IvfModel::try_from(ivf_pb.to_owned())?; last_stage = Some(Arc::new(IVFIndex::try_new( - uuid, + *uuid, ivf, reader.clone(), last_stage.unwrap(), @@ -1575,11 +1609,29 @@ pub(crate) async fn open_vector_index( Ok(idx) } +/// Open an index file without a HEAD request when the size is already known. +/// +/// `file_sizes` maps a file name to its size in bytes (see +/// `IndexMetadata::file_size_map`). If `file_name` is missing, which is the case +/// for older indices that did not record sizes, this falls back to `open`, which +/// issues a HEAD to learn the size. +pub(crate) async fn open_index_file( + object_store: &ObjectStore, + path: &Path, + file_name: &str, + file_sizes: &HashMap, +) -> Result> { + match file_sizes.get(file_name) { + Some(&size) => object_store.open_with_size(path, size as usize).await, + None => object_store.open(path).await, + } +} + #[instrument(level = "debug", skip(dataset, reader))] pub(crate) async fn open_vector_index_v2( dataset: Arc, column: &str, - uuid: &str, + uuid: &Uuid, reader: PreviousFileReader, frag_reuse_index: Option>, ) -> Result> { @@ -1599,11 +1651,21 @@ pub(crate) async fn open_vector_index_v2( .ok_or_else(|| Error::index(format!("Index with id {} does not exist", uuid)))?; let index_dir = dataset.indice_files_dir(&index_meta)?; let object_store = dataset.object_store_for_index(&index_meta).await?; + let file_sizes = index_meta.file_size_map(); let index: Arc = match index_metadata.index_type.as_str() { "IVF_HNSW_PQ" => { - let aux_path = index_dir.clone().join(uuid).join(INDEX_AUXILIARY_FILE_NAME); - let aux_reader = object_store.open(&aux_path).await?; + let aux_path = index_dir + .clone() + .join(uuid.to_string()) + .join(INDEX_AUXILIARY_FILE_NAME); + let aux_reader = open_index_file( + object_store.as_ref(), + &aux_path, + INDEX_AUXILIARY_FILE_NAME, + &file_sizes, + ) + .await?; let ivf_data = IvfModel::load(&reader).await?; let options = HNSWIndexOptions { use_residual: true }; @@ -1617,7 +1679,7 @@ pub(crate) async fn open_vector_index_v2( let ivf = IvfModel::try_from(pb_ivf)?; Arc::new(IVFIndex::try_new( - uuid, + *uuid, ivf, reader.object_reader.clone(), Arc::new(hnsw), @@ -1629,8 +1691,17 @@ pub(crate) async fn open_vector_index_v2( } "IVF_HNSW_SQ" => { - let aux_path = index_dir.clone().join(uuid).join(INDEX_AUXILIARY_FILE_NAME); - let aux_reader = object_store.open(&aux_path).await?; + let aux_path = index_dir + .clone() + .join(uuid.to_string()) + .join(INDEX_AUXILIARY_FILE_NAME); + let aux_reader = open_index_file( + object_store.as_ref(), + &aux_path, + INDEX_AUXILIARY_FILE_NAME, + &file_sizes, + ) + .await?; let ivf_data = IvfModel::load(&reader).await?; let options = HNSWIndexOptions { @@ -1647,7 +1718,7 @@ pub(crate) async fn open_vector_index_v2( let ivf = IvfModel::try_from(pb_ivf)?; Arc::new(IVFIndex::try_new( - uuid, + *uuid, ivf, reader.object_reader.clone(), Arc::new(hnsw), @@ -1704,11 +1775,7 @@ pub async fn initialize_vector_index( let column_name = field_names[0]; let source_vector_index = source_dataset - .open_vector_index( - column_name, - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index(column_name, &source_index.uuid, &NoOpMetricsCollector) .await?; let metric_type = source_vector_index.metric_type(); @@ -1777,10 +1844,10 @@ pub async fn initialize_vector_index( .open_frag_reuse_index(&NoOpMetricsCollector) .await?; - build_vector_index_incremental( + let summary = build_vector_index_incremental( target_dataset, column_name, - &new_uuid.to_string(), + new_uuid, ¶ms, source_vector_index, frag_reuse_index, @@ -1788,10 +1855,6 @@ pub async fn initialize_vector_index( ) .await?; - // Capture file sizes for the new vector index - let index_dir = target_dataset.indices_dir().join(new_uuid.to_string()); - let files = list_index_files_with_sizes(&target_dataset.object_store, &index_dir).await?; - let field = target_dataset.schema().field(column_name).ok_or_else(|| { Error::index(format!( "Column '{}' not found in target dataset", @@ -1811,7 +1874,7 @@ pub async fn initialize_vector_index( index_version: source_index.index_version, created_at: Some(chrono::Utc::now()), base_id: None, - files: Some(files), + files: Some(summary.files), }; let transaction = Transaction::new( @@ -1880,6 +1943,7 @@ fn derive_rabit_params(rabit_quantizer: &RabitQuantizer) -> RQBuildParams { RQBuildParams { num_bits: rabit_quantizer.num_bits(), rotation_type: rabit_quantizer.rotation_type(), + rotation: None, } } @@ -1946,6 +2010,125 @@ mod tests { use lance_index::metrics::NoOpMetricsCollector; use lance_linalg::distance::MetricType; + /// `open_index_file` skips the HEAD when the size is known and still falls + /// back to a HEAD for older indices that did not record sizes. A HEAD is + /// issued as a `get_opts` call with `head = true`, so a proxy store counts + /// those against the index file. + /// + /// Regression test for . + #[tokio::test] + async fn test_open_index_file_skips_head_when_size_known() { + use lance_index::INDEX_FILE_NAME; + use lance_io::assert_io_eq; + use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry}; + + let (store, base) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + "memory:///", + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + + let path = base.join(INDEX_FILE_NAME); + // Larger than the block size so size discovery needs a separate HEAD. + let data = vec![7u8; 2 * store.block_size()]; + store.put(&path, &data).await.unwrap(); + + let file_sizes = HashMap::from([(INDEX_FILE_NAME.to_string(), data.len() as u64)]); + + // Size recorded in the manifest, so reading the size issues no HEAD. + let _ = store.io_stats_incremental(); // reset + let reader = open_index_file(store.as_ref(), &path, INDEX_FILE_NAME, &file_sizes) + .await + .unwrap(); + assert_eq!(reader.size().await.unwrap(), data.len()); + let stats = store.io_stats_incremental(); + assert_io_eq!( + stats, + read_iops, + 0, + "a known file size must not trigger a HEAD request" + ); + + // Size unknown, as in an older index, so it falls back to a HEAD. + let _ = store.io_stats_incremental(); // reset + let reader = open_index_file(store.as_ref(), &path, INDEX_FILE_NAME, &HashMap::new()) + .await + .unwrap(); + assert_eq!(reader.size().await.unwrap(), data.len()); + let stats = store.io_stats_incremental(); + assert_io_eq!( + stats, + read_iops, + 1, + "an unknown file size must fall back to exactly one HEAD request" + ); + } + + /// `open_index_file` looks up sizes in `IndexMetadata::file_size_map()` by + /// bare file name. This pins that a freshly created HNSW index records both + /// the main and auxiliary files under those exact names with nonzero sizes, + /// which is what lets the open path skip the HEAD. + #[tokio::test] + async fn test_hnsw_index_records_file_sizes() { + use lance_index::{INDEX_AUXILIARY_FILE_NAME, INDEX_FILE_NAME}; + + let test_dir = TempStrDir::default(); + let uri = format!("{}/ds", test_dir.as_str()); + + let reader = lance_datagen::gen_batch() + .col("vector", array::rand_vec::(32.into())) + .into_reader_rows(RowCount::from(400), BatchCount::from(1)); + let mut dataset = Dataset::write(reader, &uri, None).await.unwrap(); + + let params = VectorIndexParams::with_ivf_hnsw_pq_params( + MetricType::L2, + IvfBuildParams { + num_partitions: Some(8), + ..Default::default() + }, + HnswBuildParams { + max_level: 6, + m: 24, + ef_construction: 120, + prefetch_distance: None, + }, + PQBuildParams { + num_sub_vectors: 8, + num_bits: 8, + ..Default::default() + }, + ); + dataset + .create_index( + &["vector"], + IndexType::Vector, + Some("hnsw".to_string()), + ¶ms, + false, + ) + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + let index = indices.iter().find(|idx| idx.name == "hnsw").unwrap(); + let file_sizes = index.file_size_map(); + + assert!( + file_sizes.get(INDEX_FILE_NAME).copied().unwrap_or(0) > 0, + "manifest should record a nonzero {INDEX_FILE_NAME} size, got {file_sizes:?}" + ); + assert!( + file_sizes + .get(INDEX_AUXILIARY_FILE_NAME) + .copied() + .unwrap_or(0) + > 0, + "manifest should record a nonzero {INDEX_AUXILIARY_FILE_NAME} size, got {file_sizes:?}" + ); + } + #[tokio::test] async fn test_initialize_vector_index_ivf_pq() { let test_dir = TempStrDir::default(); @@ -2016,11 +2199,7 @@ mod tests { // Verify the index type and parameters match let target_vector_index = target_dataset - .open_vector_index( - "vector", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let stats = target_vector_index.statistics().unwrap(); @@ -2048,11 +2227,7 @@ mod tests { // Verify centroids are shared between source and target indices let source_vector_index = source_dataset - .open_vector_index( - "vector", - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &source_index.uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -2231,11 +2406,7 @@ mod tests { // Verify the index type and parameters match let target_vector_index = target_dataset - .open_vector_index( - "vector", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let stats = target_vector_index.statistics().unwrap(); @@ -2267,11 +2438,7 @@ mod tests { // Verify centroids are shared between source and target indices let source_vector_index = source_dataset - .open_vector_index( - "vector", - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &source_index.uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -2377,7 +2544,7 @@ mod tests { let invalid_id = max_id + 1000; // let params = VectorIndexParams::ivf_flat(4, MetricType::L2); - let uuid = Uuid::new_v4().to_string(); + let uuid = Uuid::new_v4(); let mut ivf_params = IvfBuildParams { num_partitions: Some(4), @@ -2405,7 +2572,7 @@ mod tests { &dataset, "vector", "vector_ivf_flat_dist", - &uuid, + uuid, ¶ms, None, &[invalid_id], @@ -2431,7 +2598,7 @@ mod tests { .into_reader_rows(RowCount::from(128), BatchCount::from(1)); let dataset = Dataset::write(reader, &uri, None).await.unwrap(); - let uuid = Uuid::new_v4().to_string(); + let uuid = Uuid::new_v4(); let mut ivf_params = IvfBuildParams { num_partitions: Some(4), ..Default::default() @@ -2458,7 +2625,7 @@ mod tests { &dataset, "vector", "vector_ivf_flat_dist", - &uuid, + uuid, ¶ms, None, &[], @@ -2520,7 +2687,7 @@ mod tests { let dataset = Dataset::write(reader, &uri, None).await.unwrap(); let params = VectorIndexParams::ivf_flat(4, MetricType::L2); - let uuid = Uuid::new_v4().to_string(); + let uuid = Uuid::new_v4(); let progress = Arc::new(RecordingProgress { train_ivf_complete: AtomicBool::new(false), saw_train_ivf_progress_before_complete: AtomicBool::new(false), @@ -2531,7 +2698,7 @@ mod tests { &dataset, "vector", "vector_ivf_flat_progress", - &uuid, + uuid, ¶ms, None, progress.clone(), @@ -2565,11 +2732,11 @@ mod tests { let dataset = Dataset::write(reader, &uri, None).await.unwrap(); let params = VectorIndexParams::ivf_flat(4, MetricType::L2); - let uuid = Uuid::new_v4().to_string(); + let uuid = Uuid::new_v4(); // Pre-create a malformed global training file that is missing the // `lance:global_ivf_centroids` metadata key. - let out_base = dataset.indices_dir().join(&*uuid); + let out_base = dataset.indices_dir().join(uuid.to_string()); let training_path = out_base.clone().join("global_training.idx"); let writer = dataset @@ -2600,7 +2767,7 @@ mod tests { &dataset, "vector", "vector_ivf_flat_dist", - &uuid, + uuid, ¶ms, None, &[valid_id], @@ -2689,20 +2856,12 @@ mod tests { // Open both indices to compare centroids let source_vector_index = source_dataset - .open_vector_index( - "vector", - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &source_index.uuid, &NoOpMetricsCollector) .await .unwrap(); let target_vector_index = target_dataset - .open_vector_index( - "vector", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -2816,11 +2975,7 @@ mod tests { // Verify that the optimized index still shares centroids with the source let target_indices = target_dataset.load_indices().await.unwrap(); let target_vector_index = target_dataset - .open_vector_index( - "vector", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -2926,11 +3081,7 @@ mod tests { // Verify the index type and parameters match let target_vector_index = target_dataset - .open_vector_index( - "vector", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let stats = target_vector_index.statistics().unwrap(); @@ -2962,11 +3113,7 @@ mod tests { // Verify centroids are shared between source and target indices let source_vector_index = source_dataset - .open_vector_index( - "vector", - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &source_index.uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -3152,11 +3299,7 @@ mod tests { // Verify the index type and parameters match let target_vector_index = target_dataset - .open_vector_index( - "vector", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let stats = target_vector_index.statistics().unwrap(); @@ -3184,11 +3327,7 @@ mod tests { // Verify centroids are shared between source and target indices let source_vector_index = source_dataset - .open_vector_index( - "vector", - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &source_index.uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -3410,11 +3549,7 @@ mod tests { // Verify the index type and parameters match let target_vector_index = target_dataset - .open_vector_index( - "vector", - &target_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &target_indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let stats = target_vector_index.statistics().unwrap(); @@ -3442,11 +3577,7 @@ mod tests { // Verify centroids are shared between source and target indices let source_vector_index = source_dataset - .open_vector_index( - "vector", - &source_index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &source_index.uuid, &NoOpMetricsCollector) .await .unwrap(); diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 08d8c32a001..1e4fec8c762 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -69,6 +69,7 @@ use lance_io::stream::RecordBatchStream; use lance_io::{object_store::ObjectStore, stream::RecordBatchStreamAdapter}; use lance_linalg::distance::{DistanceType, Dot, L2, Normalize}; use lance_linalg::kernels::normalize_fsl; +use lance_table::format::IndexFile; use log::info; use object_store::path::Path; use prost::Message; @@ -171,6 +172,11 @@ type BuildStream = type UnindexedStream = Box> + Send + Unpin + 'static>; +pub struct VectorIndexBuildSummary { + pub indices_merged: usize, + pub files: Vec, +} + impl IvfIndexBuilder { #[allow(clippy::too_many_arguments)] pub fn new( @@ -282,9 +288,8 @@ impl IvfIndexBuilder }) } - // build the index with the all data in the dataset, - // return the number of indices merged - pub async fn build(&mut self) -> Result { + // build the index and return the files created by the writer. + pub async fn build(&mut self) -> Result { let progress = self.progress.clone(); // step 1. train IVF & quantizer @@ -318,13 +323,16 @@ impl IvfIndexBuilder .stage_start("merge_partitions", num_partitions, "partitions") .await?; let build_idx_stream = self.build_partitions().boxed().await?; - self.merge_partitions(build_idx_stream).await?; + let files = self.merge_partitions(build_idx_stream).await?; progress.stage_complete("merge_partitions").await?; - Ok(self.merged_num) + Ok(VectorIndexBuildSummary { + indices_merged: self.merged_num, + files, + }) } - pub async fn remap(&mut self, mapping: &HashMap>) -> Result<()> { + pub async fn remap(&mut self, mapping: &HashMap>) -> Result> { if self.existing_indices.is_empty() { return Err(Error::invalid_input( "No existing indices available for remapping", @@ -359,13 +367,14 @@ impl IvfIndexBuilder } }); - self.merge_partitions( - stream::iter(build_iter) - .buffered(get_num_compute_intensive_cpus()) - .boxed(), - ) - .await?; - Ok(()) + let files = self + .merge_partitions( + stream::iter(build_iter) + .buffered(get_num_compute_intensive_cpus()) + .boxed(), + ) + .await?; + Ok(files) } pub fn with_ivf(&mut self, ivf: IvfModel) -> &mut Self { @@ -1036,7 +1045,7 @@ impl IvfIndexBuilder continue; } - let part_storage = existing_index.load_partition_storage(part_id).await?; + let part_storage = existing_index.load_partition_storage(part_id, None).await?; let mut part_batches = part_storage.to_batches()?.collect::>(); // for PQ, the PQ codes are transposed, so we need to transpose them back match Q::quantization_type() { @@ -1108,7 +1117,10 @@ impl IvfIndexBuilder } #[instrument(name = "merge_partitions", level = "debug", skip_all)] - async fn merge_partitions(&mut self, mut build_stream: BuildStream) -> Result<()> { + async fn merge_partitions( + &mut self, + mut build_stream: BuildStream, + ) -> Result> { let Some(ivf) = self.ivf.as_ref() else { return Err(Error::invalid_input("IVF not set before merge partitions")); }; @@ -1347,12 +1359,21 @@ impl IvfIndexBuilder serde_json::to_string(&partition_index_metadata)?, ); - storage_writer.finish().await?; - index_writer.finish().await?; + let storage_summary = storage_writer.finish().await?; + let index_summary = index_writer.finish().await?; log::info!("merging {} partitions done", ivf.num_partitions()); - Ok(()) + Ok(vec![ + IndexFile { + path: INDEX_AUXILIARY_FILE_NAME.to_string(), + size_bytes: storage_summary.size_bytes, + }, + IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes: index_summary.size_bytes, + }, + ]) } // take raw vectors from the dataset @@ -2371,7 +2392,7 @@ mod tests { let indices = dataset.load_indices_by_name("idx").await.unwrap(); let initial_index = dataset - .open_vector_index("vec", &indices[0].uuid.to_string(), &NoOpMetricsCollector) + .open_vector_index("vec", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let initial_ivf = initial_index.ivf_model(); @@ -2401,7 +2422,7 @@ mod tests { let indices = dataset.load_indices_by_name("idx").await.unwrap(); assert_eq!(indices.len(), 1, "expected merge-all on split"); let optimized = dataset - .open_vector_index("vec", &indices[0].uuid.to_string(), &NoOpMetricsCollector) + .open_vector_index("vec", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let ivf = optimized.ivf_model(); diff --git a/rust/lance/src/index/vector/details.rs b/rust/lance/src/index/vector/details.rs index 83e9b92c209..63f9375792e 100644 --- a/rust/lance/src/index/vector/details.rs +++ b/rust/lance/src/index/vector/details.rs @@ -503,7 +503,15 @@ pub async fn infer_vector_index_details( let index_dir = dataset.indice_files_dir(index)?; let file_dir = index_dir.clone().join(uuid.as_str()); let index_file = file_dir.clone().join(INDEX_FILE_NAME); - let reader: Arc = dataset.object_store.open(&index_file).await?.into(); + let file_sizes = index.file_size_map(); + let reader: Arc = super::open_index_file( + dataset.object_store.as_ref(), + &index_file, + INDEX_FILE_NAME, + &file_sizes, + ) + .await? + .into(); let tailing_bytes = read_last_block(reader.as_ref()).await?; let (major_version, minor_version) = read_version(&tailing_bytes)?; diff --git a/rust/lance/src/index/vector/fixture_test.rs b/rust/lance/src/index/vector/fixture_test.rs index 6d577c73ff8..1b82a7f6941 100644 --- a/rust/lance/src/index/vector/fixture_test.rs +++ b/rust/lance/src/index/vector/fixture_test.rs @@ -18,8 +18,8 @@ mod test { use arrow_schema::{DataType, Field, Schema}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; - use deepsize::{Context, DeepSizeOf}; use lance_arrow::FixedSizeListArrayExt; + use lance_core::deepsize::{Context, DeepSizeOf}; use lance_core::{cache::LanceCache, utils::tempfile::TempStdFile}; use lance_index::vector::v3::subindex::SubIndexType; use lance_index::{Index, IndexType, vector::Query}; @@ -54,7 +54,8 @@ mod test { impl DeepSizeOf for ResidualCheckMockIndex { fn deep_size_of_children(&self, cx: &mut Context) -> usize { - self.assert_query_value.deep_size_of_children(cx) + self.ret_val.get_array_memory_size() + self.assert_query_value.deep_size_of_children(cx) + + self.ret_val.deep_size_of_children(cx) } } @@ -70,10 +71,6 @@ mod test { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - async fn prewarm(&self) -> Result<()> { Ok(()) } @@ -211,7 +208,7 @@ mod test { ]))), }); IVFIndex::try_new( - &Uuid::new_v4().to_string(), + Uuid::new_v4(), ivf, reader.into(), mock_sub_index, @@ -268,6 +265,7 @@ mod test { use_index: true, query_parallelism: lance_index::vector::DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; let idx = make_idx.clone()(expected_query_at_subindex, metric).await; let (partition_ids, _) = idx.find_partitions(&q).unwrap(); diff --git a/rust/lance/src/index/vector/hamming.rs b/rust/lance/src/index/vector/hamming.rs new file mode 100644 index 00000000000..ba6ea98c42d --- /dev/null +++ b/rust/lance/src/index/vector/hamming.rs @@ -0,0 +1,938 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Hamming distance clustering for IVF_FLAT indices. +//! +//! This module provides functionality to perform pairwise hamming distance +//! computation and clustering on specific partitions of IVF_FLAT indices. + +use std::time::Instant; + +use arrow_array::RecordBatchReader; +use arrow_array::cast::AsArray; +use arrow_array::types::UInt64Type; +use arrow_schema::DataType; +use lance_core::{Error, Result}; +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::vector::VectorIndex; +use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex}; +use lance_index::vector::flat::storage::FLAT_COLUMN; +use lance_index::vector::storage::VectorStore; +use lance_linalg::distance::{ + ClusteringResult, cluster_pairwise_result, extract_hashes_from_fixed_list, + pairwise_hamming_distance_parallel, +}; +use rand::rng; +use rand::seq::index::sample; + +use crate::dataset::Dataset; +use crate::index::{DatasetIndexExt, DatasetIndexInternalExt}; + +use super::ivf::v2::IVFIndex; + +/// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index. +/// +/// This function loads a specific partition from an IVF_FLAT index on a hash column, +/// computes pairwise hamming distances between all hashes in the partition, +/// filters by threshold, and clusters the results using union-find. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `index_name` - Name of the IVF_FLAT index on the hash column +/// * `partition_id` - The partition ID within the IVF_FLAT index +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +/// +/// # Errors +/// +/// Returns an error if: +/// - The index doesn't exist or is not an IVF_FLAT index +/// - The indexed column has wrong type (must be `FixedSizeList`) +/// - The partition ID is out of range +pub async fn hamming_clustering_for_ivf_partition( + dataset: &Dataset, + index_name: &str, + partition_id: usize, + hamming_threshold: u32, +) -> Result> { + // Load indices and find the IVF_FLAT index + let indices = dataset.load_indices().await?; + let index_meta = indices + .iter() + .find(|idx| idx.name == index_name) + .ok_or_else(|| { + Error::invalid_input(format!("Index '{}' not found on dataset", index_name)) + })?; + + // Get the column name from the index metadata + let schema = dataset.schema(); + let field_id = index_meta + .fields + .first() + .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?; + let field = schema.field_by_id(*field_id).ok_or_else(|| { + Error::invalid_input(format!( + "Field with id {} not found in schema for index '{}'", + field_id, index_name + )) + })?; + let column = &field.name; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Open the vector index + let index = dataset + .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector) + .await?; + + // Try to downcast to IVFIndex (IVF_FLAT for binary data) + let ivf_index = index + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + Error::invalid_input(format!( + "Index '{}' is not an IVF_FLAT index for binary data", + index_name + )) + })?; + + // Check partition ID is valid + let num_partitions = ivf_index.ivf_model().num_partitions(); + if partition_id >= num_partitions { + return Err(Error::invalid_input(format!( + "Partition ID {} is out of range (0..{})", + partition_id, num_partitions + ))); + } + + // Load the partition storage + let storage = ivf_index.load_partition_storage(partition_id, None).await?; + + // Get row IDs + let row_id_slice: Vec = storage.row_ids().copied().collect(); + + if row_id_slice.is_empty() { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Get vectors from the storage batches + let batches: Vec<_> = storage.to_batches()?.collect(); + if batches.is_empty() { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Extract the hash vectors from the FLAT_COLUMN + let mut all_hashes = Vec::new(); + for batch in &batches { + let vectors = batch + .column_by_name(FLAT_COLUMN) + .ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in storage", FLAT_COLUMN)) + })? + .as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(vectors)?; + all_hashes.extend(hashes); + } + + // Compute pairwise hamming distances with threshold filtering + let pairwise_result = pairwise_hamming_distance_parallel( + &all_hashes, + Some(&row_id_slice), + Some(hamming_threshold), + ); + + // Cluster the results + let clustering = cluster_pairwise_result(&pairwise_result); + + Ok(clustering.into_reader(None)) +} + +/// Get partition statistics for an IVF_FLAT index. +pub async fn get_ivf_partition_info( + dataset: &Dataset, + index_name: &str, +) -> Result> { + let indices = dataset.load_indices().await?; + let index_meta = indices + .iter() + .find(|idx| idx.name == index_name) + .ok_or_else(|| { + Error::invalid_input(format!("Index '{}' not found on dataset", index_name)) + })?; + + // Get the column name from the index metadata + let schema = dataset.schema(); + let field_id = index_meta + .fields + .first() + .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?; + let field = schema.field_by_id(*field_id).ok_or_else(|| { + Error::invalid_input(format!( + "Field with id {} not found in schema for index '{}'", + field_id, index_name + )) + })?; + let column = &field.name; + + let index = dataset + .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector) + .await?; + + let ivf_index = index + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + Error::invalid_input(format!( + "Index '{}' is not an IVF_FLAT index for binary data", + index_name + )) + })?; + + let num_partitions = ivf_index.ivf_model().num_partitions(); + let mut partition_infos = Vec::with_capacity(num_partitions); + + for i in 0..num_partitions { + partition_infos.push(PartitionInfo { + partition_id: i, + size: ivf_index.ivf_model().partition_size(i), + }); + } + + Ok(partition_infos) +} + +/// Information about an IVF partition. +#[derive(Debug, Clone)] +pub struct PartitionInfo { + pub partition_id: usize, + pub size: usize, +} + +/// Perform pairwise hamming distance clustering on sampled rows from a dataset. +/// +/// This function samples N rows randomly from the dataset, extracts hashes, +/// computes pairwise hamming distances, and clusters the results. +/// It's useful for benchmarking and testing without requiring an IVF index. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `column` - Name of the hash column (must be `FixedSizeList`) +/// * `sample_size` - Number of rows to sample (if None or >= total rows, uses all rows) +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +pub async fn hamming_clustering_for_sample( + dataset: &Dataset, + column: &str, + sample_size: Option, + hamming_threshold: u32, +) -> Result> { + // Validate column exists and has correct type + let schema = dataset.schema(); + let field = schema.field(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in dataset schema", column)) + })?; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Get total row count + let total_rows: usize = dataset + .get_fragments() + .iter() + .filter_map(|f| f.metadata().physical_rows) + .sum(); + + let use_sampling = sample_size.is_some_and(|s| s < total_rows); + let effective_sample = sample_size.unwrap_or(total_rows).min(total_rows); + + // Read data + let (hashes, row_ids) = if use_sampling { + // Random sample using take() with _rowid (take uses positional indices) + let indices: Vec = sample(&mut rng(), total_rows, effective_sample) + .iter() + .map(|i| i as u64) + .collect(); + + let batch = dataset + .take( + &indices, + crate::dataset::ProjectionRequest::from_columns( + [column, "_rowid"], + dataset.schema(), + ), + ) + .await?; + + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in take result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + let hash_col = batch.column_by_name(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in result", column)) + })?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + (hashes, row_id_vec) + } else { + // Full scan + let batch = dataset + .scan() + .project(&[column])? + .with_row_id() + .try_into_batch() + .await?; + + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in scan result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + let hash_col = batch.column_by_name(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in result", column)) + })?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + (hashes, row_id_vec) + }; + + if hashes.len() < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Compute pairwise hamming distances + let pairwise = + pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(hamming_threshold)); + + // Cluster edges + let clustering = cluster_pairwise_result(&pairwise); + + Ok(clustering.into_reader(None)) +} + +/// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment. +/// +/// This function reads a contiguous range of rows from a specific fragment, +/// extracts hashes, computes pairwise hamming distances, and clusters the results. +/// Unlike sampling, this reads sequential rows which is useful for distributed +/// processing where each worker handles a specific range of a fragment. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `column` - Name of the hash column (must be `FixedSizeList`) +/// * `fragment_id` - The fragment ID to read from +/// * `start_row` - The starting row offset within the fragment +/// * `num_rows` - Number of rows to read from the start position +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +/// +/// # Errors +/// +/// Returns an error if: +/// - The fragment doesn't exist +/// - The column has wrong type (must be `FixedSizeList`) +/// - The row range is out of bounds +pub async fn hamming_clustering_for_range( + dataset: &Dataset, + column: &str, + fragment_id: usize, + start_row: usize, + num_rows: usize, + hamming_threshold: u32, +) -> Result> { + // Validate column exists and has correct type + let schema = dataset.schema(); + let field = schema.field(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in dataset schema", column)) + })?; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Get the fragment + let fragment = dataset.get_fragment(fragment_id).ok_or_else(|| { + Error::invalid_input(format!("Fragment with ID {} not found", fragment_id)) + })?; + + // Get fragment metadata for physical row count + let fragment_meta = fragment.metadata().clone(); + let physical_rows = fragment_meta + .physical_rows + .ok_or_else(|| Error::invalid_input("Fragment has no physical_rows metadata"))?; + + // Validate the range + if start_row >= physical_rows { + return Err(Error::invalid_input(format!( + "start_row {} is out of range for fragment with {} physical rows", + start_row, physical_rows + ))); + } + + // Adjust num_rows if it exceeds available rows + let effective_num_rows = num_rows.min(physical_rows - start_row); + + if effective_num_rows == 0 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Use scanner with the specific fragment and limit/offset + let batch = dataset + .scan() + .with_fragments(vec![fragment_meta]) + .project(&[column])? + .with_row_id() + .limit(Some(effective_num_rows as i64), Some(start_row as i64))? + .try_into_batch() + .await?; + + // Extract row IDs + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in scan result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + // Extract hashes + let hash_col = batch + .column_by_name(column) + .ok_or_else(|| Error::invalid_input(format!("Column '{}' not found in result", column)))?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + if hashes.len() < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Compute pairwise hamming distances + let pairwise = + pairwise_hamming_distance_parallel(&hashes, Some(&row_id_vec), Some(hamming_threshold)); + + // Cluster edges + let clustering = cluster_pairwise_result(&pairwise); + + Ok(clustering.into_reader(None)) +} + +/// Perform pairwise hamming distance clustering on provided hashes (no I/O). +/// +/// This is useful for benchmarking the pure compute performance without I/O. +/// Logs timing information via tracing. +/// +/// # Arguments +/// +/// * `hashes` - Vector of 64-bit hash values +/// * `row_ids` - Optional row IDs (defaults to indices if None) +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +pub fn hamming_clustering_from_hashes( + hashes: &[u64], + row_ids: Option<&[u64]>, + hamming_threshold: u32, +) -> Box { + let num_rows = hashes.len(); + if num_rows < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return empty.into_reader(None); + } + + let total_pairs = (num_rows as u64) * (num_rows as u64 - 1) / 2; + + // Compute pairwise hamming distances + let t_compute_start = Instant::now(); + let pairwise = pairwise_hamming_distance_parallel(hashes, row_ids, Some(hamming_threshold)); + let compute_time = t_compute_start.elapsed(); + + // Cluster edges + let t_cluster_start = Instant::now(); + let clustering = cluster_pairwise_result(&pairwise); + let cluster_time = t_cluster_start.elapsed(); + + // Log timing info + let pairs_per_sec = if compute_time.as_secs_f64() > 0.0 { + total_pairs as f64 / compute_time.as_secs_f64() + } else { + 0.0 + }; + tracing::info!( + num_rows, + total_pairs, + edges = pairwise.len(), + compute_time_ms = compute_time.as_millis(), + cluster_time_ms = cluster_time.as_millis(), + pairs_per_sec_millions = pairs_per_sec / 1_000_000.0, + num_clusters = clustering.num_clusters(), + num_duplicates = clustering.num_duplicates(), + "Hamming clustering completed" + ); + + clustering.into_reader(None) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::Array; + + /// Helper to collect all clusters from a reader. + fn collect_clusters(reader: Box) -> Vec<(u64, Vec)> { + let mut clusters = Vec::new(); + for batch in reader { + let batch = batch.unwrap(); + let reps = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let dups = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..batch.num_rows() { + let rep = reps.value(i); + let dup_arr = dups.value(i); + let dup_values = dup_arr + .as_any() + .downcast_ref::() + .unwrap(); + let duplicates: Vec = dup_values.values().to_vec(); + clusters.push((rep, duplicates)); + } + } + clusters + } + + #[test] + fn test_hamming_clustering_from_hashes_basic() { + // Create some test hashes with known distances + let hashes = vec![ + 0b0000u64, // hash 0 + 0b0001u64, // hash 1 - distance 1 from hash 0 + 0b0011u64, // hash 2 - distance 1 from hash 1, distance 2 from hash 0 + 0b1111u64, // hash 3 - distance 2 from hash 2, distance 4 from hash 0 + ]; + + let reader = hamming_clustering_from_hashes(&hashes, None, 1); + let clusters = collect_clusters(reader); + + // With threshold 1, pairs (0,1) and (1,2) should be connected + // This forms one cluster: {0, 1, 2} + assert_eq!(clusters.len(), 1); + assert_eq!(clusters[0].1.len(), 2); // 2 duplicates in the cluster + } + + #[test] + fn test_hamming_clustering_from_hashes_no_clusters() { + // All hashes are far apart + let hashes = vec![ + 0x0000000000000000u64, + 0xFFFFFFFFFFFFFFFFu64, + 0xAAAAAAAAAAAAAAAAu64, + ]; + + let reader = hamming_clustering_from_hashes(&hashes, None, 5); + let clusters = collect_clusters(reader); + + // With threshold 5, no pairs should be connected (min distance is 32) + assert_eq!(clusters.len(), 0); + } + + #[test] + fn test_hamming_clustering_from_hashes_with_row_ids() { + let hashes = vec![0b0000u64, 0b0001u64]; + let row_ids = vec![100u64, 200u64]; + + let reader = hamming_clustering_from_hashes(&hashes, Some(&row_ids), 1); + let clusters = collect_clusters(reader); + + assert_eq!(clusters.len(), 1); + assert_eq!(clusters[0].0, 100); // representative + assert_eq!(clusters[0].1, vec![200]); // duplicates + } + + #[tokio::test] + async fn test_hamming_clustering_for_ivf_partition() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ivf::IvfBuildParams; + use std::sync::Arc; + use tempfile::tempdir; + + // Create test data with hash column (FixedSizeList) + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Generate hashes with some duplicates (similar hashes) + let num_rows = 100; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + // Create groups of similar hashes + let base = (i / 10) as u64; // 10 groups + let variation = (i % 10) as u64; + let hash = base.wrapping_mul(0x123456789) ^ variation; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = + FixedSizeListArray::try_new_from_values(values, 8).expect("create hash array"); + + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + // Write dataset + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut dataset = crate::Dataset::write(reader, uri, None).await.unwrap(); + + // Create IVF_FLAT index with 4 partitions + let ivf_params = IvfBuildParams::new(4); + let params = crate::index::vector::VectorIndexParams::with_ivf_flat_params( + lance_linalg::distance::MetricType::Hamming, + ivf_params, + ); + + dataset + .create_index( + &["hash"], + crate::index::IndexType::Vector, + None, + ¶ms, + false, + ) + .await + .unwrap(); + + // Load and test + let dataset = crate::Dataset::open(uri).await.unwrap(); + let indices = dataset.load_indices().await.unwrap(); + let index_name = &indices[0].name; + + // Test clustering on partition 0 + let reader = hamming_clustering_for_ivf_partition(&dataset, index_name, 0, 10) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // Verify we get valid results (may or may not have clusters depending on data distribution) + // At minimum, verify no panics and valid schema + for (rep, dups) in &clusters { + assert!(*rep < num_rows as u64 * 10); // row IDs should be reasonable + for dup in dups { + assert!(*dup < num_rows as u64 * 10); + } + } + } + + #[tokio::test] + async fn test_hamming_clustering_for_ivf_partition_invalid_index() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let dataset = crate::Dataset::write(reader, uri, None).await.unwrap(); + + // Test with non-existent index + let result = hamming_clustering_for_ivf_partition(&dataset, "nonexistent", 0, 10).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("not found"), "Error: {}", err); + } + + #[tokio::test] + async fn test_hamming_clustering_for_sample_integration() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Create 50 rows with some duplicate hashes + let num_rows = 50; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + // Create some identical hashes (groups of 5) + let hash = (i / 5) as u64; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Test full scan (no sampling) + let reader = hamming_clustering_for_sample(&dataset, "hash", None, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // With threshold 0 (exact match) and groups of 5 identical hashes, + // we should have 10 clusters with 4 duplicates each + assert_eq!(clusters.len(), 10); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + + // Test with sampling + let reader = hamming_clustering_for_sample(&dataset, "hash", Some(20), 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + // With sampling, we may get fewer clusters + assert!(clusters.len() <= 10); + } + + #[tokio::test] + async fn test_hamming_clustering_for_range_integration() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Create 50 rows with some duplicate hashes (groups of 5 identical hashes) + let num_rows = 50; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + let hash = (i / 5) as u64; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Get fragment info + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 1); + let fragment_id = fragments[0].id() as usize; + + // Test reading range from the fragment + // Reading rows 0-25 should cover groups 0-4 (5 groups, each with 5 rows) + let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 0, 25, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // With threshold 0 and 25 rows (groups 0-4), we should have 5 clusters + // Each cluster has 4 duplicates (5 identical hashes - 1 representative = 4 duplicates) + assert_eq!(clusters.len(), 5); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + + // Test reading a different range (rows 25-50) + let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 25, 25, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // Should have 5 clusters (groups 5-9) + assert_eq!(clusters.len(), 5); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + } + + #[tokio::test] + async fn test_hamming_clustering_for_range_invalid_fragment() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Test with non-existent fragment + let result = hamming_clustering_for_range(&dataset, "hash", 999, 0, 10, 0).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("not found"), "Error: {}", err); + + // Test with out-of-range start_row + let result = hamming_clustering_for_range(&dataset, "hash", 0, 1000, 10, 0).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("out of range"), "Error: {}", err); + } +} diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index b7e6567b025..5d477a2e8dd 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -13,7 +13,9 @@ use super::{ utils::PartitionLoadLock, }; use crate::dataset::index::dataset_format_version; +use crate::index::DatasetIndexExt; use crate::index::DatasetIndexInternalExt; +use crate::index::vector::open_index_file; use crate::index::vector::utils::{get_vector_dim, get_vector_type}; use crate::{ dataset::Dataset, @@ -21,6 +23,7 @@ use crate::{ }; use crate::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion}; use arrow::array::ArrayData; +use arrow::compute::concat_batches; use arrow::datatypes::UInt8Type; use arrow_arith::numeric::sub; use arrow_array::Float32Array; @@ -33,7 +36,6 @@ use arrow_buffer::MutableBuffer; use arrow_schema::{DataType, Schema}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; -use deepsize::DeepSizeOf; use futures::TryFutureExt; use futures::{ Stream, TryStreamExt, @@ -41,6 +43,7 @@ use futures::{ }; use io::write_hnsw_quantization_index_partitions; use lance_arrow::*; +use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID_FIELD, Result, cache::{LanceCache, UnsizedCacheKey, WeakLanceCache}, @@ -48,6 +51,7 @@ use lance_core::{ utils::parse::parse_env_as_bool, utils::tracing::{IO_TYPE_LOAD_VECTOR_PART, TRACE_IO_EVENTS}, }; +use lance_encoding::decoder::FilterExpression; use lance_file::{ format::MAGIC, previous::writer::{ @@ -60,14 +64,18 @@ use lance_index::metrics::MetricsCollector; use lance_index::metrics::NoOpMetricsCollector; use lance_index::vector::DISTANCE_TYPE_KEY; use lance_index::vector::bq::builder::RabitQuantizer; -use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; +use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatMetadata, FlatQuantizer}; +use lance_index::vector::flat::storage::{FLAT_COLUMN, FlatBinStorage, FlatFloatStorage}; use lance_index::vector::hnsw::HnswMetadata; use lance_index::vector::hnsw::builder::HNSW_METADATA_KEY; use lance_index::vector::ivf::storage::IVF_METADATA_KEY; use lance_index::vector::ivf::storage::IvfModel; use lance_index::vector::kmeans::{KMeans, KMeansParams}; -use lance_index::vector::pq::storage::transpose; +use lance_index::vector::pq::storage::{ + PQ_METADATA_KEY, ProductQuantizationMetadata, ProductQuantizationStorage, transpose, +}; use lance_index::vector::quantizer::QuantizationType; +use lance_index::vector::storage::STORAGE_METADATA_KEY; use lance_index::vector::v3::shuffler::create_ivf_shuffler; use lance_index::vector::v3::subindex::{IvfSubIndex, SubIndexType}; use lance_index::{ @@ -82,13 +90,17 @@ use lance_index::{ storage::IVF_PARTITION_KEY, }, pq::{PQBuildParams, ProductQuantizer}, - quantizer::{Quantization, QuantizationMetadata, Quantizer}, - sq::ScalarQuantizer, + quantizer::{Quantization, QuantizationMetadata, Quantizer, QuantizerStorage}, + sq::{ + ScalarQuantizer, + storage::{SQ_METADATA_KEY, ScalarQuantizationMetadata, ScalarQuantizationStorage}, + }, }, }; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; use lance_io::utils::CachedFileSize; use lance_io::{ + ReadBatchParams, encodings::plain::PlainEncoder, local::to_local_path, object_store::ObjectStore, @@ -97,7 +109,7 @@ use lance_io::{ }; use lance_linalg::distance::{DistanceType, Dot, L2, MetricType}; use lance_linalg::{distance::Normalize, kernels::normalize_fsl_owned}; -use lance_table::format::{IndexMetadata as TableIndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{IndexFile, IndexMetadata as TableIndexMetadata}; use log::{info, warn}; use object_store::path::Path; use prost::Message; @@ -149,7 +161,7 @@ impl UnsizedCacheKey for LegacyIVFPartitionKey { /// IVF Index. /// WARNING: Internal API with no stability guarantees. pub struct IVFIndex { - uuid: String, + uuid: Uuid, /// Ivf model pub ivf: IvfModel, @@ -167,17 +179,16 @@ pub struct IVFIndex { } impl DeepSizeOf for IVFIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - self.uuid.deep_size_of_children(context) - + self.reader.deep_size_of_children(context) - + self.sub_index.deep_size_of_children(context) + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + // `Uuid` is a fixed 16-byte struct with no heap children, so contributes 0. + self.reader.deep_size_of_children(context) + self.sub_index.deep_size_of_children(context) } } impl IVFIndex { /// Create a new IVF index. pub(crate) fn try_new( - uuid: &str, + uuid: Uuid, ivf: IvfModel, reader: Arc, sub_index: Arc, @@ -193,7 +204,7 @@ impl IVFIndex { let num_partitions = ivf.num_partitions(); Ok(Self { - uuid: uuid.to_owned(), + uuid, ivf, reader, sub_index, @@ -376,14 +387,14 @@ pub(crate) fn select_segment_for_single_rebalance( // TODO: move to `lance-index` crate. /// -/// Returns (new_uuid, num_indices_merged) +/// Returns (new_uuid, num_indices_merged, files) pub(crate) async fn optimize_vector_indices( dataset: Dataset, unindexed: Option, vector_column: &str, logical_index: &LogicalIvfView<'_>, options: &OptimizeOptions, -) -> Result<(Uuid, usize)> { +) -> Result<(Uuid, usize, Vec)> { let existing_indices = logical_index.indices().cloned().collect::>(); // Sanity check the indices if existing_indices.is_empty() { @@ -420,49 +431,51 @@ pub(crate) async fn optimize_vector_indices( "optimizing vector index: the first index isn't IVF".to_string(), ))?; - let merged = if let Some(pq_index) = first_idx.sub_index.as_any().downcast_ref::() { - optimize_ivf_pq_indices( - first_idx, - pq_index, - vector_column, - unindexed, - &existing_indices, - options, - writer, - dataset.version().version, - ) - .await? - } else if let Some(hnsw_sq) = first_idx - .sub_index - .as_any() - .downcast_ref::>() - { - let aux_file = dataset - .indices_dir() - .join(new_uuid.to_string()) - .join(INDEX_AUXILIARY_FILE_NAME); - let aux_writer = object_store.create(&aux_file).await?; - optimize_ivf_hnsw_indices( - Arc::new(dataset), - first_idx, - hnsw_sq, - vector_column, - unindexed, - &existing_indices, - options, - writer, - aux_writer, - ) - .await? - } else { - return Err(Error::index( - "optimizing vector index: the sub index isn't PQ or HNSW".to_string(), - )); - }; + let (merged, files) = + if let Some(pq_index) = first_idx.sub_index.as_any().downcast_ref::() { + let (merged, file) = optimize_ivf_pq_indices( + first_idx, + pq_index, + vector_column, + unindexed, + &existing_indices, + options, + writer, + dataset.version().version, + ) + .await?; + (merged, vec![file]) + } else if let Some(hnsw_sq) = first_idx + .sub_index + .as_any() + .downcast_ref::>() + { + let aux_file = dataset + .indices_dir() + .join(new_uuid.to_string()) + .join(INDEX_AUXILIARY_FILE_NAME); + let aux_writer = object_store.create(&aux_file).await?; + optimize_ivf_hnsw_indices( + Arc::new(dataset), + first_idx, + hnsw_sq, + vector_column, + unindexed, + &existing_indices, + options, + writer, + aux_writer, + ) + .await? + } else { + return Err(Error::index( + "optimizing vector index: the sub index isn't PQ or HNSW".to_string(), + )); + }; // never change the index version, // because we won't update the legacy vector index format - Ok((new_uuid, merged)) + Ok((new_uuid, merged, files)) } pub(crate) async fn optimize_vector_indices_v2( @@ -471,7 +484,7 @@ pub(crate) async fn optimize_vector_indices_v2( vector_column: &str, existing_indices: &[Arc], options: &OptimizeOptions, -) -> Result<(Uuid, usize)> { +) -> Result<(Uuid, usize, Vec)> { // Sanity check the indices if existing_indices.is_empty() { return Err(Error::index( @@ -496,7 +509,7 @@ pub(crate) async fn optimize_vector_indices_v2( let shuffler = create_ivf_shuffler(temp_dir_path, num_partitions, format_version, None); let (_, element_type) = get_vector_type(dataset.schema(), vector_column)?; - let merged_num = match index_type { + let summary = match index_type { // IVF_FLAT (SubIndexType::Flat, QuantizationType::Flat) => { if element_type == DataType::UInt8 { @@ -705,7 +718,7 @@ pub(crate) async fn optimize_vector_indices_v2( } }; - Ok((new_uuid, merged_num)) + Ok((new_uuid, summary.indices_merged, summary.files)) } #[allow(clippy::too_many_arguments)] @@ -718,7 +731,7 @@ async fn optimize_ivf_pq_indices( options: &OptimizeOptions, mut writer: Box, dataset_version: u64, -) -> Result { +) -> Result<(usize, IndexFile)> { let metric_type = first_idx.metric_type; let dim = first_idx.ivf.dimension(); @@ -785,9 +798,16 @@ async fn optimize_ivf_pq_indices( // TODO: for now the IVF_PQ index file format hasn't been updated, so keep the old version, // change it to latest version value after refactoring the IVF_PQ writer.write_magics(pos, 0, 1, MAGIC).await?; + let size_bytes = writer.tell().await? as u64; Writer::shutdown(writer.as_mut()).await?; - Ok(existing_indices.len() - start_pos) + Ok(( + existing_indices.len() - start_pos, + IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes, + }, + )) } #[allow(clippy::too_many_arguments)] @@ -801,7 +821,7 @@ async fn optimize_ivf_hnsw_indices( options: &OptimizeOptions, writer: Box, aux_writer: Box, -) -> Result { +) -> Result<(usize, Vec)> { let distance_type = first_idx.metric_type; let quantizer = hnsw_index.quantizer().clone(); let ivf = lance_index::vector::ivf::new_ivf_transformer_with_quantizer( @@ -938,13 +958,27 @@ async fn optimize_ivf_hnsw_indices( writer.add_metadata(IVF_PARTITION_KEY, &hnsw_metadata_json.to_string()); ivf_mut.write(&mut writer).await?; + let index_size = writer.tell().await? as u64; writer.finish().await?; // Write the aux file aux_ivf.write(&mut aux_writer).await?; + let aux_size = aux_writer.tell().await? as u64; aux_writer.finish().await?; - Ok(existing_indices.len() - start_pos) + Ok(( + existing_indices.len() - start_pos, + vec![ + IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes: index_size, + }, + IndexFile { + path: INDEX_AUXILIARY_FILE_NAME.to_string(), + size_bytes: aux_size, + }, + ], + )) } #[derive(Serialize)] @@ -1054,10 +1088,6 @@ impl Index for IVFIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn index_type(&self) -> IndexType { if self.sub_index.as_any().downcast_ref::().is_some() { IndexType::IvfPq @@ -1101,7 +1131,7 @@ impl Index for IVFIndex { Ok(serde_json::to_value(IvfIndexStatistics { index_type: self.index_type().to_string(), - uuid: self.uuid.clone(), + uuid: self.uuid.to_string(), uri: to_local_path(self.reader.path()), metric_type: self.metric_type.to_string(), num_partitions: self.ivf.num_partitions(), @@ -1589,12 +1619,12 @@ pub async fn build_ivf_pq_index( dataset: &Dataset, column: &str, index_name: &str, - uuid: &str, + uuid: Uuid, metric_type: MetricType, ivf_params: &IvfBuildParams, pq_params: &PQBuildParams, progress: std::sync::Arc, -) -> Result<()> { +) -> Result> { let (ivf_model, pq) = build_ivf_model_and_pq( dataset, column, @@ -1607,7 +1637,7 @@ pub async fn build_ivf_pq_index( let stream = scan_index_field_stream(dataset, column).await?; let precomputed_partitions = load_precomputed_partitions_if_available(ivf_params).await?; - write_ivf_pq_file( + let file = write_ivf_pq_file( dataset.object_store.as_ref(), dataset.indices_dir(), column, @@ -1623,7 +1653,8 @@ pub async fn build_ivf_pq_index( ivf_params.shuffle_partition_concurrency, ivf_params.precomputed_shuffle_buffers.clone(), ) - .await + .await?; + Ok(vec![file]) } #[allow(clippy::too_many_arguments)] @@ -1631,7 +1662,7 @@ pub async fn build_ivf_hnsw_pq_index( dataset: &Dataset, column: &str, index_name: &str, - uuid: &str, + uuid: Uuid, metric_type: MetricType, ivf_params: &IvfBuildParams, hnsw_params: &HnswBuildParams, @@ -1732,13 +1763,13 @@ fn generate_remap_tasks(offsets: &[usize], lengths: &[u32]) -> Result, mapping: &HashMap>, column: String, -) -> Result<()> { +) -> Result> { let dataset = dataset.clone(); - let index_dir = dataset.indices_dir().join(new_uuid); + let index_dir = dataset.indices_dir().join(new_uuid.to_string()); let (_, element_type) = get_vector_type(dataset.schema(), &column)?; match index.sub_index_type() { (SubIndexType::Flat, QuantizationType::Flat) => match element_type { @@ -1829,20 +1860,34 @@ pub(crate) async fn remap_index_file_v3( #[allow(clippy::too_many_arguments)] pub(crate) async fn remap_index_file( dataset: &Dataset, - old_uuid: &str, - new_uuid: &str, + old_uuid: &Uuid, + new_uuid: &Uuid, old_version: u64, index: &IVFIndex, mapping: &HashMap>, name: String, column: String, transforms: Vec, -) -> Result<()> { +) -> Result { let object_store = dataset.object_store.as_ref(); - let old_path = dataset.indices_dir().join(old_uuid).join(INDEX_FILE_NAME); - let new_path = dataset.indices_dir().join(new_uuid).join(INDEX_FILE_NAME); + let old_path = dataset + .indices_dir() + .join(old_uuid.to_string()) + .join(INDEX_FILE_NAME); + let new_path = dataset + .indices_dir() + .join(new_uuid.to_string()) + .join(INDEX_FILE_NAME); - let reader: Arc = object_store.open(&old_path).await?.into(); + let file_sizes = dataset + .load_index(old_uuid) + .await? + .map(|index| index.file_size_map()) + .unwrap_or_default(); + let reader: Arc = + open_index_file(object_store, &old_path, INDEX_FILE_NAME, &file_sizes) + .await? + .into(); let mut writer = object_store.create(&new_path).await?; let tasks = generate_remap_tasks(&index.ivf.offsets, &index.ivf.lengths)?; @@ -1883,9 +1928,13 @@ pub(crate) async fn remap_index_file( // TODO: for now the IVF_PQ index file format hasn't been updated, so keep the old version, // change it to latest version value after refactoring the IVF_PQ writer.write_magics(pos, 0, 1, MAGIC).await?; + let size_bytes = writer.tell().await? as u64; Writer::shutdown(writer.as_mut()).await?; - Ok(()) + Ok(IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes, + }) } /// Write the index to the index file. @@ -1896,7 +1945,7 @@ async fn write_ivf_pq_file( index_dir: Path, column: &str, index_name: &str, - uuid: &str, + uuid: Uuid, dataset_version: u64, mut ivf: IvfModel, pq: ProductQuantizer, @@ -1906,8 +1955,11 @@ async fn write_ivf_pq_file( shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, precomputed_shuffle_buffers: Option<(Path, Vec)>, -) -> Result<()> { - let path = index_dir.clone().join(uuid).join(INDEX_FILE_NAME); +) -> Result { + let path = index_dir + .clone() + .join(uuid.to_string()) + .join(INDEX_FILE_NAME); let mut writer = object_store.create(&path).await?; let start = std::time::Instant::now(); @@ -1944,9 +1996,13 @@ async fn write_ivf_pq_file( // TODO: for now the IVF_PQ index file format hasn't been updated, so keep the old version, // change it to latest version value after refactoring the IVF_PQ writer.write_magics(pos, 0, 1, MAGIC).await?; + let size_bytes = writer.tell().await? as u64; Writer::shutdown(writer.as_mut()).await?; - Ok(()) + Ok(IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes, + }) } pub async fn write_ivf_pq_file_from_existing_index( @@ -1989,7 +2045,7 @@ async fn write_ivf_hnsw_file( dataset: &Dataset, column: &str, _index_name: &str, - uuid: &str, + uuid: Uuid, mut ivf: IvfModel, quantizer: Quantizer, distance_type: DistanceType, @@ -2001,7 +2057,10 @@ async fn write_ivf_hnsw_file( precomputed_shuffle_buffers: Option<(Path, Vec)>, ) -> Result<()> { let object_store = dataset.object_store.as_ref(); - let path = dataset.indices_dir().join(uuid).join(INDEX_FILE_NAME); + let path = dataset + .indices_dir() + .join(uuid.to_string()) + .join(INDEX_FILE_NAME); let writer = object_store.create(&path).await?; let schema = lance_core::datatypes::Schema::try_from(HNSW::schema().as_ref())?; @@ -2022,7 +2081,7 @@ async fn write_ivf_hnsw_file( let aux_path = dataset .indices_dir() - .join(uuid) + .join(uuid.to_string()) .join(INDEX_AUXILIARY_FILE_NAME); let aux_writer = object_store.create(&aux_path).await?; let schema = Schema::new(vec![ @@ -2158,7 +2217,7 @@ pub(crate) async fn merge_segments_with_progress( let index_version = infer_source_index_version(&segments)?; let segment_uuid = Uuid::new_v4(); let final_dir = indices_dir.clone().join(segment_uuid.to_string()); - merge_segments_to_dir( + let files = merge_segments_to_dir( object_store, indices_dir, &final_dir, @@ -2167,7 +2226,6 @@ pub(crate) async fn merge_segments_with_progress( progress, ) .await?; - let files = list_index_files_with_sizes(object_store, &final_dir).await?; merged_segment = TableIndexMetadata { uuid: segment_uuid, @@ -2194,7 +2252,7 @@ async fn merge_segments_to_dir( segments: &[TableIndexMetadata], _requested_index_type: Option, progress: Arc, -) -> Result<()> { +) -> Result> { reset_final_segment_dir(object_store, final_dir).await?; debug_assert!( @@ -2221,14 +2279,15 @@ async fn merge_segments_to_dir( }) .collect::>(); - lance_index::vector::distributed::index_merger::merge_partial_vector_auxiliary_files( - object_store, - &aux_paths, - final_dir, - progress.clone(), - ) - .await?; - write_root_vector_index_from_auxiliary( + let auxiliary_file = + lance_index::vector::distributed::index_merger::merge_partial_vector_auxiliary_files( + object_store, + &aux_paths, + final_dir, + progress.clone(), + ) + .await?; + let index_file = write_root_vector_index_from_auxiliary( object_store, final_dir, None, @@ -2237,7 +2296,7 @@ async fn merge_segments_to_dir( ) .await?; - Ok(()) + Ok(vec![auxiliary_file, index_file]) } fn infer_source_index_version(group: &[TableIndexMetadata]) -> Result { @@ -2267,7 +2326,7 @@ async fn write_root_vector_index_from_auxiliary( requested_index_type: Option, centroid_source_index_paths: &[Path], progress: Arc, -) -> Result<()> { +) -> Result { let aux_path = index_dir.clone().join(INDEX_AUXILIARY_FILE_NAME); let scheduler = ScanScheduler::new( Arc::new(object_store.clone()), @@ -2336,9 +2395,9 @@ async fn write_root_vector_index_from_auxiliary( let ivf_bytes = pb_ivf.encode_to_vec().into(); // Determine index metadata JSON from auxiliary or requested index type. - let index_meta_json = + let mut idx_meta: IndexMetadata = if let Some(idx_json) = meta.file_schema.metadata.get(INDEX_METADATA_SCHEMA_KEY) { - idx_json.clone() + serde_json::from_str(idx_json)? } else { let dt = meta .file_schema @@ -2352,11 +2411,25 @@ async fn write_root_vector_index_from_auxiliary( .to_string(), ) })?; - serde_json::to_string(&IndexMetadata { + IndexMetadata { index_type: index_type.to_string(), distance_type: dt, - })? + } }; + if let Some(source_hnsw_index_metadata) = + read_hnsw_index_metadata_from_sources(object_store, &scheduler, centroid_source_index_paths) + .await? + { + if idx_meta.index_type.starts_with("IVF_HNSW") + && !index_metadata_eq(&idx_meta, &source_hnsw_index_metadata) + { + return Err(Error::invalid_input(format!( + "HNSW index metadata mismatch while merging index segments: expected {:?}, got {:?}", + idx_meta, source_hnsw_index_metadata + ))); + } + idx_meta = source_hnsw_index_metadata; + } // Write root index.idx via V2 writer so downstream opens through v2 path. let index_path = index_dir.clone().join(INDEX_FILE_NAME); @@ -2377,16 +2450,8 @@ async fn write_root_vector_index_from_auxiliary( }, )?; - // Attach precise index metadata (type + distance). - v2_writer.add_schema_metadata(INDEX_METADATA_SCHEMA_KEY, &index_meta_json); - - // Add IVF protobuf as a global buffer and reference via IVF_METADATA_KEY. - let pos = v2_writer.add_global_buffer(ivf_bytes).await?; - v2_writer.add_schema_metadata(IVF_METADATA_KEY, pos.to_string()); - // For HNSW variants, attach per-partition metadata list; for FLAT-based // variants, attach minimal placeholder metadata. - let idx_meta: IndexMetadata = serde_json::from_str(&index_meta_json)?; let is_hnsw = idx_meta.index_type.starts_with("IVF_HNSW"); let is_flat_based = matches!( idx_meta.index_type.as_str(), @@ -2394,27 +2459,361 @@ async fn write_root_vector_index_from_auxiliary( ); if is_hnsw { - let default_meta = HnswMetadata::default(); - let meta_vec: Vec = (0..nlist) - .map(|_| serde_json::to_string(&default_meta).unwrap()) - .collect(); - let meta_vec_json = serde_json::to_string(&meta_vec)?; - v2_writer.add_schema_metadata(HNSW_METADATA_KEY, meta_vec_json); - } else if is_flat_based { - let meta_vec: Vec = (0..nlist).map(|_| "{}".to_string()).collect(); - let meta_vec_json = serde_json::to_string(&meta_vec)?; - v2_writer.add_schema_metadata("lance:flat", meta_vec_json); + let hnsw_params = read_hnsw_build_params_from_sources( + object_store, + &scheduler, + centroid_source_index_paths, + ) + .await?; + write_hnsw_root_index_from_auxiliary( + &mut v2_writer, + &aux_reader, + &ivf_model, + &hnsw_params, + &idx_meta, + progress.clone(), + ) + .await?; + } else { + // Attach precise index metadata (type + distance). + let index_meta_json = serde_json::to_string(&idx_meta)?; + v2_writer.add_schema_metadata(INDEX_METADATA_SCHEMA_KEY, &index_meta_json); + + // Add IVF protobuf as a global buffer and reference via IVF_METADATA_KEY. + let pos = v2_writer.add_global_buffer(ivf_bytes).await?; + v2_writer.add_schema_metadata(IVF_METADATA_KEY, pos.to_string()); + + if is_flat_based { + let meta_vec: Vec = (0..nlist).map(|_| "{}".to_string()).collect(); + let meta_vec_json = serde_json::to_string(&meta_vec)?; + v2_writer.add_schema_metadata("lance:flat", meta_vec_json); + } + + let empty_batch = RecordBatch::new_empty(arrow_schema); + v2_writer.write_batch(&empty_batch).await?; } - let empty_batch = RecordBatch::new_empty(arrow_schema); - v2_writer.write_batch(&empty_batch).await?; - v2_writer.finish().await?; + let summary = v2_writer.finish().await?; progress.stage_progress("write_root_index", 1).await?; progress.stage_complete("write_root_index").await?; + Ok(IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes: summary.size_bytes, + }) +} + +async fn read_hnsw_index_metadata_from_sources( + object_store: &ObjectStore, + scheduler: &Arc, + source_index_paths: &[Path], +) -> Result> { + let mut index_metadata: Option = None; + + for source_index_path in source_index_paths { + if !object_store.exists(source_index_path).await? { + continue; + } + + let fh = scheduler + .open_file(source_index_path, &CachedFileSize::unknown()) + .await?; + let reader = V2Reader::try_open( + fh, + None, + Arc::default(), + &LanceCache::no_cache(), + V2ReaderOptions::default(), + ) + .await?; + let Some(metadata_json) = reader + .metadata() + .file_schema + .metadata + .get(INDEX_METADATA_SCHEMA_KEY) + else { + continue; + }; + let metadata: IndexMetadata = serde_json::from_str(metadata_json)?; + if !metadata.index_type.starts_with("IVF_HNSW") { + continue; + } + + if let Some(index_metadata) = index_metadata.as_ref() { + if !index_metadata_eq(index_metadata, &metadata) { + return Err(Error::invalid_input(format!( + "HNSW index metadata mismatch while merging index segments: \ + expected {:?}, got {:?} in {}", + index_metadata, metadata, source_index_path + ))); + } + } else { + index_metadata = Some(metadata); + } + } + + Ok(index_metadata) +} + +fn index_metadata_eq(left: &IndexMetadata, right: &IndexMetadata) -> bool { + left.index_type == right.index_type && left.distance_type == right.distance_type +} + +async fn read_hnsw_build_params_from_sources( + object_store: &ObjectStore, + scheduler: &Arc, + source_index_paths: &[Path], +) -> Result { + let mut build_params: Option = None; + + for source_index_path in source_index_paths { + if !object_store.exists(source_index_path).await? { + continue; + } + + let fh = scheduler + .open_file(source_index_path, &CachedFileSize::unknown()) + .await?; + let reader = V2Reader::try_open( + fh, + None, + Arc::default(), + &LanceCache::no_cache(), + V2ReaderOptions::default(), + ) + .await?; + let Some(metadata_json) = reader + .metadata() + .file_schema + .metadata + .get(HNSW_METADATA_KEY) + else { + continue; + }; + let partition_metadata: Vec = serde_json::from_str(metadata_json)?; + for metadata in partition_metadata { + if metadata.is_empty() { + continue; + } + let metadata: HnswMetadata = serde_json::from_str(&metadata)?; + if let Some(build_params) = build_params.as_ref() { + if !hnsw_build_params_eq(build_params, &metadata.params) { + return Err(Error::invalid_input(format!( + "HNSW build parameters mismatch while merging index segments: \ + expected {:?}, got {:?} in {}", + build_params, metadata.params, source_index_path + ))); + } + } else { + build_params = Some(metadata.params); + } + } + } + + Ok(build_params.unwrap_or_default()) +} + +fn hnsw_build_params_eq(left: &HnswBuildParams, right: &HnswBuildParams) -> bool { + left.max_level == right.max_level + && left.m == right.m + && left.ef_construction == right.ef_construction + && left.prefetch_distance == right.prefetch_distance +} + +async fn write_hnsw_root_index_from_auxiliary( + writer: &mut V2Writer, + aux_reader: &V2Reader, + aux_ivf: &IvfModel, + hnsw_params: &HnswBuildParams, + index_metadata: &IndexMetadata, + progress: Arc, +) -> Result<()> { + let mut index_ivf = if let Some(centroids) = aux_ivf.centroids.clone() { + IvfModel::new(centroids, aux_ivf.loss) + } else { + IvfModel::empty() + }; + let distance_type = DistanceType::try_from(index_metadata.distance_type.as_str())?; + let mut partition_index_metadata = Vec::with_capacity(aux_ivf.num_partitions()); + + progress + .stage_start( + "rebuild_hnsw_graph", + Some(aux_ivf.num_partitions() as u64), + "partitions", + ) + .await?; + + for partition_id in 0..aux_ivf.num_partitions() { + let row_range = aux_ivf.row_range(partition_id); + if row_range.is_empty() { + index_ivf.add_partition(0); + partition_index_metadata.push(String::new()); + progress + .stage_progress("rebuild_hnsw_graph", partition_id as u64 + 1) + .await?; + continue; + } + + let batch = read_v2_partition_batch(aux_reader, row_range).await?; + let hnsw = build_hnsw_from_storage_batch( + &index_metadata.index_type, + batch, + aux_reader, + distance_type, + hnsw_params, + ) + .await?; + let index_batch = hnsw.to_batch()?; + + writer.write_batch(&index_batch).await?; + index_ivf.add_partition(index_batch.num_rows() as u32); + partition_index_metadata.push(serde_json::to_string(&hnsw.metadata())?); + progress + .stage_progress("rebuild_hnsw_graph", partition_id as u64 + 1) + .await?; + } + + progress.stage_complete("rebuild_hnsw_graph").await?; + + write_hnsw_index_metadata(writer, &index_ivf, distance_type, index_metadata).await?; + writer.add_schema_metadata( + HNSW_METADATA_KEY, + serde_json::to_string(&partition_index_metadata)?, + ); + + Ok(()) +} + +async fn read_v2_partition_batch(reader: &V2Reader, range: Range) -> Result { + let schema = Arc::new(reader.schema().as_ref().into()); + let stream = reader + .read_stream( + ReadBatchParams::Range(range), + u32::MAX, + 4, + FilterExpression::no_filter(), + ) + .await?; + let batches = stream.try_collect::>().await?; + if batches.is_empty() { + Ok(RecordBatch::new_empty(schema)) + } else { + Ok(concat_batches(&schema, batches.iter())?) + } +} + +async fn build_hnsw_from_storage_batch( + index_type: &str, + batch: RecordBatch, + aux_reader: &V2Reader, + distance_type: DistanceType, + hnsw_params: &HnswBuildParams, +) -> Result { + match index_type { + "IVF_HNSW_FLAT" => { + let metadata = read_storage_metadata::(aux_reader, "")?; + let vector_type = batch + .column_by_name(FLAT_COLUMN) + .ok_or_else(|| { + Error::index(format!( + "{FLAT_COLUMN} column missing from HNSW_FLAT storage" + )) + })? + .as_fixed_size_list() + .value_type(); + if vector_type == DataType::UInt8 && distance_type == DistanceType::Hamming { + let storage = + FlatBinStorage::try_from_batch(batch, &metadata, distance_type, None)?; + HNSW::index_vectors(&storage, hnsw_params.clone()) + } else { + let storage = + FlatFloatStorage::try_from_batch(batch, &metadata, distance_type, None)?; + HNSW::index_vectors(&storage, hnsw_params.clone()) + } + } + "IVF_HNSW_PQ" => { + let metadata = read_pq_storage_metadata(aux_reader).await?; + let storage = + ProductQuantizationStorage::try_from_batch(batch, &metadata, distance_type, None)?; + HNSW::index_vectors(&storage, hnsw_params.clone()) + } + "IVF_HNSW_SQ" => { + let metadata = + read_storage_metadata::(aux_reader, SQ_METADATA_KEY)?; + let storage = + ScalarQuantizationStorage::try_from_batch(batch, &metadata, distance_type, None)?; + HNSW::index_vectors(&storage, hnsw_params.clone()) + } + other => Err(Error::index(format!( + "Cannot rebuild HNSW graph for unsupported index type {other}" + ))), + } +} + +async fn write_hnsw_index_metadata( + writer: &mut V2Writer, + ivf: &IvfModel, + distance_type: DistanceType, + index_metadata: &IndexMetadata, +) -> Result<()> { + let pb_ivf: lance_index::pb::Ivf = ivf.try_into()?; + let pos = writer + .add_global_buffer(pb_ivf.encode_to_vec().into()) + .await?; + writer.add_schema_metadata(IVF_METADATA_KEY, pos.to_string()); + writer.add_schema_metadata( + INDEX_METADATA_SCHEMA_KEY, + serde_json::to_string(&IndexMetadata { + index_type: index_metadata.index_type.clone(), + distance_type: distance_type.to_string(), + })?, + ); Ok(()) } +async fn read_pq_storage_metadata(reader: &V2Reader) -> Result { + let mut metadata = + read_storage_metadata::(reader, PQ_METADATA_KEY)?; + if metadata.codebook.is_none() { + let tensor_bytes = reader + .read_global_buffer(metadata.codebook_position as u32) + .await?; + let codebook_tensor: lance_index::pb::Tensor = Message::decode(tensor_bytes)?; + metadata.codebook = Some(FixedSizeListArray::try_from(&codebook_tensor)?); + } + Ok(metadata) +} + +fn read_storage_metadata(reader: &V2Reader, storage_metadata_key: &str) -> Result +where + T: serde::de::DeserializeOwned, +{ + if !storage_metadata_key.is_empty() + && let Some(metadata) = reader + .metadata() + .file_schema + .metadata + .get(storage_metadata_key) + { + return Ok(serde_json::from_str(metadata)?); + } + + let storage_metadata = reader + .metadata() + .file_schema + .metadata + .get(STORAGE_METADATA_KEY) + .ok_or_else(|| Error::index(format!("{STORAGE_METADATA_KEY} missing from storage file")))?; + let metadata_entries: Vec = serde_json::from_str(storage_metadata)?; + let metadata = metadata_entries.first().ok_or_else(|| { + Error::index(format!( + "{STORAGE_METADATA_KEY} did not contain any storage metadata entries" + )) + })?; + Ok(serde_json::from_str(metadata)?) +} + async fn do_train_ivf_model( centroids: Option>, data: &PrimitiveArray, @@ -4467,6 +4866,7 @@ mod tests { use_index: true, query_parallelism: lance_index::vector::DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; let (partitions, _) = index.find_partitions(&query).unwrap(); let nearest_partition_id = partitions.value(0) as usize; @@ -4642,13 +5042,12 @@ mod tests { let pq_params = PQBuildParams::new(NUM_SUBVECTORS as usize, NUM_BITS as usize); let uuid = Uuid::new_v4(); - let uuid_str = uuid.to_string(); build_ivf_pq_index( &dataset, WellKnownIvfPqData::COLUMN, INDEX_NAME, - &uuid_str, + uuid, MetricType::L2, &ivf_params, &pq_params, @@ -4693,7 +5092,7 @@ mod tests { .unwrap(); let index = dataset_mut - .open_vector_index(WellKnownIvfPqData::COLUMN, &uuid_str, &NoOpMetricsCollector) + .open_vector_index(WellKnownIvfPqData::COLUMN, &uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -4737,12 +5136,11 @@ mod tests { let mapping = build_mapping(row_ids_to_modify, row_ids_to_remove, max_id); let new_uuid = Uuid::new_v4(); - let new_uuid_str = new_uuid.to_string(); remap_index_file( &dataset_mut, - &uuid_str, - &new_uuid_str, + &uuid, + &new_uuid, dataset_mut.version().version, ivf_index, &mapping, @@ -4789,11 +5187,7 @@ mod tests { .unwrap(); let remapped = dataset_mut - .open_vector_index( - WellKnownIvfPqData::COLUMN, - &new_uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index(WellKnownIvfPqData::COLUMN, &new_uuid, &NoOpMetricsCollector) .await .unwrap(); let ivf_remapped = remapped.as_any().downcast_ref::().unwrap(); @@ -5849,11 +6243,7 @@ mod tests { .unwrap(); let indices = dataset.load_indices().await.unwrap(); let idx = dataset - .open_generic_index( - "vector", - indices[0].uuid.to_string().as_str(), - &NoOpMetricsCollector, - ) + .open_generic_index("vector", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let ivf_idx = idx.as_any().downcast_ref::().unwrap(); @@ -5872,7 +6262,7 @@ mod tests { ); // PQ code is on residual space - let pq_store = ivf_idx.load_partition_storage(0).await.unwrap(); + let pq_store = ivf_idx.load_partition_storage(0, None).await.unwrap(); pq_store .codebook() .values() diff --git a/rust/lance/src/index/vector/ivf/io.rs b/rust/lance/src/index/vector/ivf/io.rs index bb52dd4b484..56d220aeed2 100644 --- a/rust/lance/src/index/vector/ivf/io.rs +++ b/rust/lance/src/index/vector/ivf/io.rs @@ -579,11 +579,7 @@ mod tests { assert_eq!(ds.get_fragments().len(), 2); let idx = ds - .open_vector_index( - "vector", - &indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let _ivf_idx = idx diff --git a/rust/lance/src/index/vector/ivf/partition_serde.rs b/rust/lance/src/index/vector/ivf/partition_serde.rs index f8d13a2f0b5..ad737620a94 100644 --- a/rust/lance/src/index/vector/ivf/partition_serde.rs +++ b/rust/lance/src/index/vector/ivf/partition_serde.rs @@ -3,46 +3,35 @@ //! Serialization and zero-copy deserialization for IVF partition cache entries. //! -//! The format is: -//! -//! ```text -//! [header_len: u64 LE] -//! [header: JSON bytes] -//! [sub_index Arrow IPC stream] -//! [... quantizer-specific IPC streams ...] -//! [storage Arrow IPC stream] -//! ``` -//! -//! Each IPC section is a self-delimiting Arrow IPC stream (schema + batches + EOS -//! marker), written directly to the underlying writer without buffering. On -//! deserialization, each message is read into a per-message buffer and zero-copy -//! decoded via [`lance_arrow::ipc`]. +//! Each entry is a protobuf header (see `lance-index/protos-cache/cache.proto`, with the +//! distance and rotation types as proto enums) followed by 64-byte-aligned +//! Arrow IPC sections in a fixed, version-keyed order: the sub-index, then any +//! quantizer-specific arrays (PQ codebook, RabitQ Matrix rotation), then the +//! quantizer storage batches. Sections decode zero-copy via [`lance_arrow::ipc`]. -use std::io::Write; use std::sync::Arc; use arrow_array::{FixedSizeListArray, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; -use bytes::Bytes; -use lance_arrow::ipc::{ - read_ipc_stream_at, read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_ipc_stream_batches, write_len_prefixed_bytes, -}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; use lance_index::vector::bq::RQRotationType; use lance_index::vector::bq::builder::RabitQuantizer; -use lance_index::vector::bq::storage::RabitQuantizationMetadata; +use lance_index::vector::bq::storage::{RabitQuantizationMetadata, RabitQueryEstimator}; use lance_index::vector::flat::index::{FlatBinQuantizer, FlatMetadata, FlatQuantizer}; use lance_index::vector::pq::ProductQuantizer; use lance_index::vector::pq::storage::ProductQuantizationMetadata; use lance_index::vector::quantizer::{Quantization, QuantizerStorage}; use lance_index::vector::sq::ScalarQuantizer; -use lance_index::vector::sq::storage::ScalarQuantizationMetadata; use lance_index::vector::storage::VectorStore; use lance_index::vector::v3::subindex::IvfSubIndex; use lance_linalg::distance::DistanceType; -use serde::{Deserialize, Serialize}; + +use lance_index::cache_pb::{ + DistanceType as PbDistanceType, FlatPartitionHeader, PqPartitionHeader, RabitPartitionHeader, + RabitQueryEstimator as PbRabitQueryEstimator, RotationType as PbRotationType, + SqPartitionHeader, +}; use super::v2::PartitionEntry; @@ -68,7 +57,7 @@ type ArcAny = Arc; fn serialize_partition_entry( any: &ArcAny, - writer: &mut dyn Write, + writer: &mut CacheEntryWriter<'_>, ) -> lance_core::Result<()> where S: IvfSubIndex + 'static, @@ -81,14 +70,16 @@ where concrete.serialize(writer) } -fn deserialize_partition_entry(data: &Bytes) -> lance_core::Result +fn deserialize_partition_entry( + reader: &mut CacheEntryReader<'_>, +) -> lance_core::Result where S: IvfSubIndex + 'static, Q: Quantization + 'static, Concrete: Quantization + 'static, PartitionEntry: CacheCodecImpl, { - let concrete = PartitionEntry::::deserialize(data)?; + let concrete = PartitionEntry::::deserialize(reader)?; let any: ArcAny = Arc::new(concrete); Ok(any .downcast::>() @@ -109,6 +100,8 @@ where PartitionEntry: CacheCodecImpl, { lance_core::cache::CacheCodec::new( + as CacheCodecImpl>::TYPE_ID, + as CacheCodecImpl>::CURRENT_VERSION, serialize_partition_entry::, deserialize_partition_entry::, ) @@ -118,51 +111,64 @@ where // Common helpers // --------------------------------------------------------------------------- -fn distance_type_to_u8(dt: DistanceType) -> u8 { +// Distance and rotation discriminants travel as proto enums in the header; +// these map to/from the in-memory Rust enums. + +fn distance_type_to_proto(dt: DistanceType) -> PbDistanceType { match dt { - DistanceType::L2 => 0, - DistanceType::Cosine => 1, - DistanceType::Dot => 2, - DistanceType::Hamming => 3, + DistanceType::L2 => PbDistanceType::L2, + DistanceType::Cosine => PbDistanceType::Cosine, + DistanceType::Dot => PbDistanceType::Dot, + DistanceType::Hamming => PbDistanceType::Hamming, } } -fn u8_to_distance_type(v: u8) -> Result { - match v { - 0 => Ok(DistanceType::L2), - 1 => Ok(DistanceType::Cosine), - 2 => Ok(DistanceType::Dot), - 3 => Ok(DistanceType::Hamming), - _ => Err(Error::io(format!("unknown distance type: {v}"))), +fn proto_to_distance_type(dt: PbDistanceType) -> DistanceType { + match dt { + PbDistanceType::L2 => DistanceType::L2, + PbDistanceType::Cosine => DistanceType::Cosine, + PbDistanceType::Dot => DistanceType::Dot, + PbDistanceType::Hamming => DistanceType::Hamming, } } -fn rotation_type_to_u8(rt: RQRotationType) -> u8 { +fn rotation_type_to_proto(rt: RQRotationType) -> PbRotationType { match rt { - RQRotationType::Matrix => 0, - RQRotationType::Fast => 1, + RQRotationType::Matrix => PbRotationType::Matrix, + RQRotationType::Fast => PbRotationType::Fast, } } -fn u8_to_rotation_type(v: u8) -> Result { - match v { - 0 => Ok(RQRotationType::Matrix), - 1 => Ok(RQRotationType::Fast), - _ => Err(Error::io(format!("unknown rotation type: {v}"))), +fn proto_to_rotation_type(rt: PbRotationType) -> RQRotationType { + match rt { + PbRotationType::Matrix => RQRotationType::Matrix, + PbRotationType::Fast => RQRotationType::Fast, } } -/// Write a JSON-serializable header using [`write_len_prefixed_bytes`]. -fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> { - let header_json = serde_json::to_vec(header)?; - write_len_prefixed_bytes(writer, &header_json)?; - Ok(()) +fn query_estimator_to_proto(qe: RabitQueryEstimator) -> PbRabitQueryEstimator { + match qe { + RabitQueryEstimator::ResidualQuery => PbRabitQueryEstimator::ResidualQuery, + RabitQueryEstimator::RawQuery => PbRabitQueryEstimator::RawQuery, + } } -/// Read a JSON header written by [`write_json_header`]. -fn read_json_header(data: &Bytes, offset: &mut usize) -> Result { - let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - serde_json::from_slice(&bytes).map_err(|e| Error::io(e.to_string())) +fn proto_to_query_estimator(qe: PbRabitQueryEstimator) -> RabitQueryEstimator { + match qe { + PbRabitQueryEstimator::ResidualQuery => RabitQueryEstimator::ResidualQuery, + PbRabitQueryEstimator::RawQuery => RabitQueryEstimator::RawQuery, + } +} + +/// Read a storage section expected to hold exactly one batch. +fn read_single_storage_batch(r: &mut CacheEntryReader<'_>) -> Result { + let mut batches = r.read_ipc_batches()?; + match batches.len() { + 1 => Ok(batches.remove(0)), + n => Err(Error::io(format!( + "expected exactly 1 storage batch, got {n}" + ))), + } } /// Wrap a `FixedSizeListArray` in a single-column `RecordBatch` with the given @@ -202,17 +208,11 @@ fn batch_to_codebook(batch: &RecordBatch) -> Result { // PQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct PqPartitionHeader { - distance_type: u8, - nbits: u32, - num_sub_vectors: usize, - dimension: usize, - transposed: bool, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.PQ"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let metadata = self.storage.metadata(); let distance_type = self.storage.distance_type(); @@ -221,32 +221,28 @@ impl CacheCodecImpl for PartitionEntry { })?; let header = PqPartitionHeader { - distance_type: distance_type_to_u8(distance_type), + distance_type: distance_type_to_proto(distance_type) as i32, nbits: metadata.nbits, - num_sub_vectors: metadata.num_sub_vectors, - dimension: metadata.dimension, + num_sub_vectors: metadata.num_sub_vectors as u64, + dimension: metadata.dimension as u64, transposed: metadata.transposed, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream(&codebook_to_batch(codebook)?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc(&codebook_to_batch(codebook)?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: PqPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PqPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let codebook_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let codebook_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; let codebook = batch_to_codebook(&codebook_batch)?; @@ -254,8 +250,8 @@ impl CacheCodecImpl for PartitionEntry { let metadata = ProductQuantizationMetadata { codebook_position: 0, nbits: header.nbits, - num_sub_vectors: header.num_sub_vectors, - dimension: header.dimension, + num_sub_vectors: header.num_sub_vectors as usize, + dimension: header.dimension as usize, codebook: Some(codebook), codebook_tensor: Vec::new(), transposed: header.transposed, @@ -276,41 +272,35 @@ impl CacheCodecImpl for PartitionEntry { // Flat (Float32) // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct FlatPartitionHeader { - distance_type: u8, - dim: usize, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Flat"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = FlatPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + dim: metadata.dim as u64, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: FlatPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: FlatPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; - let metadata = FlatMetadata { dim: header.dim }; + let metadata = FlatMetadata { + dim: header.dim as usize, + }; let storage = ::Storage::try_from_batch( storage_batch, &metadata, @@ -327,34 +317,34 @@ impl CacheCodecImpl for PartitionEntry { // --------------------------------------------------------------------------- impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.FlatBin"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = FlatPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + dim: metadata.dim as u64, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: FlatPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: FlatPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; - let metadata = FlatMetadata { dim: header.dim }; + let metadata = FlatMetadata { + dim: header.dim as usize, + }; let storage = ::Storage::try_from_batch( storage_batch, &metadata, @@ -370,56 +360,41 @@ impl CacheCodecImpl for PartitionEntry { // SQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct SqPartitionHeader { - distance_type: u8, - num_bits: u16, - dim: usize, - bounds_start: f64, - bounds_end: f64, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.SQ"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = SqPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - num_bits: metadata.num_bits, - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + num_bits: metadata.num_bits as u32, + dim: metadata.dim as u64, bounds_start: metadata.bounds.start, bounds_end: metadata.bounds.end, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - // SQ storage may contain multiple batches; stream them all in one IPC stream. - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + // SQ storage may contain multiple batches; write them all in one section. + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: SqPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: SqPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batches = - read_ipc_stream_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batches = r.read_ipc_batches()?; let index = S::load(sub_index_batch)?; - let metadata = ScalarQuantizationMetadata { - dim: header.dim, - num_bits: header.num_bits, - bounds: header.bounds_start..header.bounds_end, - }; + let num_bits = header.num_bits as u16; let storage = ::Storage::try_new( - metadata.num_bits, + num_bits, distance_type, - metadata.bounds, + header.bounds_start..header.bounds_end, storage_batches, None, )?; @@ -432,80 +407,69 @@ impl CacheCodecImpl for PartitionEntry { // RabitQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct RabitPartitionHeader { - distance_type: u8, - num_bits: u8, - code_dim: u32, - /// 0 = Matrix, 1 = Fast - rotation_type: u8, - /// Fast rotation signs (only set when rotation_type == Fast). - fast_rotation_signs: Option>, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Rabit"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = RabitPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - num_bits: metadata.num_bits, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + num_bits: metadata.num_bits as u32, code_dim: metadata.code_dim, - rotation_type: rotation_type_to_u8(metadata.rotation_type), + rotation_type: rotation_type_to_proto(metadata.rotation_type) as i32, + query_estimator: query_estimator_to_proto(metadata.query_estimator) as i32, fast_rotation_signs: metadata.fast_rotation_signs.clone(), }; - write_json_header(writer, &header)?; - - write_ipc_stream(&self.index.to_batch()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; - // Write the rotation matrix IPC stream only for Matrix rotation; the - // Fast rotation case stores its signs compactly in the JSON header. + // Write the rotation matrix IPC section only for Matrix rotation; the + // Fast rotation case stores its signs compactly in the proto header. if metadata.rotation_type == RQRotationType::Matrix { let mat = metadata.rotate_mat.as_ref().ok_or_else(|| { Error::io( "RabitQ Matrix metadata missing rotate_mat during serialization".to_string(), ) })?; - write_ipc_stream(&fsl_to_batch(mat, "rotate_mat")?, writer)?; + w.write_ipc(&fsl_to_batch(mat, "rotate_mat")?)?; } - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: RabitPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; - let rotation_type = u8_to_rotation_type(header.rotation_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: RabitPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); + let rotation_type = proto_to_rotation_type(header.rotation_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; let rotate_mat = if rotation_type == RQRotationType::Matrix { - let mat_batch = read_ipc_stream_single_at(data, &mut offset) - .map_err(|e| Error::io(e.to_string()))?; + let mat_batch = r.read_ipc()?; Some(batch_to_fsl(&mat_batch)?) } else { None }; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; + // Read the proto enum accessor before moving fields out of `header`. + let query_estimator = proto_to_query_estimator(header.query_estimator()); let metadata = RabitQuantizationMetadata { rotate_mat, rotate_mat_position: None, fast_rotation_signs: header.fast_rotation_signs, rotation_type, code_dim: header.code_dim, - num_bits: header.num_bits, + num_bits: header.num_bits as u8, // The storage batch already has packed codes; skip re-packing. packed: true, + query_estimator, }; let storage = ::Storage::try_from_batch( storage_batch, @@ -543,6 +507,21 @@ mod tests { use lance_index::vector::flat::storage::FlatFloatStorage; use lance_index::vector::sq::storage::ScalarQuantizationStorage; + /// Serialize a codec body (no envelope) for tests. + fn ser_body(entry: &T) -> Vec { + let mut buf = Vec::new(); + entry + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + buf + } + + /// Deserialize a codec body (no envelope) at the current build's version. + fn de_body(bytes: Vec) -> Result { + let data = bytes::Bytes::from(bytes); + T::deserialize(&mut CacheEntryReader::new(&data, 0, T::CURRENT_VERSION)) + } + // ----- PQ helpers ------------------------------------------------------- fn make_test_codebook(dim: usize, num_sub_vectors: usize) -> FixedSizeListArray { @@ -610,12 +589,9 @@ mod tests { storage, }; - let mut serialized = Vec::new(); - entry.serialize(&mut serialized).unwrap(); - let deserialized = PartitionEntry::::deserialize( - &bytes::Bytes::from(serialized), - ) - .unwrap(); + let serialized = ser_body(&entry); + let deserialized = + de_body::>(serialized).unwrap(); assert_eq!(entry.storage, deserialized.storage); } @@ -663,12 +639,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!( restored.storage.distance_type(), entry.storage.distance_type() @@ -686,12 +658,9 @@ mod tests { storage, }; - let mut serialized = Vec::new(); - entry.serialize(&mut serialized).unwrap(); - let deserialized = PartitionEntry::::deserialize( - &bytes::Bytes::from(serialized), - ) - .unwrap(); + let serialized = ser_body(&entry); + let deserialized = + de_body::>(serialized).unwrap(); assert_eq!(entry.storage, deserialized.storage); } @@ -704,13 +673,9 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); + let mut bytes = ser_body(&entry); bytes.truncate(3); - assert!( - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .is_err() - ); + assert!(de_body::>(bytes).is_err()); } // ----- Flat helpers ----------------------------------------------------- @@ -748,11 +713,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!( restored.storage.metadata().dim, @@ -778,11 +740,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.distance_type(), dt); } } @@ -795,11 +754,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let restored_batch = restored.storage.to_batches().unwrap().next().unwrap(); let schema = restored_batch.schema(); @@ -820,11 +776,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let restored_batch = restored.storage.to_batches().unwrap().next().unwrap(); let schema = restored_batch.schema(); @@ -876,11 +829,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let m = entry.storage.metadata(); let rm = restored.storage.metadata(); @@ -906,12 +856,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.distance_type(), dt); } } @@ -952,11 +898,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.len(), 30); let orig_ids: Vec = entry.storage.row_ids().copied().collect(); @@ -970,14 +913,27 @@ mod tests { num_rows: usize, code_dim: usize, distance_type: DistanceType, + ) -> ::Storage { + make_rabit_storage( + num_rows, + code_dim, + distance_type, + RQRotationType::Fast, + RabitQueryEstimator::ResidualQuery, + ) + } + + fn make_rabit_storage( + num_rows: usize, + code_dim: usize, + distance_type: DistanceType, + rotation_type: RQRotationType, + query_estimator: RabitQueryEstimator, ) -> ::Storage { use lance_arrow::FixedSizeListArrayExt; - let quantizer = RabitQuantizer::new_with_rotation::( - 1, - code_dim as i32, - RQRotationType::Fast, - ); + let quantizer = + RabitQuantizer::new_with_rotation::(1, code_dim as i32, rotation_type); let values: Vec = (0..num_rows * code_dim) .map(|i| (i % 100) as f32 / 100.0 - 0.5) .collect(); @@ -989,7 +945,8 @@ mod tests { .as_fixed_size_list() .clone(); - let metadata = quantizer.metadata(None); + let mut metadata = quantizer.metadata(None); + metadata.query_estimator = query_estimator; let batch = RecordBatch::try_from_iter(vec![ ( lance_core::ROW_ID, @@ -1036,17 +993,15 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let m = entry.storage.metadata(); let rm = restored.storage.metadata(); assert_eq!(rm.num_bits, m.num_bits); assert_eq!(rm.code_dim, m.code_dim); assert_eq!(rm.rotation_type, m.rotation_type); + assert_eq!(rm.query_estimator, m.query_estimator); assert_eq!(rm.fast_rotation_signs, m.fast_rotation_signs); assert!(rm.packed); assert_eq!( @@ -1077,19 +1032,129 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + // The codec round-trips the distance type faithfully. + assert_eq!( + restored.storage.distance_type(), + entry.storage.distance_type() + ); + } + } + + #[test] + fn test_roundtrip_rabitq_raw_query_estimator() { + // The query estimator is a non-default value here; it must survive the + // round trip so raw-query search keeps working after a cache reload. + let storage = make_rabit_storage( + 40, + 32, + DistanceType::L2, + RQRotationType::Fast, + RabitQueryEstimator::RawQuery, + ); + assert_eq!( + storage.metadata().query_estimator, + RabitQueryEstimator::RawQuery + ); + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage, + }; + + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + assert_eq!( + restored.storage.metadata().query_estimator, + RabitQueryEstimator::RawQuery + ); + } + + /// Matrix rotation writes an extra `rotate_mat` IPC section between the + /// sub-index and storage sections; exercise that the codec preserves it. + #[test] + fn test_roundtrip_flat_rabitq_matrix() { + let storage = make_rabit_storage( + 40, + 32, + DistanceType::L2, + RQRotationType::Matrix, + RabitQueryEstimator::ResidualQuery, + ); + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage, + }; + + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + + let m = entry.storage.metadata(); + let rm = restored.storage.metadata(); + assert_eq!(rm.rotation_type, RQRotationType::Matrix); + assert_eq!(rm.code_dim, m.code_dim); + assert_eq!(rm.num_bits, m.num_bits); + // The rotation matrix itself must survive the round trip. + let orig_mat = m + .rotate_mat + .as_ref() + .expect("matrix rotation has rotate_mat"); + let rest_mat = rm + .rotate_mat + .as_ref() + .expect("restored matrix rotation has rotate_mat"); + assert_eq!( + orig_mat.values().as_primitive::().values(), + rest_mat.values().as_primitive::().values(), + ); + } + + /// SQ storage (a multi-batch IPC section) must decode zero-copy through the + /// full envelope even though the proto header and sub-index section push it + /// to a non-aligned starting offset. + #[test] + fn test_partition_storage_is_zero_copy_through_envelope() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage: make_sq_storage(64, 32, DistanceType::L2), + }; + let codec = CacheCodec::from_impl::>(); + let any: Arc = Arc::new(entry); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored + .downcast::>() .unwrap(); - assert_eq!(restored.storage.distance_type(), dt); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + let first = restored.storage.to_batches().unwrap().next().unwrap(); + for col in first.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "storage buffer was realigned out of the input — misaligned IPC section", + ); + } } } #[test] fn test_ivf_index_state_roundtrip() { - use crate::index::vector::ivf::v2::{IvfIndexState, IvfStateEntryBox}; + use crate::index::vector::ivf::v2::{ + IvfIndexState, IvfStateEntryBox, empty_rabit_search_cache_cell, + }; use lance_index::vector::flat::index::FlatQuantizer; use lance_index::vector::ivf::storage::IvfModel; use lance_index::vector::quantizer::QuantizationType; @@ -1114,21 +1179,17 @@ mod tests { cache_key_prefix: "prefix/".to_string(), index_file_size: 1024, aux_file_size: 512, + rq_search_cache: empty_rabit_search_cache_cell(), }; let entry = IvfStateEntryBox(Arc::new(state)); - let mut bytes = Vec::new(); - CacheCodecImpl::serialize(&entry, &mut bytes).unwrap(); - - let restored = - ::deserialize(&bytes::Bytes::from(bytes.clone())) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::(bytes.clone()).unwrap(); // Re-serialize the restored entry and compare bytes — a stronger check // than field-by-field comparison and avoids needing to downcast. - let mut restored_bytes = Vec::new(); - CacheCodecImpl::serialize(&restored, &mut restored_bytes).unwrap(); + let restored_bytes = ser_body(&restored); assert_eq!(bytes, restored_bytes); } } diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 672264f0dac..40227d2d020 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -3,12 +3,12 @@ //! IVF - Inverted File index. -use std::io::Write as IoWrite; use std::marker::PhantomData; use std::{ any::Any, + borrow::Cow, collections::{BinaryHeap, HashMap}, - sync::Arc, + sync::{Arc, Mutex}, }; use crate::index::vector::{IndexFileVersion, builder::index_type_string}; @@ -21,23 +21,29 @@ use async_trait::async_trait; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::prelude::stream::{self, TryStreamExt}; use futures::{StreamExt, TryFutureExt}; use lance_arrow::RecordBatchExt; -use lance_arrow::ipc::write_len_prefixed_bytes; -use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}; +use lance_core::cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, +}; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_VECTOR_PART, TRACE_IO_EVENTS}; use lance_core::{Error, ROW_ID, Result}; use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::LanceEncodingsIo; use lance_file::reader::{CachedFileMetadata, FileReader, FileReaderOptions}; +use lance_index::cache_pb::IvfStateHeader; use lance_index::frag_reuse::FragReuseIndex; use lance_index::metrics::{LocalMetricsCollector, MetricsCollector, NoOpMetricsCollector}; use lance_index::vector::VectorIndexCacheEntry; use lance_index::vector::bq::builder::RabitQuantizer; +use lance_index::vector::bq::ex_dot::{blocked_ex_code_bytes, padded_query_len}; +use lance_index::vector::bq::rabit_ex_bits; +use lance_index::vector::bq::storage::{RabitQueryEstimator, SEGMENT_NUM_CODES}; use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; use lance_index::vector::graph::OrderedNode; use lance_index::vector::hnsw::HNSW; @@ -48,7 +54,8 @@ use lance_index::vector::quantizer::{ }; use lance_index::vector::sq::ScalarQuantizer; use lance_index::vector::storage::{ - QueryResidual, QueryScratch, QueryScratchCapacity, QueryScratchPool, VectorStore, + QueryResidual, QueryScratch, QueryScratchCapacity, QueryScratchPool, RabitRawQueryContext, + VectorStore, }; use lance_index::vector::v3::subindex::SubIndexType; use lance_index::{ @@ -61,7 +68,7 @@ use lance_index::{ }; use lance_index::{INDEX_METADATA_SCHEMA_KEY, IndexMetadata}; use lance_io::local::to_local_path; -use lance_io::scheduler::SchedulerConfig; +use lance_io::scheduler::{IoStats, ScanStats, SchedulerConfig}; use lance_io::utils::CachedFileSize; use lance_io::{ ReadBatchParams, object_store::ObjectStore, scheduler::ScanScheduler, traits::Reader, @@ -73,9 +80,12 @@ use roaring::RoaringBitmap; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; use tracing::{info, instrument}; +use uuid::Uuid; use super::{IvfIndexPartitionStatistics, IvfIndexStatistics, maybe_centroids_for_stats}; +pub(crate) type RabitSearchCacheCell = Arc>>>>; + /// Serializable state of an IVF index, sufficient to reconstruct the index /// without re-reading global buffers from object storage. /// @@ -107,6 +117,8 @@ pub(crate) struct IvfIndexState { /// when reconstructing from cache. pub(crate) index_file_size: u64, pub(crate) aux_file_size: u64, + /// Runtime-only cache, intentionally excluded from the CacheCodec wire format. + pub(crate) rq_search_cache: RabitSearchCacheCell, } struct PreparedPartitionSearch { @@ -114,12 +126,82 @@ struct PreparedPartitionSearch { pre_filter: Arc, partition_id: usize, partition_centroid: Option, + rq_search_cache: Option>, + raw_query_context: Option>, part_entry: Arc, _marker: PhantomData<(S, Q)>, } +#[derive(Debug)] +pub(crate) struct RabitSearchCache { + rotated_centroids: Vec, + code_dim: usize, +} + +pub(crate) fn empty_rabit_search_cache_cell() -> RabitSearchCacheCell { + Arc::new(Mutex::new(None)) +} + +fn rabit_search_cache_cell(cache: Option>) -> RabitSearchCacheCell { + Arc::new(Mutex::new(Some(cache))) +} + +fn rotated_partition_centroid_slice( + cache: Option<&RabitSearchCache>, + partition_id: usize, +) -> Option<&[f32]> { + let cache = cache?; + let start = partition_id.checked_mul(cache.code_dim)?; + let end = start.checked_add(cache.code_dim)?; + cache.rotated_centroids.get(start..end) +} + +/// `f32` scratch needed for the ex-bit query state: a zero-padded query copy +/// when the rotated dim is not a multiple of the 64-dim kernel block (the +/// FastScan ex LUT is built directly from the query, with no f32 table). +fn rabit_ex_scratch_len(dim: usize, num_bits: u8) -> usize { + let multi_bit = rabit_ex_bits(num_bits) + .map(|ex_bits| ex_bits > 0) + .unwrap_or(true); + if !multi_bit || dim.is_multiple_of(64) { + 0 + } else { + padded_query_len(dim) + } +} + +fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize { + let binary_dist_table_len = dim * 4; + let ex_dist_table_len = rabit_ex_bits(num_bits) + .ok() + .and_then(|ex_bits| match ex_bits { + 2 | 4 | 8 => Some(blocked_ex_code_bytes(dim, ex_bits)), + _ => None, + }) + .map(|ex_code_len| ex_code_len * 2 * SEGMENT_NUM_CODES) + .unwrap_or_default(); + binary_dist_table_len.max(ex_dist_table_len) +} + +fn rabit_query_scratch_capacity( + dim: usize, + max_partition_len: usize, + num_bits: u8, +) -> QueryScratchCapacity { + let dist_table_len = dim * 4; + let ex_scratch_len = rabit_ex_scratch_len(dim, num_bits); + let u8_scratch_len = rabit_u8_scratch_len(dim, num_bits); + + QueryScratchCapacity::new( + max_partition_len, + dim + dist_table_len + ex_scratch_len, + max_partition_len.max(dist_table_len), + u8_scratch_len, + ) +} + impl DeepSizeOf for IvfIndexState { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.index_file_path.deep_size_of_children(context) + self.uuid.deep_size_of_children(context) + self.ivf.deep_size_of_children(context) @@ -127,31 +209,16 @@ impl DeepSizeOf for IvfIndexState { + self.sub_index_metadata.deep_size_of_children(context) + self.metadata.deep_size_of_children(context) + self.cache_key_prefix.deep_size_of_children(context) + + self + .rq_search_cache + .lock() + .ok() + .and_then(|cache| cache.as_ref().and_then(|cache| cache.as_ref().cloned())) + .map(|cache| cache.rotated_centroids.len() * std::mem::size_of::()) + .unwrap_or_default() } } -/// Serialization header for the `IvfIndexState` wire format. -/// -/// Kept as a flat, non-generic struct so the JSON header format is stable -/// regardless of `Q`. `quantizer_metadata_json` holds the serialized -/// `Q::Metadata`; large blobs (PQ codebook, RQ matrix) follow as raw bytes. -#[derive(serde::Serialize, serde::Deserialize)] -struct IvfIndexStateHeader { - index_file_path: String, - uuid: String, - distance_type: String, - sub_index_metadata: Vec, - sub_index_type: String, - quantization_type: String, - quantizer_metadata_json: String, - #[serde(default)] - cache_key_prefix: String, - #[serde(default)] - index_file_size: u64, - #[serde(default)] - aux_file_size: u64, -} - /// Object-safe interface for a type-erased `IvfIndexState`. /// /// Stored as `Arc` inside [`IvfStateEntryBox`], which is @@ -159,13 +226,14 @@ struct IvfIndexStateHeader { /// wrapper lets the cache infrastructure work with a sized type while the /// hot paths call `reconstruct` without knowing `Q`. pub(crate) trait IvfStateEntry: DeepSizeOf + Send + Sync + 'static { - fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()>; + fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()>; fn reconstruct<'a>( &'a self, object_store: Arc, file_metadata_cache: &'a LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> BoxFuture<'a, Result>>; } @@ -178,47 +246,44 @@ pub(crate) trait IvfStateEntry: DeepSizeOf + Send + Sync + 'static { pub(crate) struct IvfStateEntryBox(pub(crate) Arc); impl DeepSizeOf for IvfStateEntryBox { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0.deep_size_of_children(context) } } -/// Wire format (unchanged from the non-generic `IvfIndexState`): -/// `[header_json_len: u64 LE][header JSON][ivf_pb_len: u64 LE][ivf protobuf] -/// [extra_len: u64 LE][extra bytes][aux_ivf_pb_len: u64 LE][aux_ivf protobuf]` +/// Wire format: +/// ```text +/// HEADER : IvfStateHeader proto (paths, types, quantizer metadata JSON) +/// RAW_BLOB : IVF model protobuf +/// RAW_BLOB : quantizer extra-metadata buffer (may be empty) +/// RAW_BLOB : auxiliary IVF model protobuf +/// ``` impl CacheCodecImpl for IvfStateEntryBox { - fn serialize(&self, writer: &mut dyn IoWrite) -> Result<()> { - self.0.serialize_state(writer) - } + const TYPE_ID: &'static str = "lance.vector.ivf.IvfState"; + const CURRENT_VERSION: u32 = 1; - fn deserialize(data: &bytes::Bytes) -> Result { - use lance_arrow::ipc::read_len_prefixed_bytes_at; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + self.0.serialize_state(w) + } - // Parse the common wire format, then dispatch on quantization_type to + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + // Parse the common header, then dispatch on quantization_type to // construct the right IvfIndexState. - let mut offset = 0; - let header_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; - let header: IvfIndexStateHeader = serde_json::from_slice(&header_bytes) - .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?; + let header: IvfStateHeader = r.read_header()?; - let ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let ivf_bytes = r.read_raw()?; let ivf = IvfModel::try_from( pb::Ivf::decode(ivf_bytes.as_ref()) .map_err(|e| lance_core::Error::io(format!("IvfIndexState IVF decode: {e}")))?, )?; - let extra_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let extra_bytes = r.read_raw()?; - // aux_ivf was added after initial deployment; fall back to ivf on - // clean EOF (legacy format without the field). - let aux_ivf = if offset + 8 <= data.len() { - let aux_ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let aux_ivf_bytes = r.read_raw()?; + let aux_ivf = IvfModel::try_from(pb::Ivf::decode(aux_ivf_bytes.as_ref()).map_err(|e| { lance_core::Error::io(format!("IvfIndexState aux IVF decode: {e}")) - })?)? - } else { - ivf.clone() - }; + })?)?; let distance_type = DistanceType::try_from(header.distance_type.as_str())?; let sub_index_type = SubIndexType::try_from(header.sub_index_type.as_str())?; @@ -227,7 +292,7 @@ impl CacheCodecImpl for IvfStateEntryBox { // Helper: parse Q::Metadata from the JSON+extra_bytes in the header, // then build an IvfStateEntryBox wrapping IvfIndexState. fn make_entry( - header: IvfIndexStateHeader, + header: IvfStateHeader, ivf: IvfModel, aux_ivf: IvfModel, extra_bytes: bytes::Bytes, @@ -258,6 +323,7 @@ impl CacheCodecImpl for IvfStateEntryBox { cache_key_prefix: header.cache_key_prefix, index_file_size: header.index_file_size, aux_file_size: header.aux_file_size, + rq_search_cache: empty_rabit_search_cache_cell(), }))) } @@ -312,15 +378,15 @@ impl CacheCodecImpl for IvfStateEntryBox { } impl IvfStateEntry for IvfIndexState { - fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()> { + fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let quantizer_metadata_json = serde_json::to_string(&self.metadata) .map_err(|e| lance_core::Error::io(format!("IvfIndexState metadata: {e}")))?; let extra = self.metadata.extra_metadata()?; let extra = extra.as_deref().unwrap_or(&[]); - let header = IvfIndexStateHeader { + let header = IvfStateHeader { index_file_path: self.index_file_path.clone(), - uuid: self.uuid.clone(), + uuid: self.uuid.to_string(), distance_type: self.distance_type.to_string(), sub_index_metadata: self.sub_index_metadata.clone(), sub_index_type: self.sub_index_type.to_string(), @@ -330,15 +396,13 @@ impl IvfStateEntry for IvfIndexState { index_file_size: self.index_file_size, aux_file_size: self.aux_file_size, }; - let header_json = serde_json::to_vec(&header) - .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?; let ivf_bytes = pb::Ivf::try_from(&self.ivf)?.encode_to_vec(); let aux_ivf_bytes = pb::Ivf::try_from(&self.aux_ivf)?.encode_to_vec(); - write_len_prefixed_bytes(writer, &header_json)?; - write_len_prefixed_bytes(writer, &ivf_bytes)?; - write_len_prefixed_bytes(writer, extra)?; - write_len_prefixed_bytes(writer, &aux_ivf_bytes)?; + w.write_header(&header)?; + w.write_raw(&ivf_bytes)?; + w.write_raw(extra)?; + w.write_raw(&aux_ivf_bytes)?; Ok(()) } @@ -347,6 +411,7 @@ impl IvfStateEntry for IvfIndexState { object_store: Arc, file_metadata_cache: &'a LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> BoxFuture<'a, Result>> { Box::pin(async move { match self.sub_index_type { @@ -356,6 +421,7 @@ impl IvfStateEntry for IvfIndexState { object_store, file_metadata_cache, index_cache, + frag_reuse_index, ) .await } @@ -365,6 +431,7 @@ impl IvfStateEntry for IvfIndexState { object_store, file_metadata_cache, index_cache, + frag_reuse_index, ) .await } @@ -394,8 +461,8 @@ struct CachedIndexReaders { aux_reader: Arc, } -impl deepsize::DeepSizeOf for CachedIndexReaders { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { +impl lance_core::deepsize::DeepSizeOf for CachedIndexReaders { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // FileReader doesn't impl DeepSizeOf. We approximate by counting the // fixed struct size for each reader plus the Arc // heap contents. The metadata Arcs are also held by FileMetadataCacheKey @@ -516,7 +583,7 @@ pub struct IVFIndex { /// Object-store path to the index file (forward-slash separated). /// Used by `cacheable_state()` for cross-platform reconstruction. index_path: String, - uuid: String, + uuid: Uuid, /// Ivf model ivf: IvfModel, @@ -530,32 +597,115 @@ pub struct IVFIndex { index_cache: WeakLanceCache, io_parallelism: usize, + /// Cumulative I/O performed while opening this index (file footers, IVF + /// centroids, quantization metadata). Captured once in `try_new`; exposed + /// via [`VectorIndex::open_io_stats`] so the opening query can attribute the + /// one-time open cost to its plan metrics. + open_io_stats: ScanStats, scratch_pool: Arc, + use_query_residual: bool, use_residual_scratch: bool, + rq_search_cache: Option>, _marker: PhantomData<(S, Q)>, } impl DeepSizeOf for IVFIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + // `Uuid` is a fixed 16-byte struct with no heap children, so contributes 0. self.uri.deep_size_of_children(context) + self.index_path.deep_size_of_children(context) + self.ivf.deep_size_of_children(context) + self.sub_index_metadata.deep_size_of_children(context) - + self.uuid.deep_size_of_children(context) + self.storage.deep_size_of_children(context) + self.scratch_pool.deep_size_of_children(context) + + self + .rq_search_cache + .as_ref() + .map(|cache| cache.rotated_centroids.len() * std::mem::size_of::()) + .unwrap_or_default() // Skipping session since it is a weak ref } } impl IVFIndex { + fn use_query_residual( + storage: &IvfQuantizationStorage, + distance_type: DistanceType, + ) -> bool { + if Q::quantization_type() == QuantizationType::Rabit + && let Ok(Quantizer::Rabit(rq)) = storage.quantizer() + { + return rq.metadata_ref().query_estimator == RabitQueryEstimator::ResidualQuery; + } + Q::use_residual(distance_type) + } + + fn build_rq_search_cache( + ivf: &IvfModel, + storage: &IvfQuantizationStorage, + ) -> Result>> { + if Q::quantization_type() != QuantizationType::Rabit { + return Ok(None); + } + let Quantizer::Rabit(rq) = storage.quantizer()? else { + return Ok(None); + }; + if rq.metadata_ref().query_estimator != RabitQueryEstimator::RawQuery { + return Ok(None); + } + let centroids = ivf + .centroids_array() + .ok_or_else(|| Error::index("IVF_RQ raw-query search requires centroids"))?; + let rotated_centroids = rq.rotate_fsl_to_f32(centroids)?; + Ok(Some(Arc::new(RabitSearchCache { + rotated_centroids, + code_dim: rq.code_dim(), + }))) + } + + fn rq_search_cache_from_state( + state: &IvfIndexState, + storage: &IvfQuantizationStorage, + ) -> Result>> { + let mut cache = state + .rq_search_cache + .lock() + .map_err(|_| Error::internal("RQ search cache lock was poisoned".to_string()))?; + if let Some(cache) = cache.as_ref() { + return Ok(cache.clone()); + } + let built = Self::build_rq_search_cache(&state.ivf, storage)?; + *cache = Some(built.clone()); + Ok(built) + } + + fn prepare_rq_raw_query_context( + &self, + query: &ArrayRef, + ) -> Result>> { + if Q::quantization_type() != QuantizationType::Rabit || self.use_query_residual { + return Ok(None); + } + let Quantizer::Rabit(rq) = self.storage.quantizer()? else { + return Ok(None); + }; + if rq.metadata_ref().query_estimator != RabitQueryEstimator::RawQuery { + return Ok(None); + } + Ok(Some(Arc::new( + rq.metadata_ref() + .prepare_raw_query_context(query.as_ref())?, + ))) + } + async fn prepare_partition( &self, partition_id: usize, query: &Query, pre_filter: Arc, metrics: &dyn MetricsCollector, + raw_query_context: Option>, ) -> Result> { let (part_entry, ()) = tokio::try_join!( self.load_partition(partition_id, true, metrics), @@ -566,6 +716,8 @@ impl IVFIndex { pre_filter, partition_id, partition_centroid: self.ivf.centroid(partition_id), + rq_search_cache: self.rq_search_cache.clone(), + raw_query_context, part_entry, _marker: PhantomData, }) @@ -577,6 +729,7 @@ impl IVFIndex { query: &Query, pre_filter: Arc, metrics: &dyn MetricsCollector, + raw_query_context: Option>, ) -> Result> { let part_entry = self.load_partition(partition_id, true, metrics).await?; Ok(PreparedPartitionSearch { @@ -584,13 +737,15 @@ impl IVFIndex { pre_filter, partition_id, partition_centroid: self.ivf.centroid(partition_id), + rq_search_cache: self.rq_search_cache.clone(), + raw_query_context, part_entry, _marker: PhantomData, }) } fn run_prepared_partition_search( - distance_type: DistanceType, + use_query_residual: bool, use_residual_scratch: bool, prepared: PreparedPartitionSearch, metrics: &dyn MetricsCollector, @@ -601,16 +756,23 @@ impl IVFIndex { pre_filter, partition_id, partition_centroid, + rq_search_cache, + raw_query_context, part_entry, _marker: _, } = prepared; - let residual = Self::residual_for_scratch( + let rotated_partition_centroid = + rotated_partition_centroid_slice(rq_search_cache.as_deref(), partition_id); + let residual = Self::query_context_for_scratch( + use_query_residual, use_residual_scratch, partition_id, partition_centroid.as_ref(), + rotated_partition_centroid, + raw_query_context.as_deref(), )?; let query = Self::preprocess_partition_query_owned( - distance_type, + use_query_residual, use_residual_scratch, partition_id, partition_centroid.as_ref(), @@ -640,7 +802,7 @@ impl IVFIndex { #[allow(clippy::too_many_arguments)] fn accumulate_prepared_partition_search( - distance_type: DistanceType, + use_query_residual: bool, use_residual_scratch: bool, prepared: PreparedPartitionSearch, heap: &mut BinaryHeap>, @@ -652,16 +814,23 @@ impl IVFIndex { pre_filter, partition_id, partition_centroid, + rq_search_cache, + raw_query_context, part_entry, _marker: _, } = prepared; - let residual = Self::residual_for_scratch( + let rotated_partition_centroid = + rotated_partition_centroid_slice(rq_search_cache.as_deref(), partition_id); + let residual = Self::query_context_for_scratch( + use_query_residual, use_residual_scratch, partition_id, partition_centroid.as_ref(), + rotated_partition_centroid, + raw_query_context.as_deref(), )?; let query = Self::preprocess_partition_query_owned( - distance_type, + use_query_residual, use_residual_scratch, partition_id, partition_centroid.as_ref(), @@ -689,16 +858,26 @@ impl IVFIndex { ) } - fn residual_for_scratch<'a>( + fn query_context_for_scratch<'a>( + use_query_residual: bool, use_residual_scratch: bool, partition_id: usize, partition_centroid: Option<&'a ArrayRef>, + rotated_partition_centroid: Option<&'a [f32]>, + raw_query_context: Option<&'a RabitRawQueryContext>, ) -> Result>> { if use_residual_scratch { let partition_centroid = partition_centroid.ok_or_else(|| { Error::index(format!("partition centroid {partition_id} does not exist")) })?; Ok(Some(QueryResidual::Centroid(partition_centroid.as_ref()))) + } else if !use_query_residual + && (rotated_partition_centroid.is_some() || raw_query_context.is_some()) + { + Ok(Some(QueryResidual::RabitRawQuery { + rotated_centroid: rotated_partition_centroid, + query: raw_query_context, + })) } else { Ok(None) } @@ -716,14 +895,14 @@ impl IVFIndex { } fn preprocess_partition_query( - distance_type: DistanceType, + use_query_residual: bool, use_residual_scratch: bool, partition_id: usize, partition_centroid: Option<&ArrayRef>, query: &Query, ) -> Result { Self::preprocess_partition_query_owned( - distance_type, + use_query_residual, use_residual_scratch, partition_id, partition_centroid, @@ -732,13 +911,13 @@ impl IVFIndex { } fn preprocess_partition_query_owned( - distance_type: DistanceType, + use_query_residual: bool, use_residual_scratch: bool, partition_id: usize, partition_centroid: Option<&ArrayRef>, mut query: Query, ) -> Result { - if Q::use_residual(distance_type) { + if use_query_residual { let partition_centroid = partition_centroid.ok_or_else(|| { Error::index(format!("partition centroid {partition_id} does not exist")) })?; @@ -751,36 +930,37 @@ impl IVFIndex { Ok(query) } - fn query_scratch_capacity(ivf: &IvfModel) -> QueryScratchCapacity { + fn query_scratch_capacity( + ivf: &IvfModel, + storage: &IvfQuantizationStorage, + ) -> QueryScratchCapacity { if Q::quantization_type() != QuantizationType::Rabit { return QueryScratchCapacity::default(); } let dim = ivf.dimension(); - let dist_table_len = dim * 4; let max_partition_len = ivf.lengths.iter().copied().max().unwrap_or_default() as usize; + let num_bits = match storage.quantizer() { + Ok(Quantizer::Rabit(rq)) => rq.metadata_ref().num_bits, + _ => 9, + }; - QueryScratchCapacity::new( - max_partition_len, - dim + dist_table_len, - max_partition_len, - dist_table_len, - ) + rabit_query_scratch_capacity(dim, max_partition_len, num_bits) } - fn use_residual_scratch(ivf: &IvfModel, distance_type: DistanceType) -> bool { + fn use_residual_scratch(ivf: &IvfModel, use_query_residual: bool) -> bool { Q::quantization_type() == QuantizationType::Rabit - && Q::use_residual(distance_type) + && use_query_residual && ivf .centroids_array() .map(|centroids| centroids.value_type() == DataType::Float32) .unwrap_or(false) } - fn query_scratch_pool(ivf: &IvfModel) -> QueryScratchPool { + fn query_scratch_pool(ivf: &IvfModel, storage: &IvfQuantizationStorage) -> QueryScratchPool { QueryScratchPool::with_capacity( get_num_compute_intensive_cpus(), - Self::query_scratch_capacity(ivf), + Self::query_scratch_capacity(ivf, storage), ) } @@ -788,7 +968,7 @@ impl IVFIndex { pub(crate) async fn try_new( object_store: Arc, index_dir: Path, - uuid: String, + uuid: Uuid, frag_reuse_index: Option>, file_metadata_cache: &LanceCache, index_cache: LanceCache, @@ -798,7 +978,11 @@ impl IVFIndex { let scheduler_config = SchedulerConfig::max_bandwidth(&object_store); let scheduler = ScanScheduler::new(object_store, scheduler_config); - let uri = index_dir.clone().join(uuid.as_str()).join(INDEX_FILE_NAME); + let uuid_str = uuid.to_string(); + let uri = index_dir + .clone() + .join(uuid_str.as_str()) + .join(INDEX_FILE_NAME); let cached_size = file_sizes .get(INDEX_FILE_NAME) .map(|&size| CachedFileSize::new(size)) @@ -847,7 +1031,7 @@ impl IVFIndex { .open_file( &index_dir .clone() - .join(uuid.as_str()) + .join(uuid_str.as_str()) .join(INDEX_AUXILIARY_FILE_NAME), &aux_cached_size, ) @@ -869,7 +1053,7 @@ impl IVFIndex { .await; let aux_path = index_dir .clone() - .join(uuid.as_str()) + .join(uuid_str.as_str()) .join(INDEX_AUXILIARY_FILE_NAME); file_metadata_cache .with_key_prefix(aux_path.as_ref()) @@ -879,7 +1063,9 @@ impl IVFIndex { // Cache open readers so the first reconstruction also skips file opens. file_metadata_cache .insert_with_key( - &CachedIndexReadersKey { uuid: uuid.clone() }, + &CachedIndexReadersKey { + uuid: uuid_str.clone(), + }, Arc::new(CachedIndexReaders { index_reader: Arc::new(index_reader.clone()), aux_reader: Arc::new(storage.reader().clone()), @@ -887,15 +1073,25 @@ impl IVFIndex { ) .await; - let scratch_pool = Arc::new(Self::query_scratch_pool(&ivf)); - let use_residual_scratch = Self::use_residual_scratch(&ivf, distance_type); + let scratch_pool = Arc::new(Self::query_scratch_pool(&ivf, &storage)); + let use_query_residual = Self::use_query_residual(&storage, distance_type); + let use_residual_scratch = Self::use_residual_scratch(&ivf, use_query_residual); + let rq_search_cache = Self::build_rq_search_cache(&ivf, &storage)?; + + // The scheduler is freshly created above and, at this point, has served + // only the open-time reads (file footers, IVF centroids, quantization + // metadata) -- partition reads happen later, during queries. So its + // cumulative stats are exactly the one-time index-open I/O. + let open_io_stats = scheduler.stats(); Ok(Self { uri: to_local_path(&uri), index_path: uri.as_ref().to_string(), uuid, scratch_pool, + use_query_residual, use_residual_scratch, + rq_search_cache, ivf, reader: index_reader, storage, @@ -903,6 +1099,7 @@ impl IVFIndex { distance_type, index_cache: WeakLanceCache::from(&index_cache), io_parallelism, + open_io_stats, _marker: PhantomData, }) } @@ -912,7 +1109,7 @@ impl IVFIndex { pub(crate) fn from_cached_state( uri: String, index_path: String, - uuid: String, + uuid: Uuid, ivf: IvfModel, reader: FileReader, storage: IvfQuantizationStorage, @@ -920,15 +1117,19 @@ impl IVFIndex { distance_type: DistanceType, index_cache: LanceCache, io_parallelism: usize, + rq_search_cache: Option>, ) -> Self { - let scratch_pool = Arc::new(Self::query_scratch_pool(&ivf)); - let use_residual_scratch = Self::use_residual_scratch(&ivf, distance_type); + let scratch_pool = Arc::new(Self::query_scratch_pool(&ivf, &storage)); + let use_query_residual = Self::use_query_residual(&storage, distance_type); + let use_residual_scratch = Self::use_residual_scratch(&ivf, use_query_residual); Self { uri, index_path, uuid, scratch_pool, + use_query_residual, use_residual_scratch, + rq_search_cache, ivf, reader, storage, @@ -936,6 +1137,10 @@ impl IVFIndex { distance_type, index_cache: WeakLanceCache::from(&index_cache), io_parallelism, + // Reconstruction from cached state re-opens readers on its own path; + // the open-time I/O is not attributed here (it is a one-time cost, + // and the first open via `try_new` already accounts for it). + open_io_stats: ScanStats::default(), _marker: PhantomData, } } @@ -963,7 +1168,8 @@ impl IVFIndex { .get_or_insert_with_key(cache_key, || async { info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id); metrics.record_part_load(); - self.load_partition_entry(partition_id).await + self.load_partition_entry(partition_id, metrics.io_stats()) + .await }) .await?; Ok(entry as Arc) @@ -973,11 +1179,18 @@ impl IVFIndex { } info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id); metrics.record_part_load(); - Ok(Arc::new(self.load_partition_entry(partition_id).await?)) + Ok(Arc::new( + self.load_partition_entry(partition_id, metrics.io_stats()) + .await?, + )) } } - async fn load_partition_entry(&self, partition_id: usize) -> Result> { + async fn load_partition_entry( + &self, + partition_id: usize, + io_stats: Option, + ) -> Result> { let schema = Arc::new(self.reader.schema().as_ref().into()); let batch = match self.reader.metadata().num_rows { 0 => RecordBatch::new_empty(schema), @@ -986,8 +1199,17 @@ impl IVFIndex { if row_range.is_empty() { RecordBatch::new_empty(schema) } else { - let batches = self - .reader + // When I/O is being measured, read through a reader whose + // scheduler also records into the per-query sink (a cheap + // clone sharing all cached metadata, no file re-open). + // Otherwise borrow the shared reader as-is, with no clone. + let reader = match &io_stats { + Some(io_stats) => { + Cow::Owned(self.reader.with_io_stats(io_stats.recorder())) + } + None => Cow::Borrowed(&self.reader), + }; + let batches = reader .read_stream( ReadBatchParams::Range(row_range), u32::MAX, @@ -1006,15 +1228,19 @@ impl IVFIndex { self.sub_index_metadata[partition_id].clone(), )?; let idx = S::load(batch)?; - let storage = self.load_partition_storage(partition_id).await?; + let storage = self.load_partition_storage(partition_id, io_stats).await?; Ok(PartitionEntry { index: idx, storage, }) } - pub async fn load_partition_storage(&self, partition_id: usize) -> Result { - self.storage.load_partition(partition_id).await + pub async fn load_partition_storage( + &self, + partition_id: usize, + io_stats: Option, + ) -> Result { + self.storage.load_partition(partition_id, io_stats).await } /// preprocess the query vector given the partition id. @@ -1023,7 +1249,7 @@ impl IVFIndex { #[instrument(level = "debug", skip(self))] pub fn preprocess_query(&self, partition_id: usize, query: &Query) -> Result { Self::preprocess_partition_query( - self.distance_type, + self.use_query_residual, self.use_residual_scratch, partition_id, self.ivf.centroid(partition_id).as_ref(), @@ -1036,7 +1262,7 @@ impl IVFIndex { let (sub_index_type, quantization_type) = self.sub_index_type(); IvfStateEntryBox(Arc::new(IvfIndexState:: { index_file_path: self.index_path.clone(), - uuid: self.uuid.clone(), + uuid: self.uuid.to_string(), ivf: self.ivf.clone(), aux_ivf: self.storage.ivf().clone(), distance_type: self.distance_type, @@ -1047,6 +1273,7 @@ impl IVFIndex { cache_key_prefix: self.index_cache.prefix().to_string(), index_file_size: self.reader.metadata().file_size(), aux_file_size: self.storage.reader().metadata().file_size(), + rq_search_cache: rabit_search_cache_cell(self.rq_search_cache.clone()), })) } } @@ -1061,10 +1288,6 @@ impl Index for IVFIndex) -> Result> { - Ok(self) - } - async fn prewarm(&self) -> Result<()> { futures::stream::iter(0..self.ivf.num_partitions()) .map(Ok) @@ -1149,7 +1372,7 @@ impl Index for IVFIndex VectorIndex for IVFInd let part_entry = self.load_partition(partition_id, true, metrics).await?; pre_filter.wait_for_ready().await?; - let residual_centroid = if self.use_residual_scratch { - Some(self.ivf.centroid(partition_id).ok_or_else(|| { - Error::index(format!("partition centroid {partition_id} does not exist")) - })?) - } else { - None - }; - let query = self.preprocess_query(partition_id, query)?; + let partition_centroid = self.ivf.centroid(partition_id); + let rq_search_cache = self.rq_search_cache.clone(); + let raw_query_context = self.prepare_rq_raw_query_context(&query.key)?; + let query = Self::preprocess_partition_query( + self.use_query_residual, + self.use_residual_scratch, + partition_id, + partition_centroid.as_ref(), + query, + )?; let scratch_pool = self.scratch_pool.clone(); + let use_query_residual = self.use_query_residual; + let use_residual_scratch = self.use_residual_scratch; let (batch, local_metrics) = spawn_cpu(move || { let param = (&query).into(); let refine_factor = query.refine_factor.unwrap_or(1) as usize; @@ -1228,7 +1455,16 @@ impl VectorIndex for IVFInd .ok_or(Error::internal( "failed to downcast partition entry".to_string(), ))?; - let residual = residual_centroid.as_deref().map(QueryResidual::Centroid); + let rotated_partition_centroid = + rotated_partition_centroid_slice(rq_search_cache.as_deref(), partition_id); + let residual = Self::query_context_for_scratch( + use_query_residual, + use_residual_scratch, + partition_id, + partition_centroid.as_ref(), + rotated_partition_centroid, + raw_query_context.as_deref(), + )?; let batch = scratch_pool.with_scratch(|scratch| { part.index.search_with_scratch( query.key, @@ -1257,8 +1493,9 @@ impl VectorIndex for IVFInd pre_filter: Arc, metrics: &dyn MetricsCollector, ) -> Result { + let raw_query_context = self.prepare_rq_raw_query_context(&query.key)?; Ok(Box::new( - self.prepare_partition(partition_id, query, pre_filter, metrics) + self.prepare_partition(partition_id, query, pre_filter, metrics, raw_query_context) .await?, )) } @@ -1273,7 +1510,7 @@ impl VectorIndex for IVFInd .map_err(|_| Error::internal("failed to downcast prepared partition search"))?; self.scratch_pool.with_scratch(|scratch| { Self::run_prepared_partition_search( - self.distance_type, + self.use_query_residual, self.use_residual_scratch, *prepared, metrics, @@ -1321,12 +1558,14 @@ impl VectorIndex for IVFInd } let prepare_parallelism = get_num_compute_intensive_cpus().max(1); + let raw_query_context = self.prepare_rq_raw_query_context(&query.key)?; if control.is_none() && S::supports_global_topk_heap() { let heap_capacity = query.k * query.refine_factor.unwrap_or(1) as usize; pre_filter.wait_for_ready().await?; let prepare_index = self.clone(); let prepare_metrics = metrics.clone(); + let prepare_raw_query_context = raw_query_context.clone(); let prepared = stream::iter(start_idx..end_idx) .map(move |idx| { let part_id = partitions.value(idx); @@ -1335,6 +1574,7 @@ impl VectorIndex for IVFInd let index = prepare_index.clone(); let pre_filter = pre_filter.clone(); let metrics = prepare_metrics.clone(); + let raw_query_context = prepare_raw_query_context.clone(); async move { index .prepare_partition_without_prefilter_wait( @@ -1342,6 +1582,7 @@ impl VectorIndex for IVFInd &query, pre_filter, metrics.as_ref(), + raw_query_context, ) .await } @@ -1350,7 +1591,7 @@ impl VectorIndex for IVFInd .try_collect::>() .await?; - let distance_type = self.distance_type; + let use_query_residual = self.use_query_residual; let use_residual_scratch = self.use_residual_scratch; let search_metrics = metrics.clone(); let scratch_pool = self.scratch_pool.clone(); @@ -1359,7 +1600,7 @@ impl VectorIndex for IVFInd scratch_pool.with_scratch(|scratch| -> DataFusionResult<()> { for prepared in prepared { Self::accumulate_prepared_partition_search( - distance_type, + use_query_residual, use_residual_scratch, prepared, &mut heap, @@ -1386,6 +1627,7 @@ impl VectorIndex for IVFInd let prepare_index = self.clone(); let prepare_metrics = metrics.clone(); + let prepare_raw_query_context = raw_query_context.clone(); tokio::spawn(async move { let prepare_stream = stream::iter(start_idx..end_idx) .map(move |idx| { @@ -1395,6 +1637,7 @@ impl VectorIndex for IVFInd let index = prepare_index.clone(); let pre_filter = pre_filter.clone(); let metrics = prepare_metrics.clone(); + let raw_query_context = prepare_raw_query_context.clone(); async move { index .prepare_partition( @@ -1402,6 +1645,7 @@ impl VectorIndex for IVFInd &query, pre_filter, metrics.as_ref(), + raw_query_context, ) .await } @@ -1417,7 +1661,7 @@ impl VectorIndex for IVFInd } }); - let distance_type = self.distance_type; + let use_query_residual = self.use_query_residual; let use_residual_scratch = self.use_residual_scratch; let search_metrics = metrics.clone(); let batch_tx_for_search = batch_tx.clone(); @@ -1445,7 +1689,7 @@ impl VectorIndex for IVFInd let batch = { Self::run_prepared_partition_search( - distance_type, + use_query_residual, use_residual_scratch, prepared, search_metrics.as_ref(), @@ -1572,6 +1816,10 @@ impl VectorIndex for IVFInd fn metric_type(&self) -> DistanceType { self.distance_type } + + fn open_io_stats(&self) -> Option { + Some(self.open_io_stats) + } } pub type IvfFlatIndex = IVFIndex; @@ -1584,6 +1832,7 @@ async fn reconstruct_typed( object_store: Arc, file_metadata_cache: &LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> Result> { let io_parallelism = object_store.io_parallelism(); @@ -1639,13 +1888,16 @@ async fn reconstruct_typed( state.aux_ivf.clone(), state.metadata.clone(), state.distance_type, - None, + frag_reuse_index, ); + let rq_search_cache = IVFIndex::::rq_search_cache_from_state(state, &storage)?; + let parsed_uuid = Uuid::parse_str(&state.uuid) + .map_err(|e| Error::index(format!("Invalid UUID in IvfIndexState: {e}")))?; let index = IVFIndex::::from_cached_state( to_local_path(&index_path), index_path.to_string(), - state.uuid.clone(), + parsed_uuid, state.ivf.clone(), index_reader, storage, @@ -1653,6 +1905,7 @@ async fn reconstruct_typed( state.distance_type, index_cache, io_parallelism, + rq_search_cache, ); Ok(Arc::new(index)) } @@ -1675,7 +1928,10 @@ mod tests { use itertools::Itertools; use lance_arrow::FixedSizeListArrayExt; use lance_index::vector::bq::{ - RQBuildParams, RQRotationType, storage::RabitQuantizationMetadata, + RQBuildParams, RQRotationType, + ex_dot::{blocked_ex_code_bytes, padded_query_len}, + storage::{RABIT_BLOCKED_EX_CODE_COLUMN, RabitQuantizationMetadata, RabitQueryEstimator}, + transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}, }; use lance_index::vector::storage::VectorStore; @@ -1701,6 +1957,7 @@ mod tests { use lance_index::IndexType; use lance_index::progress::IndexBuildProgress; use lance_index::vector::DIST_COL; + use lance_index::vector::hnsw::builder::HnswBuildParams; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::kmeans::{KMeansParams, train_kmeans}; use lance_index::vector::pq::PQBuildParams; @@ -1713,7 +1970,6 @@ mod tests { }; use lance_index::{INDEX_AUXILIARY_FILE_NAME, metrics::NoOpMetricsCollector}; use lance_index::{optimize::OptimizeOptions, scalar::IndexReader}; - use lance_index::{scalar::IndexWriter, vector::hnsw::builder::HnswBuildParams}; use lance_io::{ object_store::ObjectStore, scheduler::{ScanScheduler, SchedulerConfig}, @@ -1726,12 +1982,67 @@ mod tests { use rand::distr::uniform::SampleUniform; use rand::{Rng, SeedableRng, rngs::StdRng}; use rstest::rstest; + use uuid::Uuid; const NUM_ROWS: usize = 512; const DIM: usize = 32; lance_testing::define_stage_event_progress!(RecordingProgress, IndexBuildProgress, Result<()>); + #[test] + fn test_rotated_partition_centroid_slice_borrows_cache() { + let cache = super::RabitSearchCache { + rotated_centroids: vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + code_dim: 2, + }; + + let centroid = super::rotated_partition_centroid_slice(Some(&cache), 1).unwrap(); + + assert_eq!(centroid, &[3.0, 4.0]); + assert_eq!(centroid.as_ptr(), cache.rotated_centroids[2..].as_ptr()); + assert!(super::rotated_partition_centroid_slice(Some(&cache), 3).is_none()); + assert!(super::rotated_partition_centroid_slice(None, 0).is_none()); + } + + #[test] + fn test_rabit_ex_scratch_len_uses_num_bits() { + // Block-aligned dims read the rotated query in place. + let dim = 960; + for num_bits in [1, 3, 5, 7, 9] { + assert_eq!(super::rabit_ex_scratch_len(dim, num_bits), 0); + } + + // Unaligned multi-bit queries add one padded query copy. + let dim = 968; + assert_eq!(super::rabit_ex_scratch_len(dim, 1), 0); + assert_eq!(super::rabit_ex_scratch_len(dim, 7), padded_query_len(dim)); + } + + #[test] + fn test_rabit_u8_scratch_len_includes_ex_fastscan_tables() { + let dim = 960; + + assert_eq!(super::rabit_u8_scratch_len(dim, 1), dim * 4); + assert_eq!(super::rabit_u8_scratch_len(dim, 3), dim * 8); + assert_eq!(super::rabit_u8_scratch_len(dim, 5), dim * 16); + assert_eq!(super::rabit_u8_scratch_len(dim, 7), dim * 4); + assert_eq!(super::rabit_u8_scratch_len(dim, 9), dim * 32); + } + + #[test] + fn test_rabit_query_scratch_capacity_does_not_preallocate_u32() { + let dim = 960; + let max_partition_len = 4096; + + let capacity = super::rabit_query_scratch_capacity(dim, max_partition_len, 5); + + assert_eq!(capacity.distances, max_partition_len); + assert_eq!(capacity.query_f32, dim + dim * 4); + assert_eq!(capacity.u16, max_partition_len); + assert_eq!(capacity.u8, dim * 16); + assert_eq!(capacity.u32, 0); + } + async fn generate_test_dataset( test_uri: &str, range: Range, @@ -1790,11 +2101,11 @@ mod tests { vectors } - async fn get_rq_metadata( + async fn open_rq_aux_reader( dataset: &Dataset, scheduler: Arc, index_uuid: &str, - ) -> RabitQuantizationMetadata { + ) -> FileReader { let index_path = dataset .indices_dir() .join(index_uuid) @@ -1803,7 +2114,7 @@ mod tests { .open_file(&index_path, &CachedFileSize::unknown()) .await .unwrap(); - let reader = FileReader::try_open( + FileReader::try_open( file_scheduler, None, Arc::::default(), @@ -1811,7 +2122,15 @@ mod tests { FileReaderOptions::default(), ) .await - .unwrap(); + .unwrap() + } + + async fn get_rq_metadata( + dataset: &Dataset, + scheduler: Arc, + index_uuid: &str, + ) -> RabitQuantizationMetadata { + let reader = open_rq_aux_reader(dataset, scheduler, index_uuid).await; let metadata = reader.schema().metadata.get(STORAGE_METADATA_KEY).unwrap(); let metadata_entries: Vec = serde_json::from_str(metadata).unwrap(); serde_json::from_str(&metadata_entries[0]).unwrap() @@ -2108,11 +2427,12 @@ mod tests { ) -> VectorIndexTestContext { let stats_json = dataset.index_statistics(index_name).await.unwrap(); let stats: serde_json::Value = serde_json::from_str(&stats_json).unwrap(); - let uuid = stats["indices"][0]["uuid"] + let uuid_str = stats["indices"][0]["uuid"] .as_str() .expect("Index uuid should be present"); + let uuid = Uuid::parse_str(uuid_str).expect("uuid in stats should be a valid UUID"); let index = dataset - .open_vector_index(column, uuid, &NoOpMetricsCollector) + .open_vector_index(column, &uuid, &NoOpMetricsCollector) .await .unwrap(); @@ -2428,7 +2748,7 @@ mod tests { async fn load_partition_row_ids(index: &IvfPq, partition_idx: usize) -> Vec { index .storage - .load_partition(partition_idx) + .load_partition(partition_idx, None) .await .unwrap() .row_ids() @@ -3337,6 +3657,147 @@ mod tests { assert!(result.num_rows() > 0); } + #[rstest] + #[case::flat("IVF_HNSW_FLAT")] + #[case::pq("IVF_HNSW_PQ")] + #[case::sq("IVF_HNSW_SQ")] + #[tokio::test] + async fn test_merge_existing_hnsw_segments_rebuilds_graph(#[case] expected_index_type: &str) { + let test_dir = TempStrDir::default(); + let base_uri = test_dir.as_str(); + let (schema, batches) = make_two_fragment_batches(); + let dataset_uri = format!("{}/merge_hnsw_rebuilds_graph", base_uri); + let mut dataset = write_dataset_from_batches(&dataset_uri, schema, batches).await; + + let fragments = dataset.get_fragments(); + assert!(fragments.len() >= 2); + let params = match expected_index_type { + "IVF_HNSW_FLAT" => VectorIndexParams::ivf_hnsw( + DistanceType::L2, + prepare_global_ivf(&dataset, "vector").await, + HnswBuildParams::default(), + ), + "IVF_HNSW_PQ" => { + let (ivf_params, pq_params) = prepare_global_ivf_pq(&dataset, "vector").await; + VectorIndexParams::with_ivf_hnsw_pq_params( + DistanceType::L2, + ivf_params, + HnswBuildParams::default(), + pq_params, + ) + } + "IVF_HNSW_SQ" => VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + prepare_global_ivf(&dataset, "vector").await, + HnswBuildParams::default(), + SQBuildParams::default(), + ), + other => panic!("unexpected HNSW index type {other}"), + }; + let mut segments = Vec::new(); + + for fragment in fragments.iter().take(2) { + let segment = dataset + .create_index_builder(&["vector"], IndexType::Vector, ¶ms) + .name("vector_idx".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + segments.push(segment); + } + + let merged = dataset + .merge_existing_index_segments(segments) + .await + .unwrap(); + dataset + .commit_existing_index_segments("vector_idx", "vector", vec![merged]) + .await + .unwrap(); + + let stats = dataset.index_statistics("vector_idx").await.unwrap(); + let stats: serde_json::Value = serde_json::from_str(&stats).unwrap(); + assert_eq!(stats["index_type"].as_str().unwrap(), expected_index_type); + assert_eq!( + stats["indices"][0]["sub_index"]["index_type"] + .as_str() + .unwrap(), + "HNSW" + ); + + let query_batch = dataset + .scan() + .project(&["vector"] as &[&str]) + .unwrap() + .limit(Some(4), None) + .unwrap() + .try_into_batch() + .await + .unwrap(); + let q = query_batch["vector"].as_fixed_size_list().value(0); + let result = dataset + .scan() + .project(&["_rowid"] as &[&str]) + .unwrap() + .nearest("vector", q.as_ref(), 5) + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert!(result.num_rows() > 0); + } + + #[tokio::test] + async fn test_merge_existing_hnsw_segments_rejects_mismatched_build_params() { + let test_dir = TempStrDir::default(); + let base_uri = test_dir.as_str(); + let (schema, batches) = make_two_fragment_batches(); + let dataset_uri = format!("{}/merge_hnsw_rejects_mismatched_params", base_uri); + let mut dataset = write_dataset_from_batches(&dataset_uri, schema, batches).await; + + let fragments = dataset.get_fragments(); + assert!(fragments.len() >= 2); + + let ivf_params = prepare_global_ivf(&dataset, "vector").await; + let default_params = VectorIndexParams::ivf_hnsw( + DistanceType::L2, + ivf_params.clone(), + HnswBuildParams::default(), + ); + let custom_params = VectorIndexParams::ivf_hnsw( + DistanceType::L2, + ivf_params, + HnswBuildParams::default().num_edges(16), + ); + + let first_segment = dataset + .create_index_builder(&["vector"], IndexType::Vector, &default_params) + .name("vector_idx".to_string()) + .fragments(vec![fragments[0].id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + let second_segment = dataset + .create_index_builder(&["vector"], IndexType::Vector, &custom_params) + .name("vector_idx".to_string()) + .fragments(vec![fragments[1].id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + + let error = dataset + .merge_existing_index_segments(vec![first_segment, second_segment]) + .await + .unwrap_err(); + assert!( + error + .to_string() + .contains("HNSW build parameters mismatch while merging index segments"), + "{error}" + ); + } + #[tokio::test] async fn test_merge_index_metadata_reports_progress() { const INDEX_NAME: &str = "vector_idx"; @@ -3966,6 +4427,55 @@ mod tests { test_remap(params.clone(), nlist, recall_requirement).await; } + #[rstest] + #[case::l2(DistanceType::L2, 9)] + #[case::cosine(DistanceType::Cosine, 9)] + // ex_bits=3 and ex_bits=5 have no FastScan support and use the bit-plane + // repack, so these searches go through the exact ex-dot rerank kernels + // end to end. + #[case::l2_plane_repack_3bit(DistanceType::L2, 4)] + #[case::l2_plane_repack_5bit(DistanceType::L2, 6)] + #[tokio::test] + async fn test_build_ivf_rq_multi_bit_persists_split_codes_and_searches( + #[case] distance_type: DistanceType, + #[case] num_bits: u8, + ) { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let (mut dataset, vectors) = generate_test_dataset::(test_uri, 0.0..1.0).await; + + let ivf_params = IvfBuildParams::new(4); + let rq_params = RQBuildParams::with_rotation_type(num_bits, RQRotationType::Fast); + let params = VectorIndexParams::with_ivf_rq_params(distance_type, ivf_params, rq_params); + dataset + .create_index(&["vector"], IndexType::Vector, None, ¶ms, true) + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1); + let obj_store = Arc::new(ObjectStore::local()); + let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing()); + let index_uuid = indices[0].uuid.to_string(); + let rq_meta = get_rq_metadata(&dataset, scheduler.clone(), &index_uuid).await; + assert_eq!(rq_meta.num_bits, num_bits); + assert_eq!(rq_meta.query_estimator, RabitQueryEstimator::RawQuery); + + let reader = open_rq_aux_reader(&dataset, scheduler, &index_uuid).await; + let schema = reader.schema(); + let ex_field = schema.field(RABIT_BLOCKED_EX_CODE_COLUMN).unwrap(); + let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else { + panic!("RQ ex-code field should be FixedSizeList"); + }; + let expected_ex_code_bytes = + blocked_ex_code_bytes(rq_meta.rotated_dim(), num_bits - 1) as i32; + assert_eq!(ex_code_bytes, expected_ex_code_bytes); + assert!(schema.field(EX_ADD_FACTORS_COLUMN).is_some()); + assert!(schema.field(EX_SCALE_FACTORS_COLUMN).is_some()); + + test_recall::(params, 4, 0.5, "vector", &dataset, vectors).await; + } + #[rstest] #[case::fast(RQRotationType::Fast)] #[case::matrix(RQRotationType::Matrix)] @@ -4241,11 +4751,7 @@ mod tests { let indices = dataset.load_indices_by_name("vector_idx").await.unwrap(); assert_eq!(indices.len(), 1); // v1 index should be replaced by v3 index let index = dataset - .open_vector_index( - "vector", - indices[0].uuid.to_string().as_str(), - &NoOpMetricsCollector, - ) + .open_vector_index("vector", &indices[0].uuid, &NoOpMetricsCollector) .await .unwrap(); let v3_index = index.as_any().downcast_ref::(); @@ -4661,7 +5167,10 @@ mod tests { STORAGE_METADATA_KEY.to_owned(), serde_json::to_string(&vec![pq_metadata])?, ); - writer.finish_with_metadata(metadata).await?; + for (key, value) in metadata { + writer.add_schema_metadata(key, value); + } + writer.finish().await?; // Build new IndexMetadata with the new UUID and file sizes. let new_files = @@ -5702,11 +6211,9 @@ mod tests { // Try serialized store first let guard = self.serialized.lock().await; if let Some((bytes, stored_codec, _)) = guard.get(key) { - return Some( - stored_codec - .deserialize(&bytes::Bytes::copy_from_slice(bytes)) - .expect("deserialization should succeed"), - ); + return stored_codec + .deserialize(&bytes::Bytes::copy_from_slice(bytes)) + .hit(); } drop(guard); // Fall through to passthrough diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index 1f1a89d80c8..6e335cddc80 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -17,8 +17,8 @@ use arrow_select::take::take; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use lance_arrow::FixedSizeListArrayExt; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_core::utils::tokio::spawn_cpu; use lance_core::{ROW_ID, ROW_ID_FIELD}; @@ -71,17 +71,29 @@ pub struct PQIndex { } impl DeepSizeOf for PQIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.pq.deep_size_of_children(context) + self .code .as_ref() - .map(|code| code.get_array_memory_size()) + .map(|code| { + if context.mark_seen(Arc::as_ptr(code) as *const () as usize) { + (code.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + } else { + 0 + } + }) .unwrap_or(0) + self .row_ids .as_ref() - .map(|row_ids| row_ids.get_array_memory_size()) + .map(|row_ids| { + if context.mark_seen(Arc::as_ptr(row_ids) as *const () as usize) { + (row_ids.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + } else { + 0 + } + }) .unwrap_or(0) } } @@ -168,10 +180,6 @@ impl Index for PQIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn index_type(&self) -> IndexType { IndexType::Vector } @@ -899,6 +907,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; let is_empty_threads = Arc::new(Mutex::new(Vec::new())); let pre_filter = Arc::new(TestPreFilter::with_thread_capture( diff --git a/rust/lance/src/io/commit.rs b/rust/lance/src/io/commit.rs index d0a2934552c..ce0d29d550b 100644 --- a/rust/lance/src/io/commit.rs +++ b/rust/lance/src/io/commit.rs @@ -688,11 +688,7 @@ async fn migrate_indices(dataset: &Dataset, indices: &mut [IndexMetadata]) -> Re let idx_field = dataset.schema().field_by_id(index.fields[0]).ok_or_else(|| Error::internal(format!("Index with uuid {} referred to field with id {} which did not exist in dataset", index.uuid, index.fields[0])))?; // We need to calculate the fragments covered by the index let idx = dataset - .open_generic_index( - &idx_field.name, - &index.uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_generic_index(&idx_field.name, &index.uuid, &NoOpMetricsCollector) .await?; index.fragment_bitmap = Some(idx.calculate_included_frags().await?); } diff --git a/rust/lance/src/io/commit/conflict_resolver.rs b/rust/lance/src/io/commit/conflict_resolver.rs index b242cd5b3dd..dc898534c89 100644 --- a/rust/lance/src/io/commit/conflict_resolver.rs +++ b/rust/lance/src/io/commit/conflict_resolver.rs @@ -904,13 +904,42 @@ impl<'a> TransactionRebase<'a> { match &other_transaction.operation { Operation::Append { .. } | Operation::Clone { .. } - | Operation::Delete { .. } - | Operation::Update { .. } - | Operation::Merge { .. } | Operation::UpdateConfig { .. } | Operation::ReserveFragments { .. } | Operation::Project { .. } | Operation::UpdateBases { .. } => Ok(()), + Operation::Merge { .. } => { + // Merge rewrites the whole fragment list; always conflict + // (symmetric with check_merge_txn). + Err(self.retryable_conflict_err(other_transaction, other_version)) + } + Operation::Update { + updated_fragments, + removed_fragment_ids, + .. + } + | Operation::Delete { + updated_fragments, + deleted_fragment_ids: removed_fragment_ids, + .. + } => { + // A concurrent Update/Delete that changed one of our target + // fragments makes our positional column file stale; conflict so + // the committer rebuilds (lance otherwise accepts it silently). + for replacement in replacements { + let touches_our_fragment = updated_fragments + .iter() + .map(|f| f.id) + .chain(removed_fragment_ids.iter().copied()) + .any(|id| id == replacement.0); + if touches_our_fragment { + return Err( + self.retryable_conflict_err(other_transaction, other_version) + ); + } + } + Ok(()) + } Operation::CreateIndex { new_indices, .. } => { // A data replacement only conflicts if it is updating the field that // is being indexed. @@ -3258,7 +3287,7 @@ mod tests { ( "DataReplacement vs Rewrite on different fragment", Operation::DataReplacement { - replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01)], + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], }, Operation::Rewrite { groups: vec![RewriteGroup { @@ -3270,6 +3299,80 @@ mod tests { }, Compatible, ), + // A concurrent Update/Delete on a fragment we replace a column in must + // conflict, else the stale positional file is applied silently. + ( + "DataReplacement vs Update on same fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Update { + updated_fragments: vec![Fragment::new(0)], + removed_fragment_ids: vec![], + new_fragments: vec![], + fields_modified: vec![], + merged_generations: Vec::new(), + fields_for_preserving_frag_bitmap: vec![], + update_mode: None, + inserted_rows_filter: None, + updated_fragment_offsets: None, + }, + Retryable, + ), + ( + "DataReplacement vs Update on different fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Update { + updated_fragments: vec![Fragment::new(1)], + removed_fragment_ids: vec![], + new_fragments: vec![], + fields_modified: vec![], + merged_generations: Vec::new(), + fields_for_preserving_frag_bitmap: vec![], + update_mode: None, + inserted_rows_filter: None, + updated_fragment_offsets: None, + }, + Compatible, + ), + ( + "DataReplacement vs Delete on same fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Delete { + deleted_fragment_ids: vec![], + updated_fragments: vec![Fragment::new(0)], + predicate: "a > 0".to_string(), + }, + Retryable, + ), + ( + "DataReplacement vs Delete that removes the fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Delete { + deleted_fragment_ids: vec![0], + updated_fragments: vec![], + predicate: "a > 0".to_string(), + }, + Retryable, + ), + // Merge rewrites the whole fragment list -> always conflicts. + ( + "DataReplacement vs Merge", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01)], + }, + Operation::Merge { + fragments: vec![Fragment::new(0)], + schema: lance_core::datatypes::Schema::default(), + }, + Retryable, + ), ]; for (description, op1, op2, expected) in cases { diff --git a/rust/lance/src/io/commit/external_manifest.rs b/rust/lance/src/io/commit/external_manifest.rs index df2b84a4878..850d10f9a23 100644 --- a/rust/lance/src/io/commit/external_manifest.rs +++ b/rust/lance/src/io/commit/external_manifest.rs @@ -4,10 +4,14 @@ /// Keep the tests in `lance` crate because it has dependency on [Dataset]. #[cfg(test)] mod test { + use std::ops::Range; use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; use std::{collections::HashMap, time::Duration}; use async_trait::async_trait; + use bytes::Bytes; + use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt, future::join_all}; use lance_core::{Error, Result}; use lance_table::io::commit::external_manifest::{ @@ -15,7 +19,12 @@ mod test { }; use lance_table::io::commit::{CommitHandler, ManifestNamingScheme}; use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; - use object_store::{ObjectStoreExt, local::LocalFileSystem, path::Path}; + use object_store::memory::InMemory; + use object_store::{ + CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, + ObjectStore as OSObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload, + PutResult, RenameOptions, Result as OSResult, local::LocalFileSystem, path::Path, + }; use tokio::sync::Mutex; use crate::dataset::builder::DatasetBuilder; @@ -365,6 +374,32 @@ mod test { assert_eq!(ds.version().version, 6); assert_eq!(ds.count_rows(None).await.unwrap(), 60); + { + inner_store.lock().await.remove(&(ds.base.to_string(), 6)); + } + assert!( + handler + .version_exists( + &ds.base, + 6, + ds.object_store.inner.as_ref(), + ds.manifest_location().naming_scheme, + ) + .await + .unwrap() + ); + assert!( + !handler + .version_exists( + &ds.base, + 7, + ds.object_store.inner.as_ref(), + ds.manifest_location().naming_scheme, + ) + .await + .unwrap() + ); + // Open without external store handler again, should see the newly sync'd commit let ds = DatasetBuilder::from_uri(ds_uri).load().await.unwrap(); assert_eq!(ds.version().version, 6); @@ -394,4 +429,288 @@ mod test { .collect::>(); assert!(unexpected_entries.is_empty(), "{:?}", unexpected_entries); } + + /// S3's `CopyObject` API has a hard 5 GB cap on the source object size. + /// Above that, callers must use multipart copy (`UploadPartCopy`) instead. + /// `lance-table::io::commit::external_manifest` calls + /// `object_store.copy(staging, final)` unconditionally on the manifest + /// commit path — which fails for manifests >5 GB. + /// + /// This wrapper enforces that S3-side cap on top of any inner store, so + /// the regression can be reproduced in-process without S3. + /// + /// It also lets the test override `head().size` for a chosen path, so the + /// staging file can *appear* to be 14 GB without actually putting that + /// many bytes into the inner store. + const S3_COPY_OBJECT_CAP_BYTES: u64 = 5 * 1024 * 1024 * 1024; + + #[derive(Debug)] + struct CopyCapStore { + inner: Arc, + /// path → fake size returned by head(); overrides the inner store. + head_size_overrides: Arc>>, + /// Counts calls to `copy_opts` (the fast path). Tests use this to + /// assert which branch of `copy_size_aware` was taken — succeeding + /// alone is not enough, since the slow path can also succeed for + /// small files. + copy_calls: AtomicUsize, + /// Counts calls to `put_multipart_opts` (the slow read+rewrite path). + put_multipart_calls: AtomicUsize, + } + + impl CopyCapStore { + fn new(inner: Arc) -> Self { + Self { + inner, + head_size_overrides: Arc::new(Mutex::new(HashMap::new())), + copy_calls: AtomicUsize::new(0), + put_multipart_calls: AtomicUsize::new(0), + } + } + + async fn override_size(&self, path: &Path, size: u64) { + self.head_size_overrides + .lock() + .await + .insert(path.to_string(), size); + } + + async fn effective_size(&self, location: &Path, real: u64) -> u64 { + self.head_size_overrides + .lock() + .await + .get(&location.to_string()) + .copied() + .unwrap_or(real) + } + + fn copy_calls(&self) -> usize { + self.copy_calls.load(Ordering::SeqCst) + } + + fn put_multipart_calls(&self) -> usize { + self.put_multipart_calls.load(Ordering::SeqCst) + } + } + + impl std::fmt::Display for CopyCapStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "CopyCapStore({})", self.inner) + } + } + + #[async_trait] + impl OSObjectStore for CopyCapStore { + async fn put_opts( + &self, + location: &Path, + bytes: PutPayload, + opts: PutOptions, + ) -> OSResult { + self.inner.put_opts(location, bytes, opts).await + } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOptions, + ) -> OSResult> { + self.put_multipart_calls.fetch_add(1, Ordering::SeqCst); + self.inner.put_multipart_opts(location, opts).await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> OSResult { + // `head()` is a default method on `ObjectStore` that delegates to + // `get_opts(location, GetOptions { head: true, .. })`. To make a + // staging file *appear* to be 14 GB without holding 14 GB in + // memory, we override the size in the returned ObjectMeta here. + let mut res = self.inner.get_opts(location, options).await?; + let overridden = self.effective_size(location, res.meta.size).await; + res.meta.size = overridden; + Ok(res) + } + + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> OSResult> { + self.inner.get_ranges(location, ranges).await + } + + // `head` and `delete` are default methods on `ObjectStore`, derived + // from `get_opts`/`delete_stream`. We override `head` indirectly by + // overriding `get_opts` below — it returns size based on the + // overrides table for the chosen path. + fn delete_stream( + &self, + locations: BoxStream<'static, OSResult>, + ) -> BoxStream<'static, OSResult> { + self.inner.delete_stream(locations) + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, OSResult> { + self.inner.list(prefix) + } + + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'static, OSResult> { + self.inner.list_with_offset(prefix, offset) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> OSResult { + self.inner.list_with_delimiter(prefix).await + } + + async fn copy_opts(&self, from: &Path, to: &Path, opts: CopyOptions) -> OSResult<()> { + // Mimic S3's CopyObject 5 GB hard cap: read the (possibly-overridden) + // size of the source via head() and reject if it crosses the cap. + let meta = self.head(from).await?; + if meta.size >= S3_COPY_OBJECT_CAP_BYTES { + return Err(object_store::Error::Generic { + store: "S3", + source: format!( + "EntityTooLarge: ProposedSize {} exceeds CopyObject 5GB cap", + meta.size + ) + .into(), + }); + } + self.copy_calls.fetch_add(1, Ordering::SeqCst); + self.inner.copy_opts(from, to, opts).await + } + + async fn rename_opts(&self, from: &Path, to: &Path, opts: RenameOptions) -> OSResult<()> { + self.inner.rename_opts(from, to, opts).await + } + } + + /// Repro for the manifest >5 GB bug. + /// + /// Drives `ExternalManifestStore::put` (the default impl) against a + /// staging file whose `head().size` is reported as 14 GB. That `put` + /// calls `object_store.copy(staging, final)` unconditionally — which + /// our `CopyCapStore` wrapper rejects with the same `EntityTooLarge` + /// error S3 returns in production. + /// + /// Today this test is RED: the copy step fails on >5 GB. + /// After `copy_size_aware` lands, it should turn GREEN by falling back + /// to a multipart-equivalent path (option 1: read+rewrite via + /// `ObjectWriter`). + #[tokio::test] + async fn manifest_commit_succeeds_when_staging_exceeds_5gb_copy_cap() { + let inner: Arc = Arc::new(InMemory::new()); + let capped = Arc::new(CopyCapStore::new(inner)); + + // Write a small staging file, then lie about its size so the + // CopyObject cap fires without holding 14 GB in memory. + let base_path = Path::from("repro"); + let staging_path = Path::from("repro/_versions/1.manifest.staging-abcd"); + let body = b"fake manifest body"; + capped + .put(&staging_path, PutPayload::from_static(body)) + .await + .expect("seed staging file"); + capped + .override_size(&staging_path, 14_961_429_442) // matches the production failure + .await; + + // Spin up an ExternalManifestStore and drive `put` (the same code + // path the failing CTAS hits via ExternalManifestCommitHandler). + let external = SleepyExternalManifestStore::new(); + let head_meta = capped.head(&staging_path).await.unwrap(); + + let location = external + .put( + &base_path, + 1, + &staging_path, + head_meta.size, + head_meta.e_tag, + capped.as_ref(), + ManifestNamingScheme::V2, + ) + .await + .expect( + "manifest commit should succeed for a >5 GB staging file via multipart-aware copy", + ); + + // Branch-taken assertions: the slow read+rewrite path was used. + assert_eq!( + capped.copy_calls(), + 0, + "CopyObject must not be attempted for >5 GiB sources" + ); + assert!( + capped.put_multipart_calls() >= 1, + "read+rewrite path must initiate a multipart upload" + ); + + // End-state assertions: final manifest exists with the original + // bytes, and the staging file was deleted. + let final_get = capped + .inner + .get(&location.path) + .await + .expect("final manifest must exist on the inner store") + .bytes() + .await + .unwrap(); + assert_eq!(final_get.as_ref(), body); + let staging_after = capped.inner.head(&staging_path).await; + assert!( + matches!(staging_after, Err(object_store::Error::NotFound { .. })), + "staging file must be cleaned up after commit, got: {:?}", + staging_after + ); + } + + /// Counterpart to manifest_commit_succeeds_when_staging_exceeds_5gb_copy_cap. + /// Confirms that for staging files BELOW the 5 GB cap, the fast-path + /// server-side `copy()` is still used — i.e. we haven't accidentally + /// regressed every commit to read+rewrite. + #[tokio::test] + async fn manifest_commit_uses_fast_copy_for_small_staging() { + let inner: Arc = Arc::new(InMemory::new()); + let capped = Arc::new(CopyCapStore::new(inner)); + + let base_path = Path::from("repro"); + let staging_path = Path::from("repro/_versions/1.manifest.staging-abcd"); + capped + .put( + &staging_path, + PutPayload::from_static(b"small manifest body"), + ) + .await + .expect("seed staging file"); + // No size override — the staging file's real size is ~20 bytes, + // well below the 5 GB cap, so copy_size_aware must take the fast + // path. + + let external = SleepyExternalManifestStore::new(); + let head_meta = capped.head(&staging_path).await.unwrap(); + + external + .put( + &base_path, + 1, + &staging_path, + head_meta.size, + head_meta.e_tag, + capped.as_ref(), + ManifestNamingScheme::V2, + ) + .await + .expect("small manifest commit must succeed via fast-path copy"); + + // The branch-taken assertion: fast path was used, slow path was not. + assert!( + capped.copy_calls() >= 1, + "small-file commit must use server-side CopyObject" + ); + assert_eq!( + capped.put_multipart_calls(), + 0, + "small-file commit must NOT initiate a multipart upload" + ); + } } diff --git a/rust/lance/src/io/commit/namespace_manifest.rs b/rust/lance/src/io/commit/namespace_manifest.rs index 0587ff96ad4..f4f012adcca 100644 --- a/rust/lance/src/io/commit/namespace_manifest.rs +++ b/rust/lance/src/io/commit/namespace_manifest.rs @@ -14,27 +14,77 @@ use lance_table::io::commit::{ManifestLocation, ManifestNamingScheme}; use object_store::ObjectStore as OSObjectStore; use object_store::path::Path; +use lance_namespace::error::NamespaceError; + +use crate::dataset::branch_location::BranchLocation; + +/// Whether `e` says the requested chain (table or branch) does not exist, as +/// opposed to a failure talking to the namespace. +fn is_chain_not_found(e: &lance_core::Error) -> bool { + if let lance_core::Error::Namespace { source, .. } = e + && let Some(ns_err) = source.downcast_ref::() + { + return matches!( + ns_err, + NamespaceError::TableNotFound { .. } | NamespaceError::TableBranchNotFound { .. } + ); + } + false +} + #[derive(Debug)] pub struct LanceNamespaceExternalManifestStore { namespace_client: Arc, table_id: Vec, + /// Object-store path of the table root (the main branch). The base path the + /// trait methods receive is resolved against this to derive which branch a + /// request targets, so a single store serves every branch of the table. + table_root: Path, } impl LanceNamespaceExternalManifestStore { - pub fn new(namespace_client: Arc, table_id: Vec) -> Self { + pub fn new( + namespace_client: Arc, + table_id: Vec, + table_root: Path, + ) -> Self { Self { namespace_client, table_id, + table_root, } } + + /// Build a store for the table rooted at `table_uri`, resolving the root + /// path from the uri without initializing an object store. + pub fn for_table_uri( + namespace_client: Arc, + table_id: Vec, + table_uri: &str, + ) -> Result { + let table_root = lance_io::object_store::ObjectStore::extract_path_from_uri( + Arc::new(lance_io::object_store::ObjectStoreRegistry::default()), + table_uri, + )?; + Ok(Self::new(namespace_client, table_id, table_root)) + } + + /// Derive the branch targeted by `base` (the table root for main, or a + /// branch chain produced by `BranchLocation::find_branch`). The branch + /// path layout is owned by [`BranchLocation`]; this store never parses or + /// constructs it directly. + fn branch_for_base(&self, base: &str) -> Result> { + BranchLocation::branch_of(self.table_root.as_ref(), base) + } } #[async_trait] impl ExternalManifestStore for LanceNamespaceExternalManifestStore { - async fn get(&self, _base_uri: &str, version: u64) -> Result { + async fn get(&self, base_uri: &str, version: u64) -> Result { let request = DescribeTableVersionRequest { id: Some(self.table_id.clone()), version: Some(version as i64), + branch: self.branch_for_base(base_uri)?, ..Default::default() }; @@ -47,15 +97,24 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { Ok(response.version.manifest_path) } - async fn get_latest_version(&self, _base_uri: &str) -> Result> { + async fn get_latest_version(&self, base_uri: &str) -> Result> { let request = ListTableVersionsRequest { id: Some(self.table_id.clone()), descending: Some(true), limit: Some(1), + branch: self.branch_for_base(base_uri)?, ..Default::default() }; - let response = self.namespace_client.list_table_versions(request).await?; + let response = match self.namespace_client.list_table_versions(request).await { + Ok(response) => response, + // A chain that does not exist yet (e.g. probing a branch location + // before the branch is created) has no latest version; the + // ExternalManifestStore contract reports that as None, not an + // error, so existence checks can treat it as a missing dataset. + Err(e) if is_chain_not_found(&e) => return Ok(None), + Err(e) => return Err(e), + }; if response.versions.is_empty() { return Ok(None); @@ -73,7 +132,7 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { /// Put the manifest to the namespace store. async fn put( &self, - _base_path: &Path, + base_path: &Path, version: u64, staging_path: &Path, size: u64, @@ -94,6 +153,7 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { manifest_size: Some(size as i64), e_tag: e_tag.clone(), naming_scheme: Some(naming_scheme_str.to_string()), + branch: self.branch_for_base(base_path.as_ref())?, ..Default::default() }; @@ -146,3 +206,93 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { )) } } + +#[cfg(test)] +mod tests { + use super::*; + use lance_namespace::models::ListTableVersionsResponse; + + /// A namespace whose list_table_versions always fails with the configured + /// error, to pin how get_latest_version classifies failures. + #[derive(Debug)] + struct FailingNamespace { + error: fn() -> lance_core::Error, + } + + #[async_trait] + impl LanceNamespace for FailingNamespace { + fn namespace_id(&self) -> String { + "failing".to_string() + } + + async fn list_table_versions( + &self, + _request: ListTableVersionsRequest, + ) -> Result { + Err((self.error)()) + } + } + + fn store_with(error: fn() -> lance_core::Error) -> LanceNamespaceExternalManifestStore { + LanceNamespaceExternalManifestStore::new( + Arc::new(FailingNamespace { error }), + vec!["t".to_string()], + Path::parse("data/t.lance").unwrap(), + ) + } + + /// A chain that does not exist (missing table or branch) has no latest + /// version; everything else is a real failure and must propagate so an + /// outage is never mistaken for an absent dataset. + #[tokio::test] + async fn test_get_latest_version_error_classification() { + use lance_namespace::error::NamespaceError; + + let absent = [ + store_with(|| { + NamespaceError::TableNotFound { + message: "missing table".to_string(), + } + .into() + }), + store_with(|| { + NamespaceError::TableBranchNotFound { + message: "missing branch".to_string(), + } + .into() + }), + ]; + for store in absent { + let latest = store.get_latest_version("data/t.lance/tree/dev").await; + assert!( + matches!(latest, Ok(None)), + "a missing chain must read as no latest version, got: {:?}", + latest + ); + } + + let failures = [ + store_with(|| { + NamespaceError::Internal { + message: "server error".to_string(), + } + .into() + }), + store_with(|| { + NamespaceError::Throttling { + message: "slow down".to_string(), + } + .into() + }), + store_with(|| lance_core::Error::io("connection reset".to_string())), + ]; + for store in failures { + let latest = store.get_latest_version("data/t.lance/tree/dev").await; + assert!( + latest.is_err(), + "a real failure must propagate, got: {:?}", + latest + ); + } + } +} diff --git a/rust/lance/src/io/exec.rs b/rust/lance/src/io/exec.rs index f06f575de68..a477d60d56d 100644 --- a/rust/lance/src/io/exec.rs +++ b/rust/lance/src/io/exec.rs @@ -7,6 +7,8 @@ #[cfg(feature = "substrait")] pub mod ann_proto; +pub mod count_from_mask; +pub mod count_pushdown; mod filter; pub mod filtered_read; #[cfg(feature = "substrait")] diff --git a/rust/lance/src/io/exec/ann_proto.rs b/rust/lance/src/io/exec/ann_proto.rs index 235098cab55..c57ad4ca1b7 100644 --- a/rust/lance/src/io/exec/ann_proto.rs +++ b/rust/lance/src/io/exec/ann_proto.rs @@ -16,10 +16,11 @@ use arrow_array::RecordBatch; use arrow_schema::{Field, Schema as ArrowSchema}; use lance_core::{Error, Result}; use lance_index::pb as index_pb; -use lance_index::vector::{DEFAULT_QUERY_PARALLELISM, Query}; +use lance_index::vector::{ApproxMode, DEFAULT_QUERY_PARALLELISM, Query}; use lance_linalg::distance::DistanceType; use lance_table::format::IndexMetadata; use lance_table::format::pb as table_pb; +use uuid::Uuid; use crate::Dataset; use crate::pb; @@ -79,6 +80,22 @@ fn query_vector_from_ipc_bytes(bytes: &[u8]) -> Result { Ok(batches[0].column(0).clone()) } +fn approx_mode_to_proto(mode: ApproxMode) -> pb::VectorApproxMode { + match mode { + ApproxMode::Fast => pb::VectorApproxMode::Fast, + ApproxMode::Normal => pb::VectorApproxMode::Normal, + ApproxMode::Accurate => pb::VectorApproxMode::Accurate, + } +} + +fn approx_mode_from_proto(value: i32) -> ApproxMode { + match pb::VectorApproxMode::try_from(value).unwrap_or(pb::VectorApproxMode::Normal) { + pb::VectorApproxMode::Fast => ApproxMode::Fast, + pb::VectorApproxMode::Normal => ApproxMode::Normal, + pb::VectorApproxMode::Accurate => ApproxMode::Accurate, + } +} + pub fn query_to_proto(query: &Query) -> Result { let query_vector_arrow_ipc = query_vector_to_ipc_bytes(query.key.as_ref())?; @@ -100,6 +117,7 @@ pub fn query_to_proto(query: &Query) -> Result { use_index: query.use_index, dist_q_c: Some(query.dist_q_c), query_parallelism: Some(query.query_parallelism), + approx_mode: approx_mode_to_proto(query.approx_mode) as i32, }) } @@ -129,6 +147,7 @@ pub fn query_from_proto(proto: pb::VectorQueryProto) -> Result { use_index: proto.use_index, query_parallelism: proto.query_parallelism.unwrap_or(DEFAULT_QUERY_PARALLELISM), dist_q_c: proto.dist_q_c.unwrap_or(0.0), + approx_mode: approx_mode_from_proto(proto.approx_mode), }) } @@ -146,7 +165,7 @@ pub async fn ann_ivf_partition_exec_to_proto( Ok(pb::AnnIvfPartitionExecProto { query: Some(query), table: Some(table), - index_uuids: exec.index_uuids.clone(), + index_uuids: exec.index_uuids.iter().map(Uuid::to_string).collect(), }) } @@ -168,7 +187,17 @@ pub async fn ann_ivf_partition_exec_from_proto( )); } - ANNIvfPartitionExec::try_new(dataset, proto.index_uuids, query) + let index_uuids: Vec = proto + .index_uuids + .iter() + .map(|s| Uuid::parse_str(s)) + .collect::>() + .map_err(|e| { + Error::invalid_input_source( + format!("Invalid UUID in AnnIvfPartitionExecProto: {e}").into(), + ) + })?; + ANNIvfPartitionExec::try_new(dataset, index_uuids, query) } // ============================================================================= @@ -324,6 +353,7 @@ mod tests { use_index: true, query_parallelism: -1, dist_q_c: 0.42, + approx_mode: ApproxMode::Accurate, }; let proto = query_to_proto(&query).unwrap(); @@ -341,6 +371,7 @@ mod tests { assert_eq!(query.use_index, back.use_index); assert_eq!(query.query_parallelism, back.query_parallelism); assert_eq!(query.dist_q_c, back.dist_q_c); + assert_eq!(query.approx_mode, back.approx_mode); assert_eq!(query.key.len(), back.key.len()); assert_eq!(query.key.data_type(), back.key.data_type()); } @@ -362,12 +393,19 @@ mod tests { use_index: false, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; let proto = query_to_proto(&query).unwrap(); let back = query_from_proto(proto).unwrap(); assert!(back.metric_type.is_none()); assert!(!back.use_index); + assert_eq!(back.approx_mode, ApproxMode::Normal); + + let mut proto = query_to_proto(&query).unwrap(); + proto.approx_mode = i32::MAX; + let back = query_from_proto(proto).unwrap(); + assert_eq!(back.approx_mode, ApproxMode::Normal); } async fn make_vector_dataset() -> (Arc, tempfile::TempDir) { @@ -433,11 +471,12 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; let exec = ANNIvfPartitionExec::try_new( dataset.clone(), - indices.iter().map(|idx| idx.uuid.to_string()).collect(), + indices.iter().map(|idx| idx.uuid).collect(), query, ) .unwrap(); @@ -480,6 +519,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; // Use a TestMemoryExec as a mock input child (provides the KNN_PARTITION_SCHEMA) @@ -543,6 +583,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; let input: Arc = TestMemoryExec::try_new_exec( diff --git a/rust/lance/src/io/exec/count_from_mask.rs b/rust/lance/src/io/exec/count_from_mask.rs new file mode 100644 index 00000000000..0b7aeb11111 --- /dev/null +++ b/rust/lance/src/io/exec/count_from_mask.rs @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Execute-time half of the count-from-mask category of aggregate pushdown. +//! +//! [`CountFromMaskExec`] computes a `COUNT(*)`-style aggregate's partial +//! state directly from index/manifest metadata, without scanning column +//! data. Conceptually: +//! +//! ```text +//! result = | fragments_allow ∩ optional_prefilter_mask − deletion_mask | +//! ``` +//! +//! Its output schema matches what `AggregateExec(AggregateMode::Partial)` +//! would produce for the same `COUNT` aggregates, so a downstream +//! `AggregateExec(Final)` can combine the result unchanged. +//! +//! This is one of four categories of aggregate acceleration we plan to +//! support; the others (mask-to-answer, zone-aware, dimension-keyed) each +//! need additional plumbing — see the corresponding design issue. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::{Array, BinaryArray, Int64Array, RecordBatch}; +use arrow_schema::{Schema, SchemaRef}; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + execution_plan::{Boundedness, EmissionType}, + metrics::{ExecutionPlanMetricsSet, MetricsSet}, +}; +use datafusion_physical_expr::EquivalenceProperties; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; +use futures::{StreamExt, TryStreamExt, stream}; +use lance_core::{Error, Result}; +use lance_select::{RowAddrMask, RowAddrSelection, RowAddrTreeMap}; +use lance_table::format::Fragment; +use roaring::RoaringBitmap; +use tracing::instrument; + +use super::utils::InstrumentedRecordBatchStreamAdapter; +use crate::Dataset; +use crate::dataset::rowids::load_row_id_sequences; +use crate::index::prefilter::DatasetPreFilter; + +/// An execution node that computes a `COUNT(*)`-style aggregate from an +/// optional row-address mask supplied by an upstream scalar-index search, +/// combined with the dataset's deletion mask and an optional restriction to +/// a fragment subset. +/// +/// The node returns one record batch with one row whose columns are the +/// partial-state representation of each `COUNT` in `aggregate_funcs` — i.e. +/// the same shape an `AggregateExec(Partial)` would emit. +#[derive(Debug)] +pub struct CountFromMaskExec { + dataset: Arc, + /// One per output column. Used only for `state_fields()` to build the + /// output schema; the actual count is computed identically for all of + /// them since every entry is a non-distinct `COUNT()`. + aggregate_funcs: Vec>, + /// Optional [`super::scalar_index::ScalarIndexExec`] producing the row- + /// address mask to count. + prefilter_input: Option>, + /// Restrict the count to this fragment subset. `None` means "every + /// fragment in the dataset." The optimizer rule uses this to scope the + /// pushdown branch of a partial-coverage split plan to the indexed + /// fragments only — the uncovered fragments are handled by a parallel + /// scan branch. + restrict_to_fragments: Option, + schema: SchemaRef, + properties: Arc, + metrics: ExecutionPlanMetricsSet, +} + +impl DisplayAs for CountFromMaskExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "CountFromMask") + } + DisplayFormatType::TreeRender => write!(f, "CountFromMask"), + } + } +} + +impl CountFromMaskExec { + /// Build a new node. + /// + /// `aggregate_funcs` must be a non-empty set of non-distinct `COUNT` + /// aggregates (the optimizer rule guarantees this). `prefilter_input`, + /// if present, must produce a single batch in the scalar-index result + /// schema; that mask is intersected with the dataset's covered + /// fragments and the active deletion mask. + pub fn try_new( + dataset: Arc, + aggregate_funcs: Vec>, + prefilter_input: Option>, + ) -> Result { + Self::try_new_restricted(dataset, aggregate_funcs, prefilter_input, None) + } + + /// Like [`Self::try_new`] but scopes the count to a fragment subset + /// rather than the whole dataset. The optimizer rule uses this for the + /// pushdown branch of a partial-coverage split plan, so the count only + /// covers the fragments the prefilter's index can answer for. + pub fn try_new_restricted( + dataset: Arc, + aggregate_funcs: Vec>, + prefilter_input: Option>, + restrict_to_fragments: Option, + ) -> Result { + if aggregate_funcs.is_empty() { + return Err(Error::invalid_input( + "CountFromMaskExec requires at least one aggregate".to_string(), + )); + } + + let state_fields = aggregate_funcs + .iter() + .map(|agg| agg.state_fields()) + .collect::>>() + .map_err(|e| Error::invalid_input(e.to_string()))? + .into_iter() + .flatten() + .collect::>(); + let state_fields_owned: Vec = + state_fields.iter().map(|f| f.as_ref().clone()).collect(); + let schema: SchemaRef = Arc::new(Schema::new(state_fields_owned)); + + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(schema.clone()), + Partitioning::RoundRobinBatch(1), + EmissionType::Incremental, + Boundedness::Bounded, + )); + + Ok(Self { + dataset, + aggregate_funcs, + prefilter_input, + restrict_to_fragments, + schema, + properties, + metrics: ExecutionPlanMetricsSet::new(), + }) + } + + /// Drain `prefilter_input` (a [`super::scalar_index::ScalarIndexExec`]) + /// to produce the row-address mask it serialized. + async fn load_prefilter( + prefilter_input: Arc, + context: Arc, + ) -> Result { + let mut stream = prefilter_input.execute(0, context).map_err(Error::from)?; + let batch = stream + .try_next() + .await + .map_err(Error::from)? + .ok_or_else(|| { + Error::internal( + "CountFromMaskExec: prefilter input produced no batches".to_string(), + ) + })?; + // Drain any remaining batches so the upstream sees a clean shutdown. + while stream.try_next().await.map_err(Error::from)?.is_some() {} + + let result_col = batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: prefilter result column has type {:?}, expected Binary", + batch.column(0).data_type() + )) + })?; + RowAddrMask::from_arrow(result_col) + } + + /// Fold the prefilter, fragment allow list, and deletion mask into a + /// single `AllowList`-shaped [`RowAddrMask`] suitable for counting. + fn combine_masks( + fragments_allow: RowAddrTreeMap, + prefilter: Option, + deletion_mask: Option>, + ) -> RowAddrMask { + let base = RowAddrMask::AllowList(fragments_allow); + let after_prefilter = match prefilter { + None => base, + Some(prefilter) => base & prefilter, + }; + match deletion_mask { + None => after_prefilter, + Some(deletion_mask) => after_prefilter & (*deletion_mask).clone(), + } + } + + /// Count the rows selected by `mask`, looking up `Full`-marker fragments + /// in the manifest so we never need to materialize a + /// `RoaringBitmap::full()`. + fn count_from_mask(mask: &RowAddrMask, dataset: &Dataset) -> Result { + let allow = mask.allow_list().ok_or_else(|| { + Error::internal("CountFromMaskExec: combined mask is not an AllowList".to_string()) + })?; + let frag_map: HashMap = dataset + .fragments() + .iter() + .map(|f| (f.id as u32, f)) + .collect(); + let mut count = 0i64; + for (frag_id, sel) in allow.iter() { + match sel { + RowAddrSelection::Full => { + // The fragment is in the allow list with no deletions + // touching it — its row count is the physical row count. + let frag = frag_map.get(frag_id).ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: fragment {} not found in manifest", + frag_id + )) + })?; + let n = frag.physical_rows.ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: physical_rows missing for fragment {}", + frag_id + )) + })?; + count += n as i64; + } + RowAddrSelection::Partial(bitmap) => { + count += bitmap.len() as i64; + } + } + } + Ok(count) + } + + /// Row-address-space fragments-allow list: concrete `[0..physical_rows)` + /// ranges per covered fragment. + /// + /// Concrete ranges, not `Full` markers: subtracting a `BlockList` from a + /// `Full` entry materializes a `RoaringBitmap::full()` (2^32) per fragment, + /// which is slow and throws off `len()`. + fn address_fragments_allow( + dataset: &Dataset, + fragments_covered: &RoaringBitmap, + ) -> Result { + let frag_map: HashMap = dataset + .fragments() + .iter() + .map(|f| (f.id as u32, f)) + .collect(); + let mut fragments_allow = RowAddrTreeMap::new(); + for frag_id in fragments_covered.iter() { + let frag = frag_map.get(&frag_id).ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: fragment {} not in manifest", + frag_id + )) + })?; + let physical = frag.physical_rows.ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: physical_rows missing for fragment {}", + frag_id + )) + })?; + let mut bitmap = RoaringBitmap::new(); + bitmap.insert_range(0u32..(physical as u32)); + fragments_allow.insert_bitmap(frag_id, bitmap); + } + Ok(fragments_allow) + } + + /// Live (non-deleted) row count of the covered fragments, from fragment + /// metadata. Used for an unfiltered count: no prefilter to intersect, so no + /// need to build the stable-id universe. + async fn count_live_rows(dataset: &Dataset, fragments_covered: &RoaringBitmap) -> Result { + let frags = dataset + .get_fragments() + .into_iter() + .filter(|f| fragments_covered.contains(f.id() as u32)); + let counts = stream::iter(frags) + .map(|f| async move { f.count_rows(None).await }) + .buffer_unordered(dataset.object_store.as_ref().io_parallelism()) + .try_collect::>() + .await?; + Ok(counts.iter().sum::() as i64) + } + + /// Count universe in stable-id space: live stable row ids whose current home + /// is in `fragments_covered`. Staying in stable-id space lets it intersect + /// the index prefilter directly; deletions are already folded in, so the + /// caller passes no separate deletion mask. + async fn stable_id_universe( + dataset: &Arc, + fragments_covered: RoaringBitmap, + ) -> Result { + // create_restricted_deletion_mask gives a live-id allow list restricted + // to `fragments_covered`. It returns None only with no deletions and full + // coverage — then the universe is every stable id, loaded below. + if let Some(fut) = DatasetPreFilter::create_restricted_deletion_mask( + dataset.clone(), + fragments_covered.clone(), + ) { + let mask = fut.await?; + return mask.allow_list().cloned().ok_or_else(|| { + Error::internal( + "CountFromMaskExec: stable-row-id deletion mask must be an AllowList" + .to_string(), + ) + }); + } + Self::load_stable_id_universe(dataset, &fragments_covered).await + } + + /// Every stable row id in the covered fragments, from their row-id sequences + /// (metadata, not column data). Only used with no deletions and full coverage. + async fn load_stable_id_universe( + dataset: &Dataset, + fragments_covered: &RoaringBitmap, + ) -> Result { + let frags: Vec = dataset + .fragments() + .iter() + .filter(|f| fragments_covered.contains(f.id as u32)) + .cloned() + .collect(); + let mut sequences = load_row_id_sequences(dataset, &frags); + let mut universe = RowAddrTreeMap::new(); + while let Some((_frag_id, sequence)) = sequences.try_next().await? { + universe |= RowAddrTreeMap::from(sequence.as_ref()); + } + Ok(universe) + } + + #[instrument(name = "count_from_mask", skip_all, level = "debug")] + async fn do_execute( + dataset: Arc, + aggregate_funcs_len: usize, + prefilter_input: Option>, + restrict_to_fragments: Option, + context: Arc, + schema: SchemaRef, + ) -> Result { + let prefilter = match prefilter_input { + None => None, + Some(input) => Some(Self::load_prefilter(input, context.clone()).await?), + }; + + // Anchor the deletion mask against either every dataset fragment or + // the caller-supplied restricted subset. + let dataset_fragments: RoaringBitmap = + dataset.fragments().iter().map(|f| f.id as u32).collect(); + let fragments_covered = match restrict_to_fragments { + Some(restrict) => dataset_fragments & restrict, + None => dataset_fragments, + }; + + // Under stable row ids the prefilter and deletion masks are in stable-id + // space, so the universe must be too (see `stable_id_universe`); the + // default path builds it in row-address space. + let count = if dataset.manifest.uses_stable_row_ids() { + match prefilter { + // No prefilter: just the live row count, from metadata. + None => Self::count_live_rows(&dataset, &fragments_covered).await?, + Some(prefilter) => { + let universe = Self::stable_id_universe(&dataset, fragments_covered).await?; + let combined = Self::combine_masks(universe, Some(prefilter), None); + Self::count_from_mask(&combined, dataset.as_ref())? + } + } + } else { + let fragments_allow = Self::address_fragments_allow(&dataset, &fragments_covered)?; + // Load the deletion mask for the covered fragments. + let deletion_mask = + match DatasetPreFilter::create_deletion_mask(dataset.clone(), fragments_covered) { + Some(fut) => Some(fut.await?), + None => None, + }; + let combined = Self::combine_masks(fragments_allow, prefilter, deletion_mask); + Self::count_from_mask(&combined, dataset.as_ref())? + }; + + // Every aggregate is the same non-distinct COUNT shape — emit the + // count once per output column. + let arrays: Vec> = (0..aggregate_funcs_len) + .map(|_| Arc::new(Int64Array::from(vec![count])) as Arc) + .collect(); + Ok(RecordBatch::try_new(schema, arrays)?) + } +} + +impl ExecutionPlan for CountFromMaskExec { + fn name(&self) -> &str { + "CountFromMaskExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn children(&self) -> Vec<&Arc> { + match &self.prefilter_input { + Some(input) => vec![input], + None => vec![], + } + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> datafusion::error::Result> { + let prefilter_input = match children.len() { + 0 => None, + 1 => Some(children.into_iter().next().unwrap()), + n => { + return Err(datafusion::error::DataFusionError::Internal(format!( + "CountFromMaskExec accepts 0 or 1 children, got {}", + n + ))); + } + }; + Ok(Arc::new(Self { + dataset: self.dataset.clone(), + aggregate_funcs: self.aggregate_funcs.clone(), + prefilter_input, + restrict_to_fragments: self.restrict_to_fragments.clone(), + schema: self.schema.clone(), + properties: self.properties.clone(), + metrics: self.metrics.clone(), + })) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> datafusion::error::Result { + let schema = self.schema.clone(); + let batch_fut = Self::do_execute( + self.dataset.clone(), + self.aggregate_funcs.len(), + self.prefilter_input.clone(), + self.restrict_to_fragments.clone(), + context, + schema.clone(), + ); + let stream = futures::stream::iter(vec![batch_fut]) + .then(|fut| async move { fut.await.map_err(|err| err.into()) }) + .boxed(); + Ok(Box::pin(InstrumentedRecordBatchStreamAdapter::new( + schema, + stream, + partition, + &self.metrics, + ))) + } + + fn partition_statistics( + &self, + _partition: Option, + ) -> datafusion::error::Result { + Ok(datafusion::physical_plan::Statistics { + num_rows: datafusion::common::stats::Precision::Exact(1), + ..datafusion::physical_plan::Statistics::new_unknown(&self.schema) + }) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn supports_limit_pushdown(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use std::{ops::Bound, sync::Arc}; + + use arrow::datatypes::{Int64Type, UInt64Type}; + use datafusion::common::DFSchema; + use datafusion::execution::TaskContext; + use datafusion::functions_aggregate; + use datafusion::logical_expr::lit; + use datafusion::physical_expr::execution_props::ExecutionProps; + use datafusion::physical_plan::ExecutionPlan; + use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter; + use datafusion::scalar::ScalarValue; + use futures::TryStreamExt; + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::gen_batch; + use lance_index::IndexType; + use lance_index::scalar::{ + SargableQuery, ScalarIndexParams, + expression::{ScalarIndexExpr, ScalarIndexSearch}, + }; + use lance_select::result::IndexExprResultWireFormat; + use lance_select::{RowAddrMask, RowAddrTreeMap}; + + use super::*; + use crate::Dataset; + use crate::index::DatasetIndexExt; + use crate::io::exec::scalar_index::ScalarIndexExec; + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; + + /// Build an `AggregateFunctionExpr` matching `COUNT(*)`. + fn count_star_expr(input_schema: &SchemaRef) -> Arc { + let expr = functions_aggregate::count::count(lit(1)); + let df_schema = DFSchema::try_from(input_schema.as_ref().clone()).unwrap(); + let (agg_expr, _filter, _order_by) = create_aggregate_expr_and_maybe_filter( + &expr, + &df_schema, + input_schema.as_ref(), + &ExecutionProps::default(), + ) + .unwrap(); + agg_expr + } + + struct Fixture { + dataset: Arc, + _tmp: TempStrDir, + } + + /// 4 fragments × 10 rows, ascending `ordered` column with a BTree index. + async fn make_fixture() -> Fixture { + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + Fixture { + dataset: Arc::new(dataset), + _tmp: tmp, + } + } + + fn input_schema() -> SchemaRef { + Arc::new(Schema::new(vec![arrow_schema::Field::new( + "ordered", + arrow_schema::DataType::UInt64, + false, + )])) + } + + async fn run(plan: CountFromMaskExec) -> i64 { + let stream = plan.execute(0, Arc::new(TaskContext::default())).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 1); + batches[0] + .column(0) + .as_any() + .downcast_ref::>() + .expect("count partial state should be Int64") + .value(0) + } + + #[tokio::test] + async fn try_new_rejects_empty_aggregate_funcs() { + let fixture = make_fixture().await; + let err = CountFromMaskExec::try_new(fixture.dataset, vec![], None).unwrap_err(); + assert!(err.to_string().contains("at least one aggregate"), "{err}"); + } + + #[tokio::test] + async fn count_from_mask_mixes_full_and_partial() { + // Synthesize an AllowList containing one Full-marker fragment and + // one Partial bitmap; verify the Full fragment falls back to + // physical_rows from the manifest and Partial falls back to + // bitmap.len(). + let fixture = make_fixture().await; + let mut tm = RowAddrTreeMap::new(); + // Fragment 0: full (10 physical rows). + tm.insert_fragment(0); + // Fragment 1: partial with explicit row addrs. + let row_addr_for = |frag_id: u32, offset: u32| ((frag_id as u64) << 32) | offset as u64; + tm.insert(row_addr_for(1, 0)); + tm.insert(row_addr_for(1, 1)); + tm.insert(row_addr_for(1, 2)); + + let mask = RowAddrMask::AllowList(tm); + let count = CountFromMaskExec::count_from_mask(&mask, fixture.dataset.as_ref()).unwrap(); + assert_eq!(count, 10 + 3); + } + + #[tokio::test] + async fn execute_count_no_prefilter() { + let fixture = make_fixture().await; + let schema = input_schema(); + let plan = CountFromMaskExec::try_new( + fixture.dataset.clone(), + vec![count_star_expr(&schema)], + None, + ) + .unwrap(); + let count = run(plan).await; + assert_eq!(count, 40); // 4 fragments × 10 rows + } + + #[tokio::test] + async fn execute_count_with_allow_list_prefilter() { + let fixture = make_fixture().await; + let schema = input_schema(); + + // `ordered < 25` matches 25 rows across the four fragments. + let prefilter_expr = ScalarIndexExpr::Query(ScalarIndexSearch { + column: "ordered".to_string(), + index_name: "ordered_idx".to_string(), + index_type: "BTree".to_string(), + query: Arc::new(SargableQuery::Range( + Bound::Unbounded, + Bound::Excluded(ScalarValue::UInt64(Some(25))), + )), + needs_recheck: false, + fragment_bitmap: None, + }); + let prefilter: Arc = Arc::new(ScalarIndexExec::new( + fixture.dataset.clone(), + prefilter_expr, + IndexExprResultWireFormat::default(), + )); + + let plan = CountFromMaskExec::try_new( + fixture.dataset.clone(), + vec![count_star_expr(&schema)], + Some(prefilter), + ) + .unwrap(); + let count = run(plan).await; + assert_eq!(count, 25); + } + + #[tokio::test] + async fn execute_count_with_block_list_prefilter() { + let fixture = make_fixture().await; + let schema = input_schema(); + + // NOT(ordered < 25) is a block list of those 25 rows — 40 − 25 = 15. + let prefilter_expr = + ScalarIndexExpr::Not(Box::new(ScalarIndexExpr::Query(ScalarIndexSearch { + column: "ordered".to_string(), + index_name: "ordered_idx".to_string(), + index_type: "BTree".to_string(), + query: Arc::new(SargableQuery::Range( + Bound::Unbounded, + Bound::Excluded(ScalarValue::UInt64(Some(25))), + )), + needs_recheck: false, + fragment_bitmap: None, + }))); + let prefilter: Arc = Arc::new(ScalarIndexExec::new( + fixture.dataset.clone(), + prefilter_expr, + IndexExprResultWireFormat::default(), + )); + + let plan = CountFromMaskExec::try_new( + fixture.dataset.clone(), + vec![count_star_expr(&schema)], + Some(prefilter), + ) + .unwrap(); + let count = run(plan).await; + assert_eq!(count, 15); + } + + #[tokio::test] + async fn execute_count_respects_deletions() { + let fixture = make_fixture().await; + let mut dataset = (*fixture.dataset).clone(); + // Delete the first ten rows of the dataset (which live in fragment 0). + dataset.delete("ordered < 10").await.unwrap(); + let dataset = Arc::new(dataset); + + let schema = input_schema(); + let plan = + CountFromMaskExec::try_new(dataset, vec![count_star_expr(&schema)], None).unwrap(); + let count = run(plan).await; + assert_eq!(count, 30); + } +} diff --git a/rust/lance/src/io/exec/count_pushdown.rs b/rust/lance/src/io/exec/count_pushdown.rs new file mode 100644 index 00000000000..d5d90b5881a --- /dev/null +++ b/rust/lance/src/io/exec/count_pushdown.rs @@ -0,0 +1,832 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Physical optimizer rule that rewrites a `COUNT(*)`-style aggregate into +//! [`CountFromMaskExec`], answering the count from index metadata and the +//! deletion mask without scanning column data. +//! +//! This is the "count-from-mask" category of aggregate pushdown — one of +//! four planned. The other categories (mask-to-answer, zone-aware, +//! dimension-keyed) will each need their own rule and exec; the surrounding +//! infrastructure (the `fragment_bitmap` plumbed on each `ScalarIndexSearch`, +//! the `IndexInformationProvider::fragment_bitmap` lookup) is general +//! enough to be reused. +//! +//! Two rewritten shapes are emitted depending on whether the scalar index +//! backing the filter covers every dataset fragment. +//! +//! **Full coverage** (index ⊇ dataset, or no filter at all): +//! +//! ```text +//! AggregateExec(Final, aggs=[count(...)], group_by=[]) +//! └── CountFromMaskExec { prefilter_input = index_input } +//! ``` +//! +//! **Partial coverage** (index ⊊ dataset — typically appended fragments): +//! +//! ```text +//! AggregateExec(Final, aggs=[count(...)], group_by=[]) +//! └── UnionExec +//! ├── CountFromMaskExec(restrict_to_fragments = indexed) +//! └── AggregateExec(Partial) +//! └── FilteredReadExec(fragments = unindexed, full_filter = …) +//! ``` +//! +//! [`CountFromMaskExec`] emits partial-state, so the outer +//! `AggregateExec(Final)` performs the final combine in either shape. +//! +//! If the prefilter's index coverage is unknown (any leaf is missing +//! `fragment_bitmap`, e.g. constructed outside scanner planning), the rule +//! refuses to fire and leaves the existing scan path in place. + +use std::sync::Arc; + +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::config::ConfigOptions; +use datafusion::error::Result as DFResult; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +#[allow(deprecated)] +use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::{ + ExecutionPlan, ExecutionPlanProperties, + aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}, + coalesce_partitions::CoalescePartitionsExec, + projection::ProjectionExec, + repartition::RepartitionExec, + union::UnionExec, +}; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; +use datafusion_physical_expr::expressions::{Column, Literal}; +use lance_index::scalar::expression::ScalarIndexExpr; +use log::warn; +use roaring::RoaringBitmap; + +use super::count_from_mask::CountFromMaskExec; +use super::filtered_read::{FilteredReadExec, FilteredReadOptions}; +use super::scalar_index::ScalarIndexExec; + +/// Physical optimizer rule that rewrites a `COUNT(*)`-style aggregate into +/// [`CountFromMaskExec`], optionally splitting into a parallel scan branch +/// when the index has partial coverage of the dataset. +/// +/// Only fires when the shape is verifiably safe; everything outside that +/// envelope (GROUP BY, residual filters, scan ranges, etc.) is left alone +/// for the normal scan path. +#[derive(Debug)] +pub struct CountPushdown; + +impl PhysicalOptimizerRule for CountPushdown { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> DFResult> { + Ok(plan + .transform_down(|plan| { + let Some(agg) = plan.as_any().downcast_ref::() else { + return Ok(Transformed::no(plan)); + }; + if let Some(rewritten) = try_rewrite(agg)? { + return Ok(Transformed::yes(rewritten)); + } + Ok(Transformed::no(plan)) + })? + .data) + } + + fn name(&self) -> &str { + "count_pushdown" + } + + fn schema_check(&self) -> bool { + true + } +} + +fn try_rewrite(agg: &AggregateExec) -> DFResult>> { + // We can accelerate Single (Lance scanner shape) and Partial (the shape + // DataFusion's SQL planner emits at the leaf of an aggregate pipeline); + // both produce results we know how to compute from the index. We will + // never accelerate Final or FinalPartitioned — those combine an existing + // partial stream, and the value of this rule is replacing the work that + // produces the partial stream. + let mode = match agg.mode() { + AggregateMode::Single => AggregateMode::Single, + AggregateMode::Partial => AggregateMode::Partial, + _ => return Ok(None), + }; + if !agg.group_expr().is_empty() { + return Ok(None); + } + if agg.aggr_expr().is_empty() { + return Ok(None); + } + + // Every aggregate must be a `COUNT()` shape (i.e. COUNT(*) / + // COUNT(1) / etc.) with no per-aggregate `FILTER (WHERE ...)`. This rule + // is scoped to the count-from-mask category only; other aggregate + // categories (mask-to-answer, zone-aware, dimension-keyed) will need + // their own rules with their own gates. + for (af, filter) in agg.aggr_expr().iter().zip(agg.filter_expr().iter()) { + if !is_count_star(af) { + return Ok(None); + } + if filter.is_some() { + return Ok(None); + } + } + + // The input must be a FilteredReadExec we can prove is safe to skip. + // DataFusion's SQL planner inserts a few row-preserving wrappers above + // the leaf — a `RepartitionExec` for parallelism, an empty + // `ProjectionExec` once the count expression has been resolved to need + // no columns, and `CoalesceBatchesExec` here and there. Walk through + // those to reach the FilteredReadExec. + let Some(filtered_read) = strip_row_preserving_wrappers(agg.children()[0]) else { + return Ok(None); + }; + + let options = filtered_read.options(); + // A refine filter is a residual the index couldn't fully evaluate — it + // needs column data to apply, which we can't. + if options.refine_filter.is_some() { + return Ok(None); + } + // A full_filter without an index_input means the filter is evaluated by + // scanning every row; not pushdownable. + if options.full_filter.is_some() && filtered_read.index_input().is_none() { + return Ok(None); + } + // LIMIT/OFFSET would change the count. + if options.scan_range_before_filter.is_some() || options.scan_range_after_filter.is_some() { + return Ok(None); + } + // We rely on the deletion mask being applied; with_deleted_rows changes + // that contract. Surfacing as a warning because it shouldn't normally + // pair with an aggregate plan — if we see it, the planner produced a + // shape we could in principle accelerate but currently can't. + if options.with_deleted_rows { + warn!( + "count_pushdown: skipped because the FilteredReadExec was built \ + with with_deleted_rows; the count will be computed via a full \ + scan." + ); + return Ok(None); + } + // Same story for an explicit fragment subset: legitimate, but unexpected + // alongside an aggregate, and we lose the pushdown opportunity. + if options.fragments.is_some() { + warn!( + "count_pushdown: skipped because the FilteredReadExec was scoped \ + to an explicit fragment subset; the count will be computed via a \ + full scan. Intersecting that subset into the coverage logic would \ + let this query be answered from index metadata." + ); + return Ok(None); + } + + let dataset = filtered_read.dataset().clone(); + let dataset_fragments: RoaringBitmap = + dataset.fragments().iter().map(|f| f.id as u32).collect(); + let prefilter_input = filtered_read.index_input().cloned(); + + // If there is a prefilter, inspect its ScalarIndexExpr leaves: + // - Refuse to fire if any leaf is inexact (`needs_recheck`). The + // prefilter's serialized batch carries an Exact/AtMost/AtLeast + // discriminant but `load_prefilter` only reads the row-address mask + // and would treat AtMost as Exact, silently overcounting (and + // symmetrically AtLeast would undercount). + // - Compute the index's fragment coverage from leaf `fragment_bitmap`s. + // `None` means at least one leaf has no bitmap and we can't reason + // about coverage synchronously — refuse to fire. + let index_coverage = match &prefilter_input { + None => None, + Some(input) => { + let scalar_exec = input + .as_any() + .downcast_ref::() + .ok_or_else(|| { + datafusion::error::DataFusionError::Internal( + "count_pushdown: FilteredReadExec.index_input is not a ScalarIndexExec" + .to_string(), + ) + })?; + if scalar_exec.expr().needs_recheck() { + return Ok(None); + } + let Some(coverage) = collect_coverage(scalar_exec.expr()) else { + return Ok(None); + }; + Some(coverage) + } + }; + + let aggr_exprs: Vec> = agg.aggr_expr().to_vec(); + + // Decide on the plan shape. Three cases: + // + // 1. No prefilter (no filter at all): single pushdown branch over every + // dataset fragment. Always safe. + // 2. Prefilter + index covers every dataset fragment: single pushdown + // branch, prefilter feeds in directly. + // 3. Prefilter + index covers a strict subset: split into pushdown over + // indexed fragments + parallel scan over unindexed fragments. + let (partial_stream, partial_state_schema): (Arc, _) = match index_coverage { + None => { + // No prefilter at all (verified above): nothing to restrict. + let exec = CountFromMaskExec::try_new_restricted( + dataset, + aggr_exprs.clone(), + prefilter_input, + None, + )?; + let schema = exec.schema(); + (Arc::new(exec), schema) + } + Some(coverage) if (&dataset_fragments - &coverage).is_empty() => { + // Prefilter exists and the index covers every dataset fragment — + // safe to push the whole count down. + let exec = CountFromMaskExec::try_new_restricted( + dataset, + aggr_exprs.clone(), + prefilter_input, + None, + )?; + let schema = exec.schema(); + (Arc::new(exec), schema) + } + Some(coverage) => { + // Split plan: CountFromMaskExec for the indexed fragments, a + // normal scan + AggregateExec(Partial) for the rest. + let uncovered = &dataset_fragments - &coverage; + let pushdown_exec = CountFromMaskExec::try_new_restricted( + dataset, + aggr_exprs.clone(), + prefilter_input, + Some(&dataset_fragments & &coverage), + )?; + let partial_state_schema = pushdown_exec.schema(); + let pushdown_branch: Arc = Arc::new(pushdown_exec); + let scan_branch = + build_scan_branch(filtered_read, options, &uncovered, aggr_exprs.clone())?; + let union: Arc = + UnionExec::try_new(vec![pushdown_branch, scan_branch])?; + (union, partial_state_schema) + } + }; + + match mode { + AggregateMode::Partial => { + // Caller's parent is already an AggregateExec(Final) that knows + // how to consume multi-partition partial state — substitute our + // partial stream and we're done. + Ok(Some(partial_stream)) + } + AggregateMode::Single => { + // The original AggregateExec(Single) produced final output in one + // step; our exec emits partial state, so add a Final on top to + // recover the original output schema. Final expects a single + // partition of partial-state rows, so coalesce when we have a + // union producing multiple partitions. + let final_input: Arc = + if partial_stream.output_partitioning().partition_count() > 1 { + Arc::new(CoalescePartitionsExec::new(partial_stream)) + } else { + partial_stream + }; + // `AggregateExec::try_new` requires one + // `Option>` per aggregate expression for + // the optional per-aggregate `FILTER (WHERE ...)` clause. We + // rejected any aggregate carrying a filter back at the gate, so + // every slot is `None` here. + let filters: Vec>> = + (0..aggr_exprs.len()).map(|_| None).collect(); + let final_agg = AggregateExec::try_new( + AggregateMode::Final, + PhysicalGroupBy::default(), + aggr_exprs, + filters, + final_input, + partial_state_schema, + )?; + Ok(Some(Arc::new(final_agg))) + } + _ => unreachable!("mode was checked at the top of try_rewrite"), + } +} + +/// Build the scan branch of a partial-coverage split: a `FilteredReadExec` +/// restricted to the uncovered fragments (no `index_input`, the original +/// `full_filter` applied per row) wrapped in `AggregateExec(Partial)` so its +/// partial state can be unioned with the pushdown branch. +fn build_scan_branch( + filtered_read: &FilteredReadExec, + options: &FilteredReadOptions, + uncovered: &RoaringBitmap, + aggr_exprs: Vec>, +) -> DFResult> { + let dataset = filtered_read.dataset().clone(); + let uncovered_fragments: Vec<_> = dataset + .manifest() + .fragments + .iter() + .filter(|f| uncovered.contains(f.id as u32)) + .cloned() + .collect(); + let mut scan_options = options.clone(); + scan_options.fragments = Some(Arc::new(uncovered_fragments)); + let scan = FilteredReadExec::try_new(dataset, scan_options, None)?; + let scan: Arc = Arc::new(scan); + let scan_schema = scan.schema(); + // Per-aggregate `FILTER (WHERE ...)` placeholders — see the matching + // comment in `try_rewrite`; we've already rejected any aggregate that + // carried a filter, so every slot is `None`. + let filters: Vec>> = + (0..aggr_exprs.len()).map(|_| None).collect(); + let partial = AggregateExec::try_new( + AggregateMode::Partial, + PhysicalGroupBy::default(), + aggr_exprs, + filters, + scan, + scan_schema, + )?; + Ok(Arc::new(partial)) +} + +/// Walk through row-preserving wrappers (`RepartitionExec`, +/// `CoalesceBatchesExec`, and identity-or-empty `ProjectionExec`) that +/// DataFusion's planner inserts between an `AggregateExec` and the leaf, and +/// return the underlying `FilteredReadExec` if one is reached. +/// +/// "Row-preserving" here means the wrapper changes neither the number of rows +/// nor the predicate applied to them — it may reshape partitions, batches, or +/// drop unused columns, but the row population at the bottom is what reaches +/// the aggregate. That's all the rule needs from these layers, so it's safe to +/// look past them. +fn strip_row_preserving_wrappers(plan: &Arc) -> Option<&FilteredReadExec> { + let mut current: &dyn ExecutionPlan = plan.as_ref(); + loop { + if let Some(filtered_read) = current.as_any().downcast_ref::() { + return Some(filtered_read); + } + let next: &Arc = + if let Some(inner) = current.as_any().downcast_ref::() { + inner.input() + } else if let Some(inner) = { + #[allow(deprecated)] + current.as_any().downcast_ref::() + } { + inner.input() + } else if let Some(inner) = current.as_any().downcast_ref::() { + inner.input() + } else if let Some(proj) = current.as_any().downcast_ref::() { + // Only walk through projections that are row-preserving: every + // output expression is a direct column reference back to the + // input. (Empty projections trivially qualify — DataFusion uses + // one when a `COUNT(*)`'s argument no longer needs any actual + // columns.) + let input_schema = proj.input().schema(); + let identity = proj.expr().iter().all(|projection_expr| { + projection_expr + .expr + .as_any() + .downcast_ref::() + .is_some_and(|c| c.name() == input_schema.field(c.index()).name()) + }); + if !identity { + return None; + } + proj.input() + } else { + return None; + }; + current = next.as_ref(); + } +} + +/// Walk a `ScalarIndexExpr` and intersect the per-leaf `fragment_bitmap`. +/// +/// Returns `None` if any leaf is missing a bitmap (coverage unknown). All +/// three combinators (`And`, `Or`, `Not`) reduce to "every leaf must cover the +/// fragment for us to give a definitive answer about it" — i.e. intersection. +fn collect_coverage(expr: &ScalarIndexExpr) -> Option { + match expr { + ScalarIndexExpr::Not(inner) => collect_coverage(inner), + ScalarIndexExpr::And(lhs, rhs) | ScalarIndexExpr::Or(lhs, rhs) => { + let l = collect_coverage(lhs)?; + let r = collect_coverage(rhs)?; + Some(l & r) + } + ScalarIndexExpr::Query(search) => search.fragment_bitmap.clone(), + } +} + +/// Returns `true` if `af` is `COUNT()` with no DISTINCT. +fn is_count_star(af: &Arc) -> bool { + if af.fun().name() != "count" { + return false; + } + if af.is_distinct() { + return false; + } + let args = af.expressions(); + if args.len() != 1 { + return false; + } + let Some(lit) = args[0].as_any().downcast_ref::() else { + return false; + }; + // `COUNT(NULL)` would always return 0; rule it out so we don't accidentally + // produce a wrong answer if the planner ever lets it through. + !lit.value().is_null() +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::datatypes::{Int64Type, UInt64Type}; + use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion}; + use datafusion::physical_plan::{ExecutionPlan, displayable}; + use futures::TryStreamExt; + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::gen_batch; + use lance_index::IndexType; + use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; + + use super::*; + use crate::Dataset; + use crate::dataset::scanner::AggregateExpr; + use crate::index::DatasetIndexExt; + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; + + struct Fixture { + dataset: Arc, + _tmp: TempStrDir, + } + + /// 4 fragments × 10 rows, ascending `ordered` column with a BTree index. + async fn make_fixture() -> Fixture { + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + Fixture { + dataset: Arc::new(dataset), + _tmp: tmp, + } + } + + fn plan_contains_pushdown(plan: &Arc) -> bool { + let mut found = false; + plan.apply(|node| { + if node.as_any().is::() { + found = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + }) + .unwrap(); + found + } + + fn plan_contains_union(plan: &Arc) -> bool { + let mut found = false; + plan.apply(|node| { + if node.as_any().is::() { + found = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + }) + .unwrap(); + found + } + + async fn run_count( + scanner: &mut crate::dataset::scanner::Scanner, + ) -> (Arc, i64) { + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + let stream = datafusion::physical_plan::execute_stream( + plan.clone(), + Arc::new(datafusion::execution::TaskContext::default()), + ) + .unwrap(); + let batches: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!( + batches.len(), + 1, + "count plan emitted {} batches", + batches.len() + ); + let count = batches[0] + .column(0) + .as_any() + .downcast_ref::>() + .expect("count column should be Int64") + .value(0); + (plan, count) + } + + #[tokio::test] + async fn rule_fires_on_unfiltered_count_star() { + let fixture = make_fixture().await; + let mut scanner = fixture.dataset.scan(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 40); + assert!( + plan_contains_pushdown(&plan), + "expected CountFromMaskExec in plan: {}", + displayable(plan.as_ref()).indent(true) + ); + assert!( + !plan_contains_union(&plan), + "no union expected for unfiltered count, got: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_fires_when_filter_fully_indexed() { + let fixture = make_fixture().await; + let mut scanner = fixture.dataset.scan(); + scanner.filter("ordered < 25").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 25); + assert!( + plan_contains_pushdown(&plan), + "expected CountFromMaskExec in plan: {}", + displayable(plan.as_ref()).indent(true) + ); + assert!( + !plan_contains_union(&plan), + "no union expected when index covers every fragment, got: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_emits_split_plan_for_partial_index_coverage() { + // Build index over 4 fragments, then append a 5th — the index now + // covers a strict subset of the dataset. The rule must split into a + // pushdown branch over the indexed fragments and a scan branch over + // the rest, then sum the partials. + use crate::dataset::WriteParams; + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + let extra = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_reader_rows( + lance_datagen::RowCount::from(10), + lance_datagen::BatchCount::from(1), + ); + let dataset = Dataset::write( + extra, + tmp.as_str(), + Some(WriteParams { + mode: crate::dataset::WriteMode::Append, + max_rows_per_file: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + scanner.filter("ordered < 100").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + // 5 fragments × 10 rows, all match `< 100`. + assert_eq!(count, 50); + assert!( + plan_contains_pushdown(&plan), + "expected pushdown branch in split plan: {}", + displayable(plan.as_ref()).indent(true) + ); + assert!( + plan_contains_union(&plan), + "expected UnionExec for partial-coverage split, got: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_fires_with_stable_row_ids() { + // Unfiltered count, stable row ids, with a deletion. + use crate::dataset::WriteParams; + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset_with_params( + tmp.as_str(), + FragmentCount::from(2), + FragmentRowCount::from(10), + Some(WriteParams { + max_rows_per_file: 10, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset.delete("ordered = 0").await.unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 19); + assert!( + plan_contains_pushdown(&plan), + "rule should fire under stable row IDs, got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_fires_with_stable_row_ids_and_filter() { + // Indexed filter, stable row ids, deletions spread across fragments -- + // the case the pre-fix code got wrong (dropped rows in fragments > 0). + use crate::dataset::WriteParams; + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset_with_params( + tmp.as_str(), + FragmentCount::from(3), + FragmentRowCount::from(10), + Some(WriteParams { + max_rows_per_file: 10, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + // Delete one row from fragment 1 and one from fragment 2. + dataset + .delete("ordered = 15 OR ordered = 25") + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + // Matches every row across all three fragments; with the two deletions + // the live count is 28. + scanner.filter("ordered >= 0").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 28); + assert!( + plan_contains_pushdown(&plan), + "rule should fire under stable row IDs with a filter, got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_skips_when_filter_needs_refine() { + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .col("unindexed", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + scanner.filter("unindexed > 5").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 34); + assert!( + !plan_contains_pushdown(&plan), + "rule should not fire with non-indexed filter, got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_skips_when_index_is_inexact() { + // Zonemap-style indices return AtMost (over-approximation) and set + // ScalarIndexSearch.needs_recheck = true. CountFromMaskExec + // ignores the discriminant on the prefilter batch, so firing the + // rule against an inexact index would silently overcount. The rule + // must refuse — and the scan path with its recheck still answers + // correctly. + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::ZoneMap, + None, + &ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap), + true, + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + scanner.filter("ordered < 25").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 25); + assert!( + !plan_contains_pushdown(&plan), + "rule must not fire when the index produces inexact (needs_recheck) results, \ + got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_skips_count_with_group_by() { + let fixture = make_fixture().await; + let mut scanner = fixture.dataset.scan(); + scanner + .aggregate( + AggregateExpr::builder() + .group_by("ordered") + .count_star() + .build(), + ) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + assert!( + !plan_contains_pushdown(&plan), + "rule should not fire for GROUP BY: {}", + displayable(plan.as_ref()).indent(true) + ); + } +} diff --git a/rust/lance/src/io/exec/filtered_read.rs b/rust/lance/src/io/exec/filtered_read.rs index c491e3f194b..b50cba1f9ce 100644 --- a/rust/lance/src/io/exec/filtered_read.rs +++ b/rust/lance/src/io/exec/filtered_read.rs @@ -997,6 +997,10 @@ impl FilteredReadStream { base_batch_stream.boxed() }; + // Clone so the finally handler can record a final snapshot even when + // no output batches were produced (inspect_ok never fires in that case). + let global_metrics_final = global_metrics.clone(); + let scan_scheduler_final = scan_scheduler.clone(); let batch_stream = batch_stream .inspect_ok(move |batch| { partition_metrics_clone @@ -1005,6 +1009,9 @@ impl FilteredReadStream { global_metrics.io_metrics.record(&scan_scheduler); }) .finally(move || { + global_metrics_final + .io_metrics + .record(&scan_scheduler_final); partition_metrics.baseline_metrics.done(); }) .map_err(|e: lance_core::Error| DataFusionError::External(e.into())) @@ -1732,7 +1739,13 @@ impl FilteredReadExec { // Second, multiple partitions all share the same underlying task stream (see get_stream) let running_stream_lock = self.running_stream.clone(); let dataset = self.dataset.clone(); - let options = self.options.clone(); + let target_partitions = context.session_config().target_partitions(); + let mut options = self.options.clone(); + if let FilteredReadThreadingMode::OnePartitionMultipleThreads(n) = options.threading_mode { + options.threading_mode = FilteredReadThreadingMode::OnePartitionMultipleThreads( + n.min(target_partitions).max(1), + ); + } let batch_size_bytes = options .file_reader_options .as_ref() @@ -3716,6 +3729,55 @@ mod tests { assert!(iops > 0, "Should have recorded IO operations"); } + // Reproduces a bug where bytes_read (and iops/requests) stay at 0 when a filter matches + // no rows. io_metrics.record is only called inside inspect_ok on the output batch stream, + // so when the filter produces zero output batches, the I/O that did occur is never counted. + #[tokio::test] + async fn test_io_metrics_recorded_when_filter_matches_no_rows() { + let fixture = TestFixture::new().await; + // not_indexed values in the fixture go up to ~400; this filter matches nothing + let filter_plan = fixture.filter_plan("not_indexed > 10000", false).await; + let options = + FilteredReadOptions::basic_full_read(&fixture.dataset).with_filter_plan(filter_plan); + let filtered_read = + Arc::new(FilteredReadExec::try_new(fixture.dataset.clone(), options, None).unwrap()); + + let batches = filtered_read + .execute(0, Arc::new(TaskContext::default())) + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + batches.iter().map(|b| b.num_rows()).sum::(), + 0, + "filter should match no rows" + ); + + let metrics = filtered_read.metrics().unwrap(); + + let rows_scanned = metrics + .sum_by_name("rows_scanned") + .map(|v| v.as_usize()) + .unwrap_or(0); + assert!( + rows_scanned > 0, + "rows_scanned ({}) should be > 0: data was read even though filter matched nothing", + rows_scanned + ); + + let bytes_read = metrics + .sum_by_name("bytes_read") + .map(|v| v.as_usize()) + .unwrap_or(0); + assert!( + bytes_read > 0, + "bytes_read ({}) should be > 0: io_metrics.record is only called when output batches \ + are produced, so bytes_read stays 0 even though I/O occurred", + bytes_read + ); + } + /// Test that direct execution gives the same result as get_plan + execute_with_plan #[test_log::test(tokio::test)] async fn test_plan_round_trip() { @@ -3798,4 +3860,37 @@ mod tests { assert_eq!(result1.column(i).as_ref(), result3.column(i).as_ref()); } } + + /// Verify that executing with target_partitions=1 produces the same results as the default + /// context and does not panic. This is a regression guard for the parallelism cap. + #[test_log::test(tokio::test)] + async fn test_target_partitions_cap_produces_correct_results() { + use datafusion::prelude::SessionConfig; + + let fixture = TestFixture::new().await; + + let options = FilteredReadOptions::basic_full_read(&fixture.dataset); + let plan = + FilteredReadExec::try_new(fixture.dataset.clone(), options.clone(), None).unwrap(); + + // Execute with default context (high thread count) + let default_ctx = Arc::new(TaskContext::default()); + let stream = plan.execute(0, default_ctx).unwrap(); + let schema = stream.schema(); + let batches = stream.try_collect::>().await.unwrap(); + let default_result = concat_batches(&schema, &batches).unwrap(); + + // Execute fresh plan with target_partitions=1 + let plan2 = FilteredReadExec::try_new(fixture.dataset.clone(), options, None).unwrap(); + let low_ctx = Arc::new( + TaskContext::default() + .with_session_config(SessionConfig::default().with_target_partitions(1)), + ); + let stream2 = plan2.execute(0, low_ctx).unwrap(); + let schema2 = stream2.schema(); + let batches2 = stream2.try_collect::>().await.unwrap(); + let capped_result = concat_batches(&schema2, &batches2).unwrap(); + + assert_eq!(default_result.num_rows(), capped_result.num_rows()); + } } diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index b72bf0a2205..add864c0ea9 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -34,10 +34,13 @@ use lance_datafusion::utils::{ExecutionPlanMetricsSetExt, MetricsExt, PARTITIONS use lance_table::format::IndexMetadata; use super::PreFilterSource; -use super::utils::{IndexMetrics, InstrumentedChildInputStream, build_prefilter}; +use super::utils::{IndexMetrics, build_prefilter}; use crate::index::scalar::inverted::{load_segment_details, load_segments}; use crate::{Dataset, index::DatasetIndexInternalExt}; -use lance_index::metrics::MetricsCollector; +use lance_index::metrics::{ + AND_CANDIDATES_PRUNED_BEFORE_RETURN_METRIC, AND_CANDIDATES_SEEN_METRIC, AND_FULL_SCORES_METRIC, + FREQS_COLLECTED_METRIC, MetricsCollector, +}; use lance_index::scalar::inverted::builder::ScoredDoc; use lance_index::scalar::inverted::builder::document_input; use lance_index::scalar::inverted::document_tokenizer::{DocType, JsonTokenizer, LanceTokenizer}; @@ -61,15 +64,16 @@ async fn open_fts_segment( segment: &IndexMetadata, metrics: &IndexMetrics, ) -> Result> { - let uuid = segment.uuid.to_string(); - let index = dataset.open_generic_index(column, &uuid, metrics).await?; + let index = dataset + .open_scalar_index(column, &segment.uuid, metrics) + .await?; let inverted = index .as_any() .downcast_ref::() .ok_or_else(|| { Error::invalid_input(format!( "Index for column {} and segment {} is not an inverted index", - column, uuid + column, segment.uuid )) })?; Ok(Arc::new(inverted.clone())) @@ -158,6 +162,10 @@ fn default_text_tokenizer() -> Box { pub struct FtsIndexMetrics { index_metrics: IndexMetrics, partitions_searched: Count, + and_candidates_seen: Count, + and_candidates_pruned_before_return: Count, + and_full_scores: Count, + freqs_collected: Count, baseline_metrics: BaselineMetrics, } @@ -166,6 +174,11 @@ impl FtsIndexMetrics { Self { index_metrics: IndexMetrics::new(metrics, partition), partitions_searched: metrics.new_count(PARTITIONS_SEARCHED_METRIC, partition), + and_candidates_seen: metrics.new_count(AND_CANDIDATES_SEEN_METRIC, partition), + and_candidates_pruned_before_return: metrics + .new_count(AND_CANDIDATES_PRUNED_BEFORE_RETURN_METRIC, partition), + and_full_scores: metrics.new_count(AND_FULL_SCORES_METRIC, partition), + freqs_collected: metrics.new_count(FREQS_COLLECTED_METRIC, partition), baseline_metrics: BaselineMetrics::new(metrics, partition), } } @@ -187,6 +200,22 @@ impl MetricsCollector for FtsIndexMetrics { fn record_comparisons(&self, num_comparisons: usize) { self.index_metrics.record_comparisons(num_comparisons); } + + fn record_and_candidates_seen(&self, num_candidates: usize) { + self.and_candidates_seen.add(num_candidates); + } + + fn record_and_candidates_pruned_before_return(&self, num_candidates: usize) { + self.and_candidates_pruned_before_return.add(num_candidates); + } + + fn record_and_full_scores(&self, num_scores: usize) { + self.and_full_scores.add(num_scores); + } + + fn record_freqs_collected(&self, num_collections: usize) { + self.freqs_collected.add(num_collections); + } } #[derive(Debug)] @@ -699,11 +728,6 @@ impl FlatMatchFilterExec { metrics_set: ExecutionPlanMetricsSet, ) -> DataFusionResult { let metrics = Arc::new(FtsIndexMetrics::new(&metrics_set, partition)); - let elapsed_compute = metrics.baseline_metrics.elapsed_compute().clone(); - // Time the one-shot setup (tokenizer load + query tokenization) so it's - // attributed to this node's elapsed_compute. The helper itself only - // times per-batch work. - let setup_start = std::time::Instant::now(); let column = query .column .clone() @@ -724,44 +748,45 @@ impl FlatMatchFilterExec { None => Self::load_tokenizer(&dataset, &column, &metrics.index_metrics).await?, }; let query_tokens = Arc::new(collect_query_tokens(&query.terms, &mut tokenizer)); - elapsed_compute.add_duration(setup_start.elapsed()); - let helper = InstrumentedChildInputStream::new( - input, - schema, - move |batch| { - // Clone per-batch so the work runs *inside* the async block - // (i.e., during the helper's timed in_flight poll, not during - // its untimed input-pulling phase). - let column = column.clone(); - let query_tokens = query_tokens.clone(); - let mut tokenizer = tokenizer.box_clone(); - async move { - let text_column = batch.column_by_name(&column).ok_or_else(|| { - DataFusionError::Execution(format!("Column {} not found in batch", column,)) - })?; - let predicate = match text_column.data_type() { - DataType::Utf8 => { - Self::find_matches::(text_column, &mut tokenizer, &query_tokens) - } - DataType::LargeUtf8 => { - Self::find_matches::(text_column, &mut tokenizer, &query_tokens) - } - _ => { - return Err(DataFusionError::Execution(format!( - "Column {} is not a string", - column, - ))); - } - }; - Ok(arrow::compute::filter_record_batch(&batch, &predicate)?) - } - }, - 1, - partition, - &metrics_set, - ); - Ok(Box::pin(helper)) + let baseline = BaselineMetrics::new(&metrics_set, partition); + let elapsed_compute = baseline.elapsed_compute().clone(); + let stream = input.then(move |batch_result| { + let column = column.clone(); + let query_tokens = query_tokens.clone(); + let mut tokenizer = tokenizer.box_clone(); + let elapsed_compute = elapsed_compute.clone(); + async move { + let batch = batch_result?; + let _t = elapsed_compute.timer(); + let text_column = batch.column_by_name(&column).ok_or_else(|| { + DataFusionError::Execution(format!("Column {} not found in batch", column,)) + })?; + let predicate = match text_column.data_type() { + DataType::Utf8 => { + Self::find_matches::(text_column, &mut tokenizer, &query_tokens) + } + DataType::LargeUtf8 => { + Self::find_matches::(text_column, &mut tokenizer, &query_tokens) + } + _ => { + return Err(DataFusionError::Execution(format!( + "Column {} is not a string", + column, + ))); + } + }; + Ok(arrow::compute::filter_record_batch(&batch, &predicate)?) + } + }); + let stream = stream.map(move |batch| { + let poll = baseline.record_poll(std::task::Poll::Ready(Some(batch))); + match poll { + std::task::Poll::Ready(Some(b)) => b, + _ => unreachable!("record_poll preserves Ready(Some) input"), + } + }); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) } } @@ -974,6 +999,14 @@ impl ExecutionPlan for FlatMatchQueryExec { vec![&self.unindexed_input] } + fn required_input_distribution(&self) -> Vec { + // `execute()` only reads `unindexed_input.execute(partition)` for the single + // output partition, so the input must be coalesced to one partition. Without + // this, EnforceDistribution may round-robin the scan across `target_partitions` + // and only partition 0 is consumed, silently dropping the other fragments. + vec![Distribution::SinglePartition] + } + fn with_new_children( self: Arc, mut children: Vec>, @@ -1014,11 +1047,8 @@ impl ExecutionPlan for FlatMatchQueryExec { // so it can attribute the spawn_cpu tokenize work and synchronous // scoring back onto this node's `elapsed_compute`. Sharing the same // `Time` handle that's already inside the FtsIndexMetrics avoids - // registering a duplicate metric. Cloned once for use during setup - // timing (below) and again moved into the async block for the - // streaming-phase call. + // registering a duplicate metric. let elapsed_compute = metrics.baseline_metrics.elapsed_compute().clone(); - let elapsed_compute_for_stream = elapsed_compute.clone(); let column = query.column.ok_or(DataFusionError::Execution(format!( "column not set for MatchQuery {}", @@ -1028,9 +1058,6 @@ impl ExecutionPlan for FlatMatchQueryExec { document_input(self.unindexed_input.execute(partition, context)?, &column)?; let stream = stream::once(async move { - // Time the one-shot setup (load segments / open indices / build - // scorer / acquire tokenizer) and attribute it to elapsed_compute. - let setup_start = std::time::Instant::now(); let segments = match preset_segments { Some(segments) => Some(segments), None => load_segments(&ds, &column).await?, @@ -1067,7 +1094,6 @@ impl ExecutionPlan for FlatMatchQueryExec { preset_base_scorer.map(|s| (*s).clone()), ), }; - elapsed_compute.add_duration(setup_start.elapsed()); flat_bm25_search_stream_with_metrics( unindexed_input, @@ -1076,7 +1102,7 @@ impl ExecutionPlan for FlatMatchQueryExec { tokenizer, base_scorer, target_batch_size, - Some(elapsed_compute_for_stream), + Some(elapsed_compute), ) .await }) @@ -2199,7 +2225,7 @@ mod tests { .unwrap() .unwrap(); let index = dataset - .open_generic_index("text", &index_meta.uuid.to_string(), &NoOpMetricsCollector) + .open_generic_index("text", &index_meta.uuid, &NoOpMetricsCollector) .await .unwrap(); let inverted_index = index.as_any().downcast_ref::().unwrap(); @@ -2262,7 +2288,7 @@ mod tests { .unwrap() .unwrap(); let index = dataset - .open_generic_index("text", &index_meta.uuid.to_string(), &NoOpMetricsCollector) + .open_generic_index("text", &index_meta.uuid, &NoOpMetricsCollector) .await .unwrap(); let inverted_index = index.as_any().downcast_ref::().unwrap(); diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index 71239b4e34b..01125ac1617 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -17,7 +17,6 @@ use arrow_array::{ cast::AsArray, }; use arrow_schema::{DataType, Field, Schema, SchemaRef}; -use arrow_select::concat::concat_batches; use datafusion::physical_plan::PlanProperties; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ @@ -58,6 +57,7 @@ use lance_linalg::distance::DistanceType; use lance_linalg::kernels::normalize_arrow; use lance_table::format::IndexMetadata; use tokio::sync::Notify; +use uuid::Uuid; use crate::dataset::Dataset; use crate::index::DatasetIndexInternalExt; @@ -67,8 +67,8 @@ use crate::{Error, Result}; use lance_arrow::*; use super::utils::{ - FilteredRowIdsToPrefilter, IndexMetrics, InstrumentedChildInputStream, - InstrumentedRecordBatchStreamAdapter, PreFilterSource, SelectionVectorToPrefilter, + FilteredRowIdsToPrefilter, IndexMetrics, InstrumentedRecordBatchStreamAdapter, PreFilterSource, + SelectionVectorToPrefilter, }; pub const QUERY_INDEX_COL: &str = "query_index"; @@ -156,6 +156,7 @@ pub struct KNNVectorDistanceExec { pub upper_bound: Option, pub column: String, pub distance_type: DistanceType, + retain_vector: bool, input_schema: SchemaRef, output_schema: SchemaRef, @@ -171,10 +172,11 @@ pub struct KnnBatchParams { pub lower_bound: Option, pub upper_bound: Option, pub distance_type: DistanceType, + pub retain_vector: bool, } struct BatchKnnConfig { - input_schema: SchemaRef, + stored_schema: SchemaRef, output_schema: SchemaRef, column: String, query: ArrayRef, @@ -183,6 +185,7 @@ struct BatchKnnConfig { lower_bound: Option, upper_bound: Option, distance_type: DistanceType, + retain_vector: bool, } impl DisplayAs for KNNVectorDistanceExec { @@ -215,6 +218,120 @@ impl DisplayAs for KNNVectorDistanceExec { } impl KNNVectorDistanceExec { + fn remove_field_path_from_fields( + fields: &[Arc], + path: &[String], + ) -> DataFusionResult>> { + if path.is_empty() { + return Ok(fields.to_vec()); + } + let mut removed = false; + let mut new_fields = Vec::with_capacity(fields.len()); + for field in fields { + if field.name() != &path[0] { + new_fields.push(field.clone()); + continue; + } + removed = true; + if path.len() == 1 { + continue; + } + match field.data_type() { + DataType::Struct(children) => { + let child_fields = children.iter().cloned().collect::>(); + let projected_children = + Self::remove_field_path_from_fields(&child_fields, &path[1..])?; + if projected_children.is_empty() { + continue; + } + let updated = Field::new( + field.name(), + DataType::Struct(projected_children.into()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()); + new_fields.push(Arc::new(updated)); + } + _ => { + return Err(DataFusionError::Internal(format!( + "batch KNN cannot remove nested path '{}': '{}' is not a struct", + path.join("."), + field.name() + ))); + } + } + } + if !removed { + return Err(DataFusionError::Internal(format!( + "batch KNN expected vector column '{}' in scan batch schema", + path.join(".") + ))); + } + Ok(new_fields) + } + + fn remove_vector_from_schema(schema: &Schema, column: &str) -> DataFusionResult { + let path = lance_core::datatypes::parse_field_path(column).map_err(|err| { + DataFusionError::Internal(format!( + "batch KNN failed to parse vector column path '{column}': {err}" + )) + })?; + let fields = schema.fields().iter().cloned().collect::>(); + let updated_fields = Self::remove_field_path_from_fields(&fields, &path)?; + Ok(Schema::new_with_metadata( + updated_fields, + schema.metadata().clone(), + )) + } + + fn remove_vector_from_batch( + batch: &RecordBatch, + column: &str, + ) -> DataFusionResult { + let slim_schema = Self::remove_vector_from_schema(batch.schema().as_ref(), column)?; + batch + .project_by_schema(&slim_schema) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + + fn resolve_vector_column(batch: &RecordBatch, column: &str) -> DataFusionResult { + if let Some(col) = batch.column_by_name(column) { + return Ok(col.clone()); + } + let parts = lance_core::datatypes::parse_field_path(column).map_err(|e| { + DataFusionError::Internal(format!( + "batch KNN failed to parse vector column path '{column}': {e}" + )) + })?; + if parts.is_empty() { + return Err(DataFusionError::Internal(format!( + "batch KNN has invalid empty vector column path '{column}'" + ))); + } + let mut current = batch.column_by_name(&parts[0]).cloned().ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected vector column '{column}' in scan batch (missing root field '{}')", + parts[0] + )) + })?; + for part in &parts[1..] { + let struct_array = current + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected struct while resolving '{column}', but parent of '{part}' was not a struct" + )) + })?; + current = struct_array.column_by_name(part).cloned().ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected vector column '{column}' in scan batch (missing nested field '{part}')" + )) + })?; + } + Ok(current) + } + /// Create a new [`KNNVectorDistanceExec`] node. /// /// Returns an error if the preconditions are not met. @@ -235,6 +352,7 @@ impl KNNVectorDistanceExec { lower_bound: None, upper_bound: None, distance_type, + retain_vector: false, }, ) } @@ -252,6 +370,7 @@ impl KNNVectorDistanceExec { lower_bound, upper_bound, distance_type, + retain_vector, } = params; if query_count == 0 { return Err(Error::invalid_input( @@ -286,13 +405,19 @@ impl KNNVectorDistanceExec { "batch KNN cannot run when the input already contains reserved column '{QUERY_INDEX_COL}'" ))); } - let input_schema = Arc::new(input_schema); + + let stored_schema = if is_batch && !retain_vector { + Arc::new(Self::remove_vector_from_schema(&input_schema, column)?) + } else { + Arc::new(input_schema) + }; + let output_schema = if is_batch { - input_schema + stored_schema .as_ref() .try_with_column_at(0, query_index_field())? } else { - input_schema.as_ref().clone() + stored_schema.as_ref().clone() }; let output_schema = Arc::new(output_schema.try_with_column(Field::new( DIST_COL, @@ -329,19 +454,230 @@ impl KNNVectorDistanceExec { upper_bound, column: column.to_string(), distance_type, - input_schema, + retain_vector, + input_schema: stored_schema, output_schema, properties, metrics: ExecutionPlanMetricsSet::new(), }) } + fn take_vector_row(vectors: &dyn Array, row_index: u32) -> DataFusionResult { + let indices = UInt32Array::from_iter([Some(row_index)]); + arrow_select::take::take(vectors, &indices, None) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + + fn take_slim_batch_field( + results: &[BatchKnnCandidate], + field_name: &str, + ) -> DataFusionResult { + Self::take_slim_batch_field_if_present(results, field_name)?.ok_or_else(|| { + DataFusionError::Internal(format!("column '{field_name}' missing from slim batch")) + }) + } + + fn take_slim_batch_field_if_present( + results: &[BatchKnnCandidate], + field_name: &str, + ) -> DataFusionResult> { + use std::collections::HashMap; + + type SlimBatchGroup = (Arc, Vec<(usize, u32)>); + let mut groups: HashMap<*const RecordBatch, SlimBatchGroup> = HashMap::new(); + for (result_index, candidate) in results.iter().enumerate() { + let BatchKnnExtra::WithSlimBatch { + slim_batch, + row_index, + .. + } = &candidate.extra + else { + return Err(DataFusionError::Internal( + "batch KNN expected slim batch in candidate heap".to_string(), + )); + }; + groups + .entry(Arc::as_ptr(slim_batch)) + .or_insert_with(|| (Arc::clone(slim_batch), Vec::new())) + .1 + .push((result_index, *row_index)); + } + + let mut ordered: Vec> = vec![None; results.len()]; + for (_, (slim_batch, entries)) in groups { + let indices = + UInt32Array::from_iter(entries.iter().map(|(_, row_index)| Some(*row_index))); + let taken = arrow_select::take::take_record_batch(slim_batch.as_ref(), &indices) + .map_err(|e| { + DataFusionError::ArrowError(Box::new(e), Some("take top-k rows".to_string())) + })?; + let Some(column) = taken.column_by_name(field_name) else { + continue; + }; + for (offset, (result_index, _)) in entries.iter().enumerate() { + ordered[*result_index] = Some(column.slice(offset, 1)); + } + } + if ordered.iter().all(Option::is_none) { + return Ok(None); + } + if ordered.iter().any(Option::is_none) { + return Err(DataFusionError::Internal(format!( + "column '{field_name}' inconsistently present in slim batches" + ))); + } + + let row_arrays: Vec<&dyn Array> = ordered + .iter() + .map(|array| { + array + .as_ref() + .expect("every result mapped from slim batch") + .as_ref() + }) + .collect(); + arrow::compute::concat(&row_arrays) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + .map(Some) + } + + fn build_struct_column_for_path( + field: &Field, + path: &[String], + leaf_column: ArrayRef, + slim_column: Option<&dyn Array>, + ) -> DataFusionResult { + if path.is_empty() { + return Ok(leaf_column); + } + let DataType::Struct(children) = field.data_type() else { + return Err(DataFusionError::Internal(format!( + "batch KNN expected struct field '{}' while rebuilding nested vector path '{}'", + field.name(), + path.join(".") + ))); + }; + let slim_struct = slim_column + .map(|column| { + column + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected slim column '{}' to be a struct while rebuilding nested vector path '{}'", + field.name(), + path.join(".") + )) + }) + }) + .transpose()?; + let mut columns = Vec::with_capacity(children.len()); + for child in children.iter() { + if child.name() == &path[0] { + if path.len() == 1 { + columns.push(leaf_column.clone()); + } else { + let child_slim_column = slim_struct + .and_then(|struct_array| struct_array.column_by_name(child.name())); + columns.push(Self::build_struct_column_for_path( + child, + &path[1..], + leaf_column.clone(), + child_slim_column.map(|column| column.as_ref()), + )?); + } + } else if let Some(column) = + slim_struct.and_then(|struct_array| struct_array.column_by_name(child.name())) + { + columns.push(column.clone()); + } else { + columns.push(arrow_array::new_null_array( + child.data_type(), + leaf_column.len(), + )); + } + } + let struct_array = arrow_array::StructArray::try_new( + children.clone(), + columns, + slim_struct.and_then(|struct_array| struct_array.nulls().cloned()), + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + Ok(Arc::new(struct_array)) + } + + fn take_retained_vector_column( + results: &[BatchKnnCandidate], + field: &Field, + field_path: &[String], + ) -> DataFusionResult { + let vector_rows: Vec<&dyn Array> = results + .iter() + .map(|candidate| { + let BatchKnnExtra::WithSlimBatch { + vector_row: Some(vector_row), + .. + } = &candidate.extra + else { + return Err(DataFusionError::Internal( + "batch KNN expected vector rows in candidate heap".to_string(), + )); + }; + Ok(vector_row.as_ref()) + }) + .collect::>>()?; + let leaf_column = arrow::compute::concat(&vector_rows) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + if field_path.len() <= 1 { + Ok(leaf_column) + } else { + let slim_column = Self::take_slim_batch_field_if_present(results, field.name())?; + Self::build_struct_column_for_path( + field, + &field_path[1..], + leaf_column, + slim_column.as_deref(), + ) + } + } + + fn assemble_batch_output( + results: &[BatchKnnCandidate], + stored_schema: &Schema, + column: &str, + retain_vector: bool, + ) -> DataFusionResult { + let field_path = lance_core::datatypes::parse_field_path(column).map_err(|e| { + DataFusionError::Internal(format!( + "batch KNN failed to parse vector column path '{column}': {e}" + )) + })?; + let mut columns: Vec = Vec::with_capacity(stored_schema.fields().len()); + for field in stored_schema.fields() { + if field.name() == ROW_ID { + let row_ids = + UInt64Array::from_iter(results.iter().map(|candidate| Some(candidate.row_id))); + columns.push(Arc::new(row_ids)); + } else if retain_vector && !field_path.is_empty() && field.name() == &field_path[0] { + columns.push(Self::take_retained_vector_column( + results, + field, + &field_path, + )?); + } else { + columns.push(Self::take_slim_batch_field(results, field.name())?); + } + } + RecordBatch::try_new(Arc::new(stored_schema.clone()), columns) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + async fn execute_batch( input: SendableRecordBatchStream, config: BatchKnnConfig, ) -> DataFusionResult { let BatchKnnConfig { - input_schema, + stored_schema, output_schema, column, query, @@ -350,8 +686,10 @@ impl KNNVectorDistanceExec { lower_bound, upper_bound, distance_type, + retain_vector, } = config; let query_dim = query.len() / query_count; + let needs_slim_batch = stored_schema.fields().iter().any(|f| f.name() != ROW_ID); let mut heaps = (0..query_count) .map(|_| BinaryHeap::::with_capacity(k)) .collect::>(); @@ -373,6 +711,13 @@ impl KNNVectorDistanceExec { .as_primitive::() .clone(); + let mut slim_batch: Option> = None; + let vectors = if retain_vector { + Some(Self::resolve_vector_column(&batch, &column)?) + } else { + None + }; + for (query_index, heap) in heaps.iter_mut().enumerate().take(query_count) { let key = query.slice(query_index * query_dim, query_dim); let with_distances = compute_distance(key, distance_type, &column, batch.clone()) @@ -397,20 +742,42 @@ impl KNNVectorDistanceExec { } let query_index = query_index as i32; let row_id = row_ids.value(row_index); - let row_index = row_index as u32; + if !would_enter_heap(heap, k, distance, row_id, query_index) { + continue; + } + + let extra = if retain_vector || needs_slim_batch { + let row_index = row_index as u32; + if slim_batch.is_none() { + let slim = Self::remove_vector_from_batch(&batch, &column)?; + slim_batch = Some(Arc::new(slim)); + } + let slim_batch = slim_batch.as_ref().expect("slim batch"); + let vector_row = if retain_vector { + Some(Self::take_vector_row( + vectors.as_ref().expect("vectors"), + row_index, + )?) + } else { + None + }; + BatchKnnExtra::WithSlimBatch { + slim_batch: Arc::clone(slim_batch), + row_index, + vector_row, + } + } else { + BatchKnnExtra::RowIdOnly + }; let candidate = BatchKnnCandidate { query_index, distance, row_id, - batch: batch.clone(), - row_index, + extra, }; if heap.len() < k { heap.push(candidate); - } else if heap - .peek() - .is_some_and(|worst| candidate.cmp(worst).is_lt()) - { + } else { heap.pop(); heap.push(candidate); } @@ -435,20 +802,14 @@ impl KNNVectorDistanceExec { let mut query_indices = Int32Builder::with_capacity(results.len()); let mut distances = Float32Builder::with_capacity(results.len()); - let mut row_batches = Vec::with_capacity(results.len()); - for result in results { + for result in &results { query_indices.append_value(result.query_index); distances.append_value(result.distance); - let indices = UInt32Array::from(vec![result.row_index]); - row_batches.push( - arrow_select::take::take_record_batch(&result.batch, &indices).map_err(|e| { - DataFusionError::ArrowError(Box::new(e), Some("take top-k row".to_string())) - })?, - ); } - let output = concat_batches(&input_schema, &row_batches) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + let output = + Self::assemble_batch_output(&results, stored_schema.as_ref(), &column, retain_vector)?; + output .try_with_column_at(0, query_index_field(), Arc::new(query_indices.finish())) .and_then(|batch| { @@ -500,6 +861,7 @@ impl ExecutionPlan for KNNVectorDistanceExec { lower_bound: self.lower_bound, upper_bound: self.upper_bound, distance_type: self.distance_type, + retain_vector: self.retain_vector, }, )?)) } @@ -514,7 +876,7 @@ impl ExecutionPlan for KNNVectorDistanceExec { let stream = stream::once(Self::execute_batch( input_stream, BatchKnnConfig { - input_schema: self.input_schema.clone(), + stored_schema: self.input_schema.clone(), output_schema: self.output_schema.clone(), column: self.column.clone(), query: self.query.clone(), @@ -523,6 +885,7 @@ impl ExecutionPlan for KNNVectorDistanceExec { lower_bound: self.lower_bound, upper_bound: self.upper_bound, distance_type: self.distance_type, + retain_vector: self.retain_vector, }, )); let schema = self.schema(); @@ -533,43 +896,35 @@ impl ExecutionPlan for KNNVectorDistanceExec { &self.metrics, )) as SendableRecordBatchStream); } - let input_schema = self.input.schema(); let key = self.query.clone(); let column = self.column.clone(); let dt = self.distance_type; let schema = self.schema(); // Empty batches don't have a vector column to score; filter them out - // before reaching the helper so the transform always sees real work. - let filtered_input = Box::pin(RecordBatchStreamAdapter::new( - input_schema, - input_stream.try_filter(|batch| future::ready(batch.num_rows() > 0)), - )) as SendableRecordBatchStream; - - // Mirror of the helper's elapsed_compute counter; used to attribute - // wall-clock from the spawn_blocking distance kernel back onto the - // node's `elapsed_compute` metric. - let elapsed_compute = BaselineMetrics::new(&self.metrics, partition) - .elapsed_compute() - .clone(); - - let stream = InstrumentedChildInputStream::new( - filtered_input, - schema, - move |batch| { + // before the transform so it always sees real work. + let filtered_input = input_stream.try_filter(|batch| future::ready(batch.num_rows() > 0)); + + let baseline = BaselineMetrics::new(&self.metrics, partition); + let elapsed_compute = baseline.elapsed_compute().clone(); + + let stream = filtered_input + .map(move |batch_result| { let key = key.clone(); let column = column.clone(); let elapsed_compute = elapsed_compute.clone(); async move { + let batch = batch_result?; // Time around the .await to capture the spawn_blocking // distance work, which otherwise runs while this future is - // Pending and is missed by the helper's own poll timer. + // Pending and is missed by a poll-time timer. let start = Instant::now(); let batch = compute_distance(key, dt, &column, batch) .await .map_err(|e| DataFusionError::External(Box::new(e)))?; elapsed_compute.add_duration(start.elapsed()); + let _t = elapsed_compute.timer(); let distances = batch[DIST_COL].as_primitive::(); let distance_values = distances.values(); let mask = BooleanArray::from_iter((0..distances.len()).map(|row_index| { @@ -578,45 +933,56 @@ impl ExecutionPlan for KNNVectorDistanceExec { arrow::compute::filter_record_batch(&batch, &mask) .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) } - }, - get_num_compute_intensive_cpus(), - partition, - &self.metrics, - ); - Ok(Box::pin(stream) as SendableRecordBatchStream) + }) + .buffer_unordered(get_num_compute_intensive_cpus()); + + let stream = stream.map(move |batch| { + let poll = baseline.record_poll(std::task::Poll::Ready(Some(batch))); + match poll { + std::task::Poll::Ready(Some(b)) => b, + _ => unreachable!("record_poll preserves Ready(Some) input"), + } + }); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) } fn partition_statistics(&self, partition: Option) -> DataFusionResult { let inner_stats = self.input.partition_statistics(partition)?; - let schema = self.input.schema(); - let dist_stats = inner_stats + let input_schema = self.input.schema(); + let input_stats_by_name = inner_stats .column_statistics .iter() - .zip(schema.fields()) - .find(|(_, field)| field.name() == &self.column) - .map(|(stats, _)| ColumnStatistics { + .zip(input_schema.fields()) + .map(|(stats, field)| (field.name().as_str(), stats.clone())) + .collect::>(); + let vector_root = lance_core::datatypes::parse_field_path(&self.column) + .ok() + .and_then(|parts| parts.first().cloned()) + .unwrap_or_else(|| self.column.clone()); + let dist_stats = input_stats_by_name + .get(vector_root.as_str()) + .map(|stats| ColumnStatistics { null_count: stats.null_count, ..Default::default() }) .unwrap_or_default(); - let column_statistics = inner_stats - .column_statistics - .into_iter() - .zip(schema.fields()) - .filter(|(_, field)| field.name() != DIST_COL) - .map(|(stats, _)| stats) + let column_statistics = self + .output_schema + .fields() + .iter() + .map(|field| { + if field.name() == QUERY_INDEX_COL { + ColumnStatistics::default() + } else if field.name() == DIST_COL { + dist_stats.clone() + } else { + input_stats_by_name + .get(field.name().as_str()) + .cloned() + .unwrap_or_default() + } + }) .collect::>(); - let column_statistics = if self.is_batch { - std::iter::once(ColumnStatistics::default()) - .chain(column_statistics) - .chain(std::iter::once(dist_stats)) - .collect::>() - } else { - column_statistics - .into_iter() - .chain(std::iter::once(dist_stats)) - .collect::>() - }; Ok(Statistics { num_rows: inner_stats.num_rows, column_statistics, @@ -637,11 +1003,9 @@ impl ExecutionPlan for KNNVectorDistanceExec { } fn required_input_distribution(&self) -> Vec { - if self.is_batch { - vec![Distribution::SinglePartition] - } else { - vec![Distribution::UnspecifiedDistribution] - } + // Both batch and non-batch modes execute a single input partition at a time, + // so all input must be coalesced to one partition before distance computation. + vec![Distribution::SinglePartition] } } @@ -650,8 +1014,37 @@ struct BatchKnnCandidate { query_index: i32, distance: f32, row_id: u64, - batch: RecordBatch, - row_index: u32, + extra: BatchKnnExtra, +} + +#[derive(Clone)] +enum BatchKnnExtra { + RowIdOnly, + WithSlimBatch { + slim_batch: Arc, + row_index: u32, + vector_row: Option, + }, +} + +fn would_enter_heap( + heap: &BinaryHeap, + k: usize, + distance: f32, + row_id: u64, + query_index: i32, +) -> bool { + if heap.len() < k { + return true; + } + let worst = heap.peek().expect("heap non-empty when len >= k"); + let probe = BatchKnnCandidate { + query_index, + distance, + row_id, + extra: BatchKnnExtra::RowIdOnly, + }; + probe.cmp(worst).is_lt() } impl PartialEq for BatchKnnCandidate { @@ -659,7 +1052,6 @@ impl PartialEq for BatchKnnCandidate { self.query_index == other.query_index && self.distance == other.distance && self.row_id == other.row_id - && self.row_index == other.row_index } } @@ -677,7 +1069,6 @@ impl Ord for BatchKnnCandidate { .total_cmp(&other.distance) .then_with(|| self.row_id.cmp(&other.row_id)) .then_with(|| self.query_index.cmp(&other.query_index)) - .then_with(|| self.row_index.cmp(&other.row_index)) } } @@ -719,7 +1110,7 @@ pub fn new_knn_exec( ) -> Result> { let ivf_node = ANNIvfPartitionExec::try_new( dataset.clone(), - indices.iter().map(|idx| idx.uuid.to_string()).collect_vec(), + indices.iter().map(|idx| idx.uuid).collect_vec(), query.clone(), )?; @@ -765,7 +1156,7 @@ pub struct ANNIvfPartitionExec { pub query: Query, /// The UUIDs of the indices to search. - pub index_uuids: Vec, + pub index_uuids: Vec, pub properties: Arc, @@ -773,7 +1164,7 @@ pub struct ANNIvfPartitionExec { } impl ANNIvfPartitionExec { - pub fn try_new(dataset: Arc, index_uuids: Vec, query: Query) -> Result { + pub fn try_new(dataset: Arc, index_uuids: Vec, query: Query) -> Result { let dataset_schema = dataset.schema(); get_vector_type(dataset_schema, &query.column)?; if index_uuids.is_empty() { @@ -875,10 +1266,11 @@ impl ExecutionPlan for ANNIvfPartitionExec { fn execute( &self, partition: usize, - _context: Arc, + context: Arc, ) -> DataFusionResult { let timer = Instant::now(); + let target_partitions = context.session_config().target_partitions(); let query = self.query.clone(); let ds = self.dataset.clone(); let metrics = Arc::new(AnnPartitionMetrics::new(&self.metrics, partition)); @@ -914,7 +1306,7 @@ impl ExecutionPlan for ANNIvfPartitionExec { dist_q_c_list_builder.append_value(dist_q_c.iter()); let dist_q_c_col = dist_q_c_list_builder.finish(); - let uuid_col = StringArray::from(vec![uuid.as_str()]); + let uuid_col = StringArray::from(vec![uuid.to_string()]); let batch = RecordBatch::try_new( KNN_PARTITION_SCHEMA.clone(), vec![ @@ -927,8 +1319,11 @@ impl ExecutionPlan for ANNIvfPartitionExec { Ok::<_, DataFusionError>(batch) } }) - .buffered(self.index_uuids.len()) + .buffered(self.index_uuids.len().min(target_partitions).max(1)) .finally(move || { + // Partition ranking reads centroids from memory, so this is + // typically zero; flushed for symmetry with ANNSubIndex. + metrics_clone.index_metrics.flush_io(); metrics_clone.baseline_metrics.done(); metrics_clone .baseline_metrics @@ -1148,8 +1543,14 @@ impl PartitionSearchControl for LatePartitionSearchControl { } } -fn effective_query_parallelism(query: &Query, index: &dyn VectorIndex) -> usize { - let cpu_pool_size = get_num_compute_intensive_cpus(); +fn effective_query_parallelism( + query: &Query, + index: &dyn VectorIndex, + target_partitions: usize, +) -> usize { + let cpu_pool_size = get_num_compute_intensive_cpus() + .min(target_partitions) + .max(1); effective_query_parallelism_for( query, cpu_pool_size, @@ -1211,6 +1612,7 @@ impl ANNIvfSubIndexExec { .boxed() } + #[allow(clippy::too_many_arguments)] fn late_search( index: Arc, query: Query, @@ -1219,6 +1621,7 @@ impl ANNIvfSubIndexExec { prefilter: Arc, metrics: Arc, state: Arc, + target_partitions: usize, ) -> impl Stream> { let stream = futures::stream::once(async move { let max_nprobes = query @@ -1288,7 +1691,8 @@ impl ANNIvfSubIndexExec { let state_clone = state.clone(); - let query_parallelism = effective_query_parallelism(&query, index.as_ref()); + let query_parallelism = + effective_query_parallelism(&query, index.as_ref(), target_partitions); if query_parallelism <= 1 { return stream::once(async move { let prefilter: Arc = prefilter; @@ -1359,6 +1763,7 @@ impl ANNIvfSubIndexExec { stream.flatten() } + #[allow(clippy::too_many_arguments)] fn initial_search( index: Arc, query: Query, @@ -1367,10 +1772,12 @@ impl ANNIvfSubIndexExec { prefilter: Arc, metrics: Arc, state: Arc, + target_partitions: usize, ) -> impl Stream> { let minimum_nprobes = query.minimum_nprobes.min(partitions.len()); - let query_parallelism = effective_query_parallelism(&query, index.as_ref()); + let query_parallelism = + effective_query_parallelism(&query, index.as_ref(), target_partitions); if query_parallelism <= 1 { metrics.partitions_searched.add(minimum_nprobes); return stream::once(async move { @@ -1502,6 +1909,7 @@ impl ExecutionPlan for ANNIvfSubIndexExec { ) -> DataFusionResult { let input_stream = self.input.execute(partition, context.clone())?; let schema = self.schema(); + let target_partitions = context.session_config().target_partitions(); let query = self.query.clone(); let ds = self.dataset.clone(); let column = self.query.column.clone(); @@ -1540,7 +1948,11 @@ impl ExecutionPlan for ANNIvfSubIndexExec { Arc::new(part_id.unwrap().as_primitive::().clone()); let dist_q_c = Arc::new(dist_q_c.unwrap().as_primitive::().clone()); - let uuid = uuid.unwrap().to_string(); + let uuid = Uuid::parse_str(uuid.unwrap()).map_err(|e| { + DataFusionError::Execution(format!( + "Invalid UUID in __index_uuid column: {e}" + )) + })?; Ok((partitions, dist_q_c, uuid)) }) .collect_vec(); @@ -1593,6 +2005,7 @@ impl ExecutionPlan for ANNIvfSubIndexExec { pre_filter.clone(), metrics.clone(), state.clone(), + target_partitions, ); let late_search = Self::late_search( raw_index.clone(), @@ -1602,6 +2015,7 @@ impl ExecutionPlan for ANNIvfSubIndexExec { pre_filter, metrics, state, + target_partitions, ); DataFusionResult::Ok(early_search.chain(late_search).boxed()) } @@ -1611,6 +2025,9 @@ impl ExecutionPlan for ANNIvfSubIndexExec { // will not start until the early search is complete across all deltas. .try_flatten_unordered(None) .finally(move || { + // Publish the exact index-file I/O measured for this query + // (cache misses only) to the iops/requests/bytes_read gauges. + metrics_clone.index_metrics.flush_io(); metrics_clone .baseline_metrics .elapsed_compute() @@ -1876,12 +2293,13 @@ mod tests { use arrow::datatypes::Float32Type; use arrow_array::{ ArrayRef, FixedSizeListArray, Float32Array, Int32Array, RecordBatchIterator, StringArray, + StructArray, }; use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; use async_trait::async_trait; use datafusion::error::Result as DataFusionResult; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tempfile::TempStrDir; use lance_datafusion::exec::{ExecutionStatsCallback, ExecutionSummaryCounts}; use lance_datafusion::utils::FIND_PARTITIONS_ELAPSED_METRIC; @@ -1918,6 +2336,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), } } @@ -1943,6 +2362,36 @@ mod tests { assert_eq!(effective_query_parallelism_for(&query, 16, 1), 16); } + #[test] + fn test_effective_query_parallelism_respects_target_partitions() { + // effective_query_parallelism caps cpu_pool_size at target_partitions before + // passing it to effective_query_parallelism_for, so the ceiling is + // min(cpu_pool_size, target_partitions). + let mut query = base_query(); + let cpu_pool_size = 16; + + // use-all-cpus mode: capped at target_partitions + query.query_parallelism = -1; + assert_eq!( + effective_query_parallelism_for(&query, cpu_pool_size.min(4), 1), + 4 + ); + + // auto mode: auto_parallelism also clamped to the reduced cpu_pool_size + query.query_parallelism = 0; + assert_eq!( + effective_query_parallelism_for(&query, cpu_pool_size.min(4), 8), + 4 + ); + + // explicit parallelism > target_partitions: clamped down + query.query_parallelism = 16; + assert_eq!( + effective_query_parallelism_for(&query, cpu_pool_size.min(4), 1), + 4 + ); + } + #[derive(Debug, DeepSizeOf)] struct ThreadCapturingIndex { thread_name: Arc>>, @@ -1967,10 +2416,6 @@ mod tests { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn statistics(&self) -> Result { Ok(serde_json::json!({})) } @@ -2091,10 +2536,6 @@ mod tests { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn statistics(&self) -> Result { Ok(serde_json::json!({})) } @@ -2444,6 +2885,7 @@ mod tests { empty_prefilter().await, prepared_metrics(), state, + usize::MAX, ) .try_collect::>() .await @@ -2492,6 +2934,7 @@ mod tests { empty_prefilter().await, prepared_metrics(), state.clone(), + usize::MAX, ) .try_collect::>() .await @@ -2629,6 +3072,254 @@ mod tests { ); } + #[test] + fn test_batch_partition_statistics_aligns_with_output_schema() { + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ), + ROW_ID_FIELD.clone(), + ])); + let batch = RecordBatch::new_empty(schema); + let input: Arc = Arc::new(TestingExec::new(vec![batch])); + let query = Arc::new(Float32Array::from(vec![0.0, 1.0, 2.0, 3.0])) as ArrayRef; + let plan = KNNVectorDistanceExec::try_new_batch( + input, + "vec", + query, + KnnBatchParams { + is_batch: true, + query_count: 1, + k: 2, + lower_bound: None, + upper_bound: None, + distance_type: DistanceType::L2, + retain_vector: false, + }, + ) + .unwrap(); + let stats = plan.partition_statistics(None).unwrap(); + assert_eq!( + stats.column_statistics.len(), + plan.schema().fields().len(), + "partition stats must align with output schema" + ); + let schema = plan.schema(); + let query_index_pos = schema + .column_with_name(QUERY_INDEX_COL) + .expect("query_index must exist") + .0; + let dist_pos = schema + .column_with_name(DIST_COL) + .expect("distance must exist") + .0; + assert_eq!( + stats.column_statistics[query_index_pos], + ColumnStatistics::default(), + ); + assert_eq!( + stats.column_statistics[dist_pos].null_count, + stats.column_statistics[schema.column_with_name("i").unwrap().0].null_count, + "distance null-count should be derived from vector/input nullability and remain aligned" + ); + } + + #[test] + fn test_remove_vector_from_schema_nested_path() { + let payload_field = ArrowField::new( + "payload", + DataType::Struct( + vec![ + ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ), + ArrowField::new("tag", DataType::Utf8, true), + ] + .into(), + ), + true, + ); + let schema = ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + payload_field, + ROW_ID_FIELD.clone(), + ]); + let without_vec = + KNNVectorDistanceExec::remove_vector_from_schema(&schema, "payload.vec").unwrap(); + let payload = without_vec.field_with_name("payload").unwrap(); + let DataType::Struct(children) = payload.data_type() else { + panic!("payload should remain struct"); + }; + assert!(children.iter().all(|f| f.name() != "vec")); + assert!(children.iter().any(|f| f.name() == "tag")); + } + + #[test] + fn test_take_vector_row_copies_single_row() { + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..12).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let row = KNNVectorDistanceExec::take_vector_row(&vectors, 2).unwrap(); + assert_eq!(row.len(), 1); + assert_eq!( + row.to_data().offset(), + 0, + "take/copy should not retain row offset into the full input buffer" + ); + } + + #[test] + fn test_resolve_vector_column_supports_escaped_nested_path() { + let vec_field = ArrowField::new( + "vec.with.dot", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![payload_field])); + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..8).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let payload = StructArray::from(vec![(Arc::new(vec_field), Arc::new(vectors) as ArrayRef)]); + let batch = RecordBatch::try_new(schema, vec![Arc::new(payload)]).unwrap(); + let vector = + KNNVectorDistanceExec::resolve_vector_column(&batch, "payload.`vec.with.dot`").unwrap(); + assert_eq!(vector.len(), 2); + } + + #[test] + fn test_remove_vector_from_batch_nested_keeps_siblings() { + let vec_field = ArrowField::new( + "vec.with.dot", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ); + let tag_field = ArrowField::new("tag", DataType::Utf8, true); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone(), tag_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![payload_field])); + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..8).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let tags = StringArray::from(vec!["a", "b"]); + let payload = StructArray::from(vec![ + (Arc::new(vec_field), Arc::new(vectors) as ArrayRef), + (Arc::new(tag_field), Arc::new(tags) as ArrayRef), + ]); + let batch = RecordBatch::try_new(schema, vec![Arc::new(payload)]).unwrap(); + + let slim = + KNNVectorDistanceExec::remove_vector_from_batch(&batch, "payload.`vec.with.dot`") + .unwrap(); + let payload = slim.column_by_name("payload").unwrap().as_struct(); + assert!(payload.column_by_name("vec.with.dot").is_none()); + assert!(payload.column_by_name("tag").is_some()); + } + + #[test] + fn test_assemble_batch_output_retained_nested_vector_keeps_sibling_values() { + let vec_field = ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ); + let tag_field = ArrowField::new("tag", DataType::Utf8, true); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone(), tag_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![payload_field, ROW_ID_FIELD.clone()])); + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..12).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let tags = StringArray::from(vec!["a", "b", "c"]); + let payload = StructArray::from(vec![ + (Arc::new(vec_field), Arc::new(vectors) as ArrayRef), + (Arc::new(tag_field), Arc::new(tags) as ArrayRef), + ]); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(payload) as ArrayRef, + Arc::new(UInt64Array::from(vec![10, 11, 12])) as ArrayRef, + ], + ) + .unwrap(); + let slim_batch = Arc::new( + KNNVectorDistanceExec::remove_vector_from_batch(&batch, "payload.vec").unwrap(), + ); + let vectors = KNNVectorDistanceExec::resolve_vector_column(&batch, "payload.vec").unwrap(); + let results = [2, 0] + .into_iter() + .map(|row_index| BatchKnnCandidate { + query_index: 0, + distance: row_index as f32, + row_id: 10 + row_index as u64, + extra: BatchKnnExtra::WithSlimBatch { + slim_batch: Arc::clone(&slim_batch), + row_index, + vector_row: Some( + KNNVectorDistanceExec::take_vector_row(vectors.as_ref(), row_index) + .unwrap(), + ), + }, + }) + .collect::>(); + + let output = KNNVectorDistanceExec::assemble_batch_output( + &results, + schema.as_ref(), + "payload.vec", + true, + ) + .unwrap(); + + let payload = output.column_by_name("payload").unwrap().as_struct(); + let tags = payload.column_by_name("tag").unwrap().as_string::(); + assert!(tags.is_valid(0)); + assert!(tags.is_valid(1)); + assert_eq!(tags.value(0), "c"); + assert_eq!(tags.value(1), "a"); + let vectors = payload.column_by_name("vec").unwrap(); + assert_eq!(vectors.len(), 2); + } + #[tokio::test] async fn test_multivector_score() { let query = Query { @@ -2645,6 +3336,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; async fn multivector_scoring( @@ -2869,6 +3561,128 @@ mod tests { assert_find_partitions_elapsed_recorded(&stats); } + /// The ANN operators report the exact index-file I/O performed for a query + /// (bytes_read / iops), measured only on cache misses. A cold search loads + /// partitions from storage and reports non-zero I/O; an immediately + /// following warm search serves every partition from the index cache and + /// reports zero -- which is the cache-effectiveness signal the metric adds. + #[tokio::test] + async fn test_io_metrics_cold_vs_warm() { + let fixture = NprobesTestFixture::new(100, 1).await; + let q = fixture.get_centroid(0); + + let run = |holder: &StatsHolder| { + let setter = holder.get_setter(); + async { + fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .scan_stats_callback(setter) + .project(&Vec::::new()) + .unwrap() + .with_row_id() + .try_into_batch() + .await + .unwrap() + } + }; + + // Cold: a freshly opened dataset has an empty index cache, so the + // sub-index search must read partitions (and their quantization storage) + // from disk. Those reads flow through the per-query I/O sink. + let cold_holder = StatsHolder::default(); + run(&cold_holder).await; + let cold = cold_holder.consume(); + assert!( + cold.parts_loaded > 0, + "cold search should load partitions, got parts_loaded={}", + cold.parts_loaded + ); + assert!( + cold.bytes_read > 0, + "cold search should report index-file I/O, got bytes_read={}", + cold.bytes_read + ); + assert!( + cold.iops > 0, + "cold search should report index-file IOPS, got iops={}", + cold.iops + ); + + // Warm: the same query on the same dataset finds every partition it + // needs already cached, so no index-file I/O is performed. + let warm_holder = StatsHolder::default(); + run(&warm_holder).await; + let warm = warm_holder.consume(); + assert_eq!( + warm.parts_loaded, 0, + "warm search should not reload partitions, got parts_loaded={}", + warm.parts_loaded + ); + assert_eq!( + warm.bytes_read, 0, + "warm search should report no index-file I/O, got bytes_read={}", + warm.bytes_read + ); + } + + /// The new I/O metrics must actually surface in `EXPLAIN ANALYZE` text on + /// the ANN operators: non-zero on a cold query (partition reads on + /// `ANNSubIndex`, index-open reads on `ANNIvfPartition`) and zero on a warm + /// query (everything served from the index cache). + #[tokio::test] + async fn test_io_metrics_visible_in_explain_analyze() { + // Returns the value of `metric=` from the analyzed-plan line for `node`. + fn node_metric<'a>(plan: &'a str, node: &str, metric: &str) -> &'a str { + let line = plan + .lines() + .find(|l| l.trim_start().starts_with(node)) + .unwrap_or_else(|| panic!("plan missing node {node}:\n{plan}")); + let after = line + .split_once(&format!("{metric}=")) + .unwrap_or_else(|| panic!("node {node} line missing {metric}=:\n{line}")) + .1; + after.split([',', ']']).next().unwrap().trim() + } + + let fixture = NprobesTestFixture::new(100, 1).await; + let q = fixture.get_centroid(0); + + // Cold: a freshly opened dataset must show real index-file I/O. + let cold = fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .analyze_plan() + .await + .unwrap(); + // Sub-index partition reads. + assert_ne!(node_metric(&cold, "ANNSubIndex", "bytes_read"), "0"); + assert_ne!(node_metric(&cold, "ANNSubIndex", "iops"), "0"); + // Index-open reads (centroids/metadata) now attributed to the partition + // operator -- the value this part of the change adds. + assert_ne!(node_metric(&cold, "ANNIvfPartition", "bytes_read"), "0"); + assert_ne!(node_metric(&cold, "ANNIvfPartition", "iops"), "0"); + + // Warm: same query, everything cache-resident -> zero index-file I/O. + let warm = fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .analyze_plan() + .await + .unwrap(); + assert_eq!(node_metric(&warm, "ANNSubIndex", "bytes_read"), "0"); + assert_eq!(node_metric(&warm, "ANNIvfPartition", "bytes_read"), "0"); + } + #[rstest] #[tokio::test] async fn test_no_prefilter_results(#[values(1, 20)] num_deltas: usize) { diff --git a/rust/lance/src/io/exec/optimizer.rs b/rust/lance/src/io/exec/optimizer.rs index f031e10ce19..72488f3a14e 100644 --- a/rust/lance/src/io/exec/optimizer.rs +++ b/rust/lance/src/io/exec/optimizer.rs @@ -171,9 +171,16 @@ impl PhysicalOptimizerRule for SimplifyProjection { pub fn get_physical_optimizer() -> PhysicalOptimizer { PhysicalOptimizer::with_rules(vec![ + // Rewrite `COUNT(*)`-style aggregates into CountFromMaskExec so they + // can be answered without scanning column data. Runs before the + // generic rules so they don't see the rewritten subtree. + Arc::new(crate::io::exec::count_pushdown::CountPushdown), Arc::new(crate::io::exec::optimizer::CoalesceTake), Arc::new(crate::io::exec::optimizer::SimplifyProjection), // Push down limit into FilteredReadExec and other Execs via with_fetch() Arc::new(datafusion::physical_optimizer::limit_pushdown::LimitPushdown::new()), + // Insert exchange nodes (RepartitionExec, CoalescePartitionsExec) where needed + // to satisfy distribution requirements as exec nodes migrate to multi-partition output. + Arc::new(datafusion::physical_optimizer::enforce_distribution::EnforceDistribution::new()), ]) } diff --git a/rust/lance/src/io/exec/rowids.rs b/rust/lance/src/io/exec/rowids.rs index cb8280ea874..837d0b81fa3 100644 --- a/rust/lance/src/io/exec/rowids.rs +++ b/rust/lance/src/io/exec/rowids.rs @@ -10,7 +10,7 @@ use datafusion::common::ColumnStatistics; use datafusion::common::stats::Precision; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::SendableRecordBatchStream; -use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_plan::Statistics; @@ -29,8 +29,6 @@ use crate::Dataset; use crate::dataset::rowids::get_row_id_index; use crate::utils::future::SharedPrerequisite; -use super::utils::InstrumentedChildInputStream; - /// Add a `_rowaddr` column to a stream of record batches that have a `_rowid`. /// /// It's generally more efficient to scan the `_rowaddr` column, but this can be @@ -191,33 +189,41 @@ impl AddRowAddrExec { let rowid_pos = self.rowid_pos; let rowaddr_pos = self.rowaddr_pos; let output_schema = self.output_schema.clone(); - let stream = InstrumentedChildInputStream::new( - input_stream, + let baseline = BaselineMetrics::new(&self.metrics, partition); + let elapsed_compute = baseline.elapsed_compute().clone(); + let stream = input_stream.then(move |batch_result| { + let output_schema = output_schema.clone(); + let index_prereq = index_prereq.clone(); + let elapsed_compute = elapsed_compute.clone(); + async move { + let batch = batch_result?; + index_prereq.wait_ready().await?; + let row_id_index = index_prereq.get_ready(); + let index_ref = row_id_index.as_deref(); + + let _t = elapsed_compute.timer(); + let row_addr = Self::compute_row_addrs(batch.column(rowid_pos), index_ref)?; + + let mut columns = Vec::with_capacity(batch.num_columns() + 1); + let existing_columns = batch.columns(); + columns.extend_from_slice(&existing_columns[..rowaddr_pos]); + columns.push(row_addr); + columns.extend_from_slice(&existing_columns[rowaddr_pos..]); + + Ok(RecordBatch::try_new(output_schema.clone(), columns)?) + } + }); + let stream = stream.map(move |batch| { + let poll = baseline.record_poll(std::task::Poll::Ready(Some(batch))); + match poll { + std::task::Poll::Ready(Some(b)) => b, + _ => unreachable!("record_poll preserves Ready(Some) input"), + } + }); + Ok(Box::pin(RecordBatchStreamAdapter::new( self.output_schema.clone(), - move |batch| { - let output_schema = output_schema.clone(); - let index_prereq = index_prereq.clone(); - async move { - index_prereq.wait_ready().await?; - let row_id_index = index_prereq.get_ready(); - let index_ref = row_id_index.as_deref(); - - let row_addr = Self::compute_row_addrs(batch.column(rowid_pos), index_ref)?; - - let mut columns = Vec::with_capacity(batch.num_columns() + 1); - let existing_columns = batch.columns(); - columns.extend_from_slice(&existing_columns[..rowaddr_pos]); - columns.push(row_addr); - columns.extend_from_slice(&existing_columns[rowaddr_pos..]); - - Ok(RecordBatch::try_new(output_schema.clone(), columns)?) - } - }, - 1, - partition, - &self.metrics, - ); - Ok(Box::pin(stream)) + stream, + ))) } } diff --git a/rust/lance/src/io/exec/scalar_index.rs b/rust/lance/src/io/exec/scalar_index.rs index ade4995fb4b..ee05ce7a86f 100644 --- a/rust/lance/src/io/exec/scalar_index.rs +++ b/rust/lance/src/io/exec/scalar_index.rs @@ -3,9 +3,7 @@ use std::sync::{Arc, LazyLock}; -use super::utils::{ - IndexMetrics, InstrumentedChildInputStream, InstrumentedRecordBatchStreamAdapter, -}; +use super::utils::{IndexMetrics, InstrumentedRecordBatchStreamAdapter}; use crate::{ Dataset, dataset::rowids::load_row_id_sequences, @@ -22,7 +20,7 @@ use datafusion::{ physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, execution_plan::{Boundedness, EmissionType}, - metrics::{ExecutionPlanMetricsSet, MetricsSet}, + metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, stream::RecordBatchStreamAdapter, }, scalar::ScalarValue, @@ -116,6 +114,7 @@ impl ScalarIndexExec { &self.dataset } + /// The parsed scalar-index expression this node will evaluate. pub fn expr(&self) -> &ScalarIndexExpr { &self.expr } @@ -257,14 +256,40 @@ impl ExecutionPlan for ScalarIndexExec { pub static INDEX_LOOKUP_SCHEMA: LazyLock = LazyLock::new(|| Arc::new(Schema::new(vec![ROW_ID_FIELD.clone()]))); +/// A single scalar-index lookup used by [`MapIndexExec`]. +/// +/// `column` identifies a column whose values will be probed against the +/// index named `index_name`. Multiple lookups are intersected with logical +/// AND semantics inside `MapIndexExec`. +#[derive(Debug, Clone)] +pub struct IndexLookup { + pub column: String, + pub index_name: String, +} + +impl IndexLookup { + pub fn new(column: impl Into, index_name: impl Into) -> Self { + Self { + column: column.into(), + index_name: index_name.into(), + } + } +} + /// An execution node that translates index values into row addresses /// -/// This can be combined with TakeExec to perform an "indexed take" +/// This can be combined with TakeExec to perform an "indexed take". +/// +/// Multiple `(column, index_name)` lookups can be supplied: the operator +/// expects one input column per lookup (in matching order) and emits the +/// row addresses where every column's value is present in its respective +/// index — that is, the AND of the per-column index probes. This lets a +/// composite-key join trim the candidate row set with every available +/// scalar index before the downstream take. #[derive(Debug)] pub struct MapIndexExec { dataset: Arc, - column_name: String, - index_name: String, + lookups: Vec, input: Arc, properties: Arc, metrics: ExecutionPlanMetricsSet, @@ -276,19 +301,49 @@ impl DisplayAs for MapIndexExec { DisplayFormatType::Default | DisplayFormatType::Verbose | DisplayFormatType::TreeRender => { - write!(f, "IndexedLookup") + write!(f, "IndexedLookup")?; + if self.lookups.len() > 1 { + let cols = self + .lookups + .iter() + .map(|l| l.column.as_str()) + .collect::>() + .join(", "); + write!(f, " [{cols}]")?; + } + Ok(()) } } } } impl MapIndexExec { + /// Convenience constructor for the common single-column case. pub fn new( dataset: Arc, column_name: String, index_name: String, input: Arc, ) -> Self { + Self::new_multi( + dataset, + vec![IndexLookup::new(column_name, index_name)], + input, + ) + } + + /// Build a `MapIndexExec` that probes one or more scalar indices and + /// emits the AND of their results. `lookups` must be non-empty and + /// `input` must produce one column per lookup, in the same order. + pub fn new_multi( + dataset: Arc, + lookups: Vec, + input: Arc, + ) -> Self { + debug_assert!( + !lookups.is_empty(), + "MapIndexExec requires at least one index lookup" + ); let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(INDEX_LOOKUP_SCHEMA.clone()), Partitioning::RoundRobinBatch(1), @@ -297,8 +352,7 @@ impl MapIndexExec { )); Self { dataset, - column_name, - index_name, + lookups, input, properties, metrics: ExecutionPlanMetricsSet::new(), @@ -309,24 +363,30 @@ impl MapIndexExec { input: datafusion::physical_plan::SendableRecordBatchStream, partition: usize, dataset: Arc, - column_name: String, - index_name: String, + lookups: Vec, index_metrics: Arc, metrics_set: ExecutionPlanMetricsSet, ) -> datafusion::error::Result { - // Time the one-shot setup (fragment bitmap + deletion mask) so it's - // attributed to this node's elapsed_compute. The helper itself only - // times per-batch work. - let elapsed_compute = datafusion::physical_plan::metrics::MetricBuilder::new(&metrics_set) - .elapsed_compute(partition); - let setup_start = std::time::Instant::now(); - let fragment_bitmap = scalar_index_fragment_bitmap(&dataset, &column_name, &index_name) - .await? - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "IndexedLookupExec: index '{index_name}' on column '{column_name}' disappeared after planning" - )) - })?; + // A row can be found by the composite probe only if it lives in a + // fragment covered by *every* index in `lookups`; restrict the + // deletion mask to that intersection so we only filter deletes we + // could actually see. + let mut fragment_bitmap: Option = None; + for lookup in &lookups { + let bm = scalar_index_fragment_bitmap(&dataset, &lookup.column, &lookup.index_name) + .await? + .ok_or_else(|| { + datafusion::error::DataFusionError::Internal(format!( + "IndexedLookupExec: index '{}' on column '{}' disappeared after planning", + lookup.index_name, lookup.column, + )) + })?; + fragment_bitmap = Some(match fragment_bitmap { + None => bm, + Some(acc) => acc & bm, + }); + } + let fragment_bitmap = fragment_bitmap.expect("MapIndexExec built with no lookups"); let deletion_mask_fut = DatasetPreFilter::create_restricted_deletion_mask(dataset.clone(), fragment_bitmap); let deletion_mask = if let Some(fut) = deletion_mask_fut { @@ -334,53 +394,73 @@ impl MapIndexExec { } else { None }; - elapsed_compute.add_duration(setup_start.elapsed()); - let helper = InstrumentedChildInputStream::new( - input, + let baseline = BaselineMetrics::new(&metrics_set, partition); + let elapsed_compute = baseline.elapsed_compute().clone(); + let stream = input.then(move |batch_result| { + let lookups = lookups.clone(); + let dataset = dataset.clone(); + let deletion_mask = deletion_mask.clone(); + let metrics = index_metrics.clone(); + let elapsed_compute = elapsed_compute.clone(); + async move { + let batch = batch_result?; + // Timer spans `map_batch`'s `.await` on purpose: that await is + // the per-batch sargable index evaluation, which is the work + // we want attributed here. + let _t = elapsed_compute.timer(); + Self::map_batch(lookups, dataset, deletion_mask, batch, metrics).await + } + }); + let stream = stream.map(move |batch| { + let poll = baseline.record_poll(std::task::Poll::Ready(Some(batch))); + match poll { + std::task::Poll::Ready(Some(b)) => b, + _ => unreachable!("record_poll preserves Ready(Some) input"), + } + }); + Ok(Box::pin(RecordBatchStreamAdapter::new( INDEX_LOOKUP_SCHEMA.clone(), - move |batch| { - let column_name = column_name.clone(); - let index_name = index_name.clone(); - let dataset = dataset.clone(); - let deletion_mask = deletion_mask.clone(); - let metrics = index_metrics.clone(); - Self::map_batch( - column_name, - index_name, - dataset, - deletion_mask, - batch, - metrics, - ) - }, - 1, - partition, - &metrics_set, - ); - Ok(Box::pin(helper)) + stream, + ))) + } + + /// Build the AND-of-IsIn `ScalarIndexExpr` describing this batch's + /// composite lookup: each input column contributes one `IsIn` query + /// against its matching index. + fn build_query( + lookups: &[IndexLookup], + batch: &RecordBatch, + ) -> datafusion::error::Result { + let per_column = lookups.iter().enumerate().map(|(idx, lookup)| { + let column = batch.column(idx); + let values = (0..column.len()) + .map(|row| ScalarValue::try_from_array(column, row)) + .collect::>>()?; + Ok::<_, datafusion::error::DataFusionError>(ScalarIndexExpr::Query(ScalarIndexSearch { + column: lookup.column.clone(), + index_name: lookup.index_name.clone(), + // Internal IndexedLookup-style query — type is unknown at this layer + index_type: String::new(), + query: Arc::new(SargableQuery::IsIn(values)), + needs_recheck: false, + fragment_bitmap: None, + })) + }); + + per_column + .reduce(|lhs, rhs| Ok(ScalarIndexExpr::And(Box::new(lhs?), Box::new(rhs?)))) + .expect("MapIndexExec built with no lookups") } async fn map_batch( - column_name: String, - index_name: String, + lookups: Vec, dataset: Arc, deletion_mask: Option>, batch: RecordBatch, metrics: Arc, ) -> datafusion::error::Result { - let index_vals = batch.column(0); - let index_vals = (0..index_vals.len()) - .map(|idx| ScalarValue::try_from_array(index_vals, idx)) - .collect::>>()?; - let query = ScalarIndexExpr::Query(ScalarIndexSearch { - column: column_name, - index_name, - // Internal IndexedLookup-style query — type is unknown at this layer - index_type: String::new(), - query: Arc::new(SargableQuery::IsIn(index_vals)), - needs_recheck: false, - }); + let query = Self::build_query(&lookups, &batch)?; let query_result = query.evaluate(dataset.as_ref(), metrics.as_ref()).await?; if !query_result.is_exact() { todo!("Support for non-exact query results as input for merge_insert") @@ -430,10 +510,9 @@ impl ExecutionPlan for MapIndexExec { "MapIndexExec requires exactly one child".to_string(), )) } else { - Ok(Arc::new(Self::new( + Ok(Arc::new(Self::new_multi( self.dataset.clone(), - self.column_name.clone(), - self.index_name.clone(), + self.lookups.clone(), children.into_iter().next().unwrap(), ))) } @@ -449,8 +528,7 @@ impl ExecutionPlan for MapIndexExec { input, partition, self.dataset.clone(), - self.column_name.clone(), - self.index_name.clone(), + self.lookups.clone(), Arc::new(IndexMetrics::new(&self.metrics, partition)), self.metrics.clone(), ); @@ -848,6 +926,7 @@ mod tests { Bound::Excluded(ScalarValue::UInt64(Some(47))), )), needs_recheck: false, + fragment_bitmap: None, }); let fragments = dataset.fragments().clone(); @@ -892,6 +971,7 @@ mod tests { Bound::Excluded(ScalarValue::UInt64(Some(47))), )), needs_recheck: false, + fragment_bitmap: None, }); let verify = async |plan: ScalarIndexExec, schema: Arc| { @@ -943,6 +1023,7 @@ mod tests { Bound::Excluded(ScalarValue::UInt64(Some(47))), )), needs_recheck: false, + fragment_bitmap: None, }); // These plans aren't even valid but it appears we defer all work (even validation) until diff --git a/rust/lance/src/io/exec/scan.rs b/rust/lance/src/io/exec/scan.rs index 15d8d181eab..3ec63ce04cc 100644 --- a/rust/lance/src/io/exec/scan.rs +++ b/rust/lance/src/io/exec/scan.rs @@ -342,7 +342,11 @@ impl LanceStream { // TODO: Ideally this will eventually get tied into datafusion as a # of partitions. This will let // us fully fuse decode into the first half of the plan. Currently there is likely to be a thread // transfer between the two steps. - .try_buffered(get_num_compute_intensive_cpus()) + .try_buffered( + get_num_compute_intensive_cpus() + .min(config.parallelism_cap.unwrap_or(usize::MAX)) + .max(1), + ) .stream_in_current_span() .boxed(); @@ -371,9 +375,13 @@ impl LanceStream { let fragment_readahead = config .fragment_readahead .unwrap_or(LEGACY_DEFAULT_FRAGMENT_READAHEAD); + let batch_readahead = config + .batch_readahead + .min(config.parallelism_cap.unwrap_or(usize::MAX)) + .max(1); debug!( "Scanning v1 dataset with frag_readahead={} and batch_readahead={}", - fragment_readahead, config.batch_readahead + fragment_readahead, batch_readahead ); let file_fragments = fragments @@ -410,7 +418,7 @@ impl LanceStream { // We must be waiting to finish a file before moving onto thenext. That's an issue. .try_flatten() // We buffer up to `batch_readahead` batches across all streams. - .try_buffered(config.batch_readahead) + .try_buffered(batch_readahead) .stream_in_current_span() .boxed() } else { @@ -443,7 +451,7 @@ impl LanceStream { tasks .try_flatten_unordered(config.fragment_readahead) // We buffer up to `batch_readahead` batches across all streams. - .try_buffer_unordered(config.batch_readahead) + .try_buffer_unordered(batch_readahead) .stream_in_current_span() .boxed() }; @@ -508,6 +516,9 @@ pub struct LanceScanConfig { pub with_make_deletions_null: bool, pub ordered_output: bool, pub file_reader_options: Option, + /// Upper bound on frag_parallelism and CPU decode concurrency. Set from + /// DataFusion's `target_partitions` session config in `LanceScanExec::execute`. + pub parallelism_cap: Option, } // This is mostly for testing purposes, end users are unlikely to create this @@ -526,6 +537,7 @@ impl Default for LanceScanConfig { with_make_deletions_null: false, ordered_output: false, file_reader_options: None, + parallelism_cap: None, } } } @@ -690,13 +702,17 @@ impl ExecutionPlan for LanceScanExec { fn execute( &self, partition: usize, - _context: Arc, + context: Arc, ) -> Result { let dataset = self.dataset.clone(); let fragments = self.fragments.clone(); let range = self.range.clone(); let projection = self.projection.clone(); - let config = self.config.clone(); + let target_partitions = context.session_config().target_partitions(); + let config = LanceScanConfig { + parallelism_cap: Some(target_partitions), + ..self.config.clone() + }; let metrics = self.metrics.clone(); let lance_fut_stream = stream::once(async move { @@ -750,6 +766,9 @@ impl ExecutionPlan for LanceScanExec { #[cfg(test)] mod tests { use datafusion::execution::TaskContext; + use datafusion::prelude::SessionConfig; + use futures::TryStreamExt; + use lance_datagen::gen_batch; use crate::utils::test::NoContextTestFixture; @@ -772,4 +791,47 @@ mod tests { scan.execute(0, Arc::new(TaskContext::default())).unwrap(); } + + /// Verify that executing with target_partitions=1 produces the same row count as the + /// default context. Regression guard for the parallelism cap. + #[tokio::test] + async fn test_target_partitions_cap_produces_correct_results() { + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::{Dimension, array}; + + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; + + let tmp = TempStrDir::default(); + let dataset = gen_batch() + .col("x", array::step::()) + .col( + "v", + array::rand_vec::(Dimension::from(4)), + ) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(100), + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let scan = LanceScanExec::new( + dataset.clone(), + dataset.fragments().clone(), + None, + Arc::new(dataset.schema().clone()), + LanceScanConfig::default(), + ); + + let low_ctx = Arc::new( + TaskContext::default() + .with_session_config(SessionConfig::default().with_target_partitions(1)), + ); + let stream = scan.execute(0, low_ctx).unwrap(); + let batches = stream.try_collect::>().await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 400); + } } diff --git a/rust/lance/src/io/exec/take.rs b/rust/lance/src/io/exec/take.rs index 977a9c88dce..c3642cdb043 100644 --- a/rust/lance/src/io/exec/take.rs +++ b/rust/lance/src/io/exec/take.rs @@ -4,6 +4,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::{Arc, Mutex}; +use std::task::Poll; use arrow::array::AsArray; use arrow::compute::{TakeOptions, concat_batches}; @@ -27,6 +28,7 @@ use lance_arrow::RecordBatchExt; use lance_core::datatypes::{Field, OnMissing, Projection}; use lance_core::error::{DataFusionResult, LanceOptionExt}; use lance_core::utils::address::RowAddress; +use lance_core::utils::futures::FinallyStreamExt; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{ROW_ADDR, ROW_ID}; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; @@ -353,10 +355,6 @@ impl TakeStream { (None, None) => {} } - self.metrics - .baseline_metrics - .record_output(new_data.num_rows()); - self.metrics.batches_processed.add(1); Ok(batch.merge_with_schema(&new_data, self.output_schema.as_ref())?) } @@ -364,8 +362,10 @@ impl TakeStream { self: Arc, input: S, ) -> impl Stream> { - let scan_scheduler = self.scan_scheduler.clone(); - let metrics = self.metrics.clone(); + let result_scan_scheduler = self.scan_scheduler.clone(); + let final_scan_scheduler = self.scan_scheduler.clone(); + let result_metrics = self.metrics.clone(); + let final_metrics = self.metrics.clone(); let batches = input .enumerate() .map(move |(batch_index, batch)| { @@ -378,8 +378,24 @@ impl TakeStream { }) .boxed(); batches - .inspect_ok(move |_| metrics.io_metrics.record(&scan_scheduler)) .try_buffered(get_num_compute_intensive_cpus()) + .map(move |result| { + if result.is_ok() { + result_metrics.batches_processed.add(1); + } + result_metrics.io_metrics.record(&result_scan_scheduler); + match result_metrics + .baseline_metrics + .record_poll(Poll::Ready(Some(result))) + { + Poll::Ready(Some(result)) => result, + _ => unreachable!("record_poll returned a different poll state"), + } + }) + .finally(move || { + final_metrics.baseline_metrics.done(); + final_metrics.io_metrics.record(&final_scan_scheduler); + }) } } @@ -839,6 +855,80 @@ mod tests { } } + #[tokio::test(flavor = "current_thread")] + async fn test_take_records_output_and_io_metrics() { + use datafusion::physical_plan::metrics::MetricValue; + use lance_datafusion::utils::{BYTES_READ_METRIC, IOPS_METRIC, REQUESTS_METRIC}; + let TestFixture { + dataset, + _tmp_dir_guard, + } = test_fixture().await; + + let row_addrs = UInt64Array::from(vec![0_u64, 1, 2, 3, 4]); + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + ROW_ADDR, + DataType::UInt64, + true, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(row_addrs)]).unwrap(); + let stream = futures::stream::iter(vec![Ok(batch)]); + let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream)); + let input = Arc::new(OneShotExec::new(stream)); + + let projection = dataset + .empty_projection() + .union_column("s", OnMissing::Error) + .unwrap(); + + let take_exec = TakeExec::try_new(dataset, input, projection) + .unwrap() + .unwrap(); + + let stream = take_exec + .execute(0, Arc::new(TaskContext::default())) + .unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 5); + + let metrics = take_exec.metrics().unwrap(); + + let output_batches: usize = metrics + .iter() + .filter_map(|m| match m.value() { + MetricValue::OutputBatches(count) => Some(count.value()), + _ => None, + }) + .sum(); + + let output_bytes: usize = metrics + .iter() + .filter_map(|m| match m.value() { + MetricValue::OutputBytes(count) => Some(count.value()), + _ => None, + }) + .sum(); + + let gauge = |name: &str| -> usize { + metrics + .iter_gauges() + .find_map(|(metric_name, gauge)| { + (metric_name.as_ref() == name).then(|| gauge.value()) + }) + .unwrap_or(0) + }; + + let bytes_read = gauge(BYTES_READ_METRIC); + let iops = gauge(IOPS_METRIC); + let requests = gauge(REQUESTS_METRIC); + + assert_eq!(metrics.output_rows(), Some(5)); + assert_eq!(metrics.find_count("batches_processed").unwrap().value(), 1); + assert!( + output_batches > 0 && output_bytes > 0 && bytes_read > 0 && iops > 0 && requests > 0, + "expected positive TakeExec metrics, got output_batches={output_batches}, output_bytes={output_bytes}, bytes_read={bytes_read}, iops={iops}, requests={requests}" + ); + } + #[tokio::test] async fn test_take_order() { let TestFixture { diff --git a/rust/lance/src/io/exec/utils.rs b/rust/lance/src/io/exec/utils.rs index 5def0fb254d..6e2d50d3736 100644 --- a/rust/lance/src/io/exec/utils.rs +++ b/rust/lance/src/io/exec/utils.rs @@ -6,9 +6,10 @@ use lance_datafusion::utils::{ IOPS_METRIC, PARTS_LOADED_METRIC, REQUESTS_METRIC, }; use lance_index::metrics::MetricsCollector; -use lance_io::scheduler::ScanScheduler; +use lance_io::scheduler::{IoStats, ScanScheduler, ScanStats}; use lance_table::format::IndexMetadata; use pin_project::pin_project; +use std::future::Future; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; @@ -30,7 +31,6 @@ use lance_core::utils::futures::{Capacity, SharedStreamExt}; use lance_core::{ROW_ID, Result}; use lance_index::prefilter::FilterLoader; use lance_select::{RowAddrMask, RowAddrTreeMap, result::IndexExprResult}; -use std::future::Future; use crate::Dataset; use crate::index::prefilter::DatasetPreFilter; @@ -300,7 +300,7 @@ where /// applies a per-batch async transform. /// /// `elapsed_compute` measures only the time spent driving the transform -/// futures — never the time spent polling the child input — so wrapping a +/// futures -- never the time spent polling the child input -- so wrapping a /// chain of nodes does not double-count child CPU. `output_rows` and /// `output_batches` are recorded as the transform produces batches. /// @@ -363,7 +363,7 @@ where let this = self.get_mut(); // Fill in-flight transforms up to `concurrency` from the input. - // Polling the input does NOT count toward `elapsed_compute`. + // Polling the input does not count toward `elapsed_compute`. while !this.input_done && this.in_flight.len() < this.concurrency { match this.input.poll_next_unpin(cx) { Poll::Ready(Some(Ok(batch))) => { @@ -379,7 +379,7 @@ where } } - // Drive in-flight transforms; their poll time IS counted. + // Drive in-flight transforms; their poll time is counted. if !this.in_flight.is_empty() { let timer = this.baseline_metrics.elapsed_compute().timer(); let poll = this.in_flight.poll_next_unpin(cx); @@ -391,13 +391,9 @@ where } return this.baseline_metrics.record_poll(Poll::Ready(Some(result))); } - // Unreachable: `FuturesUnordered::poll_next` returns - // `Ready(None)` only when empty, and we just checked - // `!is_empty` above. A panic here is preferable to a - // silent infinite loop if the invariant ever breaks. - Poll::Ready(None) => { - unreachable!("FuturesUnordered yielded None while non-empty") - } + // `FuturesUnordered::poll_next` returns `Ready(None)` only + // when empty, and we just checked `!is_empty` above. + Poll::Ready(None) => unreachable!("non-empty transform queue yielded None"), Poll::Pending => return Poll::Pending, } } @@ -506,12 +502,17 @@ impl IoMetrics { } pub fn record(&self, scan_scheduler: &ScanScheduler) { - let current_stats = scan_scheduler.stats(); + self.record_stats(scan_scheduler.stats()); + } - // Use set_max to ensure gauge always shows the highest value seen - self.iops.set_max(current_stats.iops as usize); - self.requests.set_max(current_stats.requests as usize); - self.bytes_read.set_max(current_stats.bytes_read as usize); + /// Record a snapshot of cumulative I/O statistics. + /// + /// Uses `set_max` because the underlying counters are cumulative; the gauge + /// always reflects the highest (i.e. final) value seen. + pub fn record_stats(&self, stats: ScanStats) { + self.iops.set_max(stats.iops as usize); + self.requests.set_max(stats.requests as usize); + self.bytes_read.set_max(stats.bytes_read as usize); } } @@ -520,6 +521,12 @@ pub struct IndexMetrics { indices_loaded: Count, parts_loaded: Count, index_comparisons: Count, + /// Per-query sink that accumulates exact index-file I/O as partitions are + /// loaded from storage. Shared by all clones of this `IndexMetrics`, so + /// concurrent partition loads all funnel into the same counters. Published + /// to `io_metrics` for display via [`IndexMetrics::flush_io`]. + io_stats: IoStats, + io_metrics: IoMetrics, } impl IndexMetrics { @@ -528,8 +535,18 @@ impl IndexMetrics { indices_loaded: metrics.new_count(INDICES_LOADED_METRIC, partition), parts_loaded: metrics.new_count(PARTS_LOADED_METRIC, partition), index_comparisons: metrics.new_count(INDEX_COMPARISONS_METRIC, partition), + io_stats: IoStats::new(), + io_metrics: IoMetrics::new(metrics, partition), } } + + /// Publish the I/O accumulated in the per-query sink to the displayed + /// `iops`/`requests`/`bytes_read` metrics. Call once when the operator's + /// stream finishes; the sink only accumulates on cache misses, so a fully + /// cache-resident query publishes zeros. + pub fn flush_io(&self) { + self.io_metrics.record_stats(self.io_stats.snapshot()); + } } impl MetricsCollector for IndexMetrics { @@ -542,6 +559,9 @@ impl MetricsCollector for IndexMetrics { fn record_comparisons(&self, num_comparisons: usize) { self.index_comparisons.add(num_comparisons); } + fn io_stats(&self) -> Option { + Some(self.io_stats.clone()) + } } #[cfg(test)] @@ -579,11 +599,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); let n_batches: usize = 3; - // child_delay is intentionally several times larger than transform_delay - // so the assertion tolerates significant `std::thread::sleep` overshoot - // on busy CI runners (we've seen ~2-3x overshoot on macOS Actions). let child_delay = Duration::from_millis(150); - let transform_delay = Duration::from_millis(30); let counter = Arc::new(AtomicUsize::new(0)); let s = schema.clone(); @@ -607,10 +623,7 @@ mod tests { let stream = InstrumentedChildInputStream::new( child, schema, - move |batch| async move { - std::thread::sleep(transform_delay); - Ok(batch) - }, + move |batch| async move { Ok(batch) }, 1, 0, &metrics, @@ -625,16 +638,10 @@ mod tests { .expect("elapsed_compute should be recorded"); let elapsed = Duration::from_nanos(elapsed_ns as u64); - // Expect ~ transform_delay * n. The upper bound is set generously to - // absorb sleep overshoot on slow CI (~4-5x per call) while still - // cleanly rejecting any version that double-counts child poll time, - // which would yield ~ (transform_delay + child_delay) * n. - let upper = Duration::from_millis(400); - assert!( - elapsed >= transform_delay * (n_batches as u32 - 1), - "elapsed_compute={:?} too low; transform time was not measured", - elapsed, - ); + // The transform is immediate, so `elapsed_compute` should stay well + // below even one child poll delay. A version that double-counts child + // input time would include roughly `child_delay * n_batches`. + let upper = child_delay; assert!( elapsed < upper, "elapsed_compute={:?} >= {:?}; child input time was double-counted", diff --git a/rust/lance/src/lib.rs b/rust/lance/src/lib.rs index 284e10a9b6f..729cf2ffbe7 100644 --- a/rust/lance/src/lib.rs +++ b/rust/lance/src/lib.rs @@ -90,7 +90,7 @@ pub mod pb { include!(concat!(env!("OUT_DIR"), "/lance.pb.rs")); } -pub use blob::{BlobArrayBuilder, blob_field}; +pub use blob::{BlobArrayBuilder, BlobFieldOptions, blob_field, blob_field_with_options}; pub use dataset::Dataset; use lance_index::vector::DIST_COL; diff --git a/rust/lance/src/session.rs b/rust/lance/src/session.rs index b032cbaa15e..8d5e9717570 100644 --- a/rust/lance/src/session.rs +++ b/rust/lance/src/session.rs @@ -4,8 +4,8 @@ use std::collections::HashMap; use std::sync::Arc; -use deepsize::DeepSizeOf; -use lance_core::cache::{CacheBackend, LanceCache}; +use lance_core::cache::{CacheBackend, CacheKeyIterator, LanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_index::IndexType; use lance_io::object_store::ObjectStoreRegistry; @@ -56,7 +56,7 @@ pub struct Session { } impl DeepSizeOf for Session { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let mut size = 0; // Measure the actual cache contents through the wrapper types size += self.index_cache.deep_size_of_children(context); @@ -209,6 +209,44 @@ impl Session { pub async fn index_cache_stats(&self) -> lance_core::cache::CacheStats { self.index_cache.0.stats().await } + + /// Return an iterator over keys currently held by the index cache. + /// + /// Returns `None` when the index cache backend does not support key + /// inventory. + /// + /// # Examples + /// + /// ``` + /// # use lance::session::Session; + /// # async fn example() { + /// let session = Session::default(); + /// let keys = session.index_cache_keys().await; + /// assert!(keys.is_some()); + /// # } + /// ``` + pub async fn index_cache_keys(&self) -> Option> { + self.index_cache.0.keys().await + } + + /// Return an iterator over keys currently held by the metadata cache. + /// + /// Returns `None` when the metadata cache backend does not support key + /// inventory. + /// + /// # Examples + /// + /// ``` + /// # use lance::session::Session; + /// # async fn example() { + /// let session = Session::default(); + /// let keys = session.metadata_cache_keys().await; + /// assert!(keys.is_some()); + /// # } + /// ``` + pub async fn metadata_cache_keys(&self) -> Option> { + self.metadata_cache.0.keys().await + } } impl Default for Session { @@ -224,10 +262,23 @@ impl Default for Session { #[cfg(test)] mod tests { use super::*; - use lance_core::cache::UnsizedCacheKey; + use lance_core::cache::{CacheKey, UnsizedCacheKey}; use lance_index::vector::VectorIndex; use std::borrow::Cow; + struct TestKey(&'static str); + impl CacheKey for TestKey { + type ValueType = Vec; + + fn key(&self) -> Cow<'_, str> { + Cow::Borrowed(self.0) + } + + fn type_name() -> &'static str { + "TestVec" + } + } + struct TestUnsizedKey(&'static str); impl UnsizedCacheKey for TestUnsizedKey { type ValueType = dyn VectorIndex; @@ -251,4 +302,41 @@ mod tests { .is_none() ); } + + #[tokio::test] + async fn test_session_cache_keys() { + let session = Session::new(10_000, 10_000, Default::default()); + + session + .index_cache + .insert_with_key(&TestKey("index-key"), Arc::new(vec![1])) + .await; + session + .metadata_cache + .0 + .insert_with_key(&TestKey("metadata-key"), Arc::new(vec![2])) + .await; + + let index_keys = session + .index_cache_keys() + .await + .unwrap() + .collect::>(); + assert_eq!(index_keys.len(), 1); + assert_eq!(index_keys[0].prefix(), ""); + assert_eq!(index_keys[0].key(), "index-key"); + assert_eq!(index_keys[0].type_name(), "TestVec"); + + let metadata_keys = session + .metadata_cache_keys() + .await + .unwrap() + .collect::>(); + assert_eq!(metadata_keys.len(), 1); + assert_eq!(metadata_keys[0].prefix(), ""); + assert_eq!(metadata_keys[0].key(), "metadata-key"); + assert_eq!(metadata_keys[0].type_name(), "TestVec"); + + assert_ne!(index_keys, metadata_keys); + } } diff --git a/rust/lance/src/session/caches.rs b/rust/lance/src/session/caches.rs index 82dc755f6c0..a2dda6069ab 100644 --- a/rust/lance/src/session/caches.rs +++ b/rust/lance/src/session/caches.rs @@ -12,7 +12,7 @@ use std::{borrow::Cow, ops::Deref}; -use deepsize::{Context, DeepSizeOf}; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_core::{ cache::{CacheKey, LanceCache}, utils::deletion::DeletionVector, diff --git a/rust/lance/src/session/index_caches.rs b/rust/lance/src/session/index_caches.rs index 3ae777880aa..e93b7208b09 100644 --- a/rust/lance/src/session/index_caches.rs +++ b/rust/lance/src/session/index_caches.rs @@ -12,8 +12,8 @@ use std::{borrow::Cow, ops::Deref, sync::Arc}; -use deepsize::{Context, DeepSizeOf}; use lance_core::cache::{CacheKey, LanceCache}; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_index::frag_reuse::FragReuseIndex; use lance_table::format::IndexMetadata; use uuid::Uuid; @@ -63,14 +63,14 @@ impl Deref for DSIndexCache { impl DSIndexCache { /// Create an index-specific cache with the given UUID prefix. - pub fn for_index(&self, uuid: &str, fri_uuid: Option<&Uuid>) -> LanceCache { + pub fn for_index(&self, uuid: &Uuid, fri_uuid: Option<&Uuid>) -> LanceCache { if let Some(fri_uuid) = fri_uuid { // If a FRI UUID is provided, use it to create a more specific cache key. let cache_key = format!("{}-{}", uuid, fri_uuid); self.0.with_key_prefix(&cache_key) } else { // Otherwise, just use the index UUID as the key prefix. - self.0.with_key_prefix(uuid) + self.0.with_key_prefix(&uuid.to_string()) } } } @@ -79,7 +79,7 @@ impl DSIndexCache { #[derive(Debug)] pub struct FragReuseIndexKey<'a> { - pub uuid: &'a str, + pub uuid: &'a Uuid, } impl CacheKey for FragReuseIndexKey<'_> { @@ -131,7 +131,7 @@ impl DeepSizeOf for ProstAny { /// what they are. These we cache. #[derive(Debug)] pub struct ScalarIndexDetailsKey<'a> { - pub uuid: &'a str, + pub uuid: &'a Uuid, } impl CacheKey for ScalarIndexDetailsKey<'_> { diff --git a/rust/lance/src/session/index_extension.rs b/rust/lance/src/session/index_extension.rs index f5e7741441f..301213c6f06 100644 --- a/rust/lance/src/session/index_extension.rs +++ b/rust/lance/src/session/index_extension.rs @@ -3,10 +3,11 @@ use std::sync::Arc; -use deepsize::DeepSizeOf; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_index::{IndexParams, IndexType, vector::VectorIndex}; +use uuid::Uuid; use crate::Dataset; @@ -35,7 +36,7 @@ pub trait VectorIndexExtension: IndexExtension { // if we wrap into an Arc, the mutable reference is lost dataset: &Dataset, column: &str, - uuid: &str, + uuid: &Uuid, params: &dyn IndexParams, ) -> Result<()>; @@ -44,7 +45,7 @@ pub trait VectorIndexExtension: IndexExtension { &self, dataset: Arc, column: &str, - uuid: &str, + uuid: &Uuid, reader: PreviousFileReader, ) -> Result>; } @@ -69,7 +70,7 @@ mod test { use arrow_array::{Float32Array, RecordBatch, UInt32Array}; use arrow_schema::Schema; use datafusion::execution::SendableRecordBatchStream; - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; use lance_file::previous::writer::{ FileWriter as PreviousFileWriter, FileWriterOptions as PreviousFileWriterOptions, }; @@ -95,7 +96,7 @@ mod test { struct MockIndex; impl DeepSizeOf for MockIndex { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } @@ -110,10 +111,6 @@ mod test { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - async fn prewarm(&self) -> Result<()> { Ok(()) } @@ -230,7 +227,7 @@ mod test { } impl DeepSizeOf for MockIndexExtension { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { todo!() } } @@ -259,7 +256,7 @@ mod test { &self, dataset: &Dataset, _column: &str, - uuid: &str, + uuid: &Uuid, _params: &dyn IndexParams, ) -> Result<()> { let store = dataset.object_store.clone(); @@ -305,7 +302,7 @@ mod test { &self, _dataset: Arc, _column: &str, - _uuid: &str, + _uuid: &Uuid, _reader: PreviousFileReader, ) -> Result> { self.load_index_called @@ -391,7 +388,7 @@ mod test { let idx = ds_without_extension.load_indices().await.unwrap(); assert_eq!(idx.len(), 1); // get the index uuid - let index_uuid = idx.first().unwrap().uuid.to_string(); + let index_uuid = idx.first().unwrap().uuid; // trying to open the index should fail as there is no extension loader assert!( diff --git a/rust/lance/tests/count_pushdown/mod.rs b/rust/lance/tests/count_pushdown/mod.rs new file mode 100644 index 00000000000..aaa3f5f539e --- /dev/null +++ b/rust/lance/tests/count_pushdown/mod.rs @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! End-to-end integration tests for [`CountPushdown`] when DataFusion +//! does the planning (i.e. an aggregate built from SQL through the public +//! [`LanceTableProvider`] surface), as opposed to going through +//! `Scanner::create_plan`. +//! +//! The plan shape DataFusion produces for a SQL aggregate differs from the +//! scanner's: it emits `AggregateExec(Final) → CoalescePartitionsExec → +//! AggregateExec(Partial) → LanceTableScan` rather than a single +//! `AggregateExec(Single)`. This file pins that down so future aggregate- +//! pushdown categories can be added here with the test scaffolding already +//! in place. + +use std::sync::Arc; + +use arrow::datatypes::UInt64Type; +use arrow_array::types::Int64Type; +use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion::execution::SessionStateBuilder; +use datafusion::physical_plan::{ExecutionPlan, displayable}; +use datafusion::prelude::SessionContext; +use futures::TryStreamExt; +use lance::Dataset; +use lance::datafusion::LanceTableProvider; +use lance::dataset::WriteParams; +use lance::index::DatasetIndexExt; +use lance::io::exec::count_from_mask::CountFromMaskExec; +use lance::io::exec::count_pushdown::CountPushdown; +use lance_core::utils::tempfile::TempStrDir; +use lance_datagen::{BatchCount, RowCount, array, gen_batch}; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; + +/// Build a 4-fragment, 10-row-per-fragment dataset with a BTREE index on `x`. +async fn make_indexed_dataset() -> (Arc, TempStrDir) { + let tmp = TempStrDir::default(); + let reader = gen_batch() + .col("x", array::step::()) + .into_reader_rows(RowCount::from(10), BatchCount::from(4)); + let mut dataset = Dataset::write( + reader, + tmp.as_str(), + Some(WriteParams { + max_rows_per_file: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + (Arc::new(dataset), tmp) +} + +/// Build a `SessionContext` configured with the Lance physical optimizer rule +/// for aggregate pushdown, then register `dataset` under the name `t`. +fn lance_aware_context(dataset: Arc) -> SessionContext { + let state = SessionStateBuilder::new() + .with_default_features() + .with_physical_optimizer_rule(Arc::new(CountPushdown)) + .build(); + let ctx = SessionContext::new_with_state(state); + ctx.register_table( + "t", + Arc::new(LanceTableProvider::new(dataset, false, false)), + ) + .unwrap(); + ctx +} + +fn plan_contains_pushdown(plan: &Arc) -> bool { + let mut found = false; + plan.apply(|node| { + if node.as_any().is::() { + found = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + }) + .unwrap(); + found +} + +async fn execute_count(plan: Arc) -> i64 { + let stream = datafusion::physical_plan::execute_stream( + plan, + Arc::new(datafusion::execution::TaskContext::default()), + ) + .unwrap(); + let batches: Vec<_> = stream.try_collect().await.unwrap(); + let total: i64 = batches + .iter() + .map(|b| { + b.column(0) + .as_any() + .downcast_ref::>() + .expect("count column should be Int64") + .iter() + .map(|v| v.unwrap_or(0)) + .sum::() + }) + .sum(); + total +} + +#[tokio::test] +async fn sql_count_star_with_indexed_filter() { + // SELECT COUNT(*) FROM t WHERE x < 25 + // + // The rule should fire on DataFusion's `AggregateExec(Partial)` node at + // the leaf of the aggregate pipeline, replacing the column scan with + // `CountFromMaskExec` while the outer `AggregateExec(Final)` keeps + // doing the cross-partition combine. + let (dataset, _tmp) = make_indexed_dataset().await; + let ctx = lance_aware_context(dataset); + + let df = ctx + .sql("SELECT COUNT(*) FROM t WHERE x < 25") + .await + .unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + assert!( + plan_contains_pushdown(&plan), + "expected CountFromMaskExec in SQL plan, got:\n{}", + displayable(plan.as_ref()).indent(true) + ); + assert_eq!(execute_count(plan).await, 25); +} + +#[tokio::test] +async fn sql_unfiltered_count_star_uses_statistics() { + // SELECT COUNT(*) FROM t with no filter is answered by DataFusion + // statically from LanceTableProvider's row-count statistic — never + // reaches an `AggregateExec` for our rule to look at. Pin that + // behaviour: the rule should not fire, and the answer is correct. + let (dataset, _tmp) = make_indexed_dataset().await; + let ctx = lance_aware_context(dataset); + + let df = ctx.sql("SELECT COUNT(*) FROM t").await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + assert!( + !plan_contains_pushdown(&plan), + "unfiltered COUNT(*) should be resolved from statistics, got:\n{}", + displayable(plan.as_ref()).indent(true) + ); + assert_eq!(execute_count(plan).await, 40); +} + +#[tokio::test] +async fn sql_count_distinct_does_not_fire_yet() { + // SELECT COUNT(DISTINCT x) FROM t WHERE x < 25 + // + // `is_count_star` rejects distinct, so this rule never fires for + // distinct counts — they belong to the mask-to-answer category and will + // need their own rule (e.g. over a bitmap-index dictionary). This test + // pins the not-firing behaviour and the scaffold for the future test. + let (dataset, _tmp) = make_indexed_dataset().await; + let ctx = lance_aware_context(dataset); + + let df = ctx + .sql("SELECT COUNT(DISTINCT x) FROM t WHERE x < 25") + .await + .unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + assert!( + !plan_contains_pushdown(&plan), + "CountFromMaskExec must not fire for COUNT(DISTINCT) yet: \n{}", + displayable(plan.as_ref()).indent(true) + ); + // Correctness via the scan path: values 0..25 are all distinct. + assert_eq!(execute_count(plan).await, 25); +} diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs index 81c2535dd9c..7a6d3e71ca4 100644 --- a/rust/lance/tests/integration_tests.rs +++ b/rust/lance/tests/integration_tests.rs @@ -3,6 +3,7 @@ // NOTE: we only create one integration test binary, to keep compilation overhead down. +mod count_pushdown; #[cfg(feature = "slow_tests")] mod query; #[cfg(feature = "slow_tests")]