From e2ca69c1d1be71d4a209b2db8102a4965a6b1997 Mon Sep 17 00:00:00 2001 From: Roxanne0321 Date: Thu, 14 May 2026 16:32:01 +0800 Subject: [PATCH 01/13] feat(sindi): use sparse vector datacell and remove sparse index Keep legacy SINDI rerank deserialization compatible while moving rerank storage to SparseVectorDataCell. Signed-off-by: Roxanne0321 Assisted-by: GitHub Copilot:GPT-5.4 --- docs/docs/en/src/advanced/introspection.md | 2 +- docs/docs/en/src/advanced/search_allocator.md | 2 +- docs/docs/zh/src/advanced/introspection.md | 2 +- docs/docs/zh/src/advanced/search_allocator.md | 2 +- include/vsag/constants.h | 1 - include/vsag/index.h | 10 +- src/algorithm/inner_index_interface.cpp | 2 +- src/algorithm/inner_index_interface.h | 6 +- src/algorithm/sindi/sindi.cpp | 204 +++++++++-- src/algorithm/sindi/sindi.h | 4 +- src/algorithm/sindi/sindi_test.cpp | 101 ++++-- src/algorithm/sparse_distance.h | 70 ++++ src/algorithm/sparse_index.cpp | 333 ------------------ src/algorithm/sparse_index.h | 130 ------- src/algorithm/sparse_index_parameters.cpp | 36 -- src/algorithm/sparse_index_parameters.h | 38 -- src/constants.cpp | 1 - src/datacell/flatten_interface.h | 10 + src/datacell/sparse_vector_datacell.h | 12 +- src/datacell/sparse_vector_datacell.inl | 54 ++- src/datacell/sparse_vector_datacell_test.cpp | 18 + src/factory/index_creators.cpp | 8 - tests/test_sparse_index.cpp | 94 ----- 23 files changed, 404 insertions(+), 736 deletions(-) create mode 100644 src/algorithm/sparse_distance.h delete mode 100644 src/algorithm/sparse_index.cpp delete mode 100644 src/algorithm/sparse_index.h delete mode 100644 src/algorithm/sparse_index_parameters.cpp delete mode 100644 src/algorithm/sparse_index_parameters.h delete mode 100644 tests/test_sparse_index.cpp diff --git a/docs/docs/en/src/advanced/introspection.md b/docs/docs/en/src/advanced/introspection.md index c23e62c4d1..e55d2a7e99 100644 --- a/docs/docs/en/src/advanced/introspection.md +++ b/docs/docs/en/src/advanced/introspection.md @@ -45,7 +45,7 @@ Two overloads are provided: // Dense vector indexes (HGraph, BruteForce, IVF, DiskANN, HNSW) auto r = index->CalDistanceById(query_ptr, ids, count, /*calculate_precise_distance=*/true); -// Sparse vector indexes (SINDI, SparseIndex) — wrap the query in a Dataset +// Sparse vector indexes (SINDI) — wrap the query in a Dataset auto query_ds = vsag::Dataset::Make(); query_ds->NumElements(1)->SparseVectors(/* ... */); auto r = index->CalDistanceById(query_ds, ids, count, /*calculate_precise_distance=*/true); diff --git a/docs/docs/en/src/advanced/search_allocator.md b/docs/docs/en/src/advanced/search_allocator.md index 5a4b76ebf0..6a542946c0 100644 --- a/docs/docs/en/src/advanced/search_allocator.md +++ b/docs/docs/en/src/advanced/search_allocator.md @@ -43,7 +43,7 @@ index falls back to the allocator that was attached to its owning `Resource`. > **Availability.** `Index::SearchWithRequest` has a default implementation that returns an > *unsupported* error. Only HGraph, IVF, BruteForce and WARP implement it today > (`src/algorithm/{hgraph,ivf,brute_force,warp}.cpp`). For indexes that do not yet override -> `SearchWithRequest` (HNSW, DiskANN, SINDI, Pyramid, SparseIndex), use the legacy `SearchParam` +> `SearchWithRequest` (HNSW, DiskANN, SINDI, Pyramid), use the legacy `SearchParam` > path described below. ## Legacy API — `SearchParam::allocator` *(deprecated)* diff --git a/docs/docs/zh/src/advanced/introspection.md b/docs/docs/zh/src/advanced/introspection.md index f4eb5a6787..3d5cb7a423 100644 --- a/docs/docs/zh/src/advanced/introspection.md +++ b/docs/docs/zh/src/advanced/introspection.md @@ -41,7 +41,7 @@ if (not index->CheckFeature(vsag::SUPPORT_DELETE_BY_ID)) { // 稠密向量索引(HGraph、BruteForce、IVF、DiskANN、HNSW) auto r = index->CalDistanceById(query_ptr, ids, count, /*calculate_precise_distance=*/true); -// 稀疏向量索引(SINDI、SparseIndex)—— 用 Dataset 封装查询 +// 稀疏向量索引(SINDI)—— 用 Dataset 封装查询 auto query_ds = vsag::Dataset::Make(); query_ds->NumElements(1)->SparseVectors(/* ... */); auto r = index->CalDistanceById(query_ds, ids, count, /*calculate_precise_distance=*/true); diff --git a/docs/docs/zh/src/advanced/search_allocator.md b/docs/docs/zh/src/advanced/search_allocator.md index cd5365f169..d0f8ee7ba4 100644 --- a/docs/docs/zh/src/advanced/search_allocator.md +++ b/docs/docs/zh/src/advanced/search_allocator.md @@ -37,7 +37,7 @@ auto result = index->SearchWithRequest(req).value(); > **可用性。** `Index::SearchWithRequest` 默认实现会返回 *不支持* 错误。目前只有 HGraph、 > IVF、BruteForce、WARP 实现了它(`src/algorithm/{hgraph,ivf,brute_force,warp}.cpp`)。对于 -> 尚未 override 的索引(HNSW、DiskANN、SINDI、Pyramid、SparseIndex),请使用下文的旧版 +> 尚未 override 的索引(HNSW、DiskANN、SINDI、Pyramid),请使用下文的旧版 > `SearchParam` 路径。 ## 旧版 API —— `SearchParam::allocator`(已弃用) diff --git a/include/vsag/constants.h b/include/vsag/constants.h index ea5a3c3cf9..7430f29bd9 100644 --- a/include/vsag/constants.h +++ b/include/vsag/constants.h @@ -21,7 +21,6 @@ extern const char* const INDEX_DISKANN; extern const char* const INDEX_HNSW; extern const char* const INDEX_FRESH_HNSW; extern const char* const INDEX_PYRAMID; -extern const char* const INDEX_SPARSE; extern const char* const INDEX_SINDI; extern const char* const INDEX_BRUTE_FORCE; extern const char* const INDEX_IVF; diff --git a/include/vsag/index.h b/include/vsag/index.h index d4f7ea9950..213c1de5cf 100644 --- a/include/vsag/index.h +++ b/include/vsag/index.h @@ -51,7 +51,7 @@ struct MergeUnit { IdMapFunction id_map_func = nullptr; }; -enum class IndexType { HNSW, DISKANN, HGRAPH, IVF, PYRAMID, BRUTEFORCE, SPARSE, SINDI, WARP }; +enum class IndexType { HNSW, DISKANN, HGRAPH, IVF, PYRAMID, BRUTEFORCE, SINDI, WARP }; #define DATA_FLAG_FLOAT32_VECTOR 0x01 #define DATA_FLAG_INT8_VECTOR 0x02 @@ -462,7 +462,7 @@ class Index { * * Suitable for dense vector indexes (HGraph, BruteForce, IVF, DiskANN, HNSW). * The query must be a contiguous float32 array with dimension matching the index. - * For sparse vector indexes (SINDI, SparseIndex), this overload is not applicable; + * For sparse vector indexes (SINDI), this overload is not applicable; * use CalcDistanceById(DatasetPtr, int64_t, bool) instead. * * @param vector The embedding of the query (float32 array for dense vectors). @@ -483,7 +483,7 @@ class Index { /** * @brief Calculate the distance between the query and the vector of the given ID. * - * Suitable for sparse vector indexes (SINDI, SparseIndex) where vectors + * Suitable for sparse vector indexes (SINDI) where vectors * cannot be represented as a simple float pointer. The Dataset should * contain sparse vectors via GetSparseVectors(). * For dense vector indexes (HGraph, BruteForce, IVF, DiskANN, HNSW), @@ -509,7 +509,7 @@ class Index { * * Suitable for dense vector indexes (HGraph, BruteForce, IVF, DiskANN, HNSW). * The query must be a contiguous float32 array. For sparse vector indexes - * (SINDI, SparseIndex), this overload is not applicable; use + * (SINDI), this overload is not applicable; use * CalDistanceById(DatasetPtr, const int64_t*, int64_t, bool) instead. * * @param query is the embedding of query (float32 array for dense vectors). @@ -532,7 +532,7 @@ class Index { /** * @brief Calculate the distance between the query and the vector of the given ID for batch. * - * Suitable for sparse vector indexes (SINDI, SparseIndex) where vectors + * Suitable for sparse vector indexes (SINDI) where vectors * cannot be represented as a simple float pointer. The Dataset should * contain sparse vectors via GetSparseVectors(). * For dense vector indexes (HGraph, BruteForce, IVF, DiskANN, HNSW), diff --git a/src/algorithm/inner_index_interface.cpp b/src/algorithm/inner_index_interface.cpp index e2580c8ccf..b46bed9fa7 100644 --- a/src/algorithm/inner_index_interface.cpp +++ b/src/algorithm/inner_index_interface.cpp @@ -413,7 +413,7 @@ InnerIndexInterface::GetVectorByIds(const int64_t* ids, Allocator* allocator = has_specified_allocator ? specified_allocator : allocator_; DatasetPtr vectors = Dataset::Make(); - if (GetIndexType() == IndexType::SINDI or GetIndexType() == IndexType::SPARSE) { + if (GetIndexType() == IndexType::SINDI) { auto* sparse_vectors = static_cast(allocator->Allocate(sizeof(SparseVector) * count)); if (sparse_vectors == nullptr) { diff --git a/src/algorithm/inner_index_interface.h b/src/algorithm/inner_index_interface.h index b1c87bf96b..964213bf55 100644 --- a/src/algorithm/inner_index_interface.h +++ b/src/algorithm/inner_index_interface.h @@ -75,7 +75,7 @@ class InnerIndexInterface { /** * @brief Calculate distance by ID using DatasetPtr. * - * Suitable for sparse vector indexes (SINDI, SparseIndex) where vectors + * Suitable for sparse vector indexes (SINDI) where vectors * cannot be represented as a simple float pointer. The Dataset should * contain sparse vectors via GetSparseVectors(). * For dense vector indexes, this overload is also available via default @@ -102,7 +102,7 @@ class InnerIndexInterface { * * Suitable for dense vector indexes (HGraph, BruteForce, IVF, DiskANN, HNSW). * The query must be a contiguous float32 array with dimension matching the index. - * For sparse vector indexes (SINDI, SparseIndex), this overload is not applicable. + * For sparse vector indexes (SINDI), this overload is not applicable. * * Default implementation throws exception; dense indexes must override. * @@ -143,7 +143,7 @@ class InnerIndexInterface { /** * @brief Calculate distances by IDs (batch) using DatasetPtr. * - * Suitable for sparse vector indexes (SINDI, SparseIndex) where vectors + * Suitable for sparse vector indexes (SINDI) where vectors * cannot be represented as a simple float pointer. The Dataset should * contain sparse vectors via GetSparseVectors(). * For dense vector indexes, this overload is also available via default diff --git a/src/algorithm/sindi/sindi.cpp b/src/algorithm/sindi/sindi.cpp index 7134fc7be8..7427612a22 100644 --- a/src/algorithm/sindi/sindi.cpp +++ b/src/algorithm/sindi/sindi.cpp @@ -15,15 +15,125 @@ #include "sindi.h" +#include + #include "analyzer/analyzer.h" +#include "algorithm/sparse_distance.h" +#include "datacell/sparse_vector_datacell_parameter.h" #include "impl/heap/standard_heap.h" #include "index_feature_list.h" +#include "io/memory_io_parameter.h" +#include "quantization/sparse_quantization/sparse_quantizer.h" +#include "quantization/sparse_quantization/sparse_quantizer_parameter.h" #include "storage/serialization.h" #include "utils/util_functions.h" #include "vsag/allocator.h" #include "vsag_exception.h" namespace vsag { + +namespace { + +constexpr const char* SINDI_RERANK_FLAT_FORMAT_KEY = "sindi_rerank_flat_format"; +constexpr int64_t SINDI_RERANK_FLAT_FORMAT_DATACELL = 2; + +float +cal_distance_by_id_unsafe(const FlattenInterfacePtr& flat, + Vector& sorted_ids, + Vector& sorted_vals, + uint32_t inner_id) { + bool need_release{false}; + const auto* codes = flat->GetCodesById(inner_id, need_release); + auto len = *reinterpret_cast(codes); + const auto* entries = reinterpret_cast(codes + sizeof(uint32_t)); + float sum = 0.0F; + uint32_t i = 0; + uint32_t j = 0; + while (i < sorted_ids.size() && j < len) { + if (sorted_ids[i] < entries[j].id) { + i++; + } else if (sorted_ids[i] > entries[j].id) { + j++; + } else { + sum += sorted_vals[i] * entries[j].val; + i++; + j++; + } + } + auto distance = 1 - sum; + if (need_release) { + flat->Release(codes); + } + return distance; +} + +DatasetPtr +collect_results(const DistHeapPtr& results, Allocator* allocator) { + auto [result, dists, ids] = + create_fast_dataset(static_cast(results->Size()), allocator); + if (results->Empty()) { + result->Dim(0)->NumElements(1); + return result; + } + + for (auto j = static_cast(results->Size() - 1); j >= 0; --j) { + dists[j] = results->Top().first; + ids[j] = results->Top().second; + results->Pop(); + } + return result; +} + +FlattenInterfacePtr +create_rerank_flat(const IndexCommonParam& common_param) { + auto rerank_param = std::make_shared(); + rerank_param->io_parameter = std::make_shared(); + rerank_param->quantizer_parameter = std::make_shared(); + return FlattenInterface::MakeInstance(rerank_param, common_param); +} + +void +deserialize_legacy_rerank_flat(StreamReader& reader, + const FlattenInterfacePtr& flat, + Allocator* allocator) { + int64_t cur_element_count = 0; + StreamReader::ReadObj(reader, cur_element_count); + flat->Resize(cur_element_count); + std::vector ids; + std::vector vals; + for (int64_t i = 0; i < cur_element_count; ++i) { + uint32_t len = 0; + StreamReader::ReadObj(reader, len); + ids.resize(len); + vals.resize(len); + reader.Read(reinterpret_cast(ids.data()), + static_cast(len) * sizeof(uint32_t)); + reader.Read(reinterpret_cast(vals.data()), + static_cast(len) * sizeof(float)); + SparseVector vector; + vector.len_ = len; + vector.ids_ = ids.data(); + vector.vals_ = vals.data(); + flat->InsertVector(&vector, i); + } + LabelTable legacy_label_table(allocator); + legacy_label_table.Deserialize(reader); +} + +void +deserialize_rerank_flat(StreamReader& reader, + const FlattenInterfacePtr& flat, + Allocator* allocator, + bool has_datacell_format) { + if (has_datacell_format) { + flat->Deserialize(reader); + return; + } + deserialize_legacy_rerank_flat(reader, flat, allocator); +} + +} // namespace + ParamPtr SINDI::CheckAndMappingExternalParam(const JsonType& external_param, const IndexCommonParam& common_param) { @@ -50,9 +160,7 @@ SINDI::SINDI(const SINDIParameterPtr& param, const IndexCommonParam& common_para std::make_shared(term_id_limit_, common_param.allocator_.get()); } if (use_reorder_) { - SparseIndexParameterPtr rerank_param = std::make_shared(); - rerank_param->need_sort = true; - rerank_flat_index_ = std::make_shared(rerank_param, common_param); + rerank_flat_ = create_rerank_flat(common_param); } } @@ -184,12 +292,7 @@ SINDI::Add(const DatasetPtr& base, AddMode mode) { // high precision part if (use_reorder_) { - auto single_base = Dataset::Make(); - single_base->NumElements(1) - ->SparseVectors(sparse_vectors + i) - ->Ids(ids + i) - ->Owner(false); - rerank_flat_index_->Add(single_base); + rerank_flat_->InsertVector(sparse_vectors + i, cur_element_count_ - 1); } } if (window_changed) { @@ -209,16 +312,15 @@ SINDI::UpdateVector(int64_t id, const DatasetPtr& new_base, bool force_update) { // Note: // 1. we only check whether the old vector is a subset of the new vector // 2. we do not actually update the vector - auto check_and_cleanup = [this, id, &new_base](InnerIndexInterface* index) -> bool { + uint32_t inner_id; + { + std::scoped_lock rlock(this->global_mutex_); + inner_id = this->label_table_->GetIdByLabel(id); + } + const auto& new_sv = *new_base->GetSparseVectors(); + auto check_and_cleanup = [this, inner_id, &new_sv](auto&& get_sparse_vector) -> bool { SparseVector old_sv; - uint32_t inner_id; - { - std::scoped_lock rlock(this->global_mutex_); - inner_id = this->label_table_->GetIdByLabel(id); - } - index->GetSparseVectorByInnerId(inner_id, &old_sv, this->allocator_); - - const auto& new_sv = *new_base->GetSparseVectors(); + get_sparse_vector(inner_id, &old_sv, this->allocator_); bool ret = is_subset_of_sparse_vector(old_sv, new_sv); this->allocator_->Deallocate(old_sv.vals_); @@ -227,12 +329,20 @@ SINDI::UpdateVector(int64_t id, const DatasetPtr& new_base, bool force_update) { }; if (use_reorder_) { - if (not check_and_cleanup(rerank_flat_index_.get())) { + if (not check_and_cleanup([this](InnerIdType inner_id, + SparseVector* data, + Allocator* allocator) { + rerank_flat_->GetSparseVectorByInnerId(inner_id, data, allocator); + })) { return false; } } - return check_and_cleanup(this); + return check_and_cleanup([this](InnerIdType inner_id, + SparseVector* data, + Allocator* allocator) { + this->GetSparseVectorByInnerId(inner_id, data, allocator); + }); } DatasetPtr @@ -347,14 +457,12 @@ SINDI::search_impl(const SparseTermComputerPtr& computer, float cur_heap_top = std::numeric_limits::max(); auto candidate_size = heap.size(); auto high_precise_heap = std::make_shared>(allocator_, -1); - auto [sorted_ids, sorted_vals] = rerank_flat_index_->sort_sparse_vector( - original_query ? *original_query : computer->raw_query_); + auto [sorted_ids, sorted_vals] = + sort_sparse_vector(original_query ? *original_query : computer->raw_query_, allocator_); for (auto i = 0; i < candidate_size; i++) { auto inner_id = heap.top().second; - auto high_precise_distance = rerank_flat_index_->CalDistanceByIdUnsafe( - sorted_ids, - sorted_vals, - inner_id); // TODO(ZXY): use flat to replace rerank_flat_index_ + auto high_precise_distance = + cal_distance_by_id_unsafe(rerank_flat_, sorted_ids, sorted_vals, inner_id); auto label = label_table_->GetLabelById(inner_id); if constexpr (mode == KNN_SEARCH) { if (high_precise_distance < cur_heap_top or high_precise_heap->Size() < k) { @@ -377,7 +485,7 @@ SINDI::search_impl(const SparseTermComputerPtr& computer, heap.pop(); } - return rerank_flat_index_->collect_results(high_precise_heap); + return collect_results(high_precise_heap, allocator_); } // low precision @@ -463,8 +571,8 @@ SINDI::cal_memory_usage() { for (auto& window : window_term_list_) { memory += window->GetMemoryUsage(); } - if (this->rerank_flat_index_ != nullptr) { - memory += this->rerank_flat_index_->GetMemoryUsage(); + if (this->rerank_flat_ != nullptr) { + memory += this->rerank_flat_->GetMemoryUsage(); } memory += sizeof(QuantizationParams); @@ -493,7 +601,7 @@ SINDI::Serialize(StreamWriter& writer) const { label_table_->Serialize(writer); if (use_reorder_) { - rerank_flat_index_->Serialize(writer); + rerank_flat_->Serialize(writer); } if (remap_term_ids_ && term_id_mapper_) { @@ -503,6 +611,9 @@ SINDI::Serialize(StreamWriter& writer) const { JsonType jsonify_basic_info; auto metadata = std::make_shared(); jsonify_basic_info[INDEX_PARAM].SetString(this->create_param_ptr_->ToString()); + if (use_reorder_) { + jsonify_basic_info[SINDI_RERANK_FLAT_FORMAT_KEY].SetInt(SINDI_RERANK_FLAT_FORMAT_DATACELL); + } metadata->Set("basic_info", jsonify_basic_info); auto footer = std::make_shared