From 831f844fa21173e72f25ea8883fd478a2fb1e097 Mon Sep 17 00:00:00 2001 From: himy2u Date: Mon, 22 Jun 2026 21:08:04 -0400 Subject: [PATCH] fix(config): call .strip() correctly in vector store db_uri validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The condition in _validate_vector_store_db_uri() compared the bound method reference store.db_uri.strip (without parentheses) to an empty string. A method reference is always truthy, so the comparison was always False — meaning a whitespace-only db_uri was never replaced with the default and was forwarded unchanged to Path.resolve(), silently producing a wrong path. Fix: add parentheses to call the method and get the stripped string. Fixes #2381 Tests added: - Parametrized regression tests for empty, whitespace-only, tab, newline, and mixed-whitespace URIs (all must fall back to the configured default). - Tests for None / omitted db_uri (existing degenerate case). - Tests for valid relative and absolute URIs (must resolve correctly). --- .../patch-20260623010541725400.json | 4 + .../config/models/graph_rag_config.py | 2 +- .../test_vector_store_db_uri_validation.py | 122 ++++++++++++++++++ 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 .semversioner/next-release/patch-20260623010541725400.json create mode 100644 tests/unit/config/test_vector_store_db_uri_validation.py diff --git a/.semversioner/next-release/patch-20260623010541725400.json b/.semversioner/next-release/patch-20260623010541725400.json new file mode 100644 index 0000000000..acc5e3fd54 --- /dev/null +++ b/.semversioner/next-release/patch-20260623010541725400.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix vector store db_uri whitespace validation \u2014 .strip was referenced as a method object instead of being called, causing whitespace-only URIs to bypass the default fallback" +} diff --git a/packages/graphrag/graphrag/config/models/graph_rag_config.py b/packages/graphrag/graphrag/config/models/graph_rag_config.py index 1b753d58fc..eb32b3e5a7 100644 --- a/packages/graphrag/graphrag/config/models/graph_rag_config.py +++ b/packages/graphrag/graphrag/config/models/graph_rag_config.py @@ -270,7 +270,7 @@ def _validate_vector_store_db_uri(self) -> None: """Validate the vector store configuration.""" store = self.vector_store if store.type == VectorStoreType.LanceDB: - if not store.db_uri or store.db_uri.strip == "": + if not store.db_uri or store.db_uri.strip() == "": store.db_uri = graphrag_config_defaults.vector_store.db_uri store.db_uri = str(Path(store.db_uri).resolve()) diff --git a/tests/unit/config/test_vector_store_db_uri_validation.py b/tests/unit/config/test_vector_store_db_uri_validation.py new file mode 100644 index 0000000000..ddcf59a95f --- /dev/null +++ b/tests/unit/config/test_vector_store_db_uri_validation.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Unit tests for VectorStore db_uri validation in GraphRagConfig. + +Regression tests for: + https://github.com/microsoft/graphrag/issues/2381 + +The bug: `store.db_uri.strip` (without parentheses) is a bound method +reference, which is always truthy. The condition `store.db_uri.strip == ""` +was therefore always False, meaning a whitespace-only db_uri was never +replaced with the default, and the subsequent `Path(store.db_uri).resolve()` +would silently succeed with a meaningless path like `Path(" ").resolve()`. + +Fix: `store.db_uri.strip()` (with parentheses) calls the method and returns +the stripped string, which is then correctly compared to "". +""" + +from pathlib import Path + +import pytest + +from graphrag.config.defaults import graphrag_config_defaults +from graphrag.config.models.graph_rag_config import GraphRagConfig + +from tests.unit.config.utils import ( + DEFAULT_COMPLETION_MODELS, + DEFAULT_EMBEDDING_MODELS, +) + + +def _make_config(db_uri: str | None) -> GraphRagConfig: + """Construct a minimal GraphRagConfig with the given vector_store db_uri.""" + overrides: dict = { + "completion_models": DEFAULT_COMPLETION_MODELS, + "embedding_models": DEFAULT_EMBEDDING_MODELS, + } + if db_uri is not None: + overrides["vector_store"] = {"db_uri": db_uri} + return GraphRagConfig(**overrides) + + +class TestVectorStoreDbUriValidation: + """Tests for _validate_vector_store_db_uri in GraphRagConfig. + + Covers three categories: + 1. Degenerate inputs that should be replaced by the default URI. + 2. Valid non-empty inputs that should be resolved to an absolute path. + 3. The specific regression case: whitespace-only strings must be + treated as empty and replaced by the default, not forwarded to + Path.resolve() as-is. + """ + + # ------------------------------------------------------------------ + # Degenerate inputs -- must fall back to the configured default + # ------------------------------------------------------------------ + + @pytest.mark.parametrize( + "degenerate_uri", + [ + "", # empty string + " ", # whitespace only (the regression case from #2381) + "\t", # tab-only + "\n", # newline-only + " \t \n", # mixed whitespace + ], + ids=[ + "empty_string", + "spaces_only", + "tab_only", + "newline_only", + "mixed_whitespace", + ], + ) + def test_degenerate_db_uri_falls_back_to_default( + self, degenerate_uri: str + ) -> None: + """A blank or whitespace-only db_uri must be replaced with the project default. + + Before the fix, `store.db_uri.strip == ""` compared a bound method + to a string (always False), so whitespace-only URIs were never + normalised and were passed directly to Path.resolve(), producing + a silently wrong absolute path. + """ + config = _make_config(db_uri=degenerate_uri) + expected_default = str( + Path(graphrag_config_defaults.vector_store.db_uri).resolve() + ) + assert config.vector_store.db_uri == expected_default, ( + f"Expected db_uri to be normalised to the default '{expected_default}' " + f"when the supplied value is {degenerate_uri!r}, " + f"but got {config.vector_store.db_uri!r}." + ) + + def test_none_db_uri_falls_back_to_default(self) -> None: + """Omitting db_uri entirely must also produce the default.""" + config = _make_config(db_uri=None) + expected_default = str( + Path(graphrag_config_defaults.vector_store.db_uri).resolve() + ) + assert config.vector_store.db_uri == expected_default + + # ------------------------------------------------------------------ + # Valid inputs -- must be resolved to an absolute path + # ------------------------------------------------------------------ + + def test_valid_relative_db_uri_is_resolved_to_absolute(self) -> None: + """A valid relative path must be resolved to an absolute path.""" + config = _make_config(db_uri="output/lancedb") + result = config.vector_store.db_uri + assert Path(result).is_absolute(), ( + f"Expected an absolute path but got: {result!r}" + ) + assert result.endswith("lancedb"), ( + f"Expected path to end with 'lancedb' but got: {result!r}" + ) + + def test_valid_absolute_db_uri_is_preserved(self, tmp_path: Path) -> None: + """An already-absolute path must pass through Path.resolve unchanged.""" + absolute_uri = str(tmp_path / "lancedb") + config = _make_config(db_uri=absolute_uri) + assert config.vector_store.db_uri == str(Path(absolute_uri).resolve())