-
Notifications
You must be signed in to change notification settings - Fork 7.6k
perf(memory): vectorize intra-batch dedup cosine similarity #6323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
HumphreySun98
wants to merge
3
commits into
crewAIInc:main
Choose a base branch
from
HumphreySun98:perf/vectorize-intra-batch-dedup
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+219
−1
Open
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,147 @@ | ||
| """Tests for EncodingFlow.intra_batch_dedup (vectorized cosine dedup). | ||
|
|
||
| The vectorized implementation must reproduce the exact drop decisions of the | ||
| original scalar O(n^2) algorithm, which is retained as ``_dedup_scalar``. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from unittest.mock import MagicMock, patch | ||
|
|
||
| import numpy as np | ||
| import pytest | ||
|
|
||
| from crewai.memory.encoding_flow import EncodingFlow, ItemState | ||
| from crewai.memory.types import MemoryConfig | ||
|
|
||
|
|
||
| def _make_flow(threshold: float = 0.98) -> EncodingFlow: | ||
| config = MemoryConfig() | ||
| config.batch_dedup_threshold = threshold | ||
| return EncodingFlow( | ||
| storage=MagicMock(), llm=MagicMock(), embedder=MagicMock(), config=config | ||
| ) | ||
|
|
||
|
|
||
| def _run_vectorized(items: list[ItemState], threshold: float = 0.98) -> EncodingFlow: | ||
| flow = _make_flow(threshold) | ||
| flow.state.items = items | ||
| flow.intra_batch_dedup() | ||
| return flow | ||
|
|
||
|
|
||
| def test_drops_identical_keeps_distinct() -> None: | ||
| items = [ | ||
| ItemState(content="a", embedding=[0.5] * 8), | ||
| ItemState(content="b", embedding=[0.5] * 8), # identical to a -> dropped | ||
| ItemState(content="c", embedding=[1.0] + [0.0] * 7), # orthogonal -> kept | ||
| ] | ||
| flow = _run_vectorized(items) | ||
| assert [it.dropped for it in items] == [False, True, False] | ||
| assert flow.state.items_dropped_dedup == 1 | ||
|
|
||
|
|
||
| def test_first_occurrence_wins() -> None: | ||
| items = [ItemState(content=str(i), embedding=[0.5] * 8) for i in range(4)] | ||
| flow = _run_vectorized(items) | ||
| # Only the first survives; the rest are dropped against it. | ||
| assert [it.dropped for it in items] == [False, True, True, True] | ||
| assert flow.state.items_dropped_dedup == 3 | ||
|
|
||
|
|
||
| def test_items_without_embeddings_never_dropped() -> None: | ||
| items = [ | ||
| ItemState(content="a", embedding=[0.5] * 8), | ||
| ItemState(content="no-emb", embedding=[]), # never participates | ||
| ItemState(content="b", embedding=[0.5] * 8), # dup of a | ||
| ] | ||
| flow = _run_vectorized(items) | ||
| assert [it.dropped for it in items] == [False, False, True] | ||
| assert flow.state.items_dropped_dedup == 1 | ||
|
|
||
|
|
||
| def test_pre_dropped_item_is_skipped() -> None: | ||
| """A pre-dropped item must neither be re-counted nor suppress others.""" | ||
| a = ItemState(content="a", embedding=[0.5] * 8) | ||
| a.dropped = True # already dropped upstream | ||
| items = [ | ||
| a, | ||
| ItemState(content="b", embedding=[0.5] * 8), # same vector as the dropped a | ||
| ItemState(content="c", embedding=[0.5] * 8), # dup of b | ||
| ] | ||
| flow = _run_vectorized(items) | ||
| # 'a' stays dropped (not re-counted); 'b' becomes the surviving original; | ||
| # 'c' is dropped against 'b'. | ||
| assert items[0].dropped is True | ||
| assert items[1].dropped is False | ||
| assert items[2].dropped is True | ||
| assert flow.state.items_dropped_dedup == 1 # only 'c' is a new drop | ||
|
|
||
|
|
||
| def test_falls_back_to_scalar_when_numpy_unavailable() -> None: | ||
| """If numpy can't be imported, dedup still works via the scalar path.""" | ||
| items = [ | ||
| ItemState(content="a", embedding=[0.5] * 8), | ||
| ItemState(content="b", embedding=[0.5] * 8), # dup -> dropped | ||
| ItemState(content="c", embedding=[1.0] + [0.0] * 7), # distinct -> kept | ||
| ] | ||
| flow = _make_flow() | ||
| flow.state.items = items | ||
| # Make `import numpy` raise ImportError inside intra_batch_dedup. | ||
| with patch.dict("sys.modules", {"numpy": None}): | ||
| flow.intra_batch_dedup() | ||
| assert [it.dropped for it in items] == [False, True, False] | ||
| assert flow.state.items_dropped_dedup == 1 | ||
|
|
||
|
|
||
| def test_ragged_embeddings_warn_and_fall_back( | ||
| caplog: pytest.LogCaptureFixture, | ||
| ) -> None: | ||
| """Variable-length embeddings (an encoding bug) must warn and still dedup | ||
| correctly via the scalar fallback (len mismatch -> 0 similarity).""" | ||
| items = [ | ||
| ItemState(content="a", embedding=[0.5] * 8), | ||
| ItemState(content="b", embedding=[0.5] * 8), # dup of a -> dropped | ||
| ItemState(content="c", embedding=[0.5] * 4), # ragged -> never matches | ||
| ] | ||
| with caplog.at_level(logging.WARNING, logger="crewai.memory.encoding_flow"): | ||
| flow = _run_vectorized(items) | ||
|
|
||
| assert any("ragged embeddings" in r.message for r in caplog.records) | ||
| assert [it.dropped for it in items] == [False, True, False] | ||
| assert flow.state.items_dropped_dedup == 1 | ||
|
|
||
|
|
||
| def test_matches_scalar_reference_on_clustered_data() -> None: | ||
| """Vectorized dedup must match the scalar reference exactly on clustered | ||
| embeddings (intra-cluster ~1.0, inter-cluster low), across many trials. | ||
| """ | ||
| rng = np.random.default_rng(0) | ||
| dim = 32 | ||
| threshold = 0.98 | ||
|
|
||
| for _ in range(25): | ||
| n_clusters = int(rng.integers(1, 5)) | ||
| centers = rng.normal(size=(n_clusters, dim)) | ||
| embeddings: list[list[float]] = [] | ||
| for _ in range(int(rng.integers(2, 12))): | ||
| c = centers[int(rng.integers(0, n_clusters))] | ||
| vec = c + rng.normal(scale=1e-3, size=dim) # tiny noise -> sim ~1.0 | ||
| embeddings.append(vec.tolist()) | ||
|
|
||
| vec_items = [ | ||
| ItemState(content=str(i), embedding=e) for i, e in enumerate(embeddings) | ||
| ] | ||
| scalar_items = [ | ||
| ItemState(content=str(i), embedding=e) for i, e in enumerate(embeddings) | ||
| ] | ||
|
|
||
| _run_vectorized(vec_items, threshold) | ||
|
|
||
| scalar_flow = _make_flow(threshold) | ||
| scalar_flow._dedup_scalar(scalar_items, threshold) | ||
|
|
||
| assert [it.dropped for it in vec_items] == [ | ||
| it.dropped for it in scalar_items | ||
| ] |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Building the full pairwise cosine-similarity matrix introduces an O(m^2) memory footprint that can exhaust memory and crash the process for large batches. This is a denial-of-service risk if an attacker can supply or influence very large batches.
Vulnerable code:
Impact: For batch size m,
simsrequires mm8 bytes (float64), quickly leading to OOM (e.g., m=50,000 -> ~20 GB). The previous scalar algorithm used O(1) extra memory.Remediation:
if m*m*8 > MAX_BYTES: fallback).For more details, see the finding in Corridor.
Provide feedback: Reply with whether this is a valid vulnerability or false positive to help improve Corridor's accuracy.