From 639cb12a9d0e12eec223f3fe54098d0c471b42b2 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Sat, 27 Jun 2026 22:38:33 -0400 Subject: [PATCH 1/5] Match batch deanonymization with existing traversal --- CHANGELOG.md | 2 + presidio-anonymizer/README.md | 21 +++ .../presidio_anonymizer/__init__.py | 2 + .../batch_deanonymize_engine.py | 99 +++++++++++ .../tests/test_batch_deanonymize_engine.py | 164 ++++++++++++++++++ 5 files changed, 288 insertions(+) create mode 100644 presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py create mode 100644 presidio-anonymizer/tests/test_batch_deanonymize_engine.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 53557a3fb4..8e37b037eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file. ## [unreleased] ### Anonymizer +#### Added +- Added `BatchDeanonymizeEngine` to complement `BatchAnonymizerEngine` for batch deanonymization over lists and nested dictionaries. #### Fixed - Custom operator `validate()` no longer calls the user-supplied lambda with a dummy `"PII"` value. Previously, stateful lambdas (e.g. those accumulating a token-to-original-value map for de-anonymization) would receive a spurious invocation during validation, inserting a junk entry (`{"TOKEN_1": "PII"}`) into the map and skewing all subsequent token counters. The return-type contract is now enforced in `operate()` when the lambda runs on real data. Fixes [#2024](https://github.com/microsoft/presidio/issues/2024). diff --git a/presidio-anonymizer/README.md b/presidio-anonymizer/README.md index bba95d2abf..7f582e6cd8 100644 --- a/presidio-anonymizer/README.md +++ b/presidio-anonymizer/README.md @@ -167,6 +167,27 @@ result = engine.anonymize( print(result) ``` +Batch deanonymization works the same way for lists and nested dictionaries: +```python +from presidio_anonymizer import BatchDeanonymizeEngine +from presidio_anonymizer.entities import OperatorConfig, OperatorResult + +engine = BatchDeanonymizeEngine() + +results = engine.deanonymize_list( + texts=[ + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + ], + entities_list=[ + [OperatorResult(start=11, end=55, entity_type="PERSON")], + [OperatorResult(start=11, end=55, entity_type="PERSON")], + ], + operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})}, +) + +print(results) +``` This example take the output of the AnonymizerEngine with encrypted PII entities, and decrypt it back to the original text: ```python diff --git a/presidio-anonymizer/presidio_anonymizer/__init__.py b/presidio-anonymizer/presidio_anonymizer/__init__.py index 11b09fb7df..d2529938b3 100644 --- a/presidio-anonymizer/presidio_anonymizer/__init__.py +++ b/presidio-anonymizer/presidio_anonymizer/__init__.py @@ -4,6 +4,7 @@ from .anonymizer_engine import AnonymizerEngine from .batch_anonymizer_engine import BatchAnonymizerEngine +from .batch_deanonymize_engine import BatchDeanonymizeEngine from .deanonymize_engine import DeanonymizeEngine from .entities import ( ConflictResolutionStrategy, @@ -25,6 +26,7 @@ "AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine", + "BatchDeanonymizeEngine", "InvalidParamError", "ConflictResolutionStrategy", "PIIEntity", diff --git a/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py new file mode 100644 index 0000000000..7f04d402e6 --- /dev/null +++ b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py @@ -0,0 +1,99 @@ +import collections +from typing import Any, Dict, Iterable, List, Optional, Union + +from presidio_anonymizer.deanonymize_engine import DeanonymizeEngine +from presidio_anonymizer.entities import ( + DictRecognizerResult, + OperatorConfig, + OperatorResult, +) + + +class BatchDeanonymizeEngine: + """ + BatchDeanonymizeEngine class. + + A class that provides functionality to deanonymize in batches. + :param deanonymize_engine: An instance of the DeanonymizeEngine class. + """ + + def __init__(self, deanonymize_engine: Optional[DeanonymizeEngine] = None): + self.deanonymize_engine = deanonymize_engine or DeanonymizeEngine() + + def deanonymize_list( + self, + texts: List[Optional[Union[str, bool, int, float]]], + entities_list: List[List[OperatorResult]], + operators: Dict[str, OperatorConfig], + **kwargs, + ) -> List[Union[str, Any]]: + """ + Deanonymize a list of strings. + + :param texts: List containing the texts to be deanonymized. + Items with a `type` not in `(str, bool, int, float)` will be left + unchanged. + :param entities_list: A list of lists of OperatorResult, the output of + DeanonymizeEngine.deanonymize on each text in the list. + :param operators: Operators to define the deanonymization type. + :param kwargs: Additional kwargs for the `DeanonymizeEngine.deanonymize` method + """ + return_list = [] + if not entities_list: + entities_list = [[] for _ in range(len(texts))] + for text, entities in zip(texts, entities_list): + if type(text) in (str, bool, int, float): + res = self.deanonymize_engine.deanonymize( + text=str(text), entities=entities, operators=operators, **kwargs + ) + return_list.append(res.text) + else: + return_list.append(text) + + return return_list + + def deanonymize_dict( + self, + analyzer_results: Iterable[DictRecognizerResult], + operators: Dict[str, OperatorConfig], + **kwargs, + ) -> Dict[str, Any]: + """ + Deanonymize values in a dictionary. + + :param analyzer_results: Iterator of `DictRecognizerResult` + containing the output of the AnalyzerEngine.analyze_dict on the input text. + :param operators: Operators to define the deanonymization type. + :param kwargs: Additional kwargs for the `DeanonymizeEngine.deanonymize` method + """ + + return_dict = {} + for result in analyzer_results: + if isinstance(result.value, dict): + resp = self.deanonymize_dict( + analyzer_results=result.recognizer_results, + operators=operators, + **kwargs, + ) + return_dict[result.key] = resp + + elif isinstance(result.value, str): + resp = self.deanonymize_engine.deanonymize( + text=result.value, + entities=result.recognizer_results, + operators=operators, + **kwargs, + ) + return_dict[result.key] = resp.text + + elif isinstance(result.value, collections.abc.Iterable): + deanonymize_response = self.deanonymize_list( + texts=result.value, + entities_list=result.recognizer_results, + operators=operators, + **kwargs, + ) + return_dict[result.key] = deanonymize_response + else: + return_dict[result.key] = result.value + return return_dict diff --git a/presidio-anonymizer/tests/test_batch_deanonymize_engine.py b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py new file mode 100644 index 0000000000..89372fc3c7 --- /dev/null +++ b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py @@ -0,0 +1,164 @@ +# ruff: noqa: D101,D102,D103,I001 + +import pytest + +from presidio_anonymizer import BatchDeanonymizeEngine +from presidio_anonymizer.deanonymize_engine import DeanonymizeEngine +from presidio_anonymizer.entities import ( + DictRecognizerResult, + EngineResult, + OperatorConfig, + OperatorResult, +) +from presidio_anonymizer.operators import Decrypt + + +ENCRYPTED_TEXT = "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=" +DECRYPTED_TEXT = "My name is Chloë" +ENTITY = OperatorResult(start=11, end=55, entity_type="PERSON") +OPERATORS = {"DEFAULT": OperatorConfig(Decrypt.NAME, {"key": "WmZq4t7w!z%C&F)J"})} + + +@pytest.fixture(scope="module") +def engine(): + return BatchDeanonymizeEngine() + + +def test_package_root_exports_batch_deanonymize_engine(): + from presidio_anonymizer import ( + BatchDeanonymizeEngine as ExportedBatchDeanonymizeEngine, + ) + + assert ExportedBatchDeanonymizeEngine is BatchDeanonymizeEngine + + +def test_given_analyzer_result_we_deanonymize_list_correctly(engine): + texts = [ENCRYPTED_TEXT, ENCRYPTED_TEXT] + entities_list = [[ENTITY], [ENTITY]] + + deanonymize_results = engine.deanonymize_list( + texts=texts, entities_list=entities_list, operators=OPERATORS + ) + + assert deanonymize_results == [DECRYPTED_TEXT, DECRYPTED_TEXT] + + +def test_given_analyzer_result_we_deanonymize_dict_correctly(engine): + analyzer_results = [ + DictRecognizerResult( + key="name", value=ENCRYPTED_TEXT, recognizer_results=[ENTITY] + ) + ] + + deanonymize_results = engine.deanonymize_dict( + analyzer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results == {"name": DECRYPTED_TEXT} + + +def test_given_nested_analyzer_result_we_deanonymize_dict_correctly(engine): + analyzer_results = [ + DictRecognizerResult( + key="customer", + value={"profile": {"name": ENCRYPTED_TEXT}}, + recognizer_results=[ + DictRecognizerResult( + key="profile", + value={"name": ENCRYPTED_TEXT}, + recognizer_results=[ + DictRecognizerResult( + key="name", + value=ENCRYPTED_TEXT, + recognizer_results=[ENTITY], + ) + ], + ) + ], + ) + ] + + deanonymize_results = engine.deanonymize_dict( + analyzer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results == {"customer": {"profile": {"name": DECRYPTED_TEXT}}} + + +def test_given_empty_entities_we_return_text_unchanged(engine): + deanonymize_results = engine.deanonymize_list( + texts=[ENCRYPTED_TEXT], entities_list=[], operators=OPERATORS + ) + + assert deanonymize_results == [ENCRYPTED_TEXT] + + +def test_given_exact_type_list_values_we_route_through_injected_engine(): + deanonymize_engine = RecordingDeanonymizeEngine() + engine = BatchDeanonymizeEngine(deanonymize_engine=deanonymize_engine) + + sentinel = object() + deanonymize_results = engine.deanonymize_list( + texts=[True, 7, 1.5, sentinel], + entities_list=[[], [], [], []], + operators=OPERATORS, + ) + + assert deanonymize_results == ["custom::True", "custom::7", "custom::1.5", sentinel] + assert deanonymize_engine.calls == [ + ("True", [], OPERATORS), + ("7", [], OPERATORS), + ("1.5", [], OPERATORS), + ] + + +def test_given_non_string_list_value_we_return_item_unchanged(engine): + analyzer_results = [ + DictRecognizerResult( + key="items", + value=[ENCRYPTED_TEXT, ["nested", 123], object()], + recognizer_results=[[ENTITY], [], []], + ) + ] + + deanonymize_results = engine.deanonymize_dict( + analyzer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results["items"][0] == DECRYPTED_TEXT + assert deanonymize_results["items"][1] == ["nested", 123] + assert type(deanonymize_results["items"][2]) is object + + +def test_given_scalar_dict_value_we_return_value_unchanged(engine): + analyzer_results = [ + DictRecognizerResult(key="id", value=123, recognizer_results=[]) + ] + + deanonymize_results = engine.deanonymize_dict( + analyzer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results == {"id": 123} + + +class RecordingDeanonymizeEngine(DeanonymizeEngine): + def __init__(self): + super().__init__() + self.calls = [] + + def deanonymize(self, text, entities, operators): + self.calls.append((text, entities, operators)) + return EngineResult(text=f"custom::{text}") + + +def test_given_custom_deanonymizer_we_use_injected_engine(): + deanonymize_engine = RecordingDeanonymizeEngine() + engine = BatchDeanonymizeEngine(deanonymize_engine=deanonymize_engine) + + deanonymize_results = engine.deanonymize_list( + texts=[ENCRYPTED_TEXT], entities_list=[[ENTITY]], operators=OPERATORS + ) + + assert deanonymize_results == [f"custom::{ENCRYPTED_TEXT}"] + assert deanonymize_engine.calls == [(ENCRYPTED_TEXT, [ENTITY], OPERATORS)] From a6b72e980a8304efe3688f90b9790785f1766377 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Sat, 27 Jun 2026 22:56:20 -0400 Subject: [PATCH 2/5] Keep README examples exercised by CI --- presidio-anonymizer/tests/test_readme.py | 28 ++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/presidio-anonymizer/tests/test_readme.py b/presidio-anonymizer/tests/test_readme.py index 456356d68f..b66a6c820a 100644 --- a/presidio-anonymizer/tests/test_readme.py +++ b/presidio-anonymizer/tests/test_readme.py @@ -1,7 +1,8 @@ def test_readme(): + """Exercise the README anonymizer example.""" # Tests that the readme code snippet doesn't fail from presidio_anonymizer import AnonymizerEngine - from presidio_anonymizer.entities import RecognizerResult, OperatorConfig + from presidio_anonymizer.entities import OperatorConfig, RecognizerResult # Initialize the engine with logger. engine = AnonymizerEngine() @@ -22,8 +23,9 @@ def test_readme(): def test_readme_decrypt(): + """Exercise the README single-item deanonymize example.""" from presidio_anonymizer import DeanonymizeEngine - from presidio_anonymizer.entities import OperatorResult, OperatorConfig + from presidio_anonymizer.entities import OperatorConfig, OperatorResult # Initialize the engine with logger. engine = DeanonymizeEngine() @@ -39,3 +41,25 @@ def test_readme_decrypt(): ) print(result) + + +def test_readme_batch_decrypt(): + """Exercise the README batch deanonymize example.""" + from presidio_anonymizer import BatchDeanonymizeEngine + from presidio_anonymizer.entities import OperatorConfig, OperatorResult + + engine = BatchDeanonymizeEngine() + + results = engine.deanonymize_list( + texts=[ + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + ], + entities_list=[ + [OperatorResult(start=11, end=55, entity_type="PERSON")], + [OperatorResult(start=11, end=55, entity_type="PERSON")], + ], + operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})}, + ) + + assert results == ["My name is Chloë", "My name is Chloë"] From 77c215668719ef1ea0efe737f6d862fff7c22cf6 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Sat, 27 Jun 2026 23:04:36 -0400 Subject: [PATCH 3/5] Align batch deanonymize dict API with documented usage --- .../presidio_anonymizer/batch_deanonymize_engine.py | 10 +++++----- .../tests/test_batch_deanonymize_engine.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py index 7f04d402e6..6eb517f4f4 100644 --- a/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py +++ b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py @@ -54,24 +54,24 @@ def deanonymize_list( def deanonymize_dict( self, - analyzer_results: Iterable[DictRecognizerResult], + anonymizer_results: Iterable[DictRecognizerResult], operators: Dict[str, OperatorConfig], **kwargs, ) -> Dict[str, Any]: """ Deanonymize values in a dictionary. - :param analyzer_results: Iterator of `DictRecognizerResult` - containing the output of the AnalyzerEngine.analyze_dict on the input text. + :param anonymizer_results: Iterator of `DictRecognizerResult` + containing the output of batch anonymization on the input text. :param operators: Operators to define the deanonymization type. :param kwargs: Additional kwargs for the `DeanonymizeEngine.deanonymize` method """ return_dict = {} - for result in analyzer_results: + for result in anonymizer_results: if isinstance(result.value, dict): resp = self.deanonymize_dict( - analyzer_results=result.recognizer_results, + anonymizer_results=result.recognizer_results, operators=operators, **kwargs, ) diff --git a/presidio-anonymizer/tests/test_batch_deanonymize_engine.py b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py index 89372fc3c7..fac4bf9b63 100644 --- a/presidio-anonymizer/tests/test_batch_deanonymize_engine.py +++ b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py @@ -51,7 +51,7 @@ def test_given_analyzer_result_we_deanonymize_dict_correctly(engine): ] deanonymize_results = engine.deanonymize_dict( - analyzer_results=analyzer_results, operators=OPERATORS + anonymizer_results=analyzer_results, operators=OPERATORS ) assert deanonymize_results == {"name": DECRYPTED_TEXT} @@ -79,7 +79,7 @@ def test_given_nested_analyzer_result_we_deanonymize_dict_correctly(engine): ] deanonymize_results = engine.deanonymize_dict( - analyzer_results=analyzer_results, operators=OPERATORS + anonymizer_results=analyzer_results, operators=OPERATORS ) assert deanonymize_results == {"customer": {"profile": {"name": DECRYPTED_TEXT}}} @@ -122,7 +122,7 @@ def test_given_non_string_list_value_we_return_item_unchanged(engine): ] deanonymize_results = engine.deanonymize_dict( - analyzer_results=analyzer_results, operators=OPERATORS + anonymizer_results=analyzer_results, operators=OPERATORS ) assert deanonymize_results["items"][0] == DECRYPTED_TEXT @@ -136,7 +136,7 @@ def test_given_scalar_dict_value_we_return_value_unchanged(engine): ] deanonymize_results = engine.deanonymize_dict( - analyzer_results=analyzer_results, operators=OPERATORS + anonymizer_results=analyzer_results, operators=OPERATORS ) assert deanonymize_results == {"id": 123} From 1764f029d6a8913e8b91a75463d2d9c334d10841 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Sat, 27 Jun 2026 23:08:59 -0400 Subject: [PATCH 4/5] Keep batch deanonymize docs in contract order --- presidio-anonymizer/README.md | 43 ++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/presidio-anonymizer/README.md b/presidio-anonymizer/README.md index 7f582e6cd8..52fe828a05 100644 --- a/presidio-anonymizer/README.md +++ b/presidio-anonymizer/README.md @@ -167,27 +167,6 @@ result = engine.anonymize( print(result) ``` -Batch deanonymization works the same way for lists and nested dictionaries: -```python -from presidio_anonymizer import BatchDeanonymizeEngine -from presidio_anonymizer.entities import OperatorConfig, OperatorResult - -engine = BatchDeanonymizeEngine() - -results = engine.deanonymize_list( - texts=[ - "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", - "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", - ], - entities_list=[ - [OperatorResult(start=11, end=55, entity_type="PERSON")], - [OperatorResult(start=11, end=55, entity_type="PERSON")], - ], - operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})}, -) - -print(results) -``` This example take the output of the AnonymizerEngine with encrypted PII entities, and decrypt it back to the original text: ```python @@ -211,6 +190,28 @@ print(result) ``` +Batch deanonymization works the same way for lists and nested dictionaries: +```python +from presidio_anonymizer import BatchDeanonymizeEngine +from presidio_anonymizer.entities import OperatorConfig, OperatorResult + +engine = BatchDeanonymizeEngine() + +results = engine.deanonymize_list( + texts=[ + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + ], + entities_list=[ + [OperatorResult(start=11, end=55, entity_type="PERSON")], + [OperatorResult(start=11, end=55, entity_type="PERSON")], + ], + operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})}, +) + +print(results) +``` + ### As docker service: In folder presidio/presidio-anonymizer run: From 8931072269ca2d06b31567af80a13d9f58a78177 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Sun, 28 Jun 2026 00:11:19 -0400 Subject: [PATCH 5/5] Keep batch deanonymization from truncating or over-forwarding --- .../batch_deanonymize_engine.py | 16 +++++---------- .../tests/test_batch_deanonymize_engine.py | 20 +++++++++++++++++++ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py index 6eb517f4f4..01bb4ab806 100644 --- a/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py +++ b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py @@ -25,7 +25,6 @@ def deanonymize_list( texts: List[Optional[Union[str, bool, int, float]]], entities_list: List[List[OperatorResult]], operators: Dict[str, OperatorConfig], - **kwargs, ) -> List[Union[str, Any]]: """ Deanonymize a list of strings. @@ -33,18 +32,18 @@ def deanonymize_list( :param texts: List containing the texts to be deanonymized. Items with a `type` not in `(str, bool, int, float)` will be left unchanged. - :param entities_list: A list of lists of OperatorResult, the output of - DeanonymizeEngine.deanonymize on each text in the list. + :param entities_list: A list of lists of OperatorResult used as the + deanonymization input spans for each text in the list. :param operators: Operators to define the deanonymization type. - :param kwargs: Additional kwargs for the `DeanonymizeEngine.deanonymize` method """ return_list = [] if not entities_list: entities_list = [[] for _ in range(len(texts))] - for text, entities in zip(texts, entities_list): + for index, text in enumerate(texts): + entities = entities_list[index] if index < len(entities_list) else [] if type(text) in (str, bool, int, float): res = self.deanonymize_engine.deanonymize( - text=str(text), entities=entities, operators=operators, **kwargs + text=str(text), entities=entities, operators=operators ) return_list.append(res.text) else: @@ -56,7 +55,6 @@ def deanonymize_dict( self, anonymizer_results: Iterable[DictRecognizerResult], operators: Dict[str, OperatorConfig], - **kwargs, ) -> Dict[str, Any]: """ Deanonymize values in a dictionary. @@ -64,7 +62,6 @@ def deanonymize_dict( :param anonymizer_results: Iterator of `DictRecognizerResult` containing the output of batch anonymization on the input text. :param operators: Operators to define the deanonymization type. - :param kwargs: Additional kwargs for the `DeanonymizeEngine.deanonymize` method """ return_dict = {} @@ -73,7 +70,6 @@ def deanonymize_dict( resp = self.deanonymize_dict( anonymizer_results=result.recognizer_results, operators=operators, - **kwargs, ) return_dict[result.key] = resp @@ -82,7 +78,6 @@ def deanonymize_dict( text=result.value, entities=result.recognizer_results, operators=operators, - **kwargs, ) return_dict[result.key] = resp.text @@ -91,7 +86,6 @@ def deanonymize_dict( texts=result.value, entities_list=result.recognizer_results, operators=operators, - **kwargs, ) return_dict[result.key] = deanonymize_response else: diff --git a/presidio-anonymizer/tests/test_batch_deanonymize_engine.py b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py index fac4bf9b63..692e383c09 100644 --- a/presidio-anonymizer/tests/test_batch_deanonymize_engine.py +++ b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py @@ -93,6 +93,16 @@ def test_given_empty_entities_we_return_text_unchanged(engine): assert deanonymize_results == [ENCRYPTED_TEXT] +def test_given_short_entities_list_we_keep_trailing_texts(engine): + deanonymize_results = engine.deanonymize_list( + texts=[ENCRYPTED_TEXT, "plain trailing text"], + entities_list=[[ENTITY]], + operators=OPERATORS, + ) + + assert deanonymize_results == [DECRYPTED_TEXT, "plain trailing text"] + + def test_given_exact_type_list_values_we_route_through_injected_engine(): deanonymize_engine = RecordingDeanonymizeEngine() engine = BatchDeanonymizeEngine(deanonymize_engine=deanonymize_engine) @@ -162,3 +172,13 @@ def test_given_custom_deanonymizer_we_use_injected_engine(): assert deanonymize_results == [f"custom::{ENCRYPTED_TEXT}"] assert deanonymize_engine.calls == [(ENCRYPTED_TEXT, [ENTITY], OPERATORS)] + + +def test_given_unsupported_kwargs_then_batch_api_rejects_them(engine): + with pytest.raises(TypeError, match="unexpected keyword argument"): + engine.deanonymize_list( + texts=[ENCRYPTED_TEXT], + entities_list=[[ENTITY]], + operators=OPERATORS, + language="en", + )