diff --git a/CHANGELOG.md b/CHANGELOG.md index 53557a3fb..8e37b037e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file. ## [unreleased] ### Anonymizer +#### Added +- Added `BatchDeanonymizeEngine` to complement `BatchAnonymizerEngine` for batch deanonymization over lists and nested dictionaries. #### Fixed - Custom operator `validate()` no longer calls the user-supplied lambda with a dummy `"PII"` value. Previously, stateful lambdas (e.g. those accumulating a token-to-original-value map for de-anonymization) would receive a spurious invocation during validation, inserting a junk entry (`{"TOKEN_1": "PII"}`) into the map and skewing all subsequent token counters. The return-type contract is now enforced in `operate()` when the lambda runs on real data. Fixes [#2024](https://github.com/microsoft/presidio/issues/2024). diff --git a/presidio-anonymizer/README.md b/presidio-anonymizer/README.md index bba95d2ab..52fe828a0 100644 --- a/presidio-anonymizer/README.md +++ b/presidio-anonymizer/README.md @@ -190,6 +190,28 @@ print(result) ``` +Batch deanonymization works the same way for lists and nested dictionaries: +```python +from presidio_anonymizer import BatchDeanonymizeEngine +from presidio_anonymizer.entities import OperatorConfig, OperatorResult + +engine = BatchDeanonymizeEngine() + +results = engine.deanonymize_list( + texts=[ + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + ], + entities_list=[ + [OperatorResult(start=11, end=55, entity_type="PERSON")], + [OperatorResult(start=11, end=55, entity_type="PERSON")], + ], + operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})}, +) + +print(results) +``` + ### As docker service: In folder presidio/presidio-anonymizer run: diff --git a/presidio-anonymizer/presidio_anonymizer/__init__.py b/presidio-anonymizer/presidio_anonymizer/__init__.py index 11b09fb7d..d2529938b 100644 --- a/presidio-anonymizer/presidio_anonymizer/__init__.py +++ b/presidio-anonymizer/presidio_anonymizer/__init__.py @@ -4,6 +4,7 @@ from .anonymizer_engine import AnonymizerEngine from .batch_anonymizer_engine import BatchAnonymizerEngine +from .batch_deanonymize_engine import BatchDeanonymizeEngine from .deanonymize_engine import DeanonymizeEngine from .entities import ( ConflictResolutionStrategy, @@ -25,6 +26,7 @@ "AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine", + "BatchDeanonymizeEngine", "InvalidParamError", "ConflictResolutionStrategy", "PIIEntity", diff --git a/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py new file mode 100644 index 000000000..01bb4ab80 --- /dev/null +++ b/presidio-anonymizer/presidio_anonymizer/batch_deanonymize_engine.py @@ -0,0 +1,93 @@ +import collections +from typing import Any, Dict, Iterable, List, Optional, Union + +from presidio_anonymizer.deanonymize_engine import DeanonymizeEngine +from presidio_anonymizer.entities import ( + DictRecognizerResult, + OperatorConfig, + OperatorResult, +) + + +class BatchDeanonymizeEngine: + """ + BatchDeanonymizeEngine class. + + A class that provides functionality to deanonymize in batches. + :param deanonymize_engine: An instance of the DeanonymizeEngine class. + """ + + def __init__(self, deanonymize_engine: Optional[DeanonymizeEngine] = None): + self.deanonymize_engine = deanonymize_engine or DeanonymizeEngine() + + def deanonymize_list( + self, + texts: List[Optional[Union[str, bool, int, float]]], + entities_list: List[List[OperatorResult]], + operators: Dict[str, OperatorConfig], + ) -> List[Union[str, Any]]: + """ + Deanonymize a list of strings. + + :param texts: List containing the texts to be deanonymized. + Items with a `type` not in `(str, bool, int, float)` will be left + unchanged. + :param entities_list: A list of lists of OperatorResult used as the + deanonymization input spans for each text in the list. + :param operators: Operators to define the deanonymization type. + """ + return_list = [] + if not entities_list: + entities_list = [[] for _ in range(len(texts))] + for index, text in enumerate(texts): + entities = entities_list[index] if index < len(entities_list) else [] + if type(text) in (str, bool, int, float): + res = self.deanonymize_engine.deanonymize( + text=str(text), entities=entities, operators=operators + ) + return_list.append(res.text) + else: + return_list.append(text) + + return return_list + + def deanonymize_dict( + self, + anonymizer_results: Iterable[DictRecognizerResult], + operators: Dict[str, OperatorConfig], + ) -> Dict[str, Any]: + """ + Deanonymize values in a dictionary. + + :param anonymizer_results: Iterator of `DictRecognizerResult` + containing the output of batch anonymization on the input text. + :param operators: Operators to define the deanonymization type. + """ + + return_dict = {} + for result in anonymizer_results: + if isinstance(result.value, dict): + resp = self.deanonymize_dict( + anonymizer_results=result.recognizer_results, + operators=operators, + ) + return_dict[result.key] = resp + + elif isinstance(result.value, str): + resp = self.deanonymize_engine.deanonymize( + text=result.value, + entities=result.recognizer_results, + operators=operators, + ) + return_dict[result.key] = resp.text + + elif isinstance(result.value, collections.abc.Iterable): + deanonymize_response = self.deanonymize_list( + texts=result.value, + entities_list=result.recognizer_results, + operators=operators, + ) + return_dict[result.key] = deanonymize_response + else: + return_dict[result.key] = result.value + return return_dict diff --git a/presidio-anonymizer/tests/test_batch_deanonymize_engine.py b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py new file mode 100644 index 000000000..692e383c0 --- /dev/null +++ b/presidio-anonymizer/tests/test_batch_deanonymize_engine.py @@ -0,0 +1,184 @@ +# ruff: noqa: D101,D102,D103,I001 + +import pytest + +from presidio_anonymizer import BatchDeanonymizeEngine +from presidio_anonymizer.deanonymize_engine import DeanonymizeEngine +from presidio_anonymizer.entities import ( + DictRecognizerResult, + EngineResult, + OperatorConfig, + OperatorResult, +) +from presidio_anonymizer.operators import Decrypt + + +ENCRYPTED_TEXT = "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=" +DECRYPTED_TEXT = "My name is Chloë" +ENTITY = OperatorResult(start=11, end=55, entity_type="PERSON") +OPERATORS = {"DEFAULT": OperatorConfig(Decrypt.NAME, {"key": "WmZq4t7w!z%C&F)J"})} + + +@pytest.fixture(scope="module") +def engine(): + return BatchDeanonymizeEngine() + + +def test_package_root_exports_batch_deanonymize_engine(): + from presidio_anonymizer import ( + BatchDeanonymizeEngine as ExportedBatchDeanonymizeEngine, + ) + + assert ExportedBatchDeanonymizeEngine is BatchDeanonymizeEngine + + +def test_given_analyzer_result_we_deanonymize_list_correctly(engine): + texts = [ENCRYPTED_TEXT, ENCRYPTED_TEXT] + entities_list = [[ENTITY], [ENTITY]] + + deanonymize_results = engine.deanonymize_list( + texts=texts, entities_list=entities_list, operators=OPERATORS + ) + + assert deanonymize_results == [DECRYPTED_TEXT, DECRYPTED_TEXT] + + +def test_given_analyzer_result_we_deanonymize_dict_correctly(engine): + analyzer_results = [ + DictRecognizerResult( + key="name", value=ENCRYPTED_TEXT, recognizer_results=[ENTITY] + ) + ] + + deanonymize_results = engine.deanonymize_dict( + anonymizer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results == {"name": DECRYPTED_TEXT} + + +def test_given_nested_analyzer_result_we_deanonymize_dict_correctly(engine): + analyzer_results = [ + DictRecognizerResult( + key="customer", + value={"profile": {"name": ENCRYPTED_TEXT}}, + recognizer_results=[ + DictRecognizerResult( + key="profile", + value={"name": ENCRYPTED_TEXT}, + recognizer_results=[ + DictRecognizerResult( + key="name", + value=ENCRYPTED_TEXT, + recognizer_results=[ENTITY], + ) + ], + ) + ], + ) + ] + + deanonymize_results = engine.deanonymize_dict( + anonymizer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results == {"customer": {"profile": {"name": DECRYPTED_TEXT}}} + + +def test_given_empty_entities_we_return_text_unchanged(engine): + deanonymize_results = engine.deanonymize_list( + texts=[ENCRYPTED_TEXT], entities_list=[], operators=OPERATORS + ) + + assert deanonymize_results == [ENCRYPTED_TEXT] + + +def test_given_short_entities_list_we_keep_trailing_texts(engine): + deanonymize_results = engine.deanonymize_list( + texts=[ENCRYPTED_TEXT, "plain trailing text"], + entities_list=[[ENTITY]], + operators=OPERATORS, + ) + + assert deanonymize_results == [DECRYPTED_TEXT, "plain trailing text"] + + +def test_given_exact_type_list_values_we_route_through_injected_engine(): + deanonymize_engine = RecordingDeanonymizeEngine() + engine = BatchDeanonymizeEngine(deanonymize_engine=deanonymize_engine) + + sentinel = object() + deanonymize_results = engine.deanonymize_list( + texts=[True, 7, 1.5, sentinel], + entities_list=[[], [], [], []], + operators=OPERATORS, + ) + + assert deanonymize_results == ["custom::True", "custom::7", "custom::1.5", sentinel] + assert deanonymize_engine.calls == [ + ("True", [], OPERATORS), + ("7", [], OPERATORS), + ("1.5", [], OPERATORS), + ] + + +def test_given_non_string_list_value_we_return_item_unchanged(engine): + analyzer_results = [ + DictRecognizerResult( + key="items", + value=[ENCRYPTED_TEXT, ["nested", 123], object()], + recognizer_results=[[ENTITY], [], []], + ) + ] + + deanonymize_results = engine.deanonymize_dict( + anonymizer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results["items"][0] == DECRYPTED_TEXT + assert deanonymize_results["items"][1] == ["nested", 123] + assert type(deanonymize_results["items"][2]) is object + + +def test_given_scalar_dict_value_we_return_value_unchanged(engine): + analyzer_results = [ + DictRecognizerResult(key="id", value=123, recognizer_results=[]) + ] + + deanonymize_results = engine.deanonymize_dict( + anonymizer_results=analyzer_results, operators=OPERATORS + ) + + assert deanonymize_results == {"id": 123} + + +class RecordingDeanonymizeEngine(DeanonymizeEngine): + def __init__(self): + super().__init__() + self.calls = [] + + def deanonymize(self, text, entities, operators): + self.calls.append((text, entities, operators)) + return EngineResult(text=f"custom::{text}") + + +def test_given_custom_deanonymizer_we_use_injected_engine(): + deanonymize_engine = RecordingDeanonymizeEngine() + engine = BatchDeanonymizeEngine(deanonymize_engine=deanonymize_engine) + + deanonymize_results = engine.deanonymize_list( + texts=[ENCRYPTED_TEXT], entities_list=[[ENTITY]], operators=OPERATORS + ) + + assert deanonymize_results == [f"custom::{ENCRYPTED_TEXT}"] + assert deanonymize_engine.calls == [(ENCRYPTED_TEXT, [ENTITY], OPERATORS)] + + +def test_given_unsupported_kwargs_then_batch_api_rejects_them(engine): + with pytest.raises(TypeError, match="unexpected keyword argument"): + engine.deanonymize_list( + texts=[ENCRYPTED_TEXT], + entities_list=[[ENTITY]], + operators=OPERATORS, + language="en", + ) diff --git a/presidio-anonymizer/tests/test_readme.py b/presidio-anonymizer/tests/test_readme.py index 456356d68..b66a6c820 100644 --- a/presidio-anonymizer/tests/test_readme.py +++ b/presidio-anonymizer/tests/test_readme.py @@ -1,7 +1,8 @@ def test_readme(): + """Exercise the README anonymizer example.""" # Tests that the readme code snippet doesn't fail from presidio_anonymizer import AnonymizerEngine - from presidio_anonymizer.entities import RecognizerResult, OperatorConfig + from presidio_anonymizer.entities import OperatorConfig, RecognizerResult # Initialize the engine with logger. engine = AnonymizerEngine() @@ -22,8 +23,9 @@ def test_readme(): def test_readme_decrypt(): + """Exercise the README single-item deanonymize example.""" from presidio_anonymizer import DeanonymizeEngine - from presidio_anonymizer.entities import OperatorResult, OperatorConfig + from presidio_anonymizer.entities import OperatorConfig, OperatorResult # Initialize the engine with logger. engine = DeanonymizeEngine() @@ -39,3 +41,25 @@ def test_readme_decrypt(): ) print(result) + + +def test_readme_batch_decrypt(): + """Exercise the README batch deanonymize example.""" + from presidio_anonymizer import BatchDeanonymizeEngine + from presidio_anonymizer.entities import OperatorConfig, OperatorResult + + engine = BatchDeanonymizeEngine() + + results = engine.deanonymize_list( + texts=[ + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=", + ], + entities_list=[ + [OperatorResult(start=11, end=55, entity_type="PERSON")], + [OperatorResult(start=11, end=55, entity_type="PERSON")], + ], + operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})}, + ) + + assert results == ["My name is Chloë", "My name is Chloë"]