Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file.
## [unreleased]

### Anonymizer
#### Added
- Added `BatchDeanonymizeEngine` to complement `BatchAnonymizerEngine` for batch deanonymization over lists and nested dictionaries.
#### Fixed
- Custom operator `validate()` no longer calls the user-supplied lambda with a dummy `"PII"` value. Previously, stateful lambdas (e.g. those accumulating a token-to-original-value map for de-anonymization) would receive a spurious invocation during validation, inserting a junk entry (`{"TOKEN_1": "PII"}`) into the map and skewing all subsequent token counters. The return-type contract is now enforced in `operate()` when the lambda runs on real data. Fixes [#2024](https://github.com/microsoft/presidio/issues/2024).

Expand Down
22 changes: 22 additions & 0 deletions presidio-anonymizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,28 @@ print(result)

```

Batch deanonymization works the same way for lists and nested dictionaries:
```python
from presidio_anonymizer import BatchDeanonymizeEngine
from presidio_anonymizer.entities import OperatorConfig, OperatorResult

engine = BatchDeanonymizeEngine()

results = engine.deanonymize_list(
texts=[
"My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=",
"My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=",
],
entities_list=[
[OperatorResult(start=11, end=55, entity_type="PERSON")],
[OperatorResult(start=11, end=55, entity_type="PERSON")],
],
operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})},
)

print(results)
```

### As docker service:

In folder presidio/presidio-anonymizer run:
Expand Down
2 changes: 2 additions & 0 deletions presidio-anonymizer/presidio_anonymizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .anonymizer_engine import AnonymizerEngine
from .batch_anonymizer_engine import BatchAnonymizerEngine
from .batch_deanonymize_engine import BatchDeanonymizeEngine
from .deanonymize_engine import DeanonymizeEngine
from .entities import (
ConflictResolutionStrategy,
Expand All @@ -25,6 +26,7 @@
"AnonymizerEngine",
"DeanonymizeEngine",
"BatchAnonymizerEngine",
"BatchDeanonymizeEngine",
"InvalidParamError",
"ConflictResolutionStrategy",
"PIIEntity",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import collections
from typing import Any, Dict, Iterable, List, Optional, Union

from presidio_anonymizer.deanonymize_engine import DeanonymizeEngine
from presidio_anonymizer.entities import (
DictRecognizerResult,
OperatorConfig,
OperatorResult,
)


class BatchDeanonymizeEngine:
"""
BatchDeanonymizeEngine class.

A class that provides functionality to deanonymize in batches.
:param deanonymize_engine: An instance of the DeanonymizeEngine class.
"""

def __init__(self, deanonymize_engine: Optional[DeanonymizeEngine] = None):
self.deanonymize_engine = deanonymize_engine or DeanonymizeEngine()

def deanonymize_list(
self,
texts: List[Optional[Union[str, bool, int, float]]],
entities_list: List[List[OperatorResult]],
operators: Dict[str, OperatorConfig],
) -> List[Union[str, Any]]:
"""
Deanonymize a list of strings.

:param texts: List containing the texts to be deanonymized.
Items with a `type` not in `(str, bool, int, float)` will be left
unchanged.
:param entities_list: A list of lists of OperatorResult used as the
deanonymization input spans for each text in the list.
:param operators: Operators to define the deanonymization type.
"""
return_list = []
if not entities_list:
entities_list = [[] for _ in range(len(texts))]
for index, text in enumerate(texts):
entities = entities_list[index] if index < len(entities_list) else []
if type(text) in (str, bool, int, float):
res = self.deanonymize_engine.deanonymize(
text=str(text), entities=entities, operators=operators
)
return_list.append(res.text)
else:
return_list.append(text)

return return_list

def deanonymize_dict(
self,
anonymizer_results: Iterable[DictRecognizerResult],
operators: Dict[str, OperatorConfig],
) -> Dict[str, Any]:
"""
Deanonymize values in a dictionary.

:param anonymizer_results: Iterator of `DictRecognizerResult`
containing the output of batch anonymization on the input text.
:param operators: Operators to define the deanonymization type.
"""

return_dict = {}
for result in anonymizer_results:
if isinstance(result.value, dict):
resp = self.deanonymize_dict(
anonymizer_results=result.recognizer_results,
operators=operators,
)
return_dict[result.key] = resp

elif isinstance(result.value, str):
resp = self.deanonymize_engine.deanonymize(
text=result.value,
entities=result.recognizer_results,
operators=operators,
)
return_dict[result.key] = resp.text

elif isinstance(result.value, collections.abc.Iterable):
deanonymize_response = self.deanonymize_list(
texts=result.value,
entities_list=result.recognizer_results,
operators=operators,
)
return_dict[result.key] = deanonymize_response
else:
return_dict[result.key] = result.value
return return_dict
184 changes: 184 additions & 0 deletions presidio-anonymizer/tests/test_batch_deanonymize_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# ruff: noqa: D101,D102,D103,I001

import pytest

from presidio_anonymizer import BatchDeanonymizeEngine
from presidio_anonymizer.deanonymize_engine import DeanonymizeEngine
from presidio_anonymizer.entities import (
DictRecognizerResult,
EngineResult,
OperatorConfig,
OperatorResult,
)
from presidio_anonymizer.operators import Decrypt


ENCRYPTED_TEXT = "My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0="
DECRYPTED_TEXT = "My name is Chloë"
ENTITY = OperatorResult(start=11, end=55, entity_type="PERSON")
OPERATORS = {"DEFAULT": OperatorConfig(Decrypt.NAME, {"key": "WmZq4t7w!z%C&F)J"})}


@pytest.fixture(scope="module")
def engine():
return BatchDeanonymizeEngine()


def test_package_root_exports_batch_deanonymize_engine():
from presidio_anonymizer import (
BatchDeanonymizeEngine as ExportedBatchDeanonymizeEngine,
)

assert ExportedBatchDeanonymizeEngine is BatchDeanonymizeEngine


def test_given_analyzer_result_we_deanonymize_list_correctly(engine):
texts = [ENCRYPTED_TEXT, ENCRYPTED_TEXT]
entities_list = [[ENTITY], [ENTITY]]

deanonymize_results = engine.deanonymize_list(
texts=texts, entities_list=entities_list, operators=OPERATORS
)

assert deanonymize_results == [DECRYPTED_TEXT, DECRYPTED_TEXT]


def test_given_analyzer_result_we_deanonymize_dict_correctly(engine):
analyzer_results = [
DictRecognizerResult(
key="name", value=ENCRYPTED_TEXT, recognizer_results=[ENTITY]
)
]

deanonymize_results = engine.deanonymize_dict(
anonymizer_results=analyzer_results, operators=OPERATORS
)

assert deanonymize_results == {"name": DECRYPTED_TEXT}


def test_given_nested_analyzer_result_we_deanonymize_dict_correctly(engine):
analyzer_results = [
DictRecognizerResult(
key="customer",
value={"profile": {"name": ENCRYPTED_TEXT}},
recognizer_results=[
DictRecognizerResult(
key="profile",
value={"name": ENCRYPTED_TEXT},
recognizer_results=[
DictRecognizerResult(
key="name",
value=ENCRYPTED_TEXT,
recognizer_results=[ENTITY],
)
],
)
],
)
]

deanonymize_results = engine.deanonymize_dict(
anonymizer_results=analyzer_results, operators=OPERATORS
)

assert deanonymize_results == {"customer": {"profile": {"name": DECRYPTED_TEXT}}}


def test_given_empty_entities_we_return_text_unchanged(engine):
deanonymize_results = engine.deanonymize_list(
texts=[ENCRYPTED_TEXT], entities_list=[], operators=OPERATORS
)

assert deanonymize_results == [ENCRYPTED_TEXT]


def test_given_short_entities_list_we_keep_trailing_texts(engine):
deanonymize_results = engine.deanonymize_list(
texts=[ENCRYPTED_TEXT, "plain trailing text"],
entities_list=[[ENTITY]],
operators=OPERATORS,
)

assert deanonymize_results == [DECRYPTED_TEXT, "plain trailing text"]


def test_given_exact_type_list_values_we_route_through_injected_engine():
deanonymize_engine = RecordingDeanonymizeEngine()
engine = BatchDeanonymizeEngine(deanonymize_engine=deanonymize_engine)

sentinel = object()
deanonymize_results = engine.deanonymize_list(
texts=[True, 7, 1.5, sentinel],
entities_list=[[], [], [], []],
operators=OPERATORS,
)

assert deanonymize_results == ["custom::True", "custom::7", "custom::1.5", sentinel]
assert deanonymize_engine.calls == [
("True", [], OPERATORS),
("7", [], OPERATORS),
("1.5", [], OPERATORS),
]


def test_given_non_string_list_value_we_return_item_unchanged(engine):
analyzer_results = [
DictRecognizerResult(
key="items",
value=[ENCRYPTED_TEXT, ["nested", 123], object()],
recognizer_results=[[ENTITY], [], []],
)
]

deanonymize_results = engine.deanonymize_dict(
anonymizer_results=analyzer_results, operators=OPERATORS
)

assert deanonymize_results["items"][0] == DECRYPTED_TEXT
assert deanonymize_results["items"][1] == ["nested", 123]
assert type(deanonymize_results["items"][2]) is object


def test_given_scalar_dict_value_we_return_value_unchanged(engine):
analyzer_results = [
DictRecognizerResult(key="id", value=123, recognizer_results=[])
]

deanonymize_results = engine.deanonymize_dict(
anonymizer_results=analyzer_results, operators=OPERATORS
)

assert deanonymize_results == {"id": 123}


class RecordingDeanonymizeEngine(DeanonymizeEngine):
def __init__(self):
super().__init__()
self.calls = []

def deanonymize(self, text, entities, operators):
self.calls.append((text, entities, operators))
return EngineResult(text=f"custom::{text}")


def test_given_custom_deanonymizer_we_use_injected_engine():
deanonymize_engine = RecordingDeanonymizeEngine()
engine = BatchDeanonymizeEngine(deanonymize_engine=deanonymize_engine)

deanonymize_results = engine.deanonymize_list(
texts=[ENCRYPTED_TEXT], entities_list=[[ENTITY]], operators=OPERATORS
)

assert deanonymize_results == [f"custom::{ENCRYPTED_TEXT}"]
assert deanonymize_engine.calls == [(ENCRYPTED_TEXT, [ENTITY], OPERATORS)]


def test_given_unsupported_kwargs_then_batch_api_rejects_them(engine):
with pytest.raises(TypeError, match="unexpected keyword argument"):
engine.deanonymize_list(
texts=[ENCRYPTED_TEXT],
entities_list=[[ENTITY]],
operators=OPERATORS,
language="en",
)
28 changes: 26 additions & 2 deletions presidio-anonymizer/tests/test_readme.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
def test_readme():
"""Exercise the README anonymizer example."""
# Tests that the readme code snippet doesn't fail
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
from presidio_anonymizer.entities import OperatorConfig, RecognizerResult

# Initialize the engine with logger.
engine = AnonymizerEngine()
Expand All @@ -22,8 +23,9 @@ def test_readme():


def test_readme_decrypt():
"""Exercise the README single-item deanonymize example."""
from presidio_anonymizer import DeanonymizeEngine
from presidio_anonymizer.entities import OperatorResult, OperatorConfig
from presidio_anonymizer.entities import OperatorConfig, OperatorResult

# Initialize the engine with logger.
engine = DeanonymizeEngine()
Expand All @@ -39,3 +41,25 @@ def test_readme_decrypt():
)

print(result)


def test_readme_batch_decrypt():
"""Exercise the README batch deanonymize example."""
from presidio_anonymizer import BatchDeanonymizeEngine
from presidio_anonymizer.entities import OperatorConfig, OperatorResult

engine = BatchDeanonymizeEngine()

results = engine.deanonymize_list(
texts=[
"My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=",
"My name is S184CMt9Drj7QaKQ21JTrpYzghnboTF9pn/neN8JME0=",
],
entities_list=[
[OperatorResult(start=11, end=55, entity_type="PERSON")],
[OperatorResult(start=11, end=55, entity_type="PERSON")],
],
operators={"DEFAULT": OperatorConfig("decrypt", {"key": "WmZq4t7w!z%C&F)J"})},
)

assert results == ["My name is Chloë", "My name is Chloë"]