Skip to content

Commit 7029893

Browse files
committed
Align entity types with model labels
1 parent 286d9a1 commit 7029893

6 files changed

Lines changed: 23 additions & 33 deletions

File tree

src/EntityFactory.php

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,13 @@ public function fromDecodedOutput(array $entities, string $sourceText): array
4141
/**
4242
* Create entities from invalid JSON output prefixes.
4343
*
44-
* The upstream CLI includes raw entity text in the JSON response without
45-
* escaping it first. This fallback uses the byte offsets and ignores the
46-
* echoed text field so quoted entity text can still be handled safely.
47-
*
4844
* @return array<int, Entity>
4945
*/
5046
public function fromOutputPrefixes(string $output, string $sourceText): array
5147
{
48+
// The upstream CLI prints entity text directly into a JSON-like response
49+
// without escaping it first. Read only the fields before the text value,
50+
// then rehydrate entity text from the original byte offsets.
5251
preg_match_all(
5352
'/\{\s*"entity_group"\s*:\s*"(?P<type>[^"]+)"\s*,\s*"start"\s*:\s*(?P<start>-?\d+)\s*,\s*"end"\s*:\s*(?P<end>-?\d+)\s*,\s*"score"\s*:\s*(?P<score>-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?)\s*,\s*"text"\s*:\s*"/',
5453
$output,

src/EntityType.php

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,12 @@
77
*/
88
enum EntityType: string
99
{
10-
case Person = 'person';
11-
case Email = 'email';
12-
case Phone = 'phone';
13-
case Address = 'address';
14-
case CreditCard = 'credit_card';
15-
case Date = 'date';
16-
case IpAddress = 'ip_address';
17-
case Url = 'url';
18-
19-
case PrivatePerson = 'private_person';
20-
case PrivateEmail = 'private_email';
21-
case PrivatePhone = 'private_phone';
10+
case AccountNumber = 'account_number';
2211
case PrivateAddress = 'private_address';
23-
case PrivateCreditCard = 'private_credit_card';
2412
case PrivateDate = 'private_date';
25-
case PrivateIpAddress = 'private_ip_address';
13+
case PrivateEmail = 'private_email';
14+
case PrivatePerson = 'private_person';
15+
case PrivatePhone = 'private_phone';
2616
case PrivateUrl = 'private_url';
17+
case Secret = 'secret';
2718
}

tests/Feature/ClassifierTest.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
expect($entities)->toHaveCount(1)
1616
->and($entities[0])->toBeInstanceOf(Entity::class)
17-
->and($entities[0]->type)->toBe('email')
17+
->and($entities[0]->type)->toBe('private_email')
1818
->and($entities[0]->text)->toBe('jdoe@example.com')
1919
->and($entities[0]->start)->toBe(20)
2020
->and($entities[0]->end)->toBe(36)
@@ -26,7 +26,7 @@
2626
$this->setFakePrivacyFilterEnvironment([
2727
'PRIVACY_FILTER_FAKE_MODE' => 'raw-text',
2828
'PRIVACY_FILTER_FAKE_NEEDLE' => 'John "JD" Doe',
29-
'PRIVACY_FILTER_FAKE_TYPE' => 'person',
29+
'PRIVACY_FILTER_FAKE_TYPE' => 'private_person',
3030
]);
3131

3232
$entities = (new Classifier(
@@ -36,7 +36,7 @@
3636
))->entities('Contact John "JD" Doe today.');
3737

3838
expect($entities)->toHaveCount(1)
39-
->and($entities[0]->type)->toBe('person')
39+
->and($entities[0]->type)->toBe('private_person')
4040
->and($entities[0]->text)->toBe('John "JD" Doe');
4141
});
4242

tests/Fixtures/privacy-filter

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
$text = stream_get_contents(STDIN);
55
$needle = getenv('PRIVACY_FILTER_FAKE_NEEDLE') ?: 'jdoe@example.com';
6-
$type = getenv('PRIVACY_FILTER_FAKE_TYPE') ?: 'email';
6+
$type = getenv('PRIVACY_FILTER_FAKE_TYPE') ?: 'private_email';
77
$mode = getenv('PRIVACY_FILTER_FAKE_MODE') ?: 'json';
88
$start = strpos($text, $needle);
99

tests/Unit/EntityFactoryTest.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
it('creates entities from decoded output', function () {
77
$entities = (new EntityFactory)->fromDecodedOutput([
88
[
9-
'entity_group' => 'email',
9+
'entity_group' => 'private_email',
1010
'start' => 20,
1111
'end' => 36,
1212
'score' => 0.9876,
@@ -15,18 +15,18 @@
1515
], 'Contact John Doe at jdoe@example.com.');
1616

1717
expect($entities)->toHaveCount(1)
18-
->and($entities[0]->type)->toBe('email')
18+
->and($entities[0]->type)->toBe('private_email')
1919
->and($entities[0]->text)->toBe('jdoe@example.com');
2020
});
2121

2222
it('creates entities from output prefixes', function () {
2323
$entities = (new EntityFactory)->fromOutputPrefixes(
24-
"[\n {\"entity_group\": \"person\", \"start\": 8, \"end\": 21, \"score\": 0.9876, \"text\": \"John \"JD\" Doe\"}\n]\n",
24+
"[\n {\"entity_group\": \"private_person\", \"start\": 8, \"end\": 21, \"score\": 0.9876, \"text\": \"John \"JD\" Doe\"}\n]\n",
2525
'Contact John "JD" Doe today.',
2626
);
2727

2828
expect($entities)->toHaveCount(1)
29-
->and($entities[0]->type)->toBe('person')
29+
->and($entities[0]->type)->toBe('private_person')
3030
->and($entities[0]->text)->toBe('John "JD" Doe');
3131
});
3232

tests/Unit/EntityTest.php

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66

77
it('creates an entity from a decoded payload', function () {
88
$entity = Entity::from([
9-
'entity_group' => 'email',
9+
'entity_group' => 'private_email',
1010
'start' => 20,
1111
'end' => 36,
1212
'score' => 0.9876,
1313
'text' => 'ignored@example.com',
1414
], 'Contact John Doe at jdoe@example.com.');
1515

16-
expect($entity->type)->toBe('email')
16+
expect($entity->type)->toBe('private_email')
1717
->and($entity->text)->toBe('jdoe@example.com')
1818
->and($entity->start)->toBe(20)
1919
->and($entity->end)->toBe(36)
@@ -22,19 +22,19 @@
2222

2323
it('creates an entity from alternate type keys', function () {
2424
$entity = Entity::from([
25-
'label' => 'person',
25+
'label' => 'private_person',
2626
'start' => 8,
2727
'end' => 16,
2828
'score' => 0.9876,
2929
], 'Contact John Doe today.');
3030

31-
expect($entity->type)->toBe('person')
31+
expect($entity->type)->toBe('private_person')
3232
->and($entity->text)->toBe('John Doe');
3333
});
3434

3535
it('exposes entity details as arrays and json', function () {
3636
$entity = new Entity(
37-
type: 'email',
37+
type: 'private_email',
3838
start: 20,
3939
end: 36,
4040
score: 0.9876,
@@ -43,7 +43,7 @@
4343

4444
expect($entity->length())->toBe(16)
4545
->and($entity->toArray())->toBe([
46-
'type' => 'email',
46+
'type' => 'private_email',
4747
'start' => 20,
4848
'end' => 36,
4949
'score' => 0.9876,
@@ -78,7 +78,7 @@
7878

7979
it('throws an exception when the decoded payload is invalid', function () {
8080
Entity::from([
81-
'entity_group' => 'email',
81+
'entity_group' => 'private_email',
8282
'start' => 20,
8383
'end' => 200,
8484
'score' => 0.9876,

0 commit comments

Comments
 (0)