diff --git a/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql new file mode 100644 index 000000000000..c832c3279fbe --- /dev/null +++ b/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql @@ -0,0 +1,4 @@ +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v11212.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql new file mode 100644 index 000000000000..c832c3279fbe --- /dev/null +++ b/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql @@ -0,0 +1,4 @@ +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v11212.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql index 6b01929401aa..274d95f1e0ac 100644 --- a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql @@ -29,3 +29,8 @@ CREATE INDEX worksheet_entity_name_index ON worksheet_entity (name); -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), -- which exceeds MySQL's 3072-byte index key limit (utf8mb4), and the table is small -- enough that the reindex cursor sort is not a concern. + +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql index 27aef87ddc27..7ba0e1f78e86 100644 --- a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql @@ -30,3 +30,8 @@ CREATE INDEX IF NOT EXISTS worksheet_entity_name_index ON worksheet_entity (name -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), too -- wide to fit a btree index row, and the table is small enough that the reindex cursor -- sort is not a concern. + +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/ingestion/tests/integration/auto_classification/databases/conftest.py b/ingestion/tests/integration/auto_classification/databases/conftest.py index a93267d6824c..18cfdb05092f 100644 --- a/ingestion/tests/integration/auto_classification/databases/conftest.py +++ b/ingestion/tests/integration/auto_classification/databases/conftest.py @@ -3,29 +3,7 @@ import pytest from testcontainers.postgres import PostgresContainer -from _openmetadata_testutils.factories.metadata.generated.schema.api.classification.create_classification import ( - CreateClassificationRequestFactory, -) -from _openmetadata_testutils.factories.metadata.generated.schema.api.classification.create_tag import ( - CreateTagRequestFactory, -) -from _openmetadata_testutils.factories.metadata.generated.schema.type.recognizer import ( - RecognizerFactory, -) from _openmetadata_testutils.helpers.docker import try_bind -from metadata.generated.schema.api.classification.createClassification import ( - CreateClassificationRequest, -) -from metadata.generated.schema.api.classification.createTag import CreateTagRequest -from metadata.generated.schema.entity.classification.classification import ( - Classification, - ConflictResolution, -) -from metadata.generated.schema.entity.classification.tag import Tag -from metadata.generated.schema.type.piiEntity import PIIEntity -from metadata.generated.schema.type.predefinedRecognizer import Name -from metadata.generated.schema.type.recognizer import Recognizer -from metadata.ingestion.ometa.ometa_api import OpenMetadata @pytest.fixture(scope="module") @@ -38,486 +16,3 @@ def postgres_container(): with try_bind(container, 5432, 5432) if not os.getenv("CI") else container as container: yield container - - -@pytest.fixture(scope="session") -def credit_card_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="credit_card_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.CreditCardRecognizer, - ) - - -@pytest.fixture(scope="session") -def aba_routing_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="aba_routing_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AbaRoutingRecognizer, - ) - - -@pytest.fixture(scope="session") -def crypto_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="crypto_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.CryptoRecognizer, - ) - - -@pytest.fixture(scope="session") -def date_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="date_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.DateRecognizer, - ) - - -@pytest.fixture(scope="session") -def email_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="email_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.EmailRecognizer, - ) - - -@pytest.fixture(scope="session") -def iban_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="iban_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.IbanRecognizer, - ) - - -@pytest.fixture(scope="session") -def ip_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="ip_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.IpRecognizer, - ) - - -@pytest.fixture(scope="session") -def nhs_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="nhs_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.NhsRecognizer, - ) - - -@pytest.fixture(scope="session") -def medical_license_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="medical_license_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.MedicalLicenseRecognizer, - ) - - -@pytest.fixture(scope="session") -def phone_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="phone_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.PhoneRecognizer, - ) - - -@pytest.fixture(scope="session") -def sg_fin_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="sg_fin_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SgFinRecognizer, - ) - - -@pytest.fixture(scope="session") -def url_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="url_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UrlRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_bank_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_bank_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsBankRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_itin_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_itin_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsItinRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_license_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_license_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsLicenseRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_passport_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_passport_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsPassportRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_ssn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_ssn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsSsnRecognizer, - ) - - -@pytest.fixture(scope="session") -def es_nif_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="es_nif_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.EsNifRecognizer, - ) - - -@pytest.fixture(scope="session") -def pii_spacy_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="spacy_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SpacyRecognizer, - recognizerConfig__supportedEntities=[ - PIIEntity.PERSON, - ], - ) - - -@pytest.fixture(scope="session") -def non_pii_spacy_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="spacy_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SpacyRecognizer, - recognizerConfig__supportedEntities=[ - PIIEntity.LOCATION, - PIIEntity.DATE_TIME, - ], - ) - - -@pytest.fixture(scope="session") -def au_abn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_abn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuAbnRecognizer, - ) - - -@pytest.fixture(scope="session") -def au_acn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_acn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuAcnRecognizer, - ) - - -@pytest.fixture(scope="session") -def au_tfn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_tfn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuTfnRecognizer, - ) - - -@pytest.fixture(scope="session") -def au_medicare_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_medicare_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuMedicareRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_driver_license_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_driver_license_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItDriverLicenseRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_fiscal_code_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_fiscal_code_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItFiscalCodeRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_vat_code_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_vat_code_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItVatCodeRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_identity_card_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_identity_card_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItIdentityCardRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_passport_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_passport_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItPassportRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_pan_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_pan_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InPanRecognizer, - ) - - -@pytest.fixture(scope="session") -def pl_pesel_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="pl_pesel_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.PlPeselRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_aadhaar_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_aadhaar_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InAadhaarRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_vehicle_registration_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_vehicle_registration_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InVehicleRegistrationRecognizer, - ) - - -@pytest.fixture(scope="session") -def sg_uen_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="sg_uen_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SgUenRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_voter_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_voter_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InVoterRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_passport_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_passport_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InPassportRecognizer, - ) - - -@pytest.fixture(scope="session") -def fi_personal_identity_code_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="fi_personal_identity_code_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.FiPersonalIdentityCodeRecognizer, - ) - - -@pytest.fixture(scope="session") -def es_nie_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="es_nie_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.EsNieRecognizer, - ) - - -@pytest.fixture(scope="session") -def uk_nino_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="uk_nino_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UkNinoRecognizer, - ) - - -@pytest.fixture(scope="session") -def person_column_name_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="person_column_name_recognizer", - recognizerConfig__type="pattern", - recognizerConfig__patterns__0__regex=r"^.*(user|client|person|first|last|maiden|nick).*(name).*$", - for_column_name=True, - ) - - -@pytest.fixture(scope="session") -def pii_classification(metadata: OpenMetadata[Classification, CreateClassificationRequest]) -> Classification: - create_classification_request = CreateClassificationRequestFactory.create( - fqn="PII", - autoClassificationConfig__conflictResolution=ConflictResolution.highest_priority.value, - ) - entity = metadata.create_or_update(create_classification_request) - - return entity # noqa: RET504 - - -@pytest.fixture(scope="session") -def sensitive_pii_tag( - metadata: OpenMetadata[Tag, CreateTagRequest], - pii_classification: Classification, - credit_card_recognizer: Recognizer, - aba_routing_recognizer: Recognizer, - crypto_recognizer: Recognizer, - email_recognizer: Recognizer, - iban_recognizer: Recognizer, - nhs_recognizer: Recognizer, - medical_license_recognizer: Recognizer, - sg_fin_recognizer: Recognizer, - us_bank_recognizer: Recognizer, - us_itin_recognizer: Recognizer, - us_license_recognizer: Recognizer, - us_passport_recognizer: Recognizer, - us_ssn_recognizer: Recognizer, - es_nif_recognizer: Recognizer, - pii_spacy_recognizer: Recognizer, - au_abn_recognizer: Recognizer, - au_acn_recognizer: Recognizer, - au_tfn_recognizer: Recognizer, - au_medicare_recognizer: Recognizer, - it_driver_license_recognizer: Recognizer, - it_fiscal_code_recognizer: Recognizer, - it_vat_code_recognizer: Recognizer, - it_identity_card_recognizer: Recognizer, - it_passport_recognizer: Recognizer, - in_pan_recognizer: Recognizer, - pl_pesel_recognizer: Recognizer, - in_aadhaar_recognizer: Recognizer, - sg_uen_recognizer: Recognizer, - in_voter_recognizer: Recognizer, - in_passport_recognizer: Recognizer, - fi_personal_identity_code_recognizer: Recognizer, - es_nie_recognizer: Recognizer, - uk_nino_recognizer: Recognizer, - person_column_name_recognizer: Recognizer, -) -> Tag: - create_tag_request: CreateTagRequest = CreateTagRequestFactory.create( - tag_name="Sensitive", - tag_classification=pii_classification.fullyQualifiedName.root, - autoClassificationPriority=100, - recognizers=[ - credit_card_recognizer, - aba_routing_recognizer, - crypto_recognizer, - email_recognizer, - iban_recognizer, - nhs_recognizer, - medical_license_recognizer, - sg_fin_recognizer, - us_bank_recognizer, - us_itin_recognizer, - us_license_recognizer, - us_passport_recognizer, - us_ssn_recognizer, - es_nif_recognizer, - pii_spacy_recognizer, - au_abn_recognizer, - au_acn_recognizer, - au_tfn_recognizer, - au_medicare_recognizer, - it_driver_license_recognizer, - it_fiscal_code_recognizer, - it_vat_code_recognizer, - it_identity_card_recognizer, - it_passport_recognizer, - in_pan_recognizer, - pl_pesel_recognizer, - in_aadhaar_recognizer, - sg_uen_recognizer, - in_voter_recognizer, - in_passport_recognizer, - fi_personal_identity_code_recognizer, - es_nie_recognizer, - uk_nino_recognizer, - person_column_name_recognizer, - ], - ) - return metadata.create_or_update(create_tag_request) - - -@pytest.fixture(scope="session") -def non_sensitive_pii_tag( - metadata: OpenMetadata[Tag, CreateTagRequest], - pii_classification: Classification, - date_recognizer: Recognizer, - phone_recognizer: Recognizer, - non_pii_spacy_recognizer: Recognizer, -) -> Tag: - create_tag_request: CreateTagRequest = CreateTagRequestFactory.create( - tag_name="NonSensitive", - tag_classification=pii_classification.fullyQualifiedName.root, - autoClassificationPriority=80, - recognizers=[ - date_recognizer, - phone_recognizer, - non_pii_spacy_recognizer, - ], - ) - return metadata.create_or_update(create_tag_request) diff --git a/ingestion/tests/integration/auto_classification/databases/init.sql b/ingestion/tests/integration/auto_classification/databases/init.sql index 3ce571c34f4b..31f8bebb10f7 100644 --- a/ingestion/tests/integration/auto_classification/databases/init.sql +++ b/ingestion/tests/integration/auto_classification/databases/init.sql @@ -10,17 +10,18 @@ CREATE TABLE example_table ( DWH_X20 VARCHAR(255), timestamp BIGINT, version VARCHAR(50), - order_date DATE + order_date DATE, + academic_year_code INTEGER ); -- Insert sample data -INSERT INTO example_table (NHS_number, DWH_X10, user_name, address, DWH_X20, timestamp, version, order_date) +INSERT INTO example_table (NHS_number, DWH_X10, user_name, address, DWH_X20, timestamp, version, order_date, academic_year_code) VALUES - ('999-064-3601', 'harsha@gmail.com', 'Harsha', '2240 W Ina Rd', '4242-4242-4242-4242', 1760000000123, 'v1', '2018-01-05'), - ('999-468-5678', 'suresh@gmail.com', 'Suresh', '7192 Kalanianaole Hwy', '5555-5555-5555-4444', 1760000000131, 'v1.0', '2018-01-09'), - ('999-813-4595', 'stelle@gmail.com', 'Stelle', '5900 N Cannon Ave', '4000-0566-5566-5556', 1760000000149, 'v1.1', '2018-01-12'), - ('999-313-2993', 'peter@gmail.com', 'Peter', '4350 Main St', '2223-0031-2200-3222', 1760000000156, 'v2', '2018-01-22'), - ('999-911-7562', 'teddy@gmail.com', 'Theodore', '903 W Main St', '5200-8282-8282-8210', 1760000000164, 'v3', '2018-01-26'), - ('999-595-6195', 'akash@gmail.com', 'Akash', '2220 Coit Rd', '5105-1051-0510-5100', 1760000000172, 'v1', '2018-01-28'), - ('999-056-4418', 'mary@gmail.com', 'Mary', '7 Southside Dr', '5328-7101-2269-1668', 1760000000180, 'V1', '2018-01-29'), - ('999-329-1099', 'chirag@gmail.com', 'Chirag', '2929 S 25th Ave', '4801-8451-4627-0484', 1760000000198, 'v4', '2018-01-31'); + ('999-064-3601', 'harsha@gmail.com', 'Harsha', '2240 W Ina Rd', '4242-4242-4242-4242', 1760000000123, 'v1', '2018-01-05', 1999), + ('999-468-5678', 'suresh@gmail.com', 'Suresh', '7192 Kalanianaole Hwy', '5555-5555-5555-4444', 1760000000131, 'v1.0', '2018-01-09', 2000), + ('999-813-4595', 'stelle@gmail.com', 'Stelle', '5900 N Cannon Ave', '4000-0566-5566-5556', 1760000000149, 'v1.1', '2018-01-12', 2001), + ('999-313-2993', 'peter@gmail.com', 'Peter', '4350 Main St', '2223-0031-2200-3222', 1760000000156, 'v2', '2018-01-22', 2002), + ('999-911-7562', 'teddy@gmail.com', 'Theodore', '903 W Main St', '5200-8282-8282-8210', 1760000000164, 'v3', '2018-01-26', 2003), + ('999-595-6195', 'akash@gmail.com', 'Akash', '2220 Coit Rd', '5105-1051-0510-5100', 1760000000172, 'v1', '2018-01-28', 2004), + ('999-056-4418', 'mary@gmail.com', 'Mary', '7 Southside Dr', '5328-7101-2269-1668', 1760000000180, 'V1', '2018-01-29', 2005), + ('999-329-1099', 'chirag@gmail.com', 'Chirag', '2929 S 25th Ave', '4801-8451-4627-0484', 1760000000198, 'v4', '2018-01-31', 2006); diff --git a/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py b/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py index a0a68413318c..27efa849af7e 100644 --- a/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py +++ b/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py @@ -17,10 +17,6 @@ ProfilerConfiguration, SampleDataIngestionConfig, ) -from metadata.generated.schema.entity.classification.classification import ( - Classification, -) -from metadata.generated.schema.entity.classification.tag import Tag from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.services.connections.database.common.basicAuth import ( BasicAuth, @@ -176,9 +172,6 @@ def _set_global_profiler_config(metadata: OpenMetadata, store: bool): def test_store_sample_data_when_global_config_enabled( db_service: DatabaseService, metadata: OpenMetadata, - pii_classification: Classification, - sensitive_pii_tag: Tag, - non_sensitive_pii_tag: Tag, load_metadata: MetadataWorkflow, run_workflow, autoclassification_config, @@ -201,9 +194,6 @@ def test_store_sample_data_when_global_config_enabled( def test_no_sample_data_when_global_config_disabled( db_service: DatabaseService, metadata: OpenMetadata, - pii_classification: Classification, - sensitive_pii_tag: Tag, - non_sensitive_pii_tag: Tag, load_metadata: MetadataWorkflow, run_workflow, autoclassification_config, diff --git a/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py b/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py index ee700c9218aa..f66901115ec4 100644 --- a/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py +++ b/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py @@ -7,10 +7,6 @@ from metadata.generated.schema.api.services.createDatabaseService import ( CreateDatabaseServiceRequest, ) -from metadata.generated.schema.entity.classification.classification import ( - Classification, -) -from metadata.generated.schema.entity.classification.tag import Tag from metadata.generated.schema.entity.services.connections.database.common.basicAuth import ( BasicAuth, ) @@ -118,9 +114,6 @@ def autoclassification_config(db_service, bot_workflow_config, sink_config): @pytest.fixture(scope="module") def run_autoclassification( - pii_classification: Classification, - sensitive_pii_tag: Tag, - non_sensitive_pii_tag: Tag, run_workflow, load_metadata: MetadataWorkflow, autoclassification_config, @@ -134,6 +127,7 @@ def test_it_returns_the_expected_classifications( run_autoclassification: AutoClassificationWorkflow, ) -> None: ( + academic_year_code_column, address_column, customer_id_column, dwh_x10_column, @@ -187,3 +181,12 @@ def test_it_returns_the_expected_classifications( reason=Contains("Detected by `ValidatedDateRecognizer`", "Patterns matched:"), ), ] + # SpacyRecognizer's DATE_TIME entity flags 4-digit year-like integers + # regardless of column type or semantics. Tracked separately: #29083. + assert academic_year_code_column.tags == [ + IsInstance(TagLabel) + & HasAttributes( + tagFQN=HasAttributes(root="PII.NonSensitive"), + reason=Contains("Detected by `SpacyRecognizer`"), + ), + ] diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11212/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11212/Migration.java new file mode 100644 index 000000000000..9d87c50b01ae --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11212/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.mysql.v11212; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v11212.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v11212: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java new file mode 100644 index 000000000000..490c2376dd45 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.mysql.v1131; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v1131.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v1131: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11212/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11212/Migration.java new file mode 100644 index 000000000000..f1df0a134b64 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11212/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.postgres.v11212; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v11212.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v11212: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java new file mode 100644 index 000000000000..4e0da26700a7 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.postgres.v1131; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v1131.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v1131: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java new file mode 100644 index 000000000000..3986a2fee93a --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java @@ -0,0 +1,184 @@ +package org.openmetadata.service.migration.utils; + +import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import org.jdbi.v3.core.Handle; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.resources.databases.DatasourceConfig; +import org.openmetadata.service.util.FullyQualifiedName; + +@Slf4j +public class PiiRecognizerMigrationUtil { + private PiiRecognizerMigrationUtil() {} + + private static final String FQN_HASH_COLUMN = "fqnHash"; + private static final String JSON_COLUMN = "json"; + private static final String RECOGNIZERS_FIELD = "recognizers"; + private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; + private static final String CONTEXT_FIELD = "context"; + private static final String NAME_FIELD = "name"; + private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; + + private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; + private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; + + private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; + private static final String UPDATE_POSTGRES = + "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG_POSTGRES = + "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; + + private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; + private static final String PERSON_ENTITY = "PERSON"; + + /** + * Context keywords that are too generic for their respective recognizers and cause false-positive + * PII classification (e.g. ACADEMIC_YEAR_CODE being tagged as CVV because "code" is in context). + */ + private static final Map> BROAD_KEYWORDS_TO_REMOVE = + Map.of( + "CvvRecognizer", Set.of("code", "security", "verification", "card"), + "UsBankRecognizer", Set.of("check", "save"), + "UsSsnRecognizer", Set.of("social", "security", "id_number"), + "CryptoRecognizer", Set.of("address"), + "PhoneRecognizer", Set.of("call")); + + private static final Set SPACY_PERSON_BROAD_KEYWORDS = Set.of("name"); + + public static void removeBroadPiiContextKeywords(Handle handle, String version) { + LOG.info("{}: removing overly broad context keywords from PII recognizers", version); + boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); + migrateTag(handle, PII_SENSITIVE_FQN, isMySQL, version); + migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL, version); + LOG.info("{}: PII recognizer context keyword cleanup complete", version); + } + + private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL, String version) { + String fqnHash = FullyQualifiedName.buildHash(tagFqn); + String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; + List> rows = + handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); + if (nullOrEmpty(rows)) { + LOG.warn("{}: tag '{}' not found, skipping PII recognizer keyword cleanup", version, tagFqn); + return; + } + Object jsonValue = rows.getFirst().get(JSON_COLUMN); + if (jsonValue == null) { + LOG.warn( + "{}: tag '{}' has null json, skipping PII recognizer keyword cleanup", version, tagFqn); + return; + } + String jsonStr = jsonValue.toString(); + ObjectNode root; + try { + root = (ObjectNode) JsonUtils.readTree(jsonStr); + } catch (Exception e) { + LOG.warn("{}: failed to parse tag '{}' JSON, skipping: {}", version, tagFqn, e.getMessage()); + return; + } + boolean modified = processRecognizers(root); + if (modified) { + String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; + handle + .createUpdate(updateSql) + .bind(JSON_COLUMN, root.toString()) + .bind(FQN_HASH_COLUMN, fqnHash) + .execute(); + LOG.info("{}: updated PII recognizer context keywords for tag '{}'", version, tagFqn); + } else { + LOG.info("{}: no broad PII context keywords found in tag '{}'", version, tagFqn); + } + } + + private static boolean processRecognizers(ObjectNode root) { + JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); + if (recognizersNode == null || !recognizersNode.isArray()) { + return false; + } + boolean modified = false; + for (JsonNode recognizerNode : recognizersNode) { + if (recognizerNode instanceof ObjectNode recognizer) { + modified |= processRecognizer(recognizer); + } + } + return modified; + } + + private static boolean processRecognizer(ObjectNode recognizer) { + JsonNode nameNode = recognizer.get(NAME_FIELD); + if (nameNode == null) { + return false; + } + String recognizerName = nameNode.asText(); + JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); + if (!(configNode instanceof ObjectNode config)) { + return false; + } + boolean modified = removeFromBroadKeywordsMap(recognizerName, config); + modified |= removeSpacyPersonBroadKeywords(recognizerName, config); + return modified; + } + + private static boolean removeFromBroadKeywordsMap(String recognizerName, ObjectNode config) { + Set toRemove = BROAD_KEYWORDS_TO_REMOVE.get(recognizerName); + if (toRemove == null) { + return false; + } + return removeKeywordsFromContext(config, toRemove, recognizerName); + } + + private static boolean removeSpacyPersonBroadKeywords(String recognizerName, ObjectNode config) { + if (!SPACY_RECOGNIZER.equals(recognizerName)) { + return false; + } + JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); + if (!isPersonRecognizer(entitiesNode)) { + return false; + } + return removeKeywordsFromContext(config, SPACY_PERSON_BROAD_KEYWORDS, recognizerName); + } + + private static boolean isPersonRecognizer(JsonNode entitiesNode) { + if (entitiesNode == null || !entitiesNode.isArray()) { + return false; + } + boolean found = false; + for (JsonNode entity : entitiesNode) { + if (PERSON_ENTITY.equals(entity.asText())) { + found = true; + } + } + return found; + } + + private static boolean removeKeywordsFromContext( + ObjectNode config, Set toRemove, String recognizerName) { + JsonNode contextNode = config.get(CONTEXT_FIELD); + if (contextNode == null || !contextNode.isArray()) { + return false; + } + ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); + boolean removed = false; + for (JsonNode keyword : contextNode) { + String kw = keyword.asText(); + if (toRemove.contains(kw)) { + LOG.info("Removing broad keyword '{}' from {} context", kw, recognizerName); + removed = true; + } else { + newContext.add(keyword); + } + } + if (removed) { + config.set(CONTEXT_FIELD, newContext); + } + return removed; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11212/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11212/MigrationUtil.java new file mode 100644 index 000000000000..c10a5ccf3408 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11212/MigrationUtil.java @@ -0,0 +1,12 @@ +package org.openmetadata.service.migration.utils.v11212; + +import org.jdbi.v3.core.Handle; +import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; + +public class MigrationUtil { + private MigrationUtil() {} + + public static void removeBroadPiiContextKeywords(Handle handle) { + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle, "v11212"); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java new file mode 100644 index 000000000000..280133d2c412 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java @@ -0,0 +1,12 @@ +package org.openmetadata.service.migration.utils.v1131; + +import org.jdbi.v3.core.Handle; +import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; + +public class MigrationUtil { + private MigrationUtil() {} + + public static void removeBroadPiiContextKeywords(Handle handle) { + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle, "v1131"); + } +} diff --git a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json index 825b11101d2a..e18ab08e0a2a 100644 --- a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json +++ b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json @@ -51,8 +51,7 @@ "telephone", "cell", "cellphone", - "mobile", - "call" + "mobile" ] }, "confidenceThreshold": 0.6, @@ -216,850 +215,837 @@ "description": "PII which if lost, compromised, or disclosed without authorization, could result in substantial harm, embarrassment, inconvenience, or unfairness to an individual.", "autoClassificationEnabled": true, "autoClassificationPriority": 100, - "recognizers": - [ - { - "name": "EnglishCreditCardRecognizer", - "displayName": "English Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "en", - "context": [ - "credit", - "card", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment", - "cc_number", - "card_number", - "payment_info" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "recognizers": [ + { + "name": "EnglishCreditCardRecognizer", + "displayName": "English Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "en", + "context": [ + "credit", + "card", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment", + "cc_number", + "card_number", + "payment_info" + ] }, - { - "name": "SpanishCreditCardRecognizer", - "displayName": "Spanish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "es", - "context": [ - "tarjeta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SpanishCreditCardRecognizer", + "displayName": "Spanish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "es", + "context": [ + "tarjeta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment" + ] }, - { - "name": "ItalianCreditCardRecognizer", - "displayName": "Italian Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "it", - "context": [ - "carta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItalianCreditCardRecognizer", + "displayName": "Italian Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "it", + "context": [ + "carta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] }, - { - "name": "PolishCreditCardRecognizer", - "displayName": "Polish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "pl", - "context": [ - "karta", - "kredytowa", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "PolishCreditCardRecognizer", + "displayName": "Polish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "pl", + "context": [ + "karta", + "kredytowa", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] }, - { - "name": "CvvRecognizer", - "displayName": "CVV Recognizer", - "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "patterns": [ - { - "name": "cvv_pattern", - "regex": "\\b\\d{3,4}\\b", - "score": 0.5 - } - ], - "context": [ - "cvv", - "cvc", - "security", - "code", - "verification", - "card", - "cvv2", - "cid", - "csc" - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "supportedLanguage": "en" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "CvvRecognizer", + "displayName": "CVV Recognizer", + "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "patterns": [ + { + "name": "cvv_pattern", + "regex": "\\b\\d{3,4}\\b", + "score": 0.5 + } + ], + "context": [ + "cvv", + "cvc", + "cvv2", + "cid", + "csc" + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsBankRecognizer", + "displayName": "Us Bank Recognizer", + "description": "Recognizes US bank number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsBankRecognizer", - "displayName": "Us Bank Recognizer", - "description": "Recognizes US bank number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsBankRecognizer", - "supportedLanguage": "en", - "context": [ - "check", - "account", - "acct", - "bank", - "save", - "debit", - "bank_account", - "bank", - "account_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "account", + "acct", + "bank", + "debit", + "bank_account", + "account_number" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsLicenseRecognizer", + "displayName": "Us License Recognizer", + "description": "Recognizes US driver license using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsLicenseRecognizer", - "displayName": "Us License Recognizer", - "description": "Recognizes US driver license using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsLicenseRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsItinRecognizer", + "displayName": "Us Itin Recognizer", + "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsItinRecognizer", - "displayName": "Us Itin Recognizer", - "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsItinRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsPassportRecognizer", + "displayName": "Us Passport Recognizer", + "description": "Recognizes US Passport number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsPassportRecognizer", - "displayName": "Us Passport Recognizer", - "description": "Recognizes US Passport number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsPassportRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsSsnRecognizer", + "displayName": "Us Ssn Recognizer", + "description": "Recognize US Social Security Number (SSN) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsSsnRecognizer", - "displayName": "Us Ssn Recognizer", - "description": "Recognize US Social Security Number (SSN) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsSsnRecognizer", - "supportedLanguage": "en", - "context": [ - "social", - "security", - "ssn", - "ssns", - "ssid", - "national_id", - "id_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "ssn", + "ssns", + "ssid", + "national_id" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "NhsRecognizer", + "displayName": "Nhs Recognizer", + "description": "Recognizes NHS number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "NhsRecognizer", - "displayName": "Nhs Recognizer", - "description": "Recognizes NHS number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "NhsRecognizer", - "supportedLanguage": "en", - "context": [ - "nhs", - "national_health_service", - "nhs_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "nhs", + "national_health_service", + "nhs_number" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UkNinoRecognizer", + "displayName": "Uk Nino Recognizer", + "description": "Recognizes UK National Insurance Number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UkNinoRecognizer", - "displayName": "Uk Nino Recognizer", - "description": "Recognizes UK National Insurance Number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UkNinoRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SgFinRecognizer", + "displayName": "Sg Fin Recognizer", + "description": "Recognize SG FIN/NRIC number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "SgFinRecognizer", - "displayName": "Sg Fin Recognizer", - "description": "Recognize SG FIN/NRIC number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SgFinRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuAbnRecognizer", + "displayName": "Au Abn Recognizer", + "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuAbnRecognizer", - "displayName": "Au Abn Recognizer", - "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuAbnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuAcnRecognizer", + "displayName": "Au Acn Recognizer", + "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuAcnRecognizer", - "displayName": "Au Acn Recognizer", - "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuAcnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuTfnRecognizer", + "displayName": "Au Tfn Recognizer", + "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuTfnRecognizer", - "displayName": "Au Tfn Recognizer", - "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuTfnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuMedicareRecognizer", + "displayName": "Au Medicare Recognizer", + "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuMedicareRecognizer", - "displayName": "Au Medicare Recognizer", - "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuMedicareRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InPanRecognizer", + "displayName": "In Pan Recognizer", + "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InPanRecognizer", - "displayName": "In Pan Recognizer", - "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InPanRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InAadhaarRecognizer", + "displayName": "In Aadhaar Recognizer", + "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InAadhaarRecognizer", - "displayName": "In Aadhaar Recognizer", - "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InAadhaarRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InVehicleRegistrationRecognizer", + "displayName": "In Vehicle Registration Recognizer", + "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InVehicleRegistrationRecognizer", - "displayName": "In Vehicle Registration Recognizer", - "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InVehicleRegistrationRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InPassportRecognizer", + "displayName": "In Passport Recognizer", + "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InPassportRecognizer", - "displayName": "In Passport Recognizer", - "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InPassportRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EsNifRecognizer", + "displayName": "Es Nif Recognizer", + "description": "Recognize NIF number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "EsNifRecognizer", - "displayName": "Es Nif Recognizer", - "description": "Recognize NIF number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EsNifRecognizer", - "supportedLanguage": "es" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "es" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EsNieRecognizer", + "displayName": "Es Nie Recognizer", + "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "EsNieRecognizer", - "displayName": "Es Nie Recognizer", - "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EsNieRecognizer", - "supportedLanguage": "es" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "es" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItDriverLicenseRecognizer", + "displayName": "It Driver License Recognizer", + "description": "Recognizes IT Driver License using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItDriverLicenseRecognizer", - "displayName": "It Driver License Recognizer", - "description": "Recognizes IT Driver License using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItDriverLicenseRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItFiscalCodeRecognizer", + "displayName": "It Fiscal Code Recognizer", + "description": "Recognizes IT Fiscal Code using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItFiscalCodeRecognizer", - "displayName": "It Fiscal Code Recognizer", - "description": "Recognizes IT Fiscal Code using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItFiscalCodeRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItVatCodeRecognizer", - "displayName": "It Vat Code Recognizer", - "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItVatCodeRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { - "name": "ItIdentityCardRecognizer", - "displayName": "It Identity Card Recognizer", - "description": "Recognizes Italian Identity Card number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItIdentityCardRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItVatCodeRecognizer", + "displayName": "It Vat Code Recognizer", + "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItVatCodeRecognizer", + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItIdentityCardRecognizer", + "displayName": "It Identity Card Recognizer", + "description": "Recognizes Italian Identity Card number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItIdentityCardRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItPassportRecognizer", + "displayName": "It Passport Recognizer", + "description": "Recognizes IT Passport number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItPassportRecognizer", - "displayName": "It Passport Recognizer", - "description": "Recognizes IT Passport number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItPassportRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "PlPeselRecognizer", + "displayName": "Pl Pesel Recognizer", + "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "PlPeselRecognizer", - "displayName": "Pl Pesel Recognizer", - "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "PlPeselRecognizer", - "supportedLanguage": "pl" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "pl" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "CryptoRecognizer", + "displayName": "Crypto Recognizer", + "description": "Recognize common crypto account numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "CryptoRecognizer", - "displayName": "Crypto Recognizer", - "description": "Recognize common crypto account numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CryptoRecognizer", - "context": [ - "crypto", - "bitcoin", - "btc", - "ethereum", - "eth", - "litecoin", - "ltc", - "wallet", - "address" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "context": [ + "crypto", + "bitcoin", + "btc", + "ethereum", + "eth", + "litecoin", + "ltc", + "wallet" + ] }, - { - "name": "EmailRecognizer", - "displayName": "Email Recognizer", - "description": "Recognize email addresses using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EmailRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EmailRecognizer", + "displayName": "Email Recognizer", + "description": "Recognize email addresses using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "EmailRecognizer" }, - { - "name": "IbanRecognizer", - "displayName": "Iban Recognizer", - "description": "Recognize IBAN code using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IbanRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "IbanRecognizer", + "displayName": "Iban Recognizer", + "description": "Recognize IBAN code using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IbanRecognizer" }, - { - "name": "IpRecognizer", - "displayName": "Ip Recognizer", - "description": "Recognize IP address using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IpRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "IpRecognizer", + "displayName": "Ip Recognizer", + "description": "Recognize IP address using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IpRecognizer" }, - { - "name": "MedicalLicenseRecognizer", - "displayName": "Medical License Recognizer", - "description": "Recognize common Medical license numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "MedicalLicenseRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "MedicalLicenseRecognizer", + "displayName": "Medical License Recognizer", + "description": "Recognize common Medical license numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "MedicalLicenseRecognizer" }, - { - "name": "InVoterRecognizer", - "displayName": "In Voter Recognizer", - "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InVoterRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InVoterRecognizer", + "displayName": "In Voter Recognizer", + "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InVoterRecognizer" }, - { - "name": "AbaRoutingRecognizer", - "displayName": "ABA Routing Recognizer", - "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AbaRoutingRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AbaRoutingRecognizer", + "displayName": "ABA Routing Recognizer", + "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AbaRoutingRecognizer" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "FiPersonalIdentityCodeRecognizer", + "displayName": "FI Personal Identity Code Recognizer", + "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "FiPersonalIdentityCodeRecognizer", - "displayName": "FI Personal Identity Code Recognizer", - "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "FiPersonalIdentityCodeRecognizer", - "supportedLanguage": "fi" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "fi" }, - { - "name": "SgUenRecognizer", - "displayName": "Singaporean UEN recognizer", - "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SgUenRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SgUenRecognizer", + "displayName": "Singaporean UEN recognizer", + "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "SgUenRecognizer" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SpacyRecognizer", + "displayName": "Recognizer using spaCy NLP model", + "description": "Recognize PII entities using a spaCy NLP model.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "SpacyRecognizer", - "displayName": "Recognizer using spaCy NLP model", - "description": "Recognize PII entities using a spaCy NLP model.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SpacyRecognizer", - "supportedEntities": [ - "PERSON" - ], - "context": [ - "name", - "first_name", - "last_name", - "given_name", - "firstName", - "lastName", - "givenName", - "familyName" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedEntities": [ + "PERSON" + ], + "context": [ + "first_name", + "last_name", + "given_name", + "firstName", + "lastName", + "givenName", + "familyName" + ] }, - { - "displayName": "US SSN column name", - "name": "us_ssn", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_ssn_pattern_0", - "regex": "^.*(ssn|social).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "displayName": "US SSN column name", + "name": "us_ssn", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_ssn_pattern_0", + "regex": "^.*(ssn|social).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Credit card column name", - "name": "credit_card", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "credit_card_pattern_0", - "regex": "^.*(credit).*(card).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Credit card column name", + "name": "credit_card", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "credit_card_pattern_0", + "regex": "^.*(credit).*(card).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "US bank number column name", - "name": "us_bank_number", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_bank_number_pattern_0", - "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", - "score": 0.6 - }, - { - "name": "us_bank_number_pattern_1", - "regex": "\\bbank[_-]?(account|number|num|no)?\\b", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "US bank number column name", + "name": "us_bank_number", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_bank_number_pattern_0", + "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", + "score": 0.6 + }, + { + "name": "us_bank_number_pattern_1", + "regex": "\\bbank[_-]?(account|number|num|no)?\\b", + "score": 0.6 } - }, - "confidenceThreshold": 0.6, - "target": "column_name" + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + } }, - { - "displayName": "Iban code column name", - "name": "iban_code", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "iban_code_pattern_0", - "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_1", - "regex": "\bbank[_-]?(account|number|num|no)?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_2", - "regex": "\biban(?:[_]?(number|code))?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_3", - "regex": "\bbank[_]?iban\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_4", - "regex": "\binternational[_]?(account|bank[_]?number)\b", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Iban code column name", + "name": "iban_code", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "iban_code_pattern_0", + "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_1", + "regex": "\bbank[_-]?(account|number|num|no)?\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_2", + "regex": "\biban(?:[_]?(number|code))?\b", + "score": 0.6 }, - "context": [] + { + "name": "iban_code_pattern_3", + "regex": "\bbank[_]?iban\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_4", + "regex": "\binternational[_]?(account|bank[_]?number)\b", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Email address column name", - "name": "email_address", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "email_address_pattern_0", - "regex": "^(email|e-mail|mail)(.*address)?$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Email address column name", + "name": "email_address", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "email_address_pattern_0", + "regex": "^(email|e-mail|mail)(.*address)?$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Person column name", - "name": "person", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "person_pattern_0", - "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Person column name", + "name": "person", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "person_pattern_0", + "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" - } - ] + "context": [] + }, + "confidenceThreshold": 0.6, + "target": "column_name" + } + ] } ] -} \ No newline at end of file +}