From a0f2e7af678856302e3d02fafb5c37a99d1a1099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20Do=C3=B1aque?= Date: Mon, 15 Jun 2026 19:06:28 -0400 Subject: [PATCH 1/8] fix(pii): remove overly broad context keywords from PII recognizers Broad keywords like "code", "security", "address", "name", "social", "check", "save", and "call" caused false-positive PII classification. Example: ACADEMIC_YEAR_CODE was tagged PII.Sensitive because CvvRecognizer has "code" as a context keyword and 1999/2000 match the CVV digit pattern. Removes the broad terms from 6 recognizers in the seed data and adds idempotent data migrations for 1.13.1 and 1.12.13. Migrations skip recognizers the user deleted and are no-ops if keywords are already absent. Fixes #29049 Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../native/1.12.13/mysql/schemaChanges.sql | 4 + .../native/1.12.13/postgres/schemaChanges.sql | 4 + .../native/1.13.1/mysql/schemaChanges.sql | 5 + .../native/1.13.1/postgres/schemaChanges.sql | 5 + .../migration/mysql/v11213/Migration.java | 25 + .../migration/mysql/v1131/Migration.java | 25 + .../migration/postgres/v11213/Migration.java | 25 + .../migration/postgres/v1131/Migration.java | 25 + .../migration/utils/v11213/MigrationUtil.java | 174 ++ .../migration/utils/v1131/MigrationUtil.java | 174 ++ .../data/tags/piiTagsWithRecognizers.json | 1528 ++++++++--------- 11 files changed, 1223 insertions(+), 771 deletions(-) create mode 100644 bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql create mode 100644 bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java diff --git a/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql new file mode 100644 index 000000000000..5cf17d3480c0 --- /dev/null +++ b/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql @@ -0,0 +1,4 @@ +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql new file mode 100644 index 000000000000..5cf17d3480c0 --- /dev/null +++ b/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql @@ -0,0 +1,4 @@ +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql index 6b01929401aa..274d95f1e0ac 100644 --- a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql @@ -29,3 +29,8 @@ CREATE INDEX worksheet_entity_name_index ON worksheet_entity (name); -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), -- which exceeds MySQL's 3072-byte index key limit (utf8mb4), and the table is small -- enough that the reindex cursor sort is not a concern. + +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql index 27aef87ddc27..7ba0e1f78e86 100644 --- a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql @@ -30,3 +30,8 @@ CREATE INDEX IF NOT EXISTS worksheet_entity_name_index ON worksheet_entity (name -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), too -- wide to fit a btree index row, and the table is small enough that the reindex cursor -- sort is not a concern. + +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java new file mode 100644 index 000000000000..bda8a4e55c79 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.mysql.v11213; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v11213.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v11213: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java new file mode 100644 index 000000000000..490c2376dd45 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.mysql.v1131; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v1131.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v1131: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java new file mode 100644 index 000000000000..9cf6155b2992 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.postgres.v11213; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v11213.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v11213: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java new file mode 100644 index 000000000000..4e0da26700a7 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.postgres.v1131; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v1131.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v1131: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java new file mode 100644 index 000000000000..1838b476f898 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java @@ -0,0 +1,174 @@ +package org.openmetadata.service.migration.utils.v11213; + +import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import org.jdbi.v3.core.Handle; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.resources.databases.DatasourceConfig; +import org.openmetadata.service.util.FullyQualifiedName; + +@Slf4j +public class MigrationUtil { + private MigrationUtil() {} + + private static final String TAG_TABLE = "tag"; + private static final String FQN_HASH_COLUMN = "fqnHash"; + private static final String JSON_COLUMN = "json"; + private static final String RECOGNIZERS_FIELD = "recognizers"; + private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; + private static final String CONTEXT_FIELD = "context"; + private static final String NAME_FIELD = "name"; + + private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; + private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; + + private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; + private static final String UPDATE_POSTGRES = + "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG_POSTGRES = + "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; + + private static final Map> KEYWORDS_TO_REMOVE = + Map.of( + "CvvRecognizer", Set.of("code", "security", "verification", "card"), + "UsBankRecognizer", Set.of("check", "save"), + "UsSsnRecognizer", Set.of("social", "security", "id_number"), + "CryptoRecognizer", Set.of("address"), + "PhoneRecognizer", Set.of("call")); + + private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; + private static final String PERSON_ENTITY = "PERSON"; + private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; + private static final Set SPACY_PERSON_KEYWORDS_TO_REMOVE = Set.of("name"); + + public static void removeBroadPiiContextKeywords(Handle handle) { + LOG.info("v11213: removing overly broad context keywords from PII recognizers"); + boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); + migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); + migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); + LOG.info("v11213: PII recognizer context keyword cleanup complete"); + } + + private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { + String fqnHash = FullyQualifiedName.buildHash(tagFqn); + String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; + List> rows = + handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); + if (nullOrEmpty(rows)) { + LOG.warn("v11213: tag '{}' not found, skipping", tagFqn); + return; + } + String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); + ObjectNode root; + try { + root = (ObjectNode) JsonUtils.readTree(jsonStr); + } catch (Exception e) { + LOG.warn("v11213: failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); + return; + } + boolean modified = processRecognizers(root, tagFqn); + if (modified) { + String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; + handle + .createUpdate(updateSql) + .bind(JSON_COLUMN, root.toString()) + .bind(FQN_HASH_COLUMN, fqnHash) + .execute(); + LOG.info("v11213: updated PII recognizer context keywords for tag '{}'", tagFqn); + } else { + LOG.info("v11213: no changes needed for tag '{}'", tagFqn); + } + } + + private static boolean processRecognizers(ObjectNode root, String tagFqn) { + JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); + if (recognizersNode == null || !recognizersNode.isArray()) { + return false; + } + boolean modified = false; + for (JsonNode recognizerNode : recognizersNode) { + if (recognizerNode instanceof ObjectNode recognizer) { + modified |= processRecognizer(recognizer, tagFqn); + } + } + return modified; + } + + private static boolean processRecognizer(ObjectNode recognizer, String tagFqn) { + JsonNode nameNode = recognizer.get(NAME_FIELD); + if (nameNode == null) { + return false; + } + String recognizerName = nameNode.asText(); + JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); + if (!(configNode instanceof ObjectNode config)) { + return false; + } + boolean modified = removeFromKeywordsMap(recognizerName, config); + modified |= removeSpacyPersonKeyword(recognizerName, config, tagFqn); + return modified; + } + + private static boolean removeFromKeywordsMap(String recognizerName, ObjectNode config) { + Set toRemove = KEYWORDS_TO_REMOVE.get(recognizerName); + if (toRemove == null) { + return false; + } + return removeKeywordsFromContext(config, toRemove, recognizerName); + } + + private static boolean removeSpacyPersonKeyword( + String recognizerName, ObjectNode config, String tagFqn) { + if (!SPACY_RECOGNIZER.equals(recognizerName)) { + return false; + } + JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); + if (!isPersonRecognizer(entitiesNode)) { + return false; + } + return removeKeywordsFromContext(config, SPACY_PERSON_KEYWORDS_TO_REMOVE, recognizerName); + } + + private static boolean isPersonRecognizer(JsonNode entitiesNode) { + if (entitiesNode == null || !entitiesNode.isArray()) { + return false; + } + for (JsonNode entity : entitiesNode) { + if (PERSON_ENTITY.equals(entity.asText())) { + return true; + } + } + return false; + } + + private static boolean removeKeywordsFromContext( + ObjectNode config, Set toRemove, String recognizerName) { + JsonNode contextNode = config.get(CONTEXT_FIELD); + if (contextNode == null || !contextNode.isArray()) { + return false; + } + ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); + boolean removed = false; + for (JsonNode keyword : contextNode) { + String kw = keyword.asText(); + if (toRemove.contains(kw)) { + LOG.info("v11213: removing keyword '{}' from {} context", kw, recognizerName); + removed = true; + } else { + newContext.add(keyword); + } + } + if (removed) { + config.set(CONTEXT_FIELD, newContext); + } + return removed; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java new file mode 100644 index 000000000000..659ae25072c1 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java @@ -0,0 +1,174 @@ +package org.openmetadata.service.migration.utils.v1131; + +import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import org.jdbi.v3.core.Handle; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.resources.databases.DatasourceConfig; +import org.openmetadata.service.util.FullyQualifiedName; + +@Slf4j +public class MigrationUtil { + private MigrationUtil() {} + + private static final String TAG_TABLE = "tag"; + private static final String FQN_HASH_COLUMN = "fqnHash"; + private static final String JSON_COLUMN = "json"; + private static final String RECOGNIZERS_FIELD = "recognizers"; + private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; + private static final String CONTEXT_FIELD = "context"; + private static final String NAME_FIELD = "name"; + + private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; + private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; + + private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; + private static final String UPDATE_POSTGRES = + "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG_POSTGRES = + "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; + + private static final Map> KEYWORDS_TO_REMOVE = + Map.of( + "CvvRecognizer", Set.of("code", "security", "verification", "card"), + "UsBankRecognizer", Set.of("check", "save"), + "UsSsnRecognizer", Set.of("social", "security", "id_number"), + "CryptoRecognizer", Set.of("address"), + "PhoneRecognizer", Set.of("call")); + + private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; + private static final String PERSON_ENTITY = "PERSON"; + private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; + private static final Set SPACY_PERSON_KEYWORDS_TO_REMOVE = Set.of("name"); + + public static void removeBroadPiiContextKeywords(Handle handle) { + LOG.info("v1131: removing overly broad context keywords from PII recognizers"); + boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); + migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); + migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); + LOG.info("v1131: PII recognizer context keyword cleanup complete"); + } + + private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { + String fqnHash = FullyQualifiedName.buildHash(tagFqn); + String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; + List> rows = + handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); + if (nullOrEmpty(rows)) { + LOG.warn("v1131: tag '{}' not found, skipping", tagFqn); + return; + } + String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); + ObjectNode root; + try { + root = (ObjectNode) JsonUtils.readTree(jsonStr); + } catch (Exception e) { + LOG.warn("v1131: failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); + return; + } + boolean modified = processRecognizers(root, tagFqn); + if (modified) { + String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; + handle + .createUpdate(updateSql) + .bind(JSON_COLUMN, root.toString()) + .bind(FQN_HASH_COLUMN, fqnHash) + .execute(); + LOG.info("v1131: updated PII recognizer context keywords for tag '{}'", tagFqn); + } else { + LOG.info("v1131: no changes needed for tag '{}'", tagFqn); + } + } + + private static boolean processRecognizers(ObjectNode root, String tagFqn) { + JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); + if (recognizersNode == null || !recognizersNode.isArray()) { + return false; + } + boolean modified = false; + for (JsonNode recognizerNode : recognizersNode) { + if (recognizerNode instanceof ObjectNode recognizer) { + modified |= processRecognizer(recognizer, tagFqn); + } + } + return modified; + } + + private static boolean processRecognizer(ObjectNode recognizer, String tagFqn) { + JsonNode nameNode = recognizer.get(NAME_FIELD); + if (nameNode == null) { + return false; + } + String recognizerName = nameNode.asText(); + JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); + if (!(configNode instanceof ObjectNode config)) { + return false; + } + boolean modified = removeFromKeywordsMap(recognizerName, config); + modified |= removeSpacyPersonKeyword(recognizerName, config, tagFqn); + return modified; + } + + private static boolean removeFromKeywordsMap(String recognizerName, ObjectNode config) { + Set toRemove = KEYWORDS_TO_REMOVE.get(recognizerName); + if (toRemove == null) { + return false; + } + return removeKeywordsFromContext(config, toRemove, recognizerName); + } + + private static boolean removeSpacyPersonKeyword( + String recognizerName, ObjectNode config, String tagFqn) { + if (!SPACY_RECOGNIZER.equals(recognizerName)) { + return false; + } + JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); + if (!isPersonRecognizer(entitiesNode)) { + return false; + } + return removeKeywordsFromContext(config, SPACY_PERSON_KEYWORDS_TO_REMOVE, recognizerName); + } + + private static boolean isPersonRecognizer(JsonNode entitiesNode) { + if (entitiesNode == null || !entitiesNode.isArray()) { + return false; + } + for (JsonNode entity : entitiesNode) { + if (PERSON_ENTITY.equals(entity.asText())) { + return true; + } + } + return false; + } + + private static boolean removeKeywordsFromContext( + ObjectNode config, Set toRemove, String recognizerName) { + JsonNode contextNode = config.get(CONTEXT_FIELD); + if (contextNode == null || !contextNode.isArray()) { + return false; + } + ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); + boolean removed = false; + for (JsonNode keyword : contextNode) { + String kw = keyword.asText(); + if (toRemove.contains(kw)) { + LOG.info("v1131: removing keyword '{}' from {} context", kw, recognizerName); + removed = true; + } else { + newContext.add(keyword); + } + } + if (removed) { + config.set(CONTEXT_FIELD, newContext); + } + return removed; + } +} diff --git a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json index 825b11101d2a..e18ab08e0a2a 100644 --- a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json +++ b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json @@ -51,8 +51,7 @@ "telephone", "cell", "cellphone", - "mobile", - "call" + "mobile" ] }, "confidenceThreshold": 0.6, @@ -216,850 +215,837 @@ "description": "PII which if lost, compromised, or disclosed without authorization, could result in substantial harm, embarrassment, inconvenience, or unfairness to an individual.", "autoClassificationEnabled": true, "autoClassificationPriority": 100, - "recognizers": - [ - { - "name": "EnglishCreditCardRecognizer", - "displayName": "English Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "en", - "context": [ - "credit", - "card", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment", - "cc_number", - "card_number", - "payment_info" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "recognizers": [ + { + "name": "EnglishCreditCardRecognizer", + "displayName": "English Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "en", + "context": [ + "credit", + "card", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment", + "cc_number", + "card_number", + "payment_info" + ] }, - { - "name": "SpanishCreditCardRecognizer", - "displayName": "Spanish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "es", - "context": [ - "tarjeta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SpanishCreditCardRecognizer", + "displayName": "Spanish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "es", + "context": [ + "tarjeta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment" + ] }, - { - "name": "ItalianCreditCardRecognizer", - "displayName": "Italian Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "it", - "context": [ - "carta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItalianCreditCardRecognizer", + "displayName": "Italian Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "it", + "context": [ + "carta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] }, - { - "name": "PolishCreditCardRecognizer", - "displayName": "Polish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "pl", - "context": [ - "karta", - "kredytowa", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "PolishCreditCardRecognizer", + "displayName": "Polish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "pl", + "context": [ + "karta", + "kredytowa", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] }, - { - "name": "CvvRecognizer", - "displayName": "CVV Recognizer", - "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "patterns": [ - { - "name": "cvv_pattern", - "regex": "\\b\\d{3,4}\\b", - "score": 0.5 - } - ], - "context": [ - "cvv", - "cvc", - "security", - "code", - "verification", - "card", - "cvv2", - "cid", - "csc" - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "supportedLanguage": "en" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "CvvRecognizer", + "displayName": "CVV Recognizer", + "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "patterns": [ + { + "name": "cvv_pattern", + "regex": "\\b\\d{3,4}\\b", + "score": 0.5 + } + ], + "context": [ + "cvv", + "cvc", + "cvv2", + "cid", + "csc" + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsBankRecognizer", + "displayName": "Us Bank Recognizer", + "description": "Recognizes US bank number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsBankRecognizer", - "displayName": "Us Bank Recognizer", - "description": "Recognizes US bank number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsBankRecognizer", - "supportedLanguage": "en", - "context": [ - "check", - "account", - "acct", - "bank", - "save", - "debit", - "bank_account", - "bank", - "account_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "account", + "acct", + "bank", + "debit", + "bank_account", + "account_number" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsLicenseRecognizer", + "displayName": "Us License Recognizer", + "description": "Recognizes US driver license using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsLicenseRecognizer", - "displayName": "Us License Recognizer", - "description": "Recognizes US driver license using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsLicenseRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsItinRecognizer", + "displayName": "Us Itin Recognizer", + "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsItinRecognizer", - "displayName": "Us Itin Recognizer", - "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsItinRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsPassportRecognizer", + "displayName": "Us Passport Recognizer", + "description": "Recognizes US Passport number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsPassportRecognizer", - "displayName": "Us Passport Recognizer", - "description": "Recognizes US Passport number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsPassportRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsSsnRecognizer", + "displayName": "Us Ssn Recognizer", + "description": "Recognize US Social Security Number (SSN) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsSsnRecognizer", - "displayName": "Us Ssn Recognizer", - "description": "Recognize US Social Security Number (SSN) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsSsnRecognizer", - "supportedLanguage": "en", - "context": [ - "social", - "security", - "ssn", - "ssns", - "ssid", - "national_id", - "id_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "ssn", + "ssns", + "ssid", + "national_id" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "NhsRecognizer", + "displayName": "Nhs Recognizer", + "description": "Recognizes NHS number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "NhsRecognizer", - "displayName": "Nhs Recognizer", - "description": "Recognizes NHS number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "NhsRecognizer", - "supportedLanguage": "en", - "context": [ - "nhs", - "national_health_service", - "nhs_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "nhs", + "national_health_service", + "nhs_number" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UkNinoRecognizer", + "displayName": "Uk Nino Recognizer", + "description": "Recognizes UK National Insurance Number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UkNinoRecognizer", - "displayName": "Uk Nino Recognizer", - "description": "Recognizes UK National Insurance Number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UkNinoRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SgFinRecognizer", + "displayName": "Sg Fin Recognizer", + "description": "Recognize SG FIN/NRIC number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "SgFinRecognizer", - "displayName": "Sg Fin Recognizer", - "description": "Recognize SG FIN/NRIC number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SgFinRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuAbnRecognizer", + "displayName": "Au Abn Recognizer", + "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuAbnRecognizer", - "displayName": "Au Abn Recognizer", - "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuAbnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuAcnRecognizer", + "displayName": "Au Acn Recognizer", + "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuAcnRecognizer", - "displayName": "Au Acn Recognizer", - "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuAcnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuTfnRecognizer", + "displayName": "Au Tfn Recognizer", + "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuTfnRecognizer", - "displayName": "Au Tfn Recognizer", - "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuTfnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuMedicareRecognizer", + "displayName": "Au Medicare Recognizer", + "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuMedicareRecognizer", - "displayName": "Au Medicare Recognizer", - "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuMedicareRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InPanRecognizer", + "displayName": "In Pan Recognizer", + "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InPanRecognizer", - "displayName": "In Pan Recognizer", - "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InPanRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InAadhaarRecognizer", + "displayName": "In Aadhaar Recognizer", + "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InAadhaarRecognizer", - "displayName": "In Aadhaar Recognizer", - "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InAadhaarRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InVehicleRegistrationRecognizer", + "displayName": "In Vehicle Registration Recognizer", + "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InVehicleRegistrationRecognizer", - "displayName": "In Vehicle Registration Recognizer", - "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InVehicleRegistrationRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InPassportRecognizer", + "displayName": "In Passport Recognizer", + "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InPassportRecognizer", - "displayName": "In Passport Recognizer", - "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InPassportRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EsNifRecognizer", + "displayName": "Es Nif Recognizer", + "description": "Recognize NIF number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "EsNifRecognizer", - "displayName": "Es Nif Recognizer", - "description": "Recognize NIF number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EsNifRecognizer", - "supportedLanguage": "es" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "es" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EsNieRecognizer", + "displayName": "Es Nie Recognizer", + "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "EsNieRecognizer", - "displayName": "Es Nie Recognizer", - "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EsNieRecognizer", - "supportedLanguage": "es" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "es" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItDriverLicenseRecognizer", + "displayName": "It Driver License Recognizer", + "description": "Recognizes IT Driver License using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItDriverLicenseRecognizer", - "displayName": "It Driver License Recognizer", - "description": "Recognizes IT Driver License using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItDriverLicenseRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItFiscalCodeRecognizer", + "displayName": "It Fiscal Code Recognizer", + "description": "Recognizes IT Fiscal Code using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItFiscalCodeRecognizer", - "displayName": "It Fiscal Code Recognizer", - "description": "Recognizes IT Fiscal Code using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItFiscalCodeRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItVatCodeRecognizer", - "displayName": "It Vat Code Recognizer", - "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItVatCodeRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { - "name": "ItIdentityCardRecognizer", - "displayName": "It Identity Card Recognizer", - "description": "Recognizes Italian Identity Card number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItIdentityCardRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItVatCodeRecognizer", + "displayName": "It Vat Code Recognizer", + "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItVatCodeRecognizer", + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItIdentityCardRecognizer", + "displayName": "It Identity Card Recognizer", + "description": "Recognizes Italian Identity Card number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItIdentityCardRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItPassportRecognizer", + "displayName": "It Passport Recognizer", + "description": "Recognizes IT Passport number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItPassportRecognizer", - "displayName": "It Passport Recognizer", - "description": "Recognizes IT Passport number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItPassportRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "PlPeselRecognizer", + "displayName": "Pl Pesel Recognizer", + "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "PlPeselRecognizer", - "displayName": "Pl Pesel Recognizer", - "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "PlPeselRecognizer", - "supportedLanguage": "pl" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "pl" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "CryptoRecognizer", + "displayName": "Crypto Recognizer", + "description": "Recognize common crypto account numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "CryptoRecognizer", - "displayName": "Crypto Recognizer", - "description": "Recognize common crypto account numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CryptoRecognizer", - "context": [ - "crypto", - "bitcoin", - "btc", - "ethereum", - "eth", - "litecoin", - "ltc", - "wallet", - "address" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "context": [ + "crypto", + "bitcoin", + "btc", + "ethereum", + "eth", + "litecoin", + "ltc", + "wallet" + ] }, - { - "name": "EmailRecognizer", - "displayName": "Email Recognizer", - "description": "Recognize email addresses using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EmailRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EmailRecognizer", + "displayName": "Email Recognizer", + "description": "Recognize email addresses using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "EmailRecognizer" }, - { - "name": "IbanRecognizer", - "displayName": "Iban Recognizer", - "description": "Recognize IBAN code using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IbanRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "IbanRecognizer", + "displayName": "Iban Recognizer", + "description": "Recognize IBAN code using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IbanRecognizer" }, - { - "name": "IpRecognizer", - "displayName": "Ip Recognizer", - "description": "Recognize IP address using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IpRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "IpRecognizer", + "displayName": "Ip Recognizer", + "description": "Recognize IP address using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IpRecognizer" }, - { - "name": "MedicalLicenseRecognizer", - "displayName": "Medical License Recognizer", - "description": "Recognize common Medical license numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "MedicalLicenseRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "MedicalLicenseRecognizer", + "displayName": "Medical License Recognizer", + "description": "Recognize common Medical license numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "MedicalLicenseRecognizer" }, - { - "name": "InVoterRecognizer", - "displayName": "In Voter Recognizer", - "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InVoterRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InVoterRecognizer", + "displayName": "In Voter Recognizer", + "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InVoterRecognizer" }, - { - "name": "AbaRoutingRecognizer", - "displayName": "ABA Routing Recognizer", - "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AbaRoutingRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AbaRoutingRecognizer", + "displayName": "ABA Routing Recognizer", + "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AbaRoutingRecognizer" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "FiPersonalIdentityCodeRecognizer", + "displayName": "FI Personal Identity Code Recognizer", + "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "FiPersonalIdentityCodeRecognizer", - "displayName": "FI Personal Identity Code Recognizer", - "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "FiPersonalIdentityCodeRecognizer", - "supportedLanguage": "fi" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "fi" }, - { - "name": "SgUenRecognizer", - "displayName": "Singaporean UEN recognizer", - "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SgUenRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SgUenRecognizer", + "displayName": "Singaporean UEN recognizer", + "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "SgUenRecognizer" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SpacyRecognizer", + "displayName": "Recognizer using spaCy NLP model", + "description": "Recognize PII entities using a spaCy NLP model.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "SpacyRecognizer", - "displayName": "Recognizer using spaCy NLP model", - "description": "Recognize PII entities using a spaCy NLP model.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SpacyRecognizer", - "supportedEntities": [ - "PERSON" - ], - "context": [ - "name", - "first_name", - "last_name", - "given_name", - "firstName", - "lastName", - "givenName", - "familyName" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedEntities": [ + "PERSON" + ], + "context": [ + "first_name", + "last_name", + "given_name", + "firstName", + "lastName", + "givenName", + "familyName" + ] }, - { - "displayName": "US SSN column name", - "name": "us_ssn", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_ssn_pattern_0", - "regex": "^.*(ssn|social).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "displayName": "US SSN column name", + "name": "us_ssn", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_ssn_pattern_0", + "regex": "^.*(ssn|social).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Credit card column name", - "name": "credit_card", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "credit_card_pattern_0", - "regex": "^.*(credit).*(card).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Credit card column name", + "name": "credit_card", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "credit_card_pattern_0", + "regex": "^.*(credit).*(card).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "US bank number column name", - "name": "us_bank_number", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_bank_number_pattern_0", - "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", - "score": 0.6 - }, - { - "name": "us_bank_number_pattern_1", - "regex": "\\bbank[_-]?(account|number|num|no)?\\b", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "US bank number column name", + "name": "us_bank_number", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_bank_number_pattern_0", + "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", + "score": 0.6 + }, + { + "name": "us_bank_number_pattern_1", + "regex": "\\bbank[_-]?(account|number|num|no)?\\b", + "score": 0.6 } - }, - "confidenceThreshold": 0.6, - "target": "column_name" + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + } }, - { - "displayName": "Iban code column name", - "name": "iban_code", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "iban_code_pattern_0", - "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_1", - "regex": "\bbank[_-]?(account|number|num|no)?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_2", - "regex": "\biban(?:[_]?(number|code))?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_3", - "regex": "\bbank[_]?iban\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_4", - "regex": "\binternational[_]?(account|bank[_]?number)\b", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Iban code column name", + "name": "iban_code", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "iban_code_pattern_0", + "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_1", + "regex": "\bbank[_-]?(account|number|num|no)?\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_2", + "regex": "\biban(?:[_]?(number|code))?\b", + "score": 0.6 }, - "context": [] + { + "name": "iban_code_pattern_3", + "regex": "\bbank[_]?iban\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_4", + "regex": "\binternational[_]?(account|bank[_]?number)\b", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Email address column name", - "name": "email_address", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "email_address_pattern_0", - "regex": "^(email|e-mail|mail)(.*address)?$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Email address column name", + "name": "email_address", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "email_address_pattern_0", + "regex": "^(email|e-mail|mail)(.*address)?$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Person column name", - "name": "person", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "person_pattern_0", - "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Person column name", + "name": "person", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "person_pattern_0", + "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" - } - ] + "context": [] + }, + "confidenceThreshold": 0.6, + "target": "column_name" + } + ] } ] -} \ No newline at end of file +} From b6423a6eebbe799cc291d39ed7c9d14a47012855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20Do=C3=B1aque?= Date: Mon, 15 Jun 2026 19:17:58 -0400 Subject: [PATCH 2/8] refactor(pii): extract shared PiiRecognizerMigrationUtil Move the broad-keyword removal logic into a single shared class so v1131 and v11213 don't duplicate it. Versioned MigrationUtils become one-line delegates; future migrations reuse the same entry point. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../utils/PiiRecognizerMigrationUtil.java | 178 ++++++++++++++++++ .../migration/utils/v11213/MigrationUtil.java | 166 +--------------- .../migration/utils/v1131/MigrationUtil.java | 166 +--------------- 3 files changed, 182 insertions(+), 328 deletions(-) create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java new file mode 100644 index 000000000000..59145e35623f --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java @@ -0,0 +1,178 @@ +package org.openmetadata.service.migration.utils; + +import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import org.jdbi.v3.core.Handle; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.resources.databases.DatasourceConfig; +import org.openmetadata.service.util.FullyQualifiedName; + +@Slf4j +public class PiiRecognizerMigrationUtil { + private PiiRecognizerMigrationUtil() {} + + private static final String FQN_HASH_COLUMN = "fqnHash"; + private static final String JSON_COLUMN = "json"; + private static final String RECOGNIZERS_FIELD = "recognizers"; + private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; + private static final String CONTEXT_FIELD = "context"; + private static final String NAME_FIELD = "name"; + private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; + + private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; + private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; + + private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; + private static final String UPDATE_POSTGRES = + "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG_POSTGRES = + "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; + + private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; + private static final String PERSON_ENTITY = "PERSON"; + + /** + * Context keywords that are too generic for their respective recognizers and cause false-positive + * PII classification (e.g. ACADEMIC_YEAR_CODE being tagged as CVV because "code" is in context). + */ + private static final Map> BROAD_KEYWORDS_TO_REMOVE = + Map.of( + "CvvRecognizer", Set.of("code", "security", "verification", "card"), + "UsBankRecognizer", Set.of("check", "save"), + "UsSsnRecognizer", Set.of("social", "security", "id_number"), + "CryptoRecognizer", Set.of("address"), + "PhoneRecognizer", Set.of("call")); + + private static final Set SPACY_PERSON_BROAD_KEYWORDS = Set.of("name"); + + public static void removeBroadPiiContextKeywords(Handle handle) { + LOG.info("Removing overly broad context keywords from PII recognizers"); + boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); + migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); + migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); + LOG.info("PII recognizer context keyword cleanup complete"); + } + + private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { + String fqnHash = FullyQualifiedName.buildHash(tagFqn); + String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; + List> rows = + handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); + if (nullOrEmpty(rows)) { + LOG.warn("Tag '{}' not found, skipping PII recognizer keyword cleanup", tagFqn); + return; + } + String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); + ObjectNode root; + try { + root = (ObjectNode) JsonUtils.readTree(jsonStr); + } catch (Exception e) { + LOG.warn("Failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); + return; + } + boolean modified = processRecognizers(root); + if (modified) { + String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; + handle + .createUpdate(updateSql) + .bind(JSON_COLUMN, root.toString()) + .bind(FQN_HASH_COLUMN, fqnHash) + .execute(); + LOG.info("Updated PII recognizer context keywords for tag '{}'", tagFqn); + } else { + LOG.info("No broad PII context keywords found in tag '{}'", tagFqn); + } + } + + private static boolean processRecognizers(ObjectNode root) { + JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); + if (recognizersNode == null || !recognizersNode.isArray()) { + return false; + } + boolean modified = false; + for (JsonNode recognizerNode : recognizersNode) { + if (recognizerNode instanceof ObjectNode recognizer) { + modified |= processRecognizer(recognizer); + } + } + return modified; + } + + private static boolean processRecognizer(ObjectNode recognizer) { + JsonNode nameNode = recognizer.get(NAME_FIELD); + if (nameNode == null) { + return false; + } + String recognizerName = nameNode.asText(); + JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); + if (!(configNode instanceof ObjectNode config)) { + return false; + } + boolean modified = removeFromBroadKeywordsMap(recognizerName, config); + modified |= removeSpacyPersonBroadKeywords(recognizerName, config); + return modified; + } + + private static boolean removeFromBroadKeywordsMap(String recognizerName, ObjectNode config) { + Set toRemove = BROAD_KEYWORDS_TO_REMOVE.get(recognizerName); + if (toRemove == null) { + return false; + } + return removeKeywordsFromContext(config, toRemove, recognizerName); + } + + private static boolean removeSpacyPersonBroadKeywords(String recognizerName, ObjectNode config) { + if (!SPACY_RECOGNIZER.equals(recognizerName)) { + return false; + } + JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); + if (!isPersonRecognizer(entitiesNode)) { + return false; + } + return removeKeywordsFromContext(config, SPACY_PERSON_BROAD_KEYWORDS, recognizerName); + } + + private static boolean isPersonRecognizer(JsonNode entitiesNode) { + if (entitiesNode == null || !entitiesNode.isArray()) { + return false; + } + boolean found = false; + for (JsonNode entity : entitiesNode) { + if (PERSON_ENTITY.equals(entity.asText())) { + found = true; + } + } + return found; + } + + private static boolean removeKeywordsFromContext( + ObjectNode config, Set toRemove, String recognizerName) { + JsonNode contextNode = config.get(CONTEXT_FIELD); + if (contextNode == null || !contextNode.isArray()) { + return false; + } + ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); + boolean removed = false; + for (JsonNode keyword : contextNode) { + String kw = keyword.asText(); + if (toRemove.contains(kw)) { + LOG.info("Removing broad keyword '{}' from {} context", kw, recognizerName); + removed = true; + } else { + newContext.add(keyword); + } + } + if (removed) { + config.set(CONTEXT_FIELD, newContext); + } + return removed; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java index 1838b476f898..f047f33f00f5 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java @@ -1,174 +1,12 @@ package org.openmetadata.service.migration.utils.v11213; -import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import java.util.List; -import java.util.Map; -import java.util.Set; -import lombok.extern.slf4j.Slf4j; import org.jdbi.v3.core.Handle; -import org.openmetadata.schema.utils.JsonUtils; -import org.openmetadata.service.resources.databases.DatasourceConfig; -import org.openmetadata.service.util.FullyQualifiedName; +import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; -@Slf4j public class MigrationUtil { private MigrationUtil() {} - private static final String TAG_TABLE = "tag"; - private static final String FQN_HASH_COLUMN = "fqnHash"; - private static final String JSON_COLUMN = "json"; - private static final String RECOGNIZERS_FIELD = "recognizers"; - private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; - private static final String CONTEXT_FIELD = "context"; - private static final String NAME_FIELD = "name"; - - private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; - private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; - - private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; - private static final String UPDATE_POSTGRES = - "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; - private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; - private static final String SELECT_TAG_POSTGRES = - "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; - - private static final Map> KEYWORDS_TO_REMOVE = - Map.of( - "CvvRecognizer", Set.of("code", "security", "verification", "card"), - "UsBankRecognizer", Set.of("check", "save"), - "UsSsnRecognizer", Set.of("social", "security", "id_number"), - "CryptoRecognizer", Set.of("address"), - "PhoneRecognizer", Set.of("call")); - - private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; - private static final String PERSON_ENTITY = "PERSON"; - private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; - private static final Set SPACY_PERSON_KEYWORDS_TO_REMOVE = Set.of("name"); - public static void removeBroadPiiContextKeywords(Handle handle) { - LOG.info("v11213: removing overly broad context keywords from PII recognizers"); - boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); - migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); - migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); - LOG.info("v11213: PII recognizer context keyword cleanup complete"); - } - - private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { - String fqnHash = FullyQualifiedName.buildHash(tagFqn); - String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; - List> rows = - handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); - if (nullOrEmpty(rows)) { - LOG.warn("v11213: tag '{}' not found, skipping", tagFqn); - return; - } - String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); - ObjectNode root; - try { - root = (ObjectNode) JsonUtils.readTree(jsonStr); - } catch (Exception e) { - LOG.warn("v11213: failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); - return; - } - boolean modified = processRecognizers(root, tagFqn); - if (modified) { - String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; - handle - .createUpdate(updateSql) - .bind(JSON_COLUMN, root.toString()) - .bind(FQN_HASH_COLUMN, fqnHash) - .execute(); - LOG.info("v11213: updated PII recognizer context keywords for tag '{}'", tagFqn); - } else { - LOG.info("v11213: no changes needed for tag '{}'", tagFqn); - } - } - - private static boolean processRecognizers(ObjectNode root, String tagFqn) { - JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); - if (recognizersNode == null || !recognizersNode.isArray()) { - return false; - } - boolean modified = false; - for (JsonNode recognizerNode : recognizersNode) { - if (recognizerNode instanceof ObjectNode recognizer) { - modified |= processRecognizer(recognizer, tagFqn); - } - } - return modified; - } - - private static boolean processRecognizer(ObjectNode recognizer, String tagFqn) { - JsonNode nameNode = recognizer.get(NAME_FIELD); - if (nameNode == null) { - return false; - } - String recognizerName = nameNode.asText(); - JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); - if (!(configNode instanceof ObjectNode config)) { - return false; - } - boolean modified = removeFromKeywordsMap(recognizerName, config); - modified |= removeSpacyPersonKeyword(recognizerName, config, tagFqn); - return modified; - } - - private static boolean removeFromKeywordsMap(String recognizerName, ObjectNode config) { - Set toRemove = KEYWORDS_TO_REMOVE.get(recognizerName); - if (toRemove == null) { - return false; - } - return removeKeywordsFromContext(config, toRemove, recognizerName); - } - - private static boolean removeSpacyPersonKeyword( - String recognizerName, ObjectNode config, String tagFqn) { - if (!SPACY_RECOGNIZER.equals(recognizerName)) { - return false; - } - JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); - if (!isPersonRecognizer(entitiesNode)) { - return false; - } - return removeKeywordsFromContext(config, SPACY_PERSON_KEYWORDS_TO_REMOVE, recognizerName); - } - - private static boolean isPersonRecognizer(JsonNode entitiesNode) { - if (entitiesNode == null || !entitiesNode.isArray()) { - return false; - } - for (JsonNode entity : entitiesNode) { - if (PERSON_ENTITY.equals(entity.asText())) { - return true; - } - } - return false; - } - - private static boolean removeKeywordsFromContext( - ObjectNode config, Set toRemove, String recognizerName) { - JsonNode contextNode = config.get(CONTEXT_FIELD); - if (contextNode == null || !contextNode.isArray()) { - return false; - } - ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); - boolean removed = false; - for (JsonNode keyword : contextNode) { - String kw = keyword.asText(); - if (toRemove.contains(kw)) { - LOG.info("v11213: removing keyword '{}' from {} context", kw, recognizerName); - removed = true; - } else { - newContext.add(keyword); - } - } - if (removed) { - config.set(CONTEXT_FIELD, newContext); - } - return removed; + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java index 659ae25072c1..4a0363c5ace5 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java @@ -1,174 +1,12 @@ package org.openmetadata.service.migration.utils.v1131; -import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import java.util.List; -import java.util.Map; -import java.util.Set; -import lombok.extern.slf4j.Slf4j; import org.jdbi.v3.core.Handle; -import org.openmetadata.schema.utils.JsonUtils; -import org.openmetadata.service.resources.databases.DatasourceConfig; -import org.openmetadata.service.util.FullyQualifiedName; +import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; -@Slf4j public class MigrationUtil { private MigrationUtil() {} - private static final String TAG_TABLE = "tag"; - private static final String FQN_HASH_COLUMN = "fqnHash"; - private static final String JSON_COLUMN = "json"; - private static final String RECOGNIZERS_FIELD = "recognizers"; - private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; - private static final String CONTEXT_FIELD = "context"; - private static final String NAME_FIELD = "name"; - - private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; - private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; - - private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; - private static final String UPDATE_POSTGRES = - "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; - private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; - private static final String SELECT_TAG_POSTGRES = - "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; - - private static final Map> KEYWORDS_TO_REMOVE = - Map.of( - "CvvRecognizer", Set.of("code", "security", "verification", "card"), - "UsBankRecognizer", Set.of("check", "save"), - "UsSsnRecognizer", Set.of("social", "security", "id_number"), - "CryptoRecognizer", Set.of("address"), - "PhoneRecognizer", Set.of("call")); - - private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; - private static final String PERSON_ENTITY = "PERSON"; - private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; - private static final Set SPACY_PERSON_KEYWORDS_TO_REMOVE = Set.of("name"); - public static void removeBroadPiiContextKeywords(Handle handle) { - LOG.info("v1131: removing overly broad context keywords from PII recognizers"); - boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); - migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); - migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); - LOG.info("v1131: PII recognizer context keyword cleanup complete"); - } - - private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { - String fqnHash = FullyQualifiedName.buildHash(tagFqn); - String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; - List> rows = - handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); - if (nullOrEmpty(rows)) { - LOG.warn("v1131: tag '{}' not found, skipping", tagFqn); - return; - } - String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); - ObjectNode root; - try { - root = (ObjectNode) JsonUtils.readTree(jsonStr); - } catch (Exception e) { - LOG.warn("v1131: failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); - return; - } - boolean modified = processRecognizers(root, tagFqn); - if (modified) { - String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; - handle - .createUpdate(updateSql) - .bind(JSON_COLUMN, root.toString()) - .bind(FQN_HASH_COLUMN, fqnHash) - .execute(); - LOG.info("v1131: updated PII recognizer context keywords for tag '{}'", tagFqn); - } else { - LOG.info("v1131: no changes needed for tag '{}'", tagFqn); - } - } - - private static boolean processRecognizers(ObjectNode root, String tagFqn) { - JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); - if (recognizersNode == null || !recognizersNode.isArray()) { - return false; - } - boolean modified = false; - for (JsonNode recognizerNode : recognizersNode) { - if (recognizerNode instanceof ObjectNode recognizer) { - modified |= processRecognizer(recognizer, tagFqn); - } - } - return modified; - } - - private static boolean processRecognizer(ObjectNode recognizer, String tagFqn) { - JsonNode nameNode = recognizer.get(NAME_FIELD); - if (nameNode == null) { - return false; - } - String recognizerName = nameNode.asText(); - JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); - if (!(configNode instanceof ObjectNode config)) { - return false; - } - boolean modified = removeFromKeywordsMap(recognizerName, config); - modified |= removeSpacyPersonKeyword(recognizerName, config, tagFqn); - return modified; - } - - private static boolean removeFromKeywordsMap(String recognizerName, ObjectNode config) { - Set toRemove = KEYWORDS_TO_REMOVE.get(recognizerName); - if (toRemove == null) { - return false; - } - return removeKeywordsFromContext(config, toRemove, recognizerName); - } - - private static boolean removeSpacyPersonKeyword( - String recognizerName, ObjectNode config, String tagFqn) { - if (!SPACY_RECOGNIZER.equals(recognizerName)) { - return false; - } - JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); - if (!isPersonRecognizer(entitiesNode)) { - return false; - } - return removeKeywordsFromContext(config, SPACY_PERSON_KEYWORDS_TO_REMOVE, recognizerName); - } - - private static boolean isPersonRecognizer(JsonNode entitiesNode) { - if (entitiesNode == null || !entitiesNode.isArray()) { - return false; - } - for (JsonNode entity : entitiesNode) { - if (PERSON_ENTITY.equals(entity.asText())) { - return true; - } - } - return false; - } - - private static boolean removeKeywordsFromContext( - ObjectNode config, Set toRemove, String recognizerName) { - JsonNode contextNode = config.get(CONTEXT_FIELD); - if (contextNode == null || !contextNode.isArray()) { - return false; - } - ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); - boolean removed = false; - for (JsonNode keyword : contextNode) { - String kw = keyword.asText(); - if (toRemove.contains(kw)) { - LOG.info("v1131: removing keyword '{}' from {} context", kw, recognizerName); - removed = true; - } else { - newContext.add(keyword); - } - } - if (removed) { - config.set(CONTEXT_FIELD, newContext); - } - return removed; + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); } } From 70b42091f361dc9068b35cc713136c59f5b65ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20Do=C3=B1aque?= Date: Tue, 16 Jun 2026 09:36:17 -0400 Subject: [PATCH 3/8] revert: undo broad-keyword removal; refactor TagProcessor integration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts 031424c907 (fix: remove overly broad context keywords) and 16fc5297ff (refactor: extract PiiRecognizerMigrationUtil). The integration test was testing PIIProcessor behaviour (hardcoded recognizers → hardcoded tags) rather than the TagProcessor, which fetches its tags and recognizers from the server. Replace the explicit PII classification/tag creation fixtures in conftest.py with nothing — the test now relies on the server's seeded PII tags, which is what the TagProcessor actually uses at runtime. Add academic_year_code INTEGER column (values 1999–2006) to the test table. Assert it receives no tags, covering the false-positive case where CvvRecognizer broad context keywords ("code") caused year-valued columns to be labelled PII.Sensitive. Co-Authored-By: Claude Sonnet 4.6 --- .../native/1.12.13/mysql/schemaChanges.sql | 4 - .../native/1.12.13/postgres/schemaChanges.sql | 4 - .../native/1.13.1/mysql/schemaChanges.sql | 5 - .../native/1.13.1/postgres/schemaChanges.sql | 5 - .../auto_classification/databases/conftest.py | 505 ------ .../auto_classification/databases/init.sql | 21 +- .../databases/test_tag_processor.py | 9 +- .../migration/mysql/v11213/Migration.java | 25 - .../migration/mysql/v1131/Migration.java | 25 - .../migration/postgres/v11213/Migration.java | 25 - .../migration/postgres/v1131/Migration.java | 25 - .../utils/PiiRecognizerMigrationUtil.java | 178 -- .../migration/utils/v11213/MigrationUtil.java | 12 - .../migration/utils/v1131/MigrationUtil.java | 12 - .../data/tags/piiTagsWithRecognizers.json | 1522 +++++++++-------- 15 files changed, 781 insertions(+), 1596 deletions(-) delete mode 100644 bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql delete mode 100644 bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java diff --git a/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql deleted file mode 100644 index 5cf17d3480c0..000000000000 --- a/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql +++ /dev/null @@ -1,4 +0,0 @@ --- PII recognizer context keyword cleanup: remove overly broad context keywords --- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") --- that caused false-positive PII classification on non-PII columns. --- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql deleted file mode 100644 index 5cf17d3480c0..000000000000 --- a/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql +++ /dev/null @@ -1,4 +0,0 @@ --- PII recognizer context keyword cleanup: remove overly broad context keywords --- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") --- that caused false-positive PII classification on non-PII columns. --- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql index 274d95f1e0ac..6b01929401aa 100644 --- a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql @@ -29,8 +29,3 @@ CREATE INDEX worksheet_entity_name_index ON worksheet_entity (name); -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), -- which exceeds MySQL's 3072-byte index key limit (utf8mb4), and the table is small -- enough that the reindex cursor sort is not a concern. - --- PII recognizer context keyword cleanup: remove overly broad context keywords --- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") --- that caused false-positive PII classification on non-PII columns. --- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql index 7ba0e1f78e86..27aef87ddc27 100644 --- a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql @@ -30,8 +30,3 @@ CREATE INDEX IF NOT EXISTS worksheet_entity_name_index ON worksheet_entity (name -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), too -- wide to fit a btree index row, and the table is small enough that the reindex cursor -- sort is not a concern. - --- PII recognizer context keyword cleanup: remove overly broad context keywords --- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") --- that caused false-positive PII classification on non-PII columns. --- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/ingestion/tests/integration/auto_classification/databases/conftest.py b/ingestion/tests/integration/auto_classification/databases/conftest.py index a93267d6824c..18cfdb05092f 100644 --- a/ingestion/tests/integration/auto_classification/databases/conftest.py +++ b/ingestion/tests/integration/auto_classification/databases/conftest.py @@ -3,29 +3,7 @@ import pytest from testcontainers.postgres import PostgresContainer -from _openmetadata_testutils.factories.metadata.generated.schema.api.classification.create_classification import ( - CreateClassificationRequestFactory, -) -from _openmetadata_testutils.factories.metadata.generated.schema.api.classification.create_tag import ( - CreateTagRequestFactory, -) -from _openmetadata_testutils.factories.metadata.generated.schema.type.recognizer import ( - RecognizerFactory, -) from _openmetadata_testutils.helpers.docker import try_bind -from metadata.generated.schema.api.classification.createClassification import ( - CreateClassificationRequest, -) -from metadata.generated.schema.api.classification.createTag import CreateTagRequest -from metadata.generated.schema.entity.classification.classification import ( - Classification, - ConflictResolution, -) -from metadata.generated.schema.entity.classification.tag import Tag -from metadata.generated.schema.type.piiEntity import PIIEntity -from metadata.generated.schema.type.predefinedRecognizer import Name -from metadata.generated.schema.type.recognizer import Recognizer -from metadata.ingestion.ometa.ometa_api import OpenMetadata @pytest.fixture(scope="module") @@ -38,486 +16,3 @@ def postgres_container(): with try_bind(container, 5432, 5432) if not os.getenv("CI") else container as container: yield container - - -@pytest.fixture(scope="session") -def credit_card_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="credit_card_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.CreditCardRecognizer, - ) - - -@pytest.fixture(scope="session") -def aba_routing_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="aba_routing_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AbaRoutingRecognizer, - ) - - -@pytest.fixture(scope="session") -def crypto_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="crypto_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.CryptoRecognizer, - ) - - -@pytest.fixture(scope="session") -def date_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="date_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.DateRecognizer, - ) - - -@pytest.fixture(scope="session") -def email_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="email_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.EmailRecognizer, - ) - - -@pytest.fixture(scope="session") -def iban_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="iban_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.IbanRecognizer, - ) - - -@pytest.fixture(scope="session") -def ip_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="ip_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.IpRecognizer, - ) - - -@pytest.fixture(scope="session") -def nhs_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="nhs_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.NhsRecognizer, - ) - - -@pytest.fixture(scope="session") -def medical_license_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="medical_license_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.MedicalLicenseRecognizer, - ) - - -@pytest.fixture(scope="session") -def phone_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="phone_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.PhoneRecognizer, - ) - - -@pytest.fixture(scope="session") -def sg_fin_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="sg_fin_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SgFinRecognizer, - ) - - -@pytest.fixture(scope="session") -def url_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="url_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UrlRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_bank_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_bank_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsBankRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_itin_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_itin_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsItinRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_license_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_license_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsLicenseRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_passport_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_passport_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsPassportRecognizer, - ) - - -@pytest.fixture(scope="session") -def us_ssn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="us_ssn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UsSsnRecognizer, - ) - - -@pytest.fixture(scope="session") -def es_nif_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="es_nif_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.EsNifRecognizer, - ) - - -@pytest.fixture(scope="session") -def pii_spacy_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="spacy_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SpacyRecognizer, - recognizerConfig__supportedEntities=[ - PIIEntity.PERSON, - ], - ) - - -@pytest.fixture(scope="session") -def non_pii_spacy_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="spacy_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SpacyRecognizer, - recognizerConfig__supportedEntities=[ - PIIEntity.LOCATION, - PIIEntity.DATE_TIME, - ], - ) - - -@pytest.fixture(scope="session") -def au_abn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_abn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuAbnRecognizer, - ) - - -@pytest.fixture(scope="session") -def au_acn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_acn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuAcnRecognizer, - ) - - -@pytest.fixture(scope="session") -def au_tfn_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_tfn_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuTfnRecognizer, - ) - - -@pytest.fixture(scope="session") -def au_medicare_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="au_medicare_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.AuMedicareRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_driver_license_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_driver_license_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItDriverLicenseRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_fiscal_code_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_fiscal_code_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItFiscalCodeRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_vat_code_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_vat_code_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItVatCodeRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_identity_card_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_identity_card_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItIdentityCardRecognizer, - ) - - -@pytest.fixture(scope="session") -def it_passport_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="it_passport_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.ItPassportRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_pan_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_pan_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InPanRecognizer, - ) - - -@pytest.fixture(scope="session") -def pl_pesel_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="pl_pesel_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.PlPeselRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_aadhaar_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_aadhaar_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InAadhaarRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_vehicle_registration_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_vehicle_registration_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InVehicleRegistrationRecognizer, - ) - - -@pytest.fixture(scope="session") -def sg_uen_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="sg_uen_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.SgUenRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_voter_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_voter_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InVoterRecognizer, - ) - - -@pytest.fixture(scope="session") -def in_passport_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="in_passport_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.InPassportRecognizer, - ) - - -@pytest.fixture(scope="session") -def fi_personal_identity_code_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="fi_personal_identity_code_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.FiPersonalIdentityCodeRecognizer, - ) - - -@pytest.fixture(scope="session") -def es_nie_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="es_nie_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.EsNieRecognizer, - ) - - -@pytest.fixture(scope="session") -def uk_nino_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="uk_nino_recognizer", - recognizerConfig__type="predefined", - recognizerConfig__name=Name.UkNinoRecognizer, - ) - - -@pytest.fixture(scope="session") -def person_column_name_recognizer() -> Recognizer: - return RecognizerFactory.create( - name="person_column_name_recognizer", - recognizerConfig__type="pattern", - recognizerConfig__patterns__0__regex=r"^.*(user|client|person|first|last|maiden|nick).*(name).*$", - for_column_name=True, - ) - - -@pytest.fixture(scope="session") -def pii_classification(metadata: OpenMetadata[Classification, CreateClassificationRequest]) -> Classification: - create_classification_request = CreateClassificationRequestFactory.create( - fqn="PII", - autoClassificationConfig__conflictResolution=ConflictResolution.highest_priority.value, - ) - entity = metadata.create_or_update(create_classification_request) - - return entity # noqa: RET504 - - -@pytest.fixture(scope="session") -def sensitive_pii_tag( - metadata: OpenMetadata[Tag, CreateTagRequest], - pii_classification: Classification, - credit_card_recognizer: Recognizer, - aba_routing_recognizer: Recognizer, - crypto_recognizer: Recognizer, - email_recognizer: Recognizer, - iban_recognizer: Recognizer, - nhs_recognizer: Recognizer, - medical_license_recognizer: Recognizer, - sg_fin_recognizer: Recognizer, - us_bank_recognizer: Recognizer, - us_itin_recognizer: Recognizer, - us_license_recognizer: Recognizer, - us_passport_recognizer: Recognizer, - us_ssn_recognizer: Recognizer, - es_nif_recognizer: Recognizer, - pii_spacy_recognizer: Recognizer, - au_abn_recognizer: Recognizer, - au_acn_recognizer: Recognizer, - au_tfn_recognizer: Recognizer, - au_medicare_recognizer: Recognizer, - it_driver_license_recognizer: Recognizer, - it_fiscal_code_recognizer: Recognizer, - it_vat_code_recognizer: Recognizer, - it_identity_card_recognizer: Recognizer, - it_passport_recognizer: Recognizer, - in_pan_recognizer: Recognizer, - pl_pesel_recognizer: Recognizer, - in_aadhaar_recognizer: Recognizer, - sg_uen_recognizer: Recognizer, - in_voter_recognizer: Recognizer, - in_passport_recognizer: Recognizer, - fi_personal_identity_code_recognizer: Recognizer, - es_nie_recognizer: Recognizer, - uk_nino_recognizer: Recognizer, - person_column_name_recognizer: Recognizer, -) -> Tag: - create_tag_request: CreateTagRequest = CreateTagRequestFactory.create( - tag_name="Sensitive", - tag_classification=pii_classification.fullyQualifiedName.root, - autoClassificationPriority=100, - recognizers=[ - credit_card_recognizer, - aba_routing_recognizer, - crypto_recognizer, - email_recognizer, - iban_recognizer, - nhs_recognizer, - medical_license_recognizer, - sg_fin_recognizer, - us_bank_recognizer, - us_itin_recognizer, - us_license_recognizer, - us_passport_recognizer, - us_ssn_recognizer, - es_nif_recognizer, - pii_spacy_recognizer, - au_abn_recognizer, - au_acn_recognizer, - au_tfn_recognizer, - au_medicare_recognizer, - it_driver_license_recognizer, - it_fiscal_code_recognizer, - it_vat_code_recognizer, - it_identity_card_recognizer, - it_passport_recognizer, - in_pan_recognizer, - pl_pesel_recognizer, - in_aadhaar_recognizer, - sg_uen_recognizer, - in_voter_recognizer, - in_passport_recognizer, - fi_personal_identity_code_recognizer, - es_nie_recognizer, - uk_nino_recognizer, - person_column_name_recognizer, - ], - ) - return metadata.create_or_update(create_tag_request) - - -@pytest.fixture(scope="session") -def non_sensitive_pii_tag( - metadata: OpenMetadata[Tag, CreateTagRequest], - pii_classification: Classification, - date_recognizer: Recognizer, - phone_recognizer: Recognizer, - non_pii_spacy_recognizer: Recognizer, -) -> Tag: - create_tag_request: CreateTagRequest = CreateTagRequestFactory.create( - tag_name="NonSensitive", - tag_classification=pii_classification.fullyQualifiedName.root, - autoClassificationPriority=80, - recognizers=[ - date_recognizer, - phone_recognizer, - non_pii_spacy_recognizer, - ], - ) - return metadata.create_or_update(create_tag_request) diff --git a/ingestion/tests/integration/auto_classification/databases/init.sql b/ingestion/tests/integration/auto_classification/databases/init.sql index 3ce571c34f4b..31f8bebb10f7 100644 --- a/ingestion/tests/integration/auto_classification/databases/init.sql +++ b/ingestion/tests/integration/auto_classification/databases/init.sql @@ -10,17 +10,18 @@ CREATE TABLE example_table ( DWH_X20 VARCHAR(255), timestamp BIGINT, version VARCHAR(50), - order_date DATE + order_date DATE, + academic_year_code INTEGER ); -- Insert sample data -INSERT INTO example_table (NHS_number, DWH_X10, user_name, address, DWH_X20, timestamp, version, order_date) +INSERT INTO example_table (NHS_number, DWH_X10, user_name, address, DWH_X20, timestamp, version, order_date, academic_year_code) VALUES - ('999-064-3601', 'harsha@gmail.com', 'Harsha', '2240 W Ina Rd', '4242-4242-4242-4242', 1760000000123, 'v1', '2018-01-05'), - ('999-468-5678', 'suresh@gmail.com', 'Suresh', '7192 Kalanianaole Hwy', '5555-5555-5555-4444', 1760000000131, 'v1.0', '2018-01-09'), - ('999-813-4595', 'stelle@gmail.com', 'Stelle', '5900 N Cannon Ave', '4000-0566-5566-5556', 1760000000149, 'v1.1', '2018-01-12'), - ('999-313-2993', 'peter@gmail.com', 'Peter', '4350 Main St', '2223-0031-2200-3222', 1760000000156, 'v2', '2018-01-22'), - ('999-911-7562', 'teddy@gmail.com', 'Theodore', '903 W Main St', '5200-8282-8282-8210', 1760000000164, 'v3', '2018-01-26'), - ('999-595-6195', 'akash@gmail.com', 'Akash', '2220 Coit Rd', '5105-1051-0510-5100', 1760000000172, 'v1', '2018-01-28'), - ('999-056-4418', 'mary@gmail.com', 'Mary', '7 Southside Dr', '5328-7101-2269-1668', 1760000000180, 'V1', '2018-01-29'), - ('999-329-1099', 'chirag@gmail.com', 'Chirag', '2929 S 25th Ave', '4801-8451-4627-0484', 1760000000198, 'v4', '2018-01-31'); + ('999-064-3601', 'harsha@gmail.com', 'Harsha', '2240 W Ina Rd', '4242-4242-4242-4242', 1760000000123, 'v1', '2018-01-05', 1999), + ('999-468-5678', 'suresh@gmail.com', 'Suresh', '7192 Kalanianaole Hwy', '5555-5555-5555-4444', 1760000000131, 'v1.0', '2018-01-09', 2000), + ('999-813-4595', 'stelle@gmail.com', 'Stelle', '5900 N Cannon Ave', '4000-0566-5566-5556', 1760000000149, 'v1.1', '2018-01-12', 2001), + ('999-313-2993', 'peter@gmail.com', 'Peter', '4350 Main St', '2223-0031-2200-3222', 1760000000156, 'v2', '2018-01-22', 2002), + ('999-911-7562', 'teddy@gmail.com', 'Theodore', '903 W Main St', '5200-8282-8282-8210', 1760000000164, 'v3', '2018-01-26', 2003), + ('999-595-6195', 'akash@gmail.com', 'Akash', '2220 Coit Rd', '5105-1051-0510-5100', 1760000000172, 'v1', '2018-01-28', 2004), + ('999-056-4418', 'mary@gmail.com', 'Mary', '7 Southside Dr', '5328-7101-2269-1668', 1760000000180, 'V1', '2018-01-29', 2005), + ('999-329-1099', 'chirag@gmail.com', 'Chirag', '2929 S 25th Ave', '4801-8451-4627-0484', 1760000000198, 'v4', '2018-01-31', 2006); diff --git a/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py b/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py index ee700c9218aa..8430b36c10b5 100644 --- a/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py +++ b/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py @@ -7,10 +7,6 @@ from metadata.generated.schema.api.services.createDatabaseService import ( CreateDatabaseServiceRequest, ) -from metadata.generated.schema.entity.classification.classification import ( - Classification, -) -from metadata.generated.schema.entity.classification.tag import Tag from metadata.generated.schema.entity.services.connections.database.common.basicAuth import ( BasicAuth, ) @@ -118,9 +114,6 @@ def autoclassification_config(db_service, bot_workflow_config, sink_config): @pytest.fixture(scope="module") def run_autoclassification( - pii_classification: Classification, - sensitive_pii_tag: Tag, - non_sensitive_pii_tag: Tag, run_workflow, load_metadata: MetadataWorkflow, autoclassification_config, @@ -134,6 +127,7 @@ def test_it_returns_the_expected_classifications( run_autoclassification: AutoClassificationWorkflow, ) -> None: ( + academic_year_code_column, address_column, customer_id_column, dwh_x10_column, @@ -187,3 +181,4 @@ def test_it_returns_the_expected_classifications( reason=Contains("Detected by `ValidatedDateRecognizer`", "Patterns matched:"), ), ] + assert academic_year_code_column.tags == [] diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java deleted file mode 100644 index bda8a4e55c79..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.openmetadata.service.migration.mysql.v11213; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; -import org.openmetadata.service.migration.api.MigrationProcessImpl; -import org.openmetadata.service.migration.utils.MigrationFile; -import org.openmetadata.service.migration.utils.v11213.MigrationUtil; - -@Slf4j -public class Migration extends MigrationProcessImpl { - - public Migration(MigrationFile migrationFile) { - super(migrationFile); - } - - @Override - @SneakyThrows - public void runDataMigration() { - try { - MigrationUtil.removeBroadPiiContextKeywords(handle); - } catch (Exception e) { - LOG.error("v11213: failed to remove broad PII context keywords", e); - } - } -} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java deleted file mode 100644 index 490c2376dd45..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.openmetadata.service.migration.mysql.v1131; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; -import org.openmetadata.service.migration.api.MigrationProcessImpl; -import org.openmetadata.service.migration.utils.MigrationFile; -import org.openmetadata.service.migration.utils.v1131.MigrationUtil; - -@Slf4j -public class Migration extends MigrationProcessImpl { - - public Migration(MigrationFile migrationFile) { - super(migrationFile); - } - - @Override - @SneakyThrows - public void runDataMigration() { - try { - MigrationUtil.removeBroadPiiContextKeywords(handle); - } catch (Exception e) { - LOG.error("v1131: failed to remove broad PII context keywords", e); - } - } -} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java deleted file mode 100644 index 9cf6155b2992..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.openmetadata.service.migration.postgres.v11213; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; -import org.openmetadata.service.migration.api.MigrationProcessImpl; -import org.openmetadata.service.migration.utils.MigrationFile; -import org.openmetadata.service.migration.utils.v11213.MigrationUtil; - -@Slf4j -public class Migration extends MigrationProcessImpl { - - public Migration(MigrationFile migrationFile) { - super(migrationFile); - } - - @Override - @SneakyThrows - public void runDataMigration() { - try { - MigrationUtil.removeBroadPiiContextKeywords(handle); - } catch (Exception e) { - LOG.error("v11213: failed to remove broad PII context keywords", e); - } - } -} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java deleted file mode 100644 index 4e0da26700a7..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.openmetadata.service.migration.postgres.v1131; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; -import org.openmetadata.service.migration.api.MigrationProcessImpl; -import org.openmetadata.service.migration.utils.MigrationFile; -import org.openmetadata.service.migration.utils.v1131.MigrationUtil; - -@Slf4j -public class Migration extends MigrationProcessImpl { - - public Migration(MigrationFile migrationFile) { - super(migrationFile); - } - - @Override - @SneakyThrows - public void runDataMigration() { - try { - MigrationUtil.removeBroadPiiContextKeywords(handle); - } catch (Exception e) { - LOG.error("v1131: failed to remove broad PII context keywords", e); - } - } -} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java deleted file mode 100644 index 59145e35623f..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java +++ /dev/null @@ -1,178 +0,0 @@ -package org.openmetadata.service.migration.utils; - -import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import java.util.List; -import java.util.Map; -import java.util.Set; -import lombok.extern.slf4j.Slf4j; -import org.jdbi.v3.core.Handle; -import org.openmetadata.schema.utils.JsonUtils; -import org.openmetadata.service.resources.databases.DatasourceConfig; -import org.openmetadata.service.util.FullyQualifiedName; - -@Slf4j -public class PiiRecognizerMigrationUtil { - private PiiRecognizerMigrationUtil() {} - - private static final String FQN_HASH_COLUMN = "fqnHash"; - private static final String JSON_COLUMN = "json"; - private static final String RECOGNIZERS_FIELD = "recognizers"; - private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; - private static final String CONTEXT_FIELD = "context"; - private static final String NAME_FIELD = "name"; - private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; - - private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; - private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; - - private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; - private static final String UPDATE_POSTGRES = - "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; - private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; - private static final String SELECT_TAG_POSTGRES = - "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; - - private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; - private static final String PERSON_ENTITY = "PERSON"; - - /** - * Context keywords that are too generic for their respective recognizers and cause false-positive - * PII classification (e.g. ACADEMIC_YEAR_CODE being tagged as CVV because "code" is in context). - */ - private static final Map> BROAD_KEYWORDS_TO_REMOVE = - Map.of( - "CvvRecognizer", Set.of("code", "security", "verification", "card"), - "UsBankRecognizer", Set.of("check", "save"), - "UsSsnRecognizer", Set.of("social", "security", "id_number"), - "CryptoRecognizer", Set.of("address"), - "PhoneRecognizer", Set.of("call")); - - private static final Set SPACY_PERSON_BROAD_KEYWORDS = Set.of("name"); - - public static void removeBroadPiiContextKeywords(Handle handle) { - LOG.info("Removing overly broad context keywords from PII recognizers"); - boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); - migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); - migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); - LOG.info("PII recognizer context keyword cleanup complete"); - } - - private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { - String fqnHash = FullyQualifiedName.buildHash(tagFqn); - String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; - List> rows = - handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); - if (nullOrEmpty(rows)) { - LOG.warn("Tag '{}' not found, skipping PII recognizer keyword cleanup", tagFqn); - return; - } - String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); - ObjectNode root; - try { - root = (ObjectNode) JsonUtils.readTree(jsonStr); - } catch (Exception e) { - LOG.warn("Failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); - return; - } - boolean modified = processRecognizers(root); - if (modified) { - String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; - handle - .createUpdate(updateSql) - .bind(JSON_COLUMN, root.toString()) - .bind(FQN_HASH_COLUMN, fqnHash) - .execute(); - LOG.info("Updated PII recognizer context keywords for tag '{}'", tagFqn); - } else { - LOG.info("No broad PII context keywords found in tag '{}'", tagFqn); - } - } - - private static boolean processRecognizers(ObjectNode root) { - JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); - if (recognizersNode == null || !recognizersNode.isArray()) { - return false; - } - boolean modified = false; - for (JsonNode recognizerNode : recognizersNode) { - if (recognizerNode instanceof ObjectNode recognizer) { - modified |= processRecognizer(recognizer); - } - } - return modified; - } - - private static boolean processRecognizer(ObjectNode recognizer) { - JsonNode nameNode = recognizer.get(NAME_FIELD); - if (nameNode == null) { - return false; - } - String recognizerName = nameNode.asText(); - JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); - if (!(configNode instanceof ObjectNode config)) { - return false; - } - boolean modified = removeFromBroadKeywordsMap(recognizerName, config); - modified |= removeSpacyPersonBroadKeywords(recognizerName, config); - return modified; - } - - private static boolean removeFromBroadKeywordsMap(String recognizerName, ObjectNode config) { - Set toRemove = BROAD_KEYWORDS_TO_REMOVE.get(recognizerName); - if (toRemove == null) { - return false; - } - return removeKeywordsFromContext(config, toRemove, recognizerName); - } - - private static boolean removeSpacyPersonBroadKeywords(String recognizerName, ObjectNode config) { - if (!SPACY_RECOGNIZER.equals(recognizerName)) { - return false; - } - JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); - if (!isPersonRecognizer(entitiesNode)) { - return false; - } - return removeKeywordsFromContext(config, SPACY_PERSON_BROAD_KEYWORDS, recognizerName); - } - - private static boolean isPersonRecognizer(JsonNode entitiesNode) { - if (entitiesNode == null || !entitiesNode.isArray()) { - return false; - } - boolean found = false; - for (JsonNode entity : entitiesNode) { - if (PERSON_ENTITY.equals(entity.asText())) { - found = true; - } - } - return found; - } - - private static boolean removeKeywordsFromContext( - ObjectNode config, Set toRemove, String recognizerName) { - JsonNode contextNode = config.get(CONTEXT_FIELD); - if (contextNode == null || !contextNode.isArray()) { - return false; - } - ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); - boolean removed = false; - for (JsonNode keyword : contextNode) { - String kw = keyword.asText(); - if (toRemove.contains(kw)) { - LOG.info("Removing broad keyword '{}' from {} context", kw, recognizerName); - removed = true; - } else { - newContext.add(keyword); - } - } - if (removed) { - config.set(CONTEXT_FIELD, newContext); - } - return removed; - } -} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java deleted file mode 100644 index f047f33f00f5..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java +++ /dev/null @@ -1,12 +0,0 @@ -package org.openmetadata.service.migration.utils.v11213; - -import org.jdbi.v3.core.Handle; -import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; - -public class MigrationUtil { - private MigrationUtil() {} - - public static void removeBroadPiiContextKeywords(Handle handle) { - PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); - } -} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java deleted file mode 100644 index 4a0363c5ace5..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java +++ /dev/null @@ -1,12 +0,0 @@ -package org.openmetadata.service.migration.utils.v1131; - -import org.jdbi.v3.core.Handle; -import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; - -public class MigrationUtil { - private MigrationUtil() {} - - public static void removeBroadPiiContextKeywords(Handle handle) { - PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); - } -} diff --git a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json index e18ab08e0a2a..825b11101d2a 100644 --- a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json +++ b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json @@ -51,7 +51,8 @@ "telephone", "cell", "cellphone", - "mobile" + "mobile", + "call" ] }, "confidenceThreshold": 0.6, @@ -215,837 +216,850 @@ "description": "PII which if lost, compromised, or disclosed without authorization, could result in substantial harm, embarrassment, inconvenience, or unfairness to an individual.", "autoClassificationEnabled": true, "autoClassificationPriority": 100, - "recognizers": [ - { - "name": "EnglishCreditCardRecognizer", - "displayName": "English Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "en", - "context": [ - "credit", - "card", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment", - "cc_number", - "card_number", - "payment_info" - ] + "recognizers": + [ + { + "name": "EnglishCreditCardRecognizer", + "displayName": "English Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "en", + "context": [ + "credit", + "card", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment", + "cc_number", + "card_number", + "payment_info" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "SpanishCreditCardRecognizer", - "displayName": "Spanish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "es", - "context": [ - "tarjeta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment" - ] + { + "name": "SpanishCreditCardRecognizer", + "displayName": "Spanish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "es", + "context": [ + "tarjeta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItalianCreditCardRecognizer", - "displayName": "Italian Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "it", - "context": [ - "carta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] + { + "name": "ItalianCreditCardRecognizer", + "displayName": "Italian Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "it", + "context": [ + "carta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "PolishCreditCardRecognizer", - "displayName": "Polish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "pl", - "context": [ - "karta", - "kredytowa", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] + { + "name": "PolishCreditCardRecognizer", + "displayName": "Polish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "pl", + "context": [ + "karta", + "kredytowa", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "CvvRecognizer", - "displayName": "CVV Recognizer", - "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "patterns": [ - { - "name": "cvv_pattern", - "regex": "\\b\\d{3,4}\\b", - "score": 0.5 - } - ], - "context": [ - "cvv", - "cvc", - "cvv2", - "cid", - "csc" - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + { + "name": "CvvRecognizer", + "displayName": "CVV Recognizer", + "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "patterns": [ + { + "name": "cvv_pattern", + "regex": "\\b\\d{3,4}\\b", + "score": 0.5 + } + ], + "context": [ + "cvv", + "cvc", + "security", + "code", + "verification", + "card", + "cvv2", + "cid", + "csc" + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + }, + "supportedLanguage": "en" }, - "supportedLanguage": "en" + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "UsBankRecognizer", - "displayName": "Us Bank Recognizer", - "description": "Recognizes US bank number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "UsBankRecognizer", - "supportedLanguage": "en", - "context": [ - "account", - "acct", - "bank", - "debit", - "bank_account", - "account_number" - ] + "displayName": "Us Bank Recognizer", + "description": "Recognizes US bank number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "UsBankRecognizer", + "supportedLanguage": "en", + "context": [ + "check", + "account", + "acct", + "bank", + "save", + "debit", + "bank_account", + "bank", + "account_number" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "UsLicenseRecognizer", - "displayName": "Us License Recognizer", - "description": "Recognizes US driver license using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "UsLicenseRecognizer", - "supportedLanguage": "en" + "displayName": "Us License Recognizer", + "description": "Recognizes US driver license using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "UsLicenseRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "UsItinRecognizer", - "displayName": "Us Itin Recognizer", - "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "UsItinRecognizer", - "supportedLanguage": "en" + "displayName": "Us Itin Recognizer", + "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "UsItinRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "UsPassportRecognizer", - "displayName": "Us Passport Recognizer", - "description": "Recognizes US Passport number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "UsPassportRecognizer", - "supportedLanguage": "en" + "displayName": "Us Passport Recognizer", + "description": "Recognizes US Passport number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "UsPassportRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "UsSsnRecognizer", - "displayName": "Us Ssn Recognizer", - "description": "Recognize US Social Security Number (SSN) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "UsSsnRecognizer", - "supportedLanguage": "en", - "context": [ - "ssn", - "ssns", - "ssid", - "national_id" - ] + "displayName": "Us Ssn Recognizer", + "description": "Recognize US Social Security Number (SSN) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "UsSsnRecognizer", + "supportedLanguage": "en", + "context": [ + "social", + "security", + "ssn", + "ssns", + "ssid", + "national_id", + "id_number" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "NhsRecognizer", - "displayName": "Nhs Recognizer", - "description": "Recognizes NHS number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "NhsRecognizer", - "supportedLanguage": "en", - "context": [ - "nhs", - "national_health_service", - "nhs_number" - ] + "displayName": "Nhs Recognizer", + "description": "Recognizes NHS number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "NhsRecognizer", + "supportedLanguage": "en", + "context": [ + "nhs", + "national_health_service", + "nhs_number" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "UkNinoRecognizer", - "displayName": "Uk Nino Recognizer", - "description": "Recognizes UK National Insurance Number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "UkNinoRecognizer", - "supportedLanguage": "en" + "displayName": "Uk Nino Recognizer", + "description": "Recognizes UK National Insurance Number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "UkNinoRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "SgFinRecognizer", - "displayName": "Sg Fin Recognizer", - "description": "Recognize SG FIN/NRIC number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "SgFinRecognizer", - "supportedLanguage": "en" + "displayName": "Sg Fin Recognizer", + "description": "Recognize SG FIN/NRIC number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "SgFinRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "AuAbnRecognizer", - "displayName": "Au Abn Recognizer", - "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "AuAbnRecognizer", - "supportedLanguage": "en" + "displayName": "Au Abn Recognizer", + "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AuAbnRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "AuAcnRecognizer", - "displayName": "Au Acn Recognizer", - "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "AuAcnRecognizer", - "supportedLanguage": "en" + "displayName": "Au Acn Recognizer", + "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AuAcnRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "AuTfnRecognizer", - "displayName": "Au Tfn Recognizer", - "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "AuTfnRecognizer", - "supportedLanguage": "en" + "displayName": "Au Tfn Recognizer", + "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AuTfnRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "AuMedicareRecognizer", - "displayName": "Au Medicare Recognizer", - "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "AuMedicareRecognizer", - "supportedLanguage": "en" + "displayName": "Au Medicare Recognizer", + "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AuMedicareRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "InPanRecognizer", - "displayName": "In Pan Recognizer", - "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "InPanRecognizer", - "supportedLanguage": "en" + "displayName": "In Pan Recognizer", + "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InPanRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "InAadhaarRecognizer", - "displayName": "In Aadhaar Recognizer", - "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "InAadhaarRecognizer", - "supportedLanguage": "en" + "displayName": "In Aadhaar Recognizer", + "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InAadhaarRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "InVehicleRegistrationRecognizer", - "displayName": "In Vehicle Registration Recognizer", - "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "InVehicleRegistrationRecognizer", - "supportedLanguage": "en" + "displayName": "In Vehicle Registration Recognizer", + "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InVehicleRegistrationRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "InPassportRecognizer", - "displayName": "In Passport Recognizer", - "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "InPassportRecognizer", - "supportedLanguage": "en" + "displayName": "In Passport Recognizer", + "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InPassportRecognizer", + "supportedLanguage": "en" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "EsNifRecognizer", - "displayName": "Es Nif Recognizer", - "description": "Recognize NIF number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "EsNifRecognizer", - "supportedLanguage": "es" + "displayName": "Es Nif Recognizer", + "description": "Recognize NIF number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "EsNifRecognizer", + "supportedLanguage": "es" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "EsNieRecognizer", - "displayName": "Es Nie Recognizer", - "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "EsNieRecognizer", - "supportedLanguage": "es" + "displayName": "Es Nie Recognizer", + "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "EsNieRecognizer", + "supportedLanguage": "es" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItDriverLicenseRecognizer", - "displayName": "It Driver License Recognizer", - "description": "Recognizes IT Driver License using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "ItDriverLicenseRecognizer", - "supportedLanguage": "it" + "displayName": "It Driver License Recognizer", + "description": "Recognizes IT Driver License using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItDriverLicenseRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItFiscalCodeRecognizer", - "displayName": "It Fiscal Code Recognizer", - "description": "Recognizes IT Fiscal Code using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "ItFiscalCodeRecognizer", - "supportedLanguage": "it" + "displayName": "It Fiscal Code Recognizer", + "description": "Recognizes IT Fiscal Code using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItFiscalCodeRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItVatCodeRecognizer", - "displayName": "It Vat Code Recognizer", - "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "ItVatCodeRecognizer", - "supportedLanguage": "it" + "displayName": "It Vat Code Recognizer", + "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItVatCodeRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItIdentityCardRecognizer", - "displayName": "It Identity Card Recognizer", - "description": "Recognizes Italian Identity Card number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "ItIdentityCardRecognizer", - "supportedLanguage": "it" + "displayName": "It Identity Card Recognizer", + "description": "Recognizes Italian Identity Card number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItIdentityCardRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItPassportRecognizer", - "displayName": "It Passport Recognizer", - "description": "Recognizes IT Passport number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "ItPassportRecognizer", - "supportedLanguage": "it" + "displayName": "It Passport Recognizer", + "description": "Recognizes IT Passport number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItPassportRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "PlPeselRecognizer", - "displayName": "Pl Pesel Recognizer", - "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "PlPeselRecognizer", - "supportedLanguage": "pl" + "displayName": "Pl Pesel Recognizer", + "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "PlPeselRecognizer", + "supportedLanguage": "pl" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "CryptoRecognizer", - "displayName": "Crypto Recognizer", - "description": "Recognize common crypto account numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "CryptoRecognizer", - "context": [ - "crypto", - "bitcoin", - "btc", - "ethereum", - "eth", - "litecoin", - "ltc", - "wallet" - ] + "displayName": "Crypto Recognizer", + "description": "Recognize common crypto account numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CryptoRecognizer", + "context": [ + "crypto", + "bitcoin", + "btc", + "ethereum", + "eth", + "litecoin", + "ltc", + "wallet", + "address" + ] + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "EmailRecognizer", - "displayName": "Email Recognizer", - "description": "Recognize email addresses using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EmailRecognizer" + { + "name": "EmailRecognizer", + "displayName": "Email Recognizer", + "description": "Recognize email addresses using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "EmailRecognizer" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "IbanRecognizer", - "displayName": "Iban Recognizer", - "description": "Recognize IBAN code using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IbanRecognizer" + { + "name": "IbanRecognizer", + "displayName": "Iban Recognizer", + "description": "Recognize IBAN code using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IbanRecognizer" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "IpRecognizer", - "displayName": "Ip Recognizer", - "description": "Recognize IP address using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IpRecognizer" + { + "name": "IpRecognizer", + "displayName": "Ip Recognizer", + "description": "Recognize IP address using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IpRecognizer" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "MedicalLicenseRecognizer", - "displayName": "Medical License Recognizer", - "description": "Recognize common Medical license numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "MedicalLicenseRecognizer" + { + "name": "MedicalLicenseRecognizer", + "displayName": "Medical License Recognizer", + "description": "Recognize common Medical license numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "MedicalLicenseRecognizer" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "InVoterRecognizer", - "displayName": "In Voter Recognizer", - "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InVoterRecognizer" + { + "name": "InVoterRecognizer", + "displayName": "In Voter Recognizer", + "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InVoterRecognizer" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "AbaRoutingRecognizer", - "displayName": "ABA Routing Recognizer", - "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AbaRoutingRecognizer" + { + "name": "AbaRoutingRecognizer", + "displayName": "ABA Routing Recognizer", + "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AbaRoutingRecognizer" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "FiPersonalIdentityCodeRecognizer", - "displayName": "FI Personal Identity Code Recognizer", - "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "FiPersonalIdentityCodeRecognizer", - "supportedLanguage": "fi" + "displayName": "FI Personal Identity Code Recognizer", + "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "FiPersonalIdentityCodeRecognizer", + "supportedLanguage": "fi" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "SgUenRecognizer", - "displayName": "Singaporean UEN recognizer", - "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SgUenRecognizer" + { + "name": "SgUenRecognizer", + "displayName": "Singaporean UEN recognizer", + "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "SgUenRecognizer" + }, + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "SpacyRecognizer", - "displayName": "Recognizer using spaCy NLP model", - "description": "Recognize PII entities using a spaCy NLP model.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", + { "name": "SpacyRecognizer", - "supportedEntities": [ - "PERSON" - ], - "context": [ - "first_name", - "last_name", - "given_name", - "firstName", - "lastName", - "givenName", - "familyName" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "displayName": "US SSN column name", - "name": "us_ssn", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_ssn_pattern_0", - "regex": "^.*(ssn|social).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + "displayName": "Recognizer using spaCy NLP model", + "description": "Recognize PII entities using a spaCy NLP model.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "SpacyRecognizer", + "supportedEntities": [ + "PERSON" + ], + "context": [ + "name", + "first_name", + "last_name", + "given_name", + "firstName", + "lastName", + "givenName", + "familyName" + ] }, - "context": [] + "confidenceThreshold": 0.6, + "target": "content" }, - "confidenceThreshold": 0.6, - "target": "column_name" - }, - { - "displayName": "Credit card column name", - "name": "credit_card", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "credit_card_pattern_0", - "regex": "^.*(credit).*(card).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + { + "displayName": "US SSN column name", + "name": "us_ssn", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_ssn_pattern_0", + "regex": "^.*(ssn|social).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + }, + "context": [] }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" }, - "confidenceThreshold": 0.6, - "target": "column_name" - }, - { - "displayName": "US bank number column name", - "name": "us_bank_number", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_bank_number_pattern_0", - "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", - "score": 0.6 + { + "displayName": "Credit card column name", + "name": "credit_card", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "credit_card_pattern_0", + "regex": "^.*(credit).*(card).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - { - "name": "us_bank_number_pattern_1", - "regex": "\\bbank[_-]?(account|number|num|no)?\\b", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - } + "context": [] + }, + "confidenceThreshold": 0.6, + "target": "column_name" }, - "confidenceThreshold": 0.6, - "target": "column_name" - }, - { - "displayName": "Iban code column name", - "name": "iban_code", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "iban_code_pattern_0", - "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_1", - "regex": "\bbank[_-]?(account|number|num|no)?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_2", - "regex": "\biban(?:[_]?(number|code))?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_3", - "regex": "\bbank[_]?iban\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_4", - "regex": "\binternational[_]?(account|bank[_]?number)\b", - "score": 0.6 + { + "displayName": "US bank number column name", + "name": "us_bank_number", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_bank_number_pattern_0", + "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", + "score": 0.6 + }, + { + "name": "us_bank_number_pattern_1", + "regex": "\\bbank[_-]?(account|number|num|no)?\\b", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" }, - "confidenceThreshold": 0.6, - "target": "column_name" - }, - { - "displayName": "Email address column name", - "name": "email_address", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "email_address_pattern_0", - "regex": "^(email|e-mail|mail)(.*address)?$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + { + "displayName": "Iban code column name", + "name": "iban_code", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "iban_code_pattern_0", + "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_1", + "regex": "\bbank[_-]?(account|number|num|no)?\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_2", + "regex": "\biban(?:[_]?(number|code))?\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_3", + "regex": "\bbank[_]?iban\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_4", + "regex": "\binternational[_]?(account|bank[_]?number)\b", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + }, + "context": [] }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" }, - "confidenceThreshold": 0.6, - "target": "column_name" - }, - { - "displayName": "Person column name", - "name": "person", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "person_pattern_0", - "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + { + "displayName": "Email address column name", + "name": "email_address", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "email_address_pattern_0", + "regex": "^(email|e-mail|mail)(.*address)?$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + }, + "context": [] }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" }, - "confidenceThreshold": 0.6, - "target": "column_name" - } - ] + { + "displayName": "Person column name", + "name": "person", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "person_pattern_0", + "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + }, + "context": [] + }, + "confidenceThreshold": 0.6, + "target": "column_name" + } + ] } ] -} +} \ No newline at end of file From dc80aa307d5ca1eb37e8fc5d269b24508c2c8113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20Do=C3=B1aque?= Date: Tue, 16 Jun 2026 11:28:57 -0400 Subject: [PATCH 4/8] fix(pii): restore broad-keyword removal fix with migration log/null-check hardening Reapplies 68480d978b/88d6b192f3 after they were accidentally reverted alongside the TagProcessor test refactor. Addresses prior review feedback: PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords now takes a version label so v1131/v11213 log lines stay attributable, and migrateTag guards against a null tag.json column before toString(). Updates the academic_year_code_column assertion to match actual behavior: the CVV/broad-keyword false positive is fixed, but a separate SpacyRecognizer DATE_TIME false positive on year-like integers remains (tracked in #29083). Fixes #29049 Co-Authored-By: Claude Sonnet 4.6 --- .../native/1.12.13/mysql/schemaChanges.sql | 4 + .../native/1.12.13/postgres/schemaChanges.sql | 4 + .../native/1.13.1/mysql/schemaChanges.sql | 5 + .../native/1.13.1/postgres/schemaChanges.sql | 5 + .../databases/test_tag_processor.py | 10 +- .../migration/mysql/v11213/Migration.java | 25 + .../migration/mysql/v1131/Migration.java | 25 + .../migration/postgres/v11213/Migration.java | 25 + .../migration/postgres/v1131/Migration.java | 25 + .../utils/PiiRecognizerMigrationUtil.java | 178 ++ .../migration/utils/v11213/MigrationUtil.java | 12 + .../migration/utils/v1131/MigrationUtil.java | 12 + .../data/tags/piiTagsWithRecognizers.json | 1528 ++++++++--------- 13 files changed, 1086 insertions(+), 772 deletions(-) create mode 100644 bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql create mode 100644 bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java diff --git a/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql new file mode 100644 index 000000000000..5cf17d3480c0 --- /dev/null +++ b/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql @@ -0,0 +1,4 @@ +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql new file mode 100644 index 000000000000..5cf17d3480c0 --- /dev/null +++ b/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql @@ -0,0 +1,4 @@ +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql index 6b01929401aa..274d95f1e0ac 100644 --- a/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/mysql/schemaChanges.sql @@ -29,3 +29,8 @@ CREATE INDEX worksheet_entity_name_index ON worksheet_entity (name); -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), -- which exceeds MySQL's 3072-byte index key limit (utf8mb4), and the table is small -- enough that the reindex cursor sort is not a concern. + +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql index 27aef87ddc27..7ba0e1f78e86 100644 --- a/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.1/postgres/schemaChanges.sql @@ -30,3 +30,8 @@ CREATE INDEX IF NOT EXISTS worksheet_entity_name_index ON worksheet_entity (name -- learning_resource_entity is intentionally omitted: its `name` is varchar(3072), too -- wide to fit a btree index row, and the table is small enough that the reindex cursor -- sort is not a concern. + +-- PII recognizer context keyword cleanup: remove overly broad context keywords +-- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") +-- that caused false-positive PII classification on non-PII columns. +-- Handled by Java data migration in v1131.MigrationUtil.removeBroadPiiContextKeywords. diff --git a/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py b/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py index 8430b36c10b5..f66901115ec4 100644 --- a/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py +++ b/ingestion/tests/integration/auto_classification/databases/test_tag_processor.py @@ -181,4 +181,12 @@ def test_it_returns_the_expected_classifications( reason=Contains("Detected by `ValidatedDateRecognizer`", "Patterns matched:"), ), ] - assert academic_year_code_column.tags == [] + # SpacyRecognizer's DATE_TIME entity flags 4-digit year-like integers + # regardless of column type or semantics. Tracked separately: #29083. + assert academic_year_code_column.tags == [ + IsInstance(TagLabel) + & HasAttributes( + tagFQN=HasAttributes(root="PII.NonSensitive"), + reason=Contains("Detected by `SpacyRecognizer`"), + ), + ] diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java new file mode 100644 index 000000000000..bda8a4e55c79 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.mysql.v11213; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v11213.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v11213: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java new file mode 100644 index 000000000000..490c2376dd45 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v1131/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.mysql.v1131; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v1131.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v1131: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java new file mode 100644 index 000000000000..9cf6155b2992 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.postgres.v11213; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v11213.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v11213: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java new file mode 100644 index 000000000000..4e0da26700a7 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v1131/Migration.java @@ -0,0 +1,25 @@ +package org.openmetadata.service.migration.postgres.v1131; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.migration.api.MigrationProcessImpl; +import org.openmetadata.service.migration.utils.MigrationFile; +import org.openmetadata.service.migration.utils.v1131.MigrationUtil; + +@Slf4j +public class Migration extends MigrationProcessImpl { + + public Migration(MigrationFile migrationFile) { + super(migrationFile); + } + + @Override + @SneakyThrows + public void runDataMigration() { + try { + MigrationUtil.removeBroadPiiContextKeywords(handle); + } catch (Exception e) { + LOG.error("v1131: failed to remove broad PII context keywords", e); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java new file mode 100644 index 000000000000..59145e35623f --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java @@ -0,0 +1,178 @@ +package org.openmetadata.service.migration.utils; + +import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import org.jdbi.v3.core.Handle; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.resources.databases.DatasourceConfig; +import org.openmetadata.service.util.FullyQualifiedName; + +@Slf4j +public class PiiRecognizerMigrationUtil { + private PiiRecognizerMigrationUtil() {} + + private static final String FQN_HASH_COLUMN = "fqnHash"; + private static final String JSON_COLUMN = "json"; + private static final String RECOGNIZERS_FIELD = "recognizers"; + private static final String RECOGNIZER_CONFIG_FIELD = "recognizerConfig"; + private static final String CONTEXT_FIELD = "context"; + private static final String NAME_FIELD = "name"; + private static final String SUPPORTED_ENTITIES_FIELD = "supportedEntities"; + + private static final String PII_SENSITIVE_FQN = "PII.Sensitive"; + private static final String PII_NON_SENSITIVE_FQN = "PII.NonSensitive"; + + private static final String UPDATE_MYSQL = "UPDATE tag SET json = :json WHERE fqnHash = :fqnHash"; + private static final String UPDATE_POSTGRES = + "UPDATE tag SET json = :json::jsonb WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG = "SELECT json FROM tag WHERE fqnHash = :fqnHash"; + private static final String SELECT_TAG_POSTGRES = + "SELECT json::text AS json FROM tag WHERE fqnHash = :fqnHash"; + + private static final String SPACY_RECOGNIZER = "SpacyRecognizer"; + private static final String PERSON_ENTITY = "PERSON"; + + /** + * Context keywords that are too generic for their respective recognizers and cause false-positive + * PII classification (e.g. ACADEMIC_YEAR_CODE being tagged as CVV because "code" is in context). + */ + private static final Map> BROAD_KEYWORDS_TO_REMOVE = + Map.of( + "CvvRecognizer", Set.of("code", "security", "verification", "card"), + "UsBankRecognizer", Set.of("check", "save"), + "UsSsnRecognizer", Set.of("social", "security", "id_number"), + "CryptoRecognizer", Set.of("address"), + "PhoneRecognizer", Set.of("call")); + + private static final Set SPACY_PERSON_BROAD_KEYWORDS = Set.of("name"); + + public static void removeBroadPiiContextKeywords(Handle handle) { + LOG.info("Removing overly broad context keywords from PII recognizers"); + boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); + migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); + migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); + LOG.info("PII recognizer context keyword cleanup complete"); + } + + private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { + String fqnHash = FullyQualifiedName.buildHash(tagFqn); + String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; + List> rows = + handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); + if (nullOrEmpty(rows)) { + LOG.warn("Tag '{}' not found, skipping PII recognizer keyword cleanup", tagFqn); + return; + } + String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); + ObjectNode root; + try { + root = (ObjectNode) JsonUtils.readTree(jsonStr); + } catch (Exception e) { + LOG.warn("Failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); + return; + } + boolean modified = processRecognizers(root); + if (modified) { + String updateSql = isMySQL ? UPDATE_MYSQL : UPDATE_POSTGRES; + handle + .createUpdate(updateSql) + .bind(JSON_COLUMN, root.toString()) + .bind(FQN_HASH_COLUMN, fqnHash) + .execute(); + LOG.info("Updated PII recognizer context keywords for tag '{}'", tagFqn); + } else { + LOG.info("No broad PII context keywords found in tag '{}'", tagFqn); + } + } + + private static boolean processRecognizers(ObjectNode root) { + JsonNode recognizersNode = root.get(RECOGNIZERS_FIELD); + if (recognizersNode == null || !recognizersNode.isArray()) { + return false; + } + boolean modified = false; + for (JsonNode recognizerNode : recognizersNode) { + if (recognizerNode instanceof ObjectNode recognizer) { + modified |= processRecognizer(recognizer); + } + } + return modified; + } + + private static boolean processRecognizer(ObjectNode recognizer) { + JsonNode nameNode = recognizer.get(NAME_FIELD); + if (nameNode == null) { + return false; + } + String recognizerName = nameNode.asText(); + JsonNode configNode = recognizer.get(RECOGNIZER_CONFIG_FIELD); + if (!(configNode instanceof ObjectNode config)) { + return false; + } + boolean modified = removeFromBroadKeywordsMap(recognizerName, config); + modified |= removeSpacyPersonBroadKeywords(recognizerName, config); + return modified; + } + + private static boolean removeFromBroadKeywordsMap(String recognizerName, ObjectNode config) { + Set toRemove = BROAD_KEYWORDS_TO_REMOVE.get(recognizerName); + if (toRemove == null) { + return false; + } + return removeKeywordsFromContext(config, toRemove, recognizerName); + } + + private static boolean removeSpacyPersonBroadKeywords(String recognizerName, ObjectNode config) { + if (!SPACY_RECOGNIZER.equals(recognizerName)) { + return false; + } + JsonNode entitiesNode = config.get(SUPPORTED_ENTITIES_FIELD); + if (!isPersonRecognizer(entitiesNode)) { + return false; + } + return removeKeywordsFromContext(config, SPACY_PERSON_BROAD_KEYWORDS, recognizerName); + } + + private static boolean isPersonRecognizer(JsonNode entitiesNode) { + if (entitiesNode == null || !entitiesNode.isArray()) { + return false; + } + boolean found = false; + for (JsonNode entity : entitiesNode) { + if (PERSON_ENTITY.equals(entity.asText())) { + found = true; + } + } + return found; + } + + private static boolean removeKeywordsFromContext( + ObjectNode config, Set toRemove, String recognizerName) { + JsonNode contextNode = config.get(CONTEXT_FIELD); + if (contextNode == null || !contextNode.isArray()) { + return false; + } + ArrayNode newContext = JsonUtils.getObjectMapper().createArrayNode(); + boolean removed = false; + for (JsonNode keyword : contextNode) { + String kw = keyword.asText(); + if (toRemove.contains(kw)) { + LOG.info("Removing broad keyword '{}' from {} context", kw, recognizerName); + removed = true; + } else { + newContext.add(keyword); + } + } + if (removed) { + config.set(CONTEXT_FIELD, newContext); + } + return removed; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java new file mode 100644 index 000000000000..f047f33f00f5 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java @@ -0,0 +1,12 @@ +package org.openmetadata.service.migration.utils.v11213; + +import org.jdbi.v3.core.Handle; +import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; + +public class MigrationUtil { + private MigrationUtil() {} + + public static void removeBroadPiiContextKeywords(Handle handle) { + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java new file mode 100644 index 000000000000..4a0363c5ace5 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java @@ -0,0 +1,12 @@ +package org.openmetadata.service.migration.utils.v1131; + +import org.jdbi.v3.core.Handle; +import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; + +public class MigrationUtil { + private MigrationUtil() {} + + public static void removeBroadPiiContextKeywords(Handle handle) { + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); + } +} diff --git a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json index 825b11101d2a..e18ab08e0a2a 100644 --- a/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json +++ b/openmetadata-service/src/main/resources/json/data/tags/piiTagsWithRecognizers.json @@ -51,8 +51,7 @@ "telephone", "cell", "cellphone", - "mobile", - "call" + "mobile" ] }, "confidenceThreshold": 0.6, @@ -216,850 +215,837 @@ "description": "PII which if lost, compromised, or disclosed without authorization, could result in substantial harm, embarrassment, inconvenience, or unfairness to an individual.", "autoClassificationEnabled": true, "autoClassificationPriority": 100, - "recognizers": - [ - { - "name": "EnglishCreditCardRecognizer", - "displayName": "English Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "en", - "context": [ - "credit", - "card", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment", - "cc_number", - "card_number", - "payment_info" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "recognizers": [ + { + "name": "EnglishCreditCardRecognizer", + "displayName": "English Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "en", + "context": [ + "credit", + "card", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment", + "cc_number", + "card_number", + "payment_info" + ] }, - { - "name": "SpanishCreditCardRecognizer", - "displayName": "Spanish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "es", - "context": [ - "tarjeta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro", - "instapayment" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SpanishCreditCardRecognizer", + "displayName": "Spanish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "es", + "context": [ + "tarjeta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro", + "instapayment" + ] }, - { - "name": "ItalianCreditCardRecognizer", - "displayName": "Italian Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "it", - "context": [ - "carta", - "credito", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItalianCreditCardRecognizer", + "displayName": "Italian Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "it", + "context": [ + "carta", + "credito", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] }, - { - "name": "PolishCreditCardRecognizer", - "displayName": "Polish Credit Card Recognizer", - "description": "Recognize common credit card numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CreditCardRecognizer", - "supportedLanguage": "pl", - "context": [ - "karta", - "kredytowa", - "visa", - "mastercard", - "cc", - "amex", - "discover", - "jcb", - "diners", - "maestro" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "PolishCreditCardRecognizer", + "displayName": "Polish Credit Card Recognizer", + "description": "Recognize common credit card numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "CreditCardRecognizer", + "supportedLanguage": "pl", + "context": [ + "karta", + "kredytowa", + "visa", + "mastercard", + "cc", + "amex", + "discover", + "jcb", + "diners", + "maestro" + ] }, - { - "name": "CvvRecognizer", - "displayName": "CVV Recognizer", - "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "patterns": [ - { - "name": "cvv_pattern", - "regex": "\\b\\d{3,4}\\b", - "score": 0.5 - } - ], - "context": [ - "cvv", - "cvc", - "security", - "code", - "verification", - "card", - "cvv2", - "cid", - "csc" - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "supportedLanguage": "en" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "CvvRecognizer", + "displayName": "CVV Recognizer", + "description": "Recognize CVV/CVC codes (3-4 digit card verification values).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "patterns": [ + { + "name": "cvv_pattern", + "regex": "\\b\\d{3,4}\\b", + "score": 0.5 + } + ], + "context": [ + "cvv", + "cvc", + "cvv2", + "cid", + "csc" + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsBankRecognizer", + "displayName": "Us Bank Recognizer", + "description": "Recognizes US bank number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsBankRecognizer", - "displayName": "Us Bank Recognizer", - "description": "Recognizes US bank number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsBankRecognizer", - "supportedLanguage": "en", - "context": [ - "check", - "account", - "acct", - "bank", - "save", - "debit", - "bank_account", - "bank", - "account_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "account", + "acct", + "bank", + "debit", + "bank_account", + "account_number" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsLicenseRecognizer", + "displayName": "Us License Recognizer", + "description": "Recognizes US driver license using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsLicenseRecognizer", - "displayName": "Us License Recognizer", - "description": "Recognizes US driver license using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsLicenseRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsItinRecognizer", + "displayName": "Us Itin Recognizer", + "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsItinRecognizer", - "displayName": "Us Itin Recognizer", - "description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsItinRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsPassportRecognizer", + "displayName": "Us Passport Recognizer", + "description": "Recognizes US Passport number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsPassportRecognizer", - "displayName": "Us Passport Recognizer", - "description": "Recognizes US Passport number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsPassportRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UsSsnRecognizer", + "displayName": "Us Ssn Recognizer", + "description": "Recognize US Social Security Number (SSN) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UsSsnRecognizer", - "displayName": "Us Ssn Recognizer", - "description": "Recognize US Social Security Number (SSN) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UsSsnRecognizer", - "supportedLanguage": "en", - "context": [ - "social", - "security", - "ssn", - "ssns", - "ssid", - "national_id", - "id_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "ssn", + "ssns", + "ssid", + "national_id" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "NhsRecognizer", + "displayName": "Nhs Recognizer", + "description": "Recognizes NHS number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "NhsRecognizer", - "displayName": "Nhs Recognizer", - "description": "Recognizes NHS number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "NhsRecognizer", - "supportedLanguage": "en", - "context": [ - "nhs", - "national_health_service", - "nhs_number" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en", + "context": [ + "nhs", + "national_health_service", + "nhs_number" + ] }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "UkNinoRecognizer", + "displayName": "Uk Nino Recognizer", + "description": "Recognizes UK National Insurance Number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "UkNinoRecognizer", - "displayName": "Uk Nino Recognizer", - "description": "Recognizes UK National Insurance Number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "UkNinoRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SgFinRecognizer", + "displayName": "Sg Fin Recognizer", + "description": "Recognize SG FIN/NRIC number using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "SgFinRecognizer", - "displayName": "Sg Fin Recognizer", - "description": "Recognize SG FIN/NRIC number using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SgFinRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuAbnRecognizer", + "displayName": "Au Abn Recognizer", + "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuAbnRecognizer", - "displayName": "Au Abn Recognizer", - "description": "Recognizes Australian Business Number (\"ABN\").

The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier

with two leading check digits.

The leading check digits are derived using a modulus 89 calculation.

This recognizer identifies ABN using regex, context words and checksum.

Reference: https://abr.business.gov.au/Help/AbnFormat", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuAbnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuAcnRecognizer", + "displayName": "Au Acn Recognizer", + "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuAcnRecognizer", - "displayName": "Au Acn Recognizer", - "description": "Recognizes Australian Company Number (\"ACN\").

The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies ACN using regex, context words, and checksum.

Reference: https://asic.gov.au/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuAcnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuTfnRecognizer", + "displayName": "Au Tfn Recognizer", + "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuTfnRecognizer", - "displayName": "Au Tfn Recognizer", - "description": "Recognizes Australian Tax File Numbers (\"TFN\").

The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity \\u2014 an individual, company,

superannuation fund, partnership, or trust.

The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.

TFN includes a check digit for detecting erroneous number based on simple modulo 11.

This recognizer uses regex, context words,

and checksum to identify TFN.

Reference: https://www.ato.gov.au/individuals/tax-file-number/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuTfnRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AuMedicareRecognizer", + "displayName": "Au Medicare Recognizer", + "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "AuMedicareRecognizer", - "displayName": "Au Medicare Recognizer", - "description": "Recognizes Australian Medicare number using regex, context words, and checksum.

Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.

It uses a modulus 10 checksum scheme to validate the number.

Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AuMedicareRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InPanRecognizer", + "displayName": "In Pan Recognizer", + "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InPanRecognizer", - "displayName": "In Pan Recognizer", - "description": "Recognizes Indian Permanent Account Number (\"PAN\").

The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.

This recognizer identifies PAN using regex and context words.

Reference: https://en.wikipedia.org/wiki/Permanent_account_number\nhttps://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InPanRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InAadhaarRecognizer", + "displayName": "In Aadhaar Recognizer", + "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InAadhaarRecognizer", - "displayName": "In Aadhaar Recognizer", - "description": "Recognizes Indian UIDAI Person Identification Number (\"AADHAAR\").

Reference: https://en.wikipedia.org/wiki/Aadhaar

A 12 digit unique number that is issued to each individual by Government of India", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InAadhaarRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InVehicleRegistrationRecognizer", + "displayName": "In Vehicle Registration Recognizer", + "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InVehicleRegistrationRecognizer", - "displayName": "In Vehicle Registration Recognizer", - "description": "Recognizes Indian Vehicle Registration Number issued by RTO.

Reference(s):

https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India\nhttps://en.wikipedia.org/wiki/Regional_Transport_Office\nhttps://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India

The registration scheme changed over time with multiple formats in play over the years

India has multiple active patterns for registration plates issued to different vehicle categories", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InVehicleRegistrationRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InPassportRecognizer", + "displayName": "In Passport Recognizer", + "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "InPassportRecognizer", - "displayName": "In Passport Recognizer", - "description": "Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:

https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InPassportRecognizer", - "supportedLanguage": "en" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "en" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EsNifRecognizer", + "displayName": "Es Nif Recognizer", + "description": "Recognize NIF number using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "EsNifRecognizer", - "displayName": "Es Nif Recognizer", - "description": "Recognize NIF number using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EsNifRecognizer", - "supportedLanguage": "es" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "es" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EsNieRecognizer", + "displayName": "Es Nie Recognizer", + "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "EsNieRecognizer", - "displayName": "Es Nie Recognizer", - "description": "Recognize NIE number using regex and checksum.

Reference(s):

https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero\nhttps://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EsNieRecognizer", - "supportedLanguage": "es" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "es" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItDriverLicenseRecognizer", + "displayName": "It Driver License Recognizer", + "description": "Recognizes IT Driver License using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItDriverLicenseRecognizer", - "displayName": "It Driver License Recognizer", - "description": "Recognizes IT Driver License using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItDriverLicenseRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItFiscalCodeRecognizer", + "displayName": "It Fiscal Code Recognizer", + "description": "Recognizes IT Fiscal Code using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItFiscalCodeRecognizer", - "displayName": "It Fiscal Code Recognizer", - "description": "Recognizes IT Fiscal Code using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItFiscalCodeRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" - }, - { - "name": "ItVatCodeRecognizer", - "displayName": "It Vat Code Recognizer", - "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItVatCodeRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { - "name": "ItIdentityCardRecognizer", - "displayName": "It Identity Card Recognizer", - "description": "Recognizes Italian Identity Card number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItIdentityCardRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItVatCodeRecognizer", + "displayName": "It Vat Code Recognizer", + "description": "Recognizes Italian VAT code using regex and checksum.

For more information about italian VAT code:

https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItVatCodeRecognizer", + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItIdentityCardRecognizer", + "displayName": "It Identity Card Recognizer", + "description": "Recognizes Italian Identity Card number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "ItIdentityCardRecognizer", + "supportedLanguage": "it" + }, + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "ItPassportRecognizer", + "displayName": "It Passport Recognizer", + "description": "Recognizes IT Passport number using case-insensitive regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "ItPassportRecognizer", - "displayName": "It Passport Recognizer", - "description": "Recognizes IT Passport number using case-insensitive regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "ItPassportRecognizer", - "supportedLanguage": "it" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "it" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "PlPeselRecognizer", + "displayName": "Pl Pesel Recognizer", + "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "PlPeselRecognizer", - "displayName": "Pl Pesel Recognizer", - "description": "Recognize PESEL number using regex and checksum.

For more information about PESEL: https://en.wikipedia.org/wiki/PESEL", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "PlPeselRecognizer", - "supportedLanguage": "pl" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "pl" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "CryptoRecognizer", + "displayName": "Crypto Recognizer", + "description": "Recognize common crypto account numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "CryptoRecognizer", - "displayName": "Crypto Recognizer", - "description": "Recognize common crypto account numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "CryptoRecognizer", - "context": [ - "crypto", - "bitcoin", - "btc", - "ethereum", - "eth", - "litecoin", - "ltc", - "wallet", - "address" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "context": [ + "crypto", + "bitcoin", + "btc", + "ethereum", + "eth", + "litecoin", + "ltc", + "wallet" + ] }, - { - "name": "EmailRecognizer", - "displayName": "Email Recognizer", - "description": "Recognize email addresses using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "EmailRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "EmailRecognizer", + "displayName": "Email Recognizer", + "description": "Recognize email addresses using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "EmailRecognizer" }, - { - "name": "IbanRecognizer", - "displayName": "Iban Recognizer", - "description": "Recognize IBAN code using regex and checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IbanRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "IbanRecognizer", + "displayName": "Iban Recognizer", + "description": "Recognize IBAN code using regex and checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IbanRecognizer" }, - { - "name": "IpRecognizer", - "displayName": "Ip Recognizer", - "description": "Recognize IP address using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "IpRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "IpRecognizer", + "displayName": "Ip Recognizer", + "description": "Recognize IP address using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "IpRecognizer" }, - { - "name": "MedicalLicenseRecognizer", - "displayName": "Medical License Recognizer", - "description": "Recognize common Medical license numbers using regex + checksum.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "MedicalLicenseRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "MedicalLicenseRecognizer", + "displayName": "Medical License Recognizer", + "description": "Recognize common Medical license numbers using regex + checksum.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "MedicalLicenseRecognizer" }, - { - "name": "InVoterRecognizer", - "displayName": "In Voter Recognizer", - "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "InVoterRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "InVoterRecognizer", + "displayName": "In Voter Recognizer", + "description": "Recognize Indian Voter/Election Id(EPIC).

The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18

Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "InVoterRecognizer" }, - { - "name": "AbaRoutingRecognizer", - "displayName": "ABA Routing Recognizer", - "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "AbaRoutingRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "AbaRoutingRecognizer", + "displayName": "ABA Routing Recognizer", + "description": "Recognize American Banking Association (ABA) routing number.

Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "AbaRoutingRecognizer" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "FiPersonalIdentityCodeRecognizer", + "displayName": "FI Personal Identity Code Recognizer", + "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "FiPersonalIdentityCodeRecognizer", - "displayName": "FI Personal Identity Code Recognizer", - "description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "FiPersonalIdentityCodeRecognizer", - "supportedLanguage": "fi" - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedLanguage": "fi" }, - { - "name": "SgUenRecognizer", - "displayName": "Singaporean UEN recognizer", - "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SgUenRecognizer" - }, - "confidenceThreshold": 0.6, - "target": "content" + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SgUenRecognizer", + "displayName": "Singaporean UEN recognizer", + "description": "Recognize Singapore UEN (Unique Entity Number) using regex.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", + "name": "SgUenRecognizer" }, - { + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "name": "SpacyRecognizer", + "displayName": "Recognizer using spaCy NLP model", + "description": "Recognize PII entities using a spaCy NLP model.", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "predefined", "name": "SpacyRecognizer", - "displayName": "Recognizer using spaCy NLP model", - "description": "Recognize PII entities using a spaCy NLP model.", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "predefined", - "name": "SpacyRecognizer", - "supportedEntities": [ - "PERSON" - ], - "context": [ - "name", - "first_name", - "last_name", - "given_name", - "firstName", - "lastName", - "givenName", - "familyName" - ] - }, - "confidenceThreshold": 0.6, - "target": "content" + "supportedEntities": [ + "PERSON" + ], + "context": [ + "first_name", + "last_name", + "given_name", + "firstName", + "lastName", + "givenName", + "familyName" + ] }, - { - "displayName": "US SSN column name", - "name": "us_ssn", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_ssn_pattern_0", - "regex": "^.*(ssn|social).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "content" + }, + { + "displayName": "US SSN column name", + "name": "us_ssn", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_ssn_pattern_0", + "regex": "^.*(ssn|social).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Credit card column name", - "name": "credit_card", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "credit_card_pattern_0", - "regex": "^.*(credit).*(card).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Credit card column name", + "name": "credit_card", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "credit_card_pattern_0", + "regex": "^.*(credit).*(card).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "US bank number column name", - "name": "us_bank_number", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "us_bank_number_pattern_0", - "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", - "score": 0.6 - }, - { - "name": "us_bank_number_pattern_1", - "regex": "\\bbank[_-]?(account|number|num|no)?\\b", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "US bank number column name", + "name": "us_bank_number", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "us_bank_number_pattern_0", + "regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b", + "score": 0.6 + }, + { + "name": "us_bank_number_pattern_1", + "regex": "\\bbank[_-]?(account|number|num|no)?\\b", + "score": 0.6 } - }, - "confidenceThreshold": 0.6, - "target": "column_name" + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true + } }, - { - "displayName": "Iban code column name", - "name": "iban_code", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "iban_code_pattern_0", - "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_1", - "regex": "\bbank[_-]?(account|number|num|no)?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_2", - "regex": "\biban(?:[_]?(number|code))?\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_3", - "regex": "\bbank[_]?iban\b", - "score": 0.6 - }, - { - "name": "iban_code_pattern_4", - "regex": "\binternational[_]?(account|bank[_]?number)\b", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Iban code column name", + "name": "iban_code", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "iban_code_pattern_0", + "regex": "\b(account|acct|acc)[_-]?(number|num|no)\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_1", + "regex": "\bbank[_-]?(account|number|num|no)?\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_2", + "regex": "\biban(?:[_]?(number|code))?\b", + "score": 0.6 }, - "context": [] + { + "name": "iban_code_pattern_3", + "regex": "\bbank[_]?iban\b", + "score": 0.6 + }, + { + "name": "iban_code_pattern_4", + "regex": "\binternational[_]?(account|bank[_]?number)\b", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Email address column name", - "name": "email_address", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "email_address_pattern_0", - "regex": "^(email|e-mail|mail)(.*address)?$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Email address column name", + "name": "email_address", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "email_address_pattern_0", + "regex": "^(email|e-mail|mail)(.*address)?$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" + "context": [] }, - { - "displayName": "Person column name", - "name": "person", - "description": "A regex recognizer for column names", - "enabled": true, - "isSystemDefault": true, - "recognizerConfig": { - "type": "pattern", - "supportedLanguage": "en", - "patterns": [ - { - "name": "person_pattern_0", - "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", - "score": 0.6 - } - ], - "regexFlags": { - "dotAll": true, - "multiline": true, - "ignoreCase": true - }, - "context": [] + "confidenceThreshold": 0.6, + "target": "column_name" + }, + { + "displayName": "Person column name", + "name": "person", + "description": "A regex recognizer for column names", + "enabled": true, + "isSystemDefault": true, + "recognizerConfig": { + "type": "pattern", + "supportedLanguage": "en", + "patterns": [ + { + "name": "person_pattern_0", + "regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$", + "score": 0.6 + } + ], + "regexFlags": { + "dotAll": true, + "multiline": true, + "ignoreCase": true }, - "confidenceThreshold": 0.6, - "target": "column_name" - } - ] + "context": [] + }, + "confidenceThreshold": 0.6, + "target": "column_name" + } + ] } ] -} \ No newline at end of file +} From 4b2313cb215d98aa272af36370a9791b4f369882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20Do=C3=B1aque?= Date: Tue, 16 Jun 2026 15:29:10 -0400 Subject: [PATCH 5/8] fix(pii): include migration util hardening that was missed in the prior commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The version-prefixed logging and null-json guard in PiiRecognizerMigrationUtil were written and spotless-formatted before d78dfeb50c but never staged — that commit captured the pre-edit content checked out from 16fc5297ff. This adds the actual diff. Also drops the unused pii_classification/sensitive_pii_tag/ non_sensitive_pii_tag parameters from test_global_sample_data_config.py; the databases/ conftest.py no longer defines them (server-seeded tags replaced the fixture-created ones), so collection failed with "fixture 'pii_classification' not found". The fixtures were never referenced in the test bodies. Co-Authored-By: Claude Sonnet 4.6 --- .../test_global_sample_data_config.py | 10 ------- .../utils/PiiRecognizerMigrationUtil.java | 28 +++++++++++-------- .../migration/utils/v11213/MigrationUtil.java | 2 +- .../migration/utils/v1131/MigrationUtil.java | 2 +- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py b/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py index a0a68413318c..27efa849af7e 100644 --- a/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py +++ b/ingestion/tests/integration/auto_classification/databases/test_global_sample_data_config.py @@ -17,10 +17,6 @@ ProfilerConfiguration, SampleDataIngestionConfig, ) -from metadata.generated.schema.entity.classification.classification import ( - Classification, -) -from metadata.generated.schema.entity.classification.tag import Tag from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.services.connections.database.common.basicAuth import ( BasicAuth, @@ -176,9 +172,6 @@ def _set_global_profiler_config(metadata: OpenMetadata, store: bool): def test_store_sample_data_when_global_config_enabled( db_service: DatabaseService, metadata: OpenMetadata, - pii_classification: Classification, - sensitive_pii_tag: Tag, - non_sensitive_pii_tag: Tag, load_metadata: MetadataWorkflow, run_workflow, autoclassification_config, @@ -201,9 +194,6 @@ def test_store_sample_data_when_global_config_enabled( def test_no_sample_data_when_global_config_disabled( db_service: DatabaseService, metadata: OpenMetadata, - pii_classification: Classification, - sensitive_pii_tag: Tag, - non_sensitive_pii_tag: Tag, load_metadata: MetadataWorkflow, run_workflow, autoclassification_config, diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java index 59145e35623f..3986a2fee93a 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/PiiRecognizerMigrationUtil.java @@ -53,29 +53,35 @@ private PiiRecognizerMigrationUtil() {} private static final Set SPACY_PERSON_BROAD_KEYWORDS = Set.of("name"); - public static void removeBroadPiiContextKeywords(Handle handle) { - LOG.info("Removing overly broad context keywords from PII recognizers"); + public static void removeBroadPiiContextKeywords(Handle handle, String version) { + LOG.info("{}: removing overly broad context keywords from PII recognizers", version); boolean isMySQL = Boolean.TRUE.equals(DatasourceConfig.getInstance().isMySQL()); - migrateTag(handle, PII_SENSITIVE_FQN, isMySQL); - migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL); - LOG.info("PII recognizer context keyword cleanup complete"); + migrateTag(handle, PII_SENSITIVE_FQN, isMySQL, version); + migrateTag(handle, PII_NON_SENSITIVE_FQN, isMySQL, version); + LOG.info("{}: PII recognizer context keyword cleanup complete", version); } - private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { + private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL, String version) { String fqnHash = FullyQualifiedName.buildHash(tagFqn); String selectSql = isMySQL ? SELECT_TAG : SELECT_TAG_POSTGRES; List> rows = handle.createQuery(selectSql).bind(FQN_HASH_COLUMN, fqnHash).mapToMap().list(); if (nullOrEmpty(rows)) { - LOG.warn("Tag '{}' not found, skipping PII recognizer keyword cleanup", tagFqn); + LOG.warn("{}: tag '{}' not found, skipping PII recognizer keyword cleanup", version, tagFqn); return; } - String jsonStr = rows.getFirst().get(JSON_COLUMN).toString(); + Object jsonValue = rows.getFirst().get(JSON_COLUMN); + if (jsonValue == null) { + LOG.warn( + "{}: tag '{}' has null json, skipping PII recognizer keyword cleanup", version, tagFqn); + return; + } + String jsonStr = jsonValue.toString(); ObjectNode root; try { root = (ObjectNode) JsonUtils.readTree(jsonStr); } catch (Exception e) { - LOG.warn("Failed to parse tag '{}' JSON, skipping: {}", tagFqn, e.getMessage()); + LOG.warn("{}: failed to parse tag '{}' JSON, skipping: {}", version, tagFqn, e.getMessage()); return; } boolean modified = processRecognizers(root); @@ -86,9 +92,9 @@ private static void migrateTag(Handle handle, String tagFqn, boolean isMySQL) { .bind(JSON_COLUMN, root.toString()) .bind(FQN_HASH_COLUMN, fqnHash) .execute(); - LOG.info("Updated PII recognizer context keywords for tag '{}'", tagFqn); + LOG.info("{}: updated PII recognizer context keywords for tag '{}'", version, tagFqn); } else { - LOG.info("No broad PII context keywords found in tag '{}'", tagFqn); + LOG.info("{}: no broad PII context keywords found in tag '{}'", version, tagFqn); } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java index f047f33f00f5..94cf786739ac 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java @@ -7,6 +7,6 @@ public class MigrationUtil { private MigrationUtil() {} public static void removeBroadPiiContextKeywords(Handle handle) { - PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle, "v11213"); } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java index 4a0363c5ace5..280133d2c412 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v1131/MigrationUtil.java @@ -7,6 +7,6 @@ public class MigrationUtil { private MigrationUtil() {} public static void removeBroadPiiContextKeywords(Handle handle) { - PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle); + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle, "v1131"); } } From 3bd9fe378e13810ac94df163eb28b31252130d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20Do=C3=B1aque?= Date: Tue, 16 Jun 2026 15:41:02 -0400 Subject: [PATCH 6/8] fix(pii): move broad-keyword removal migration from 1.12.13 to 1.12.12 Co-Authored-By: Claude Sonnet 4.6 --- .../native/{1.12.13 => 1.12.12}/mysql/schemaChanges.sql | 0 .../native/{1.12.13 => 1.12.12}/postgres/schemaChanges.sql | 0 .../migration/mysql/{v11213 => v11212}/Migration.java | 6 +++--- .../migration/postgres/{v11213 => v11212}/Migration.java | 6 +++--- .../migration/utils/{v11213 => v11212}/MigrationUtil.java | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) rename bootstrap/sql/migrations/native/{1.12.13 => 1.12.12}/mysql/schemaChanges.sql (100%) rename bootstrap/sql/migrations/native/{1.12.13 => 1.12.12}/postgres/schemaChanges.sql (100%) rename openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/{v11213 => v11212}/Migration.java (74%) rename openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/{v11213 => v11212}/Migration.java (73%) rename openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/{v11213 => v11212}/MigrationUtil.java (80%) diff --git a/bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql similarity index 100% rename from bootstrap/sql/migrations/native/1.12.13/mysql/schemaChanges.sql rename to bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql diff --git a/bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql similarity index 100% rename from bootstrap/sql/migrations/native/1.12.13/postgres/schemaChanges.sql rename to bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11212/Migration.java similarity index 74% rename from openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java rename to openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11212/Migration.java index bda8a4e55c79..9d87c50b01ae 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11213/Migration.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/mysql/v11212/Migration.java @@ -1,10 +1,10 @@ -package org.openmetadata.service.migration.mysql.v11213; +package org.openmetadata.service.migration.mysql.v11212; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.openmetadata.service.migration.api.MigrationProcessImpl; import org.openmetadata.service.migration.utils.MigrationFile; -import org.openmetadata.service.migration.utils.v11213.MigrationUtil; +import org.openmetadata.service.migration.utils.v11212.MigrationUtil; @Slf4j public class Migration extends MigrationProcessImpl { @@ -19,7 +19,7 @@ public void runDataMigration() { try { MigrationUtil.removeBroadPiiContextKeywords(handle); } catch (Exception e) { - LOG.error("v11213: failed to remove broad PII context keywords", e); + LOG.error("v11212: failed to remove broad PII context keywords", e); } } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11212/Migration.java similarity index 73% rename from openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java rename to openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11212/Migration.java index 9cf6155b2992..f1df0a134b64 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11213/Migration.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/postgres/v11212/Migration.java @@ -1,10 +1,10 @@ -package org.openmetadata.service.migration.postgres.v11213; +package org.openmetadata.service.migration.postgres.v11212; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.openmetadata.service.migration.api.MigrationProcessImpl; import org.openmetadata.service.migration.utils.MigrationFile; -import org.openmetadata.service.migration.utils.v11213.MigrationUtil; +import org.openmetadata.service.migration.utils.v11212.MigrationUtil; @Slf4j public class Migration extends MigrationProcessImpl { @@ -19,7 +19,7 @@ public void runDataMigration() { try { MigrationUtil.removeBroadPiiContextKeywords(handle); } catch (Exception e) { - LOG.error("v11213: failed to remove broad PII context keywords", e); + LOG.error("v11212: failed to remove broad PII context keywords", e); } } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11212/MigrationUtil.java similarity index 80% rename from openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java rename to openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11212/MigrationUtil.java index 94cf786739ac..c10a5ccf3408 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11213/MigrationUtil.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/utils/v11212/MigrationUtil.java @@ -1,4 +1,4 @@ -package org.openmetadata.service.migration.utils.v11213; +package org.openmetadata.service.migration.utils.v11212; import org.jdbi.v3.core.Handle; import org.openmetadata.service.migration.utils.PiiRecognizerMigrationUtil; @@ -7,6 +7,6 @@ public class MigrationUtil { private MigrationUtil() {} public static void removeBroadPiiContextKeywords(Handle handle) { - PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle, "v11213"); + PiiRecognizerMigrationUtil.removeBroadPiiContextKeywords(handle, "v11212"); } } From b4b6777ac1a5f71eccc8fe1b7e94609f743062be Mon Sep 17 00:00:00 2001 From: Eugenio Date: Tue, 16 Jun 2026 23:00:02 +0200 Subject: [PATCH 7/8] Update bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql index 5cf17d3480c0..c832c3279fbe 100644 --- a/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.12.12/mysql/schemaChanges.sql @@ -1,4 +1,4 @@ -- PII recognizer context keyword cleanup: remove overly broad context keywords -- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") -- that caused false-positive PII classification on non-PII columns. --- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. +-- Handled by Java data migration in v11212.MigrationUtil.removeBroadPiiContextKeywords. From 7b413da24a1a572c8f5c57170de3a90cfa49721e Mon Sep 17 00:00:00 2001 From: Eugenio Date: Tue, 16 Jun 2026 23:00:24 +0200 Subject: [PATCH 8/8] Update bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- .../sql/migrations/native/1.12.12/postgres/schemaChanges.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql index 5cf17d3480c0..c832c3279fbe 100644 --- a/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.12.12/postgres/schemaChanges.sql @@ -1,4 +1,4 @@ -- PII recognizer context keyword cleanup: remove overly broad context keywords -- (e.g. "code", "security", "address", "name", "call", "check", "save", "social") -- that caused false-positive PII classification on non-PII columns. --- Handled by Java data migration in v11213.MigrationUtil.removeBroadPiiContextKeywords. +-- Handled by Java data migration in v11212.MigrationUtil.removeBroadPiiContextKeywords.