diff --git a/ingestion/src/metadata/ingestion/connections/source_api_client.py b/ingestion/src/metadata/ingestion/connections/source_api_client.py index ec33da0b4e9c..2738d1f83784 100644 --- a/ingestion/src/metadata/ingestion/connections/source_api_client.py +++ b/ingestion/src/metadata/ingestion/connections/source_api_client.py @@ -24,12 +24,58 @@ response = client.get("/dashboards") # Automatically tracked """ +import re from time import perf_counter from typing import Any, Optional, Union from metadata.ingestion.ometa.client import REST, ClientConfig from metadata.utils.operation_metrics import OperationMetricsState +# Patterns used to detect path segments that are entity identifiers so they can be +# collapsed to "{id}". Source APIs that embed opaque IDs deep in the path (e.g. Sigma's +# /workbooks/{id}/lineage/elements/{elementId}) would otherwise produce a distinct metric +# key per entity, exploding the cardinality of "source_api_calls" and the dynamic field +# count of the ingestion_pipeline search index mapping. See issue #29141. +_UUID_RE = re.compile( + r"^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$", + re.IGNORECASE, +) +_NUMERIC_RE = re.compile(r"^\d+$") +_HEX_TOKEN_RE = re.compile(r"^[0-9a-f]{24,}$", re.IGNORECASE) # Mongo ObjectId & longer hex +_VERSION_RE = re.compile(r"^v\d+$", re.IGNORECASE) # API version segment, e.g. /v1 - not an id +# Opaque identifier: an id-charset token of reasonable length that contains at least one +# digit. Plain path words ("workbooks", "elements", "lineage", ...) have no digit and so +# are preserved; version segments are excluded above. +_OPAQUE_ID_RE = re.compile(r"^(?=.{4,}$)(?=.*\d)[A-Za-z0-9._~-]+$") + + +def _is_id_segment(part: str) -> bool: + """Return True if a single path segment looks like an entity identifier.""" + if _VERSION_RE.match(part): + return False + return bool( + _UUID_RE.match(part) + or _NUMERIC_RE.match(part) + or _HEX_TOKEN_RE.match(part) + or _OPAQUE_ID_RE.match(part) + ) + + +def normalize_api_path(path: str) -> str: + """ + Normalize an API path for metrics by replacing identifier segments with ``{id}``. + + This keeps the cardinality of recorded operations bounded regardless of how many + source entities are walked. + Example: ``/workbooks/3f2a.../lineage/elements/ab12`` -> ``/workbooks/{id}/lineage/elements/{id}`` + """ + cleaned_parts = [ + "{id}" if _is_id_segment(part) else part + for part in path.split("?")[0].split("/") + if part + ] + return "/" + "/".join(cleaned_parts) if cleaned_parts else "/" + class TrackedREST(REST): """ @@ -76,28 +122,7 @@ def _extract_api_path(self, path: str) -> str: Replaces IDs and UUIDs with placeholders for better aggregation. Example: /dashboard/123-abc -> /dashboard/{id} """ - import re # noqa: PLC0415 - - parts = path.split("?")[0].split("/") # noqa: PLC0207 - cleaned_parts = [] - for part in parts: - if not part: - continue - # Replace UUIDs and numeric IDs with {id} - if re.match( - r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", - part, - re.IGNORECASE, - ): - cleaned_parts.append("{id}") - elif re.match(r"^\d+$", part): - cleaned_parts.append("{id}") - elif re.match(r"^[a-f0-9]{24}$", part, re.IGNORECASE): - # MongoDB-style ObjectIds - cleaned_parts.append("{id}") - else: - cleaned_parts.append(part) - return "/" + "/".join(cleaned_parts) if cleaned_parts else "/" + return normalize_api_path(path) def _record_api_call(self, method: str, path: str, duration_ms: float) -> None: """Record an API call metric.""" diff --git a/ingestion/tests/unit/metadata/ingestion/connections/test_source_api_client.py b/ingestion/tests/unit/metadata/ingestion/connections/test_source_api_client.py new file mode 100644 index 000000000000..7f66e45bb347 --- /dev/null +++ b/ingestion/tests/unit/metadata/ingestion/connections/test_source_api_client.py @@ -0,0 +1,92 @@ +# Copyright 2025 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests for source API path normalization used by TrackedREST metrics. + +Regression coverage for #29141: opaque identifier segments embedded deep in a path +(e.g. Sigma's /workbooks/{id}/lineage/elements/{elementId}) must collapse to {id} so +the cardinality of "source_api_calls" stays bounded. +""" + +import pytest + +from metadata.ingestion.connections.source_api_client import ( + _is_id_segment, + normalize_api_path, +) + + +@pytest.mark.parametrize( + "segment", + [ + "12345", # numeric id + "3f2a1b4c-1111-2222-3333-444455556666", # uuid (dashed) + "3f2a1b4c11112222333344445555666a", # uuid (no dashes, 32 hex) + "507f1f77bcf86cd799439011", # mongo objectid (24 hex) + "_8j4kP9x", # sigma-style opaque token with digits + "pg-9f2a1", # hyphenated id with digits + "elem123abc", # alphanumeric element id + ], +) +def test_should_treat_token_as_id_when_segment_is_identifier(segment): + assert _is_id_segment(segment) is True + + +@pytest.mark.parametrize( + "segment", + [ + "workbooks", + "lineage", + "elements", + "pages", + "queries", + "members", + "files", + "auth", + "token", + "v1", # api version, not an id + "v2", + "ab", # too short, no digit + ], +) +def test_should_preserve_segment_when_not_an_identifier(segment): + assert _is_id_segment(segment) is False + + +@pytest.mark.parametrize( + "raw_path,expected", + [ + # The Sigma paths that caused the field explosion in #29141 + ( + "/workbooks/3f2a1b4c-1111-2222-3333-444455556666/lineage/elements/_8j4kP9x", + "/workbooks/{id}/lineage/elements/{id}", + ), + ( + "/workbooks/2hQ9xAbc123/pages/pg-9f2a1/elements", + "/workbooks/{id}/pages/{id}/elements", + ), + ("/files/node-12ab", "/files/{id}"), + ("/members/12345", "/members/{id}"), + # No identifier segments -> unchanged + ("/workbooks", "/workbooks"), + ("/auth/token", "/auth/token"), + # API version segments are preserved + ("/v1/workbooks", "/v1/workbooks"), + # Query strings are stripped + ("/workbooks?page=2", "/workbooks"), + # Edge cases + ("/", "/"), + ("", "/"), + ], +) +def test_should_normalize_identifier_segments_when_extracting_api_path(raw_path, expected): + assert normalize_api_path(raw_path) == expected diff --git a/openmetadata-spec/src/main/resources/elasticsearch/en/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/en/ingestion_pipeline_index_mapping.json index 299cd73eccbc..89fde6e3ae67 100644 --- a/openmetadata-spec/src/main/resources/elasticsearch/en/ingestion_pipeline_index_mapping.json +++ b/openmetadata-spec/src/main/resources/elasticsearch/en/ingestion_pipeline_index_mapping.json @@ -378,6 +378,7 @@ "normalizer": "lowercase_normalizer" }, "pipelineStatuses": { + "dynamic": false, "properties": { "pipelineState": { "type": "keyword" diff --git a/openmetadata-spec/src/main/resources/elasticsearch/jp/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/jp/ingestion_pipeline_index_mapping.json index 6e8b2131e907..948798c2d24c 100644 --- a/openmetadata-spec/src/main/resources/elasticsearch/jp/ingestion_pipeline_index_mapping.json +++ b/openmetadata-spec/src/main/resources/elasticsearch/jp/ingestion_pipeline_index_mapping.json @@ -361,6 +361,7 @@ "type": "keyword" }, "pipelineStatuses": { + "dynamic": false, "properties": { "pipelineState": { "type": "keyword" diff --git a/openmetadata-spec/src/main/resources/elasticsearch/ru/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/ru/ingestion_pipeline_index_mapping.json index 38a8601421fe..75e8c0953a93 100644 --- a/openmetadata-spec/src/main/resources/elasticsearch/ru/ingestion_pipeline_index_mapping.json +++ b/openmetadata-spec/src/main/resources/elasticsearch/ru/ingestion_pipeline_index_mapping.json @@ -390,6 +390,7 @@ "normalizer": "lowercase_normalizer" }, "pipelineStatuses": { + "dynamic": false, "properties": { "pipelineState": { "type": "keyword" diff --git a/openmetadata-spec/src/main/resources/elasticsearch/zh/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/zh/ingestion_pipeline_index_mapping.json index d93e951a2aa6..4074d1740230 100644 --- a/openmetadata-spec/src/main/resources/elasticsearch/zh/ingestion_pipeline_index_mapping.json +++ b/openmetadata-spec/src/main/resources/elasticsearch/zh/ingestion_pipeline_index_mapping.json @@ -368,6 +368,7 @@ "type": "keyword" }, "pipelineStatuses": { + "dynamic": false, "properties": { "pipelineState": { "type": "keyword"