open-metadata · thiago-costa-fanatic · Jun 17, 2026 · gitar-bot · Jun 17, 2026 · greptile-apps
@@ -24,12 +24,58 @@
     response = client.get("/dashboards")  # Automatically tracked
 """
 
+import re
 from time import perf_counter
 from typing import Any, Optional, Union
 
 from metadata.ingestion.ometa.client import REST, ClientConfig
 from metadata.utils.operation_metrics import OperationMetricsState
 
+# Patterns used to detect path segments that are entity identifiers so they can be
+# collapsed to "{id}". Source APIs that embed opaque IDs deep in the path (e.g. Sigma's
+# /workbooks/{id}/lineage/elements/{elementId}) would otherwise produce a distinct metric
+# key per entity, exploding the cardinality of "source_api_calls" and the dynamic field
+# count of the ingestion_pipeline search index mapping. See issue #29141.
+_UUID_RE = re.compile(
+    r"^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$",
+    re.IGNORECASE,
+)
+_NUMERIC_RE = re.compile(r"^\d+$")
+_HEX_TOKEN_RE = re.compile(r"^[0-9a-f]{24,}$", re.IGNORECASE)  # Mongo ObjectId & longer hex
+_VERSION_RE = re.compile(r"^v\d+$", re.IGNORECASE)  # API version segment, e.g. /v1 - not an id
+# Opaque identifier: an id-charset token of reasonable length that contains at least one
+# digit. Plain path words ("workbooks", "elements", "lineage", ...) have no digit and so
+# are preserved; version segments are excluded above.
+_OPAQUE_ID_RE = re.compile(r"^(?=.{4,}$)(?=.*\d)[A-Za-z0-9._~-]+$")
+
+
+def _is_id_segment(part: str) -> bool:
+    """Return True if a single path segment looks like an entity identifier."""
+    if _VERSION_RE.match(part):
+        return False
+    return bool(
+        _UUID_RE.match(part)
+        or _NUMERIC_RE.match(part)
+        or _HEX_TOKEN_RE.match(part)
+        or _OPAQUE_ID_RE.match(part)
+    )
+
+
+def normalize_api_path(path: str) -> str:
+    """
+    Normalize an API path for metrics by replacing identifier segments with ``{id}``.
+
+    This keeps the cardinality of recorded operations bounded regardless of how many
+    source entities are walked.
+    Example: ``/workbooks/3f2a.../lineage/elements/ab12`` -> ``/workbooks/{id}/lineage/elements/{id}``
+    """
+    cleaned_parts = [
+        "{id}" if _is_id_segment(part) else part
+        for part in path.split("?")[0].split("/")
+        if part
+    ]
+    return "/" + "/".join(cleaned_parts) if cleaned_parts else "/"
+
 
 class TrackedREST(REST):
     """
@@ -76,28 +122,7 @@
         Replaces IDs and UUIDs with placeholders for better aggregation.
         Example: /dashboard/123-abc -> /dashboard/{id}
         """
-        import re  # noqa: PLC0415
-
-        parts = path.split("?")[0].split("/")  # noqa: PLC0207
-        cleaned_parts = []
-        for part in parts:
-            if not part:
-                continue
-            # Replace UUIDs and numeric IDs with {id}
-            if re.match(
-                r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
-                part,
-                re.IGNORECASE,
-            ):
-                cleaned_parts.append("{id}")
-            elif re.match(r"^\d+$", part):
-                cleaned_parts.append("{id}")
-            elif re.match(r"^[a-f0-9]{24}$", part, re.IGNORECASE):
-                # MongoDB-style ObjectIds
-                cleaned_parts.append("{id}")
-            else:
-                cleaned_parts.append(part)
-        return "/" + "/".join(cleaned_parts) if cleaned_parts else "/"
+        return normalize_api_path(path)
 
     def _record_api_call(self, method: str, path: str, duration_ms: float) -> None:
         """Record an API call metric."""
@@ -125,7 +150,7 @@
        data: Any = None,
        json: Any = None,
        headers: Optional[dict] = None,  # noqa: UP045
        timeout: Optional[Union[float, tuple[float, float]]] = None,  # noqa: UP007, UP045
        retries: Optional[int] = None,  # noqa: UP045
    ):
        """POST method with tracking."""

@@ -0,0 +1,92 @@
+#  Copyright 2025 Collate
+#  Licensed under the Collate Community License, Version 1.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+Tests for source API path normalization used by TrackedREST metrics.
+
+Regression coverage for #29141: opaque identifier segments embedded deep in a path
+(e.g. Sigma's /workbooks/{id}/lineage/elements/{elementId}) must collapse to {id} so
+the cardinality of "source_api_calls" stays bounded.
+"""
+
+import pytest
+
+from metadata.ingestion.connections.source_api_client import (
+    _is_id_segment,
+    normalize_api_path,
+)
+
+
+@pytest.mark.parametrize(
+    "segment",
+    [
+        "12345",  # numeric id
+        "3f2a1b4c-1111-2222-3333-444455556666",  # uuid (dashed)
+        "3f2a1b4c11112222333344445555666a",  # uuid (no dashes, 32 hex)
+        "507f1f77bcf86cd799439011",  # mongo objectid (24 hex)
+        "_8j4kP9x",  # sigma-style opaque token with digits
+        "pg-9f2a1",  # hyphenated id with digits
+        "elem123abc",  # alphanumeric element id
+    ],
+)
+def test_should_treat_token_as_id_when_segment_is_identifier(segment):
+    assert _is_id_segment(segment) is True
+
+
+@pytest.mark.parametrize(
+    "segment",
+    [
+        "workbooks",
+        "lineage",
+        "elements",
+        "pages",
+        "queries",
+        "members",
+        "files",
+        "auth",
+        "token",
+        "v1",  # api version, not an id
+        "v2",
+        "ab",  # too short, no digit
+    ],
+)
+def test_should_preserve_segment_when_not_an_identifier(segment):
+    assert _is_id_segment(segment) is False
+
+
+@pytest.mark.parametrize(
+    "raw_path,expected",
+    [
+        # The Sigma paths that caused the field explosion in #29141
+        (
+            "/workbooks/3f2a1b4c-1111-2222-3333-444455556666/lineage/elements/_8j4kP9x",
+            "/workbooks/{id}/lineage/elements/{id}",
+        ),
+        (
+            "/workbooks/2hQ9xAbc123/pages/pg-9f2a1/elements",
+            "/workbooks/{id}/pages/{id}/elements",
+        ),
+        ("/files/node-12ab", "/files/{id}"),
+        ("/members/12345", "/members/{id}"),
+        # No identifier segments -> unchanged
+        ("/workbooks", "/workbooks"),
+        ("/auth/token", "/auth/token"),
+        # API version segments are preserved
+        ("/v1/workbooks", "/v1/workbooks"),
+        # Query strings are stripped
+        ("/workbooks?page=2", "/workbooks"),
+        # Edge cases
+        ("/", "/"),
+        ("", "/"),
+    ],
+)
+def test_should_normalize_identifier_segments_when_extracting_api_path(raw_path, expected):
+    assert normalize_api_path(raw_path) == expected
diff --git a/openmetadata-spec/src/main/resources/elasticsearch/en/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/en/ingestion_pipeline_index_mapping.json
@@ -378,6 +378,7 @@
         "normalizer": "lowercase_normalizer"
       },
       "pipelineStatuses": {
+        "dynamic": false,
         "properties": {
           "pipelineState": {
             "type": "keyword"

diff --git a/openmetadata-spec/src/main/resources/elasticsearch/jp/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/jp/ingestion_pipeline_index_mapping.json
@@ -361,6 +361,7 @@
         "type": "keyword"
       },
       "pipelineStatuses": {
+        "dynamic": false,
         "properties": {
           "pipelineState": {
             "type": "keyword"

diff --git a/openmetadata-spec/src/main/resources/elasticsearch/ru/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/ru/ingestion_pipeline_index_mapping.json
@@ -390,6 +390,7 @@
         "normalizer": "lowercase_normalizer"
       },
       "pipelineStatuses": {
+        "dynamic": false,
         "properties": {
           "pipelineState": {
             "type": "keyword"

diff --git a/openmetadata-spec/src/main/resources/elasticsearch/zh/ingestion_pipeline_index_mapping.json b/openmetadata-spec/src/main/resources/elasticsearch/zh/ingestion_pipeline_index_mapping.json
@@ -368,6 +368,7 @@
         "type": "keyword"
       },
       "pipelineStatuses": {
+        "dynamic": false,
         "properties": {
           "pipelineState": {
             "type": "keyword"