Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 47 additions & 22 deletions ingestion/src/metadata/ingestion/connections/source_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,58 @@
response = client.get("/dashboards") # Automatically tracked
"""

import re
from time import perf_counter
from typing import Any, Optional, Union

from metadata.ingestion.ometa.client import REST, ClientConfig
from metadata.utils.operation_metrics import OperationMetricsState

# Patterns used to detect path segments that are entity identifiers so they can be
# collapsed to "{id}". Source APIs that embed opaque IDs deep in the path (e.g. Sigma's
# /workbooks/{id}/lineage/elements/{elementId}) would otherwise produce a distinct metric
# key per entity, exploding the cardinality of "source_api_calls" and the dynamic field
# count of the ingestion_pipeline search index mapping. See issue #29141.
_UUID_RE = re.compile(
r"^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$",
re.IGNORECASE,
)
_NUMERIC_RE = re.compile(r"^\d+$")
_HEX_TOKEN_RE = re.compile(r"^[0-9a-f]{24,}$", re.IGNORECASE) # Mongo ObjectId & longer hex
_VERSION_RE = re.compile(r"^v\d+$", re.IGNORECASE) # API version segment, e.g. /v1 - not an id
# Opaque identifier: an id-charset token of reasonable length that contains at least one
# digit. Plain path words ("workbooks", "elements", "lineage", ...) have no digit and so
# are preserved; version segments are excluded above.
_OPAQUE_ID_RE = re.compile(r"^(?=.{4,}$)(?=.*\d)[A-Za-z0-9._~-]+$")

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Quality: Opaque-ID regex may collapse legitimate path words containing digits

_OPAQUE_ID_RE = ^(?=.{4,}$)(?=.*\d)[A-Za-z0-9._~-]+$ treats any segment of length >=4 that contains at least one digit as an entity ID. While this fixes the cardinality explosion (the goal), it will also collapse static, non-identifier path words that happen to contain a digit — e.g. oauth2, utf8, log4j, s3api, ec2, or version-like words such as v1beta1 (the _VERSION_RE guard only matches the exact form ^v\d+$). Those distinct endpoints will all be rewritten to {id} and merged into the same metric key, reducing the usefulness/granularity of source_api_calls for affected connectors.

This does not cause incorrect behavior or the indexing failure (change 2 fully protects the index), so it is minor and largely an accepted trade-off. If finer metric granularity matters, consider tightening the heuristic — e.g. only collapse when the digit ratio/length suggests a token, or exclude segments that are purely [A-Za-z]+\d+-style versioned resource names — or maintain a small allowlist of known static segments. At minimum, the docstring/comment could note that digit-bearing static words are also collapsed.

Was this helpful? React with 👍 / 👎

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 _OPAQUE_ID_RE false-positives on common path words that contain a digit

Any path segment ≥ 4 characters that contains at least one digit will be collapsed to {id}. This catches Sigma-style tokens correctly, but it also catches common API path words: oauth2{id}, api2{id}, v1.2{id}. A path like /oauth2/token would be normalized to /{id}/token and merged with unrelated routes in the recorded metrics.

_VERSION_RE already excludes the common v\d+ form but does not cover compound version or protocol names. Since dynamic: false is the real safety net, this only affects metric usefulness, not correctness — but a test case for oauth2 (or similar) would pin the known behavior and prevent a future "fix" from silently making things worse.



def _is_id_segment(part: str) -> bool:
"""Return True if a single path segment looks like an entity identifier."""
if _VERSION_RE.match(part):
return False
return bool(
_UUID_RE.match(part)
or _NUMERIC_RE.match(part)
or _HEX_TOKEN_RE.match(part)
or _OPAQUE_ID_RE.match(part)
)


def normalize_api_path(path: str) -> str:
"""
Normalize an API path for metrics by replacing identifier segments with ``{id}``.

This keeps the cardinality of recorded operations bounded regardless of how many
source entities are walked.
Example: ``/workbooks/3f2a.../lineage/elements/ab12`` -> ``/workbooks/{id}/lineage/elements/{id}``
"""
cleaned_parts = [
"{id}" if _is_id_segment(part) else part
for part in path.split("?")[0].split("/")
if part
]
return "/" + "/".join(cleaned_parts) if cleaned_parts else "/"


class TrackedREST(REST):
"""
Expand Down Expand Up @@ -76,28 +122,7 @@
Replaces IDs and UUIDs with placeholders for better aggregation.
Example: /dashboard/123-abc -> /dashboard/{id}
"""
import re # noqa: PLC0415

parts = path.split("?")[0].split("/") # noqa: PLC0207
cleaned_parts = []
for part in parts:
if not part:
continue
# Replace UUIDs and numeric IDs with {id}
if re.match(
r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
part,
re.IGNORECASE,
):
cleaned_parts.append("{id}")
elif re.match(r"^\d+$", part):
cleaned_parts.append("{id}")
elif re.match(r"^[a-f0-9]{24}$", part, re.IGNORECASE):
# MongoDB-style ObjectIds
cleaned_parts.append("{id}")
else:
cleaned_parts.append(part)
return "/" + "/".join(cleaned_parts) if cleaned_parts else "/"
return normalize_api_path(path)

def _record_api_call(self, method: str, path: str, duration_ms: float) -> None:
"""Record an API call metric."""
Expand Down Expand Up @@ -125,7 +150,7 @@
data: Any = None,
json: Any = None,
headers: Optional[dict] = None, # noqa: UP045
timeout: Optional[Union[float, tuple[float, float]]] = None, # noqa: UP007, UP045

Check warning on line 153 in ingestion/src/metadata/ingestion/connections/source_api_client.py

View check run for this annotation

SonarQubeCloud / [open-metadata-ingestion] SonarCloud Code Analysis

Use a union type expression for this type hint.

See more on https://sonarcloud.io/project/issues?id=open-metadata-ingestion&issues=AZ7YIwsPTt09cAoY6lV8&open=AZ7YIwsPTt09cAoY6lV8&pullRequest=29149
retries: Optional[int] = None, # noqa: UP045
):
"""POST method with tracking."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Tests for source API path normalization used by TrackedREST metrics.

Regression coverage for #29141: opaque identifier segments embedded deep in a path
(e.g. Sigma's /workbooks/{id}/lineage/elements/{elementId}) must collapse to {id} so
the cardinality of "source_api_calls" stays bounded.
"""

import pytest

from metadata.ingestion.connections.source_api_client import (
_is_id_segment,
normalize_api_path,
)


@pytest.mark.parametrize(
"segment",
[
"12345", # numeric id
"3f2a1b4c-1111-2222-3333-444455556666", # uuid (dashed)
"3f2a1b4c11112222333344445555666a", # uuid (no dashes, 32 hex)
"507f1f77bcf86cd799439011", # mongo objectid (24 hex)
"_8j4kP9x", # sigma-style opaque token with digits
"pg-9f2a1", # hyphenated id with digits
"elem123abc", # alphanumeric element id
],
)
def test_should_treat_token_as_id_when_segment_is_identifier(segment):
assert _is_id_segment(segment) is True


@pytest.mark.parametrize(
"segment",
[
"workbooks",
"lineage",
"elements",
"pages",
"queries",
"members",
"files",
"auth",
"token",
"v1", # api version, not an id
"v2",
"ab", # too short, no digit
],
)
def test_should_preserve_segment_when_not_an_identifier(segment):
assert _is_id_segment(segment) is False


@pytest.mark.parametrize(
"raw_path,expected",
[
# The Sigma paths that caused the field explosion in #29141
(
"/workbooks/3f2a1b4c-1111-2222-3333-444455556666/lineage/elements/_8j4kP9x",
"/workbooks/{id}/lineage/elements/{id}",
),
(
"/workbooks/2hQ9xAbc123/pages/pg-9f2a1/elements",
"/workbooks/{id}/pages/{id}/elements",
),
("/files/node-12ab", "/files/{id}"),
("/members/12345", "/members/{id}"),
# No identifier segments -> unchanged
("/workbooks", "/workbooks"),
("/auth/token", "/auth/token"),
# API version segments are preserved
("/v1/workbooks", "/v1/workbooks"),
# Query strings are stripped
("/workbooks?page=2", "/workbooks"),
# Edge cases
("/", "/"),
("", "/"),
],
)
def test_should_normalize_identifier_segments_when_extracting_api_path(raw_path, expected):
assert normalize_api_path(raw_path) == expected
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@
"normalizer": "lowercase_normalizer"
},
"pipelineStatuses": {
"dynamic": false,
"properties": {
"pipelineState": {
"type": "keyword"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@
"type": "keyword"
},
"pipelineStatuses": {
"dynamic": false,
"properties": {
"pipelineState": {
"type": "keyword"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@
"normalizer": "lowercase_normalizer"
},
"pipelineStatuses": {
"dynamic": false,
"properties": {
"pipelineState": {
"type": "keyword"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@
"type": "keyword"
},
"pipelineStatuses": {
"dynamic": false,
"properties": {
"pipelineState": {
"type": "keyword"
Expand Down
Loading