-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Fixes #29141: bound source_api_calls metric cardinality and guard ingestion_pipeline index #29149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,12 +24,58 @@ | |
| response = client.get("/dashboards") # Automatically tracked | ||
| """ | ||
|
|
||
| import re | ||
| from time import perf_counter | ||
| from typing import Any, Optional, Union | ||
|
|
||
| from metadata.ingestion.ometa.client import REST, ClientConfig | ||
| from metadata.utils.operation_metrics import OperationMetricsState | ||
|
|
||
| # Patterns used to detect path segments that are entity identifiers so they can be | ||
| # collapsed to "{id}". Source APIs that embed opaque IDs deep in the path (e.g. Sigma's | ||
| # /workbooks/{id}/lineage/elements/{elementId}) would otherwise produce a distinct metric | ||
| # key per entity, exploding the cardinality of "source_api_calls" and the dynamic field | ||
| # count of the ingestion_pipeline search index mapping. See issue #29141. | ||
| _UUID_RE = re.compile( | ||
| r"^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$", | ||
| re.IGNORECASE, | ||
| ) | ||
| _NUMERIC_RE = re.compile(r"^\d+$") | ||
| _HEX_TOKEN_RE = re.compile(r"^[0-9a-f]{24,}$", re.IGNORECASE) # Mongo ObjectId & longer hex | ||
| _VERSION_RE = re.compile(r"^v\d+$", re.IGNORECASE) # API version segment, e.g. /v1 - not an id | ||
| # Opaque identifier: an id-charset token of reasonable length that contains at least one | ||
| # digit. Plain path words ("workbooks", "elements", "lineage", ...) have no digit and so | ||
| # are preserved; version segments are excluded above. | ||
| _OPAQUE_ID_RE = re.compile(r"^(?=.{4,}$)(?=.*\d)[A-Za-z0-9._~-]+$") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Any path segment ≥ 4 characters that contains at least one digit will be collapsed to
|
||
|
|
||
|
|
||
| def _is_id_segment(part: str) -> bool: | ||
| """Return True if a single path segment looks like an entity identifier.""" | ||
| if _VERSION_RE.match(part): | ||
| return False | ||
| return bool( | ||
| _UUID_RE.match(part) | ||
| or _NUMERIC_RE.match(part) | ||
| or _HEX_TOKEN_RE.match(part) | ||
| or _OPAQUE_ID_RE.match(part) | ||
| ) | ||
|
|
||
|
|
||
| def normalize_api_path(path: str) -> str: | ||
| """ | ||
| Normalize an API path for metrics by replacing identifier segments with ``{id}``. | ||
|
|
||
| This keeps the cardinality of recorded operations bounded regardless of how many | ||
| source entities are walked. | ||
| Example: ``/workbooks/3f2a.../lineage/elements/ab12`` -> ``/workbooks/{id}/lineage/elements/{id}`` | ||
| """ | ||
| cleaned_parts = [ | ||
| "{id}" if _is_id_segment(part) else part | ||
| for part in path.split("?")[0].split("/") | ||
| if part | ||
| ] | ||
| return "/" + "/".join(cleaned_parts) if cleaned_parts else "/" | ||
|
|
||
|
|
||
| class TrackedREST(REST): | ||
| """ | ||
|
|
@@ -76,28 +122,7 @@ | |
| Replaces IDs and UUIDs with placeholders for better aggregation. | ||
| Example: /dashboard/123-abc -> /dashboard/{id} | ||
| """ | ||
| import re # noqa: PLC0415 | ||
|
|
||
| parts = path.split("?")[0].split("/") # noqa: PLC0207 | ||
| cleaned_parts = [] | ||
| for part in parts: | ||
| if not part: | ||
| continue | ||
| # Replace UUIDs and numeric IDs with {id} | ||
| if re.match( | ||
| r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", | ||
| part, | ||
| re.IGNORECASE, | ||
| ): | ||
| cleaned_parts.append("{id}") | ||
| elif re.match(r"^\d+$", part): | ||
| cleaned_parts.append("{id}") | ||
| elif re.match(r"^[a-f0-9]{24}$", part, re.IGNORECASE): | ||
| # MongoDB-style ObjectIds | ||
| cleaned_parts.append("{id}") | ||
| else: | ||
| cleaned_parts.append(part) | ||
| return "/" + "/".join(cleaned_parts) if cleaned_parts else "/" | ||
| return normalize_api_path(path) | ||
|
|
||
| def _record_api_call(self, method: str, path: str, duration_ms: float) -> None: | ||
| """Record an API call metric.""" | ||
|
|
@@ -125,7 +150,7 @@ | |
| data: Any = None, | ||
| json: Any = None, | ||
| headers: Optional[dict] = None, # noqa: UP045 | ||
| timeout: Optional[Union[float, tuple[float, float]]] = None, # noqa: UP007, UP045 | ||
|
Check warning on line 153 in ingestion/src/metadata/ingestion/connections/source_api_client.py
|
||
| retries: Optional[int] = None, # noqa: UP045 | ||
| ): | ||
| """POST method with tracking.""" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| # Copyright 2025 Collate | ||
| # Licensed under the Collate Community License, Version 1.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| Tests for source API path normalization used by TrackedREST metrics. | ||
|
|
||
| Regression coverage for #29141: opaque identifier segments embedded deep in a path | ||
| (e.g. Sigma's /workbooks/{id}/lineage/elements/{elementId}) must collapse to {id} so | ||
| the cardinality of "source_api_calls" stays bounded. | ||
| """ | ||
|
|
||
| import pytest | ||
|
|
||
| from metadata.ingestion.connections.source_api_client import ( | ||
| _is_id_segment, | ||
| normalize_api_path, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "segment", | ||
| [ | ||
| "12345", # numeric id | ||
| "3f2a1b4c-1111-2222-3333-444455556666", # uuid (dashed) | ||
| "3f2a1b4c11112222333344445555666a", # uuid (no dashes, 32 hex) | ||
| "507f1f77bcf86cd799439011", # mongo objectid (24 hex) | ||
| "_8j4kP9x", # sigma-style opaque token with digits | ||
| "pg-9f2a1", # hyphenated id with digits | ||
| "elem123abc", # alphanumeric element id | ||
| ], | ||
| ) | ||
| def test_should_treat_token_as_id_when_segment_is_identifier(segment): | ||
| assert _is_id_segment(segment) is True | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "segment", | ||
| [ | ||
| "workbooks", | ||
| "lineage", | ||
| "elements", | ||
| "pages", | ||
| "queries", | ||
| "members", | ||
| "files", | ||
| "auth", | ||
| "token", | ||
| "v1", # api version, not an id | ||
| "v2", | ||
| "ab", # too short, no digit | ||
| ], | ||
| ) | ||
| def test_should_preserve_segment_when_not_an_identifier(segment): | ||
| assert _is_id_segment(segment) is False | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "raw_path,expected", | ||
| [ | ||
| # The Sigma paths that caused the field explosion in #29141 | ||
| ( | ||
| "/workbooks/3f2a1b4c-1111-2222-3333-444455556666/lineage/elements/_8j4kP9x", | ||
| "/workbooks/{id}/lineage/elements/{id}", | ||
| ), | ||
| ( | ||
| "/workbooks/2hQ9xAbc123/pages/pg-9f2a1/elements", | ||
| "/workbooks/{id}/pages/{id}/elements", | ||
| ), | ||
| ("/files/node-12ab", "/files/{id}"), | ||
| ("/members/12345", "/members/{id}"), | ||
| # No identifier segments -> unchanged | ||
| ("/workbooks", "/workbooks"), | ||
| ("/auth/token", "/auth/token"), | ||
| # API version segments are preserved | ||
| ("/v1/workbooks", "/v1/workbooks"), | ||
| # Query strings are stripped | ||
| ("/workbooks?page=2", "/workbooks"), | ||
| # Edge cases | ||
| ("/", "/"), | ||
| ("", "/"), | ||
| ], | ||
| ) | ||
| def test_should_normalize_identifier_segments_when_extracting_api_path(raw_path, expected): | ||
| assert normalize_api_path(raw_path) == expected |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💡 Quality: Opaque-ID regex may collapse legitimate path words containing digits
_OPAQUE_ID_RE = ^(?=.{4,}$)(?=.*\d)[A-Za-z0-9._~-]+$treats any segment of length >=4 that contains at least one digit as an entity ID. While this fixes the cardinality explosion (the goal), it will also collapse static, non-identifier path words that happen to contain a digit — e.g.oauth2,utf8,log4j,s3api,ec2, or version-like words such asv1beta1(the_VERSION_REguard only matches the exact form^v\d+$). Those distinct endpoints will all be rewritten to{id}and merged into the same metric key, reducing the usefulness/granularity ofsource_api_callsfor affected connectors.This does not cause incorrect behavior or the indexing failure (change 2 fully protects the index), so it is minor and largely an accepted trade-off. If finer metric granularity matters, consider tightening the heuristic — e.g. only collapse when the digit ratio/length suggests a token, or exclude segments that are purely
[A-Za-z]+\d+-style versioned resource names — or maintain a small allowlist of known static segments. At minimum, the docstring/comment could note that digit-bearing static words are also collapsed.Was this helpful? React with 👍 / 👎