diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/RdfResourceIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/RdfResourceIT.java index 99d618e35c53..51b126d19cb6 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/RdfResourceIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/RdfResourceIT.java @@ -1,8 +1,15 @@ package org.openmetadata.it.tests; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assumptions.assumeTrue; +import java.net.URI; +import java.net.URLEncoder; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; import java.time.Duration; import java.util.List; import org.awaitility.Awaitility; @@ -15,6 +22,7 @@ import org.openmetadata.it.factories.DatabaseSchemaTestFactory; import org.openmetadata.it.factories.DatabaseServiceTestFactory; import org.openmetadata.it.util.RdfTestUtils; +import org.openmetadata.it.util.SdkClients; import org.openmetadata.it.util.TestNamespace; import org.openmetadata.it.util.TestNamespaceExtension; import org.openmetadata.schema.api.configuration.rdf.RdfConfiguration; @@ -23,6 +31,7 @@ import org.openmetadata.schema.entity.data.Table; import org.openmetadata.schema.entity.services.DatabaseService; import org.openmetadata.schema.type.Column; +import org.openmetadata.schema.type.TableConstraint; import org.openmetadata.sdk.fluent.Tables; import org.openmetadata.sdk.fluent.builders.ColumnBuilder; import org.openmetadata.service.rdf.RdfUpdater; @@ -43,6 +52,10 @@ public class RdfResourceIT { private static final String TABLE_RDF_TYPE = "dcat:Dataset"; + private static final String BASE_URI = "https://open-metadata.org/"; + private static final String OM_NS = BASE_URI + "ontology/"; + private static final HttpClient HTTP_CLIENT = + HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(30)).build(); @BeforeAll static void enableRdf() { @@ -51,9 +64,9 @@ static void enableRdf() { "RDF is disabled for this test run. Use the RDF test profile to execute RdfResourceIT."); RdfConfiguration rdfConfig = new RdfConfiguration(); rdfConfig.setEnabled(true); - rdfConfig.setBaseUri(java.net.URI.create("https://open-metadata.org/")); + rdfConfig.setBaseUri(URI.create(BASE_URI)); rdfConfig.setStorageType(RdfConfiguration.StorageType.FUSEKI); - rdfConfig.setRemoteEndpoint(java.net.URI.create(TestSuiteBootstrap.getFusekiEndpoint())); + rdfConfig.setRemoteEndpoint(URI.create(TestSuiteBootstrap.getFusekiEndpoint())); rdfConfig.setUsername("admin"); rdfConfig.setPassword("test-admin"); rdfConfig.setDataset("openmetadata"); @@ -179,4 +192,251 @@ void testEntityUpdateInRdf(TestNamespace ns) { .pollInterval(Duration.ofMillis(500)) .untilAsserted(() -> RdfTestUtils.verifyEntityUpdatedInRdf(updated)); } + + // --------------------------------------------------------------------------- + // Phase-1 knowledge-graph fidelity tests (P1.1, P1.2, P1.7, P1.9). + // These exercise the new column / constraint / SHACL / ontology behavior + // against the same Fuseki container used by the tests above. + // --------------------------------------------------------------------------- + + @Test + void testColumnIsNamedRdfResource(TestNamespace ns) { + DatabaseService service = DatabaseServiceTestFactory.createPostgres(ns); + DatabaseSchema schema = DatabaseSchemaTestFactory.createSimple(ns, service); + + CreateTable createRequest = new CreateTable(); + createRequest.setName(ns.prefix("rdfColumnUriTable")); + createRequest.setDatabaseSchema(schema.getFullyQualifiedName()); + createRequest.setColumns( + List.of( + ColumnBuilder.of("id", "BIGINT").primaryKey().build(), + ColumnBuilder.of("email", "VARCHAR").dataLength(255).unique().build())); + + Table table = Tables.create(createRequest); + assertNotNull(table.getId()); + + String idColumnFqn = table.getFullyQualifiedName() + ".id"; + String emailColumnFqn = table.getFullyQualifiedName() + ".email"; + + Awaitility.await() + .atMost(Duration.ofSeconds(15)) + .pollInterval(Duration.ofMillis(500)) + .untilAsserted( + () -> { + assertTrue( + columnIsTypedAsOmColumn(idColumnFqn), + "PK column should be a named om:Column resource at the FQN-derived URI"); + assertTrue( + columnIsTypedAsOmColumn(emailColumnFqn), + "UNIQUE column should be a named om:Column resource at the FQN-derived URI"); + }); + } + + @Test + void testColumnConstraintFlagsInRdf(TestNamespace ns) { + DatabaseService service = DatabaseServiceTestFactory.createPostgres(ns); + DatabaseSchema schema = DatabaseSchemaTestFactory.createSimple(ns, service); + + CreateTable createRequest = new CreateTable(); + createRequest.setName(ns.prefix("rdfConstraintTable")); + createRequest.setDatabaseSchema(schema.getFullyQualifiedName()); + createRequest.setColumns( + List.of( + ColumnBuilder.of("id", "BIGINT").primaryKey().build(), + ColumnBuilder.of("email", "VARCHAR").dataLength(255).unique().build(), + ColumnBuilder.of("country", "VARCHAR").dataLength(2).notNull().build())); + + Table table = Tables.create(createRequest); + String idFqn = table.getFullyQualifiedName() + ".id"; + String emailFqn = table.getFullyQualifiedName() + ".email"; + String countryFqn = table.getFullyQualifiedName() + ".country"; + + Awaitility.await() + .atMost(Duration.ofSeconds(15)) + .pollInterval(Duration.ofMillis(500)) + .untilAsserted( + () -> { + assertTrue( + columnHasBooleanProperty(idFqn, "isPrimaryKey", true), + "Primary-key column should set om:isPrimaryKey true"); + assertTrue( + columnHasBooleanProperty(idFqn, "isNullable", false), + "Primary-key column should set om:isNullable false"); + assertTrue( + columnHasBooleanProperty(idFqn, "isUnique", true), + "Primary-key column should also set om:isUnique true (PKs are unique)"); + assertTrue( + columnHasBooleanProperty(emailFqn, "isUnique", true), + "UNIQUE column should set om:isUnique true"); + assertTrue( + columnHasBooleanProperty(countryFqn, "isNullable", false), + "NOT_NULL column should set om:isNullable false"); + }); + } + + @Test + void testForeignKeyReferencesInRdf(TestNamespace ns) { + DatabaseService service = DatabaseServiceTestFactory.createPostgres(ns); + DatabaseSchema schema = DatabaseSchemaTestFactory.createSimple(ns, service); + + CreateTable customers = new CreateTable(); + customers.setName(ns.prefix("rdfFkCustomers")); + customers.setDatabaseSchema(schema.getFullyQualifiedName()); + customers.setColumns( + List.of( + ColumnBuilder.of("id", "BIGINT").primaryKey().build(), + ColumnBuilder.of("name", "VARCHAR").dataLength(255).build())); + Table customersTable = Tables.create(customers); + + String customerIdFqn = customersTable.getFullyQualifiedName() + ".id"; + + CreateTable orders = new CreateTable(); + orders.setName(ns.prefix("rdfFkOrders")); + orders.setDatabaseSchema(schema.getFullyQualifiedName()); + orders.setColumns( + List.of( + ColumnBuilder.of("order_id", "BIGINT").primaryKey().build(), + ColumnBuilder.of("customer_id", "BIGINT").notNull().build())); + + TableConstraint fk = + new TableConstraint() + .withConstraintType(TableConstraint.ConstraintType.FOREIGN_KEY) + .withColumns(List.of("customer_id")) + .withReferredColumns(List.of(customerIdFqn)) + .withRelationshipType(TableConstraint.RelationshipType.MANY_TO_ONE); + orders.setTableConstraints(List.of(fk)); + + Table ordersTable = Tables.create(orders); + + String orderCustomerIdFqn = ordersTable.getFullyQualifiedName() + ".customer_id"; + + Awaitility.await() + .atMost(Duration.ofSeconds(15)) + .pollInterval(Duration.ofMillis(500)) + .untilAsserted( + () -> { + assertTrue( + columnReferencesColumn(orderCustomerIdFqn, customerIdFqn), + "FOREIGN_KEY constraint should produce direct om:references triple between source and referred column"); + assertTrue( + tableHasConstraintOfType(ordersTable.getFullyQualifiedName(), "FOREIGN_KEY"), + "Table should declare a FOREIGN_KEY om:TableConstraint resource"); + }); + } + + @Test + void testOntologyEndpointServesBumpedVersion() throws Exception { + String url = SdkClients.getServerUrl() + "/v1/rdf/ontology"; + HttpRequest request = + HttpRequest.newBuilder() + .uri(URI.create(url)) + .header("Authorization", "Bearer " + SdkClients.getAdminToken()) + .header("Accept", "text/turtle") + .GET() + .build(); + HttpResponse response = HTTP_CLIENT.send(request, HttpResponse.BodyHandlers.ofString()); + assertNotNull(response); + assertTrue( + response.statusCode() == 200, + "GET /v1/rdf/ontology should return 200, got " + response.statusCode()); + String body = response.body(); + assertTrue(body.contains("1.1.0"), "Ontology document should declare the bumped version 1.1.0"); + assertTrue( + body.contains("om:Column") && body.contains("om:TableConstraint"), + "Ontology document should declare core om:Column and om:TableConstraint classes"); + } + + @Test + void testShaclValidateEndpointReturnsReport(TestNamespace ns) throws Exception { + DatabaseService service = DatabaseServiceTestFactory.createPostgres(ns); + DatabaseSchema schema = DatabaseSchemaTestFactory.createSimple(ns, service); + + CreateTable createRequest = new CreateTable(); + createRequest.setName(ns.prefix("rdfShaclTable")); + createRequest.setDatabaseSchema(schema.getFullyQualifiedName()); + createRequest.setColumns(List.of(ColumnBuilder.of("id", "BIGINT").primaryKey().build())); + Table table = Tables.create(createRequest); + + String entityUri = BASE_URI + "entity/table/" + table.getId(); + String url = + SdkClients.getServerUrl() + + "/v1/rdf/validate?entityUri=" + + URLEncoder.encode(entityUri, StandardCharsets.UTF_8); + + HttpRequest request = + HttpRequest.newBuilder() + .uri(URI.create(url)) + .header("Authorization", "Bearer " + SdkClients.getAdminToken()) + .header("Accept", "text/turtle") + .POST(HttpRequest.BodyPublishers.noBody()) + .build(); + + Awaitility.await() + .atMost(Duration.ofSeconds(15)) + .pollInterval(Duration.ofSeconds(1)) + .untilAsserted( + () -> { + HttpResponse response = + HTTP_CLIENT.send(request, HttpResponse.BodyHandlers.ofString()); + assertTrue( + response.statusCode() == 200, + "POST /v1/rdf/validate should return 200, got " + response.statusCode()); + String body = response.body(); + assertTrue( + body.contains("ValidationReport") || body.contains("conforms"), + "Response should be a SHACL validation report"); + assertTrue( + response.headers().firstValue("OM-SHACL-Conforms").isPresent(), + "Endpoint should set OM-SHACL-Conforms header"); + }); + } + + // ---- helpers for the fidelity tests ---- + + private boolean columnIsTypedAsOmColumn(String columnFqn) { + String columnUri = + BASE_URI + "entity/column/" + URLEncoder.encode(columnFqn, StandardCharsets.UTF_8); + String sparql = + String.format( + "PREFIX om: <%s> " + + "ASK { GRAPH ?g { <%s> a om:Column ; om:fullyQualifiedName \"%s\" } }", + OM_NS, columnUri, columnFqn); + return RdfTestUtils.executeSparqlAsk(sparql); + } + + private boolean columnHasBooleanProperty(String columnFqn, String predicate, boolean expected) { + String columnUri = + BASE_URI + "entity/column/" + URLEncoder.encode(columnFqn, StandardCharsets.UTF_8); + String sparql = + String.format( + "PREFIX om: <%s> " + + "PREFIX xsd: " + + "ASK { GRAPH ?g { <%s> om:%s \"%s\"^^xsd:boolean } }", + OM_NS, columnUri, predicate, expected); + return RdfTestUtils.executeSparqlAsk(sparql); + } + + private boolean columnReferencesColumn(String fromFqn, String toFqn) { + String fromUri = + BASE_URI + "entity/column/" + URLEncoder.encode(fromFqn, StandardCharsets.UTF_8); + String toUri = BASE_URI + "entity/column/" + URLEncoder.encode(toFqn, StandardCharsets.UTF_8); + String sparql = + String.format( + "PREFIX om: <%s> ASK { GRAPH ?g { <%s> om:references <%s> } }", OM_NS, fromUri, toUri); + return RdfTestUtils.executeSparqlAsk(sparql); + } + + private boolean tableHasConstraintOfType(String tableFqn, String constraintType) { + String escaped = tableFqn.replace("\\", "\\\\").replace("\"", "\\\""); + String sparql = + String.format( + "PREFIX om: <%s> " + + "ASK { GRAPH ?g { " + + " ?table om:fullyQualifiedName \"%s\" ; " + + " om:hasConstraint ?c . " + + " ?c om:constraintType \"%s\" . " + + "} }", + OM_NS, escaped, constraintType); + return RdfTestUtils.executeSparqlAsk(sparql); + } } diff --git a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/DefaultToolContext.java b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/DefaultToolContext.java index 60f3358cef19..680a50b3dc96 100644 --- a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/DefaultToolContext.java +++ b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/DefaultToolContext.java @@ -135,6 +135,21 @@ public CallToolOutcome callToolWithMetadata( case "create_data_product": result = new CreateDataProductTool().execute(authorizer, limits, securityContext, params); break; + case "sparql_query": + result = new SparqlQueryTool().execute(authorizer, securityContext, params); + break; + case "entity_neighborhood": + result = new EntityNeighborhoodTool().execute(authorizer, securityContext, params); + break; + case "find_by_tag": + result = new FindByTagTool().execute(authorizer, securityContext, params); + break; + case "shacl_validate": + result = new ShaclValidateTool().execute(authorizer, securityContext, params); + break; + case "ontology_describe": + result = new OntologyDescribeTool().execute(authorizer, securityContext, params); + break; default: return new CallToolOutcome( McpSchema.CallToolResult.builder() diff --git a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/EntityNeighborhoodTool.java b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/EntityNeighborhoodTool.java new file mode 100644 index 000000000000..43c72ab8c6a2 --- /dev/null +++ b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/EntityNeighborhoodTool.java @@ -0,0 +1,211 @@ +package org.openmetadata.mcp.tools; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.limits.Limits; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +/** + * Returns the n-hop neighborhood of an entity in the knowledge graph as triples. + * + *

Driven by a SPARQL CONSTRUCT with bounded property paths so depth is enforced even for + * adversarial inputs. Depth is hard-capped at 3 (matches REST graph-explorer). + */ +@Slf4j +public class EntityNeighborhoodTool implements McpTool { + + private static final int MIN_DEPTH = 1; + private static final int MAX_DEPTH = 3; + private static final int DEFAULT_DEPTH = 2; + + @Override + public Map execute( + Authorizer authorizer, CatalogSecurityContext securityContext, Map params) + throws IOException { + String entityId = string(params, "entityId"); + String entityType = string(params, "entityType"); + if (entityId == null || entityId.isBlank()) { + return error("'entityId' parameter is required"); + } + if (entityType == null || entityType.isBlank()) { + return error("'entityType' parameter is required"); + } + try { + UUID.fromString(entityId); + } catch (IllegalArgumentException e) { + return error("'entityId' must be a UUID"); + } + if (!entityType.matches("[A-Za-z][A-Za-z0-9]*")) { + return error("'entityType' must be alphanumeric"); + } + + int depth = clampDepth(intParam(params, "depth", DEFAULT_DEPTH)); + int limit = Math.min(Math.max(intParam(params, "limit", 200), 1), 2000); + + RdfRepository repository = RdfRepository.getInstanceOrNull(); + if (repository == null || !repository.isEnabled()) { + return error("RDF repository is not enabled on this OpenMetadata server"); + } + + String entityUri = repository.getBaseUri() + "entity/" + entityType + "/" + entityId; + + String constructQuery = buildConstructQuery(entityUri, depth, limit); + String triples; + try { + triples = repository.executeSparqlQuery(constructQuery, "text/turtle"); + } catch (Exception e) { + LOG.error("CONSTRUCT for neighborhood failed for {}", entityUri, e); + return error("Neighborhood query failed: " + e.getMessage()); + } + + String selectQuery = buildSelectQuery(entityUri, depth, limit); + String selectJson; + try { + selectJson = repository.executeSparqlQuery(selectQuery, "application/sparql-results+json"); + } catch (Exception e) { + LOG.error("SELECT for neighborhood failed for {}", entityUri, e); + selectJson = null; + } + + Map result = new LinkedHashMap<>(); + result.put("entityUri", entityUri); + result.put("depth", depth); + result.put("limit", limit); + result.put("triples", triples == null ? "" : triples); + result.put("edges", parseEdges(selectJson)); + return result; + } + + @Override + public Map execute( + Authorizer authorizer, + Limits limits, + CatalogSecurityContext securityContext, + Map params) { + throw new UnsupportedOperationException( + "EntityNeighborhoodTool does not enforce write limits."); + } + + /** + * Builds a CONSTRUCT query that yields all triples on every path of length 1..depth radiating + * from the entity. Each UNION arm emits exactly one triple template ({@code ?s ?p ?o}), with + * {@code ?s} bound to the *actual* subject of that triple — not the start entity. The previous + * implementation bound {@code ?s = } unconditionally, which collapsed all 2- and + * 3-hop edges onto the start node and produced an incorrect graph. + */ + static String buildConstructQuery(String entityUri, int depth, int limit) { + String e = "<" + entityUri + ">"; + StringBuilder w = new StringBuilder(); + // depth-1 outgoing + w.append(" { BIND(").append(e).append(" AS ?s) ").append(e).append(" ?p ?o }\n"); + // depth-1 incoming + w.append(" UNION { BIND(").append(e).append(" AS ?o) ?s ?p ").append(e).append(" }\n"); + if (depth >= 2) { + // depth-2 outgoing: emit BOTH edges as separate arms + w.append(" UNION { BIND(") + .append(e) + .append(" AS ?s) ") + .append(e) + .append(" ?p ?o . ?o ?p2 ?n2 }\n"); + w.append(" UNION { ").append(e).append(" ?p1 ?s . ?s ?p ?o }\n"); + } + if (depth >= 3) { + // depth-3 outgoing: three edges, three arms + w.append(" UNION { BIND(") + .append(e) + .append(" AS ?s) ") + .append(e) + .append(" ?p ?o . ?o ?p2 ?n2 . ?n2 ?p3 ?n3 }\n"); + w.append(" UNION { ").append(e).append(" ?p1 ?s . ?s ?p ?o . ?o ?p3 ?n3 }\n"); + w.append(" UNION { ").append(e).append(" ?p1 ?n1 . ?n1 ?p2 ?s . ?s ?p ?o }\n"); + } + return "CONSTRUCT { ?s ?p ?o } WHERE {\n" + w + "} LIMIT " + limit; + } + + static String buildSelectQuery(String entityUri, int depth, int limit) { + return "PREFIX om: \n" + + "SELECT ?direction ?predicate ?neighbor ?neighborLabel WHERE {\n" + + " { BIND('outgoing' AS ?direction) <" + + entityUri + + "> ?predicate ?neighbor }\n" + + " UNION { BIND('incoming' AS ?direction) ?neighbor ?predicate <" + + entityUri + + "> }\n" + + " OPTIONAL { ?neighbor ?neighborLabel }\n" + + "} LIMIT " + + limit; + } + + private static List> parseEdges(String selectJson) { + if (selectJson == null || selectJson.isBlank()) { + return List.of(); + } + try { + Map sparql = JsonUtils.readValue(selectJson, Map.class); + Object results = sparql.get("results"); + if (!(results instanceof Map resultsMap)) return List.of(); + Object bindings = resultsMap.get("bindings"); + if (!(bindings instanceof List rows)) return List.of(); + List> edges = new ArrayList<>(rows.size()); + for (Object row : rows) { + if (!(row instanceof Map r)) continue; + Map edge = new LinkedHashMap<>(); + edge.put("direction", bindingValue(r, "direction")); + edge.put("predicate", bindingValue(r, "predicate")); + edge.put("neighbor", bindingValue(r, "neighbor")); + Object label = bindingValue(r, "neighborLabel"); + if (label != null) { + edge.put("neighborLabel", label); + } + edges.add(edge); + } + return edges; + } catch (Exception e) { + LOG.warn("Failed to parse neighborhood SELECT results: {}", e.getMessage()); + return List.of(); + } + } + + private static Object bindingValue(Map row, String name) { + Object node = row.get(name); + if (!(node instanceof Map nodeMap)) return null; + return nodeMap.get("value"); + } + + private static int clampDepth(int depth) { + return Math.min(Math.max(depth, MIN_DEPTH), MAX_DEPTH); + } + + private static int intParam(Map params, String key, int defaultValue) { + Object v = params.get(key); + if (v instanceof Number n) return n.intValue(); + if (v instanceof String s) { + try { + return Integer.parseInt(s); + } catch (NumberFormatException e) { + return defaultValue; + } + } + return defaultValue; + } + + private static String string(Map params, String key) { + Object v = params.get(key); + return v instanceof String s ? s : null; + } + + private static Map error(String message) { + Map result = new HashMap<>(); + result.put("error", message); + return result; + } +} diff --git a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/FindByTagTool.java b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/FindByTagTool.java new file mode 100644 index 000000000000..824f70544139 --- /dev/null +++ b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/FindByTagTool.java @@ -0,0 +1,168 @@ +package org.openmetadata.mcp.tools; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.limits.Limits; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +/** + * Finds entities tagged with a given tag or glossary term FQN. + * + *

Resolves the tag/glossary URI from the FQN, then runs a SELECT for everything connected + * via {@code om:hasTag} or {@code om:hasGlossaryTerm}. Results include the entity URI, its FQN, + * its rdf:type, and its label. + */ +@Slf4j +public class FindByTagTool implements McpTool { + + @Override + public Map execute( + Authorizer authorizer, CatalogSecurityContext securityContext, Map params) + throws IOException { + String tagFqn = string(params, "tagFqn"); + if (tagFqn == null || tagFqn.isBlank()) { + return error("'tagFqn' parameter is required"); + } + if (tagFqn.contains("\"") || tagFqn.contains("\\") || tagFqn.contains("\n")) { + return error("'tagFqn' contains illegal characters"); + } + int limit = Math.min(Math.max(intParam(params, "limit", 50), 1), 500); + int offset = Math.max(intParam(params, "offset", 0), 0); + String entityType = string(params, "entityType"); + if (entityType != null && !entityType.matches("[A-Za-z][A-Za-z0-9]*")) { + return error("'entityType' must be alphanumeric"); + } + + RdfRepository repository = RdfRepository.getInstanceOrNull(); + if (repository == null || !repository.isEnabled()) { + return error("RDF repository is not enabled on this OpenMetadata server"); + } + + String sparql = buildSparql(tagFqn, entityType, limit, offset); + String json; + try { + json = repository.executeSparqlQuery(sparql, "application/sparql-results+json"); + } catch (Exception e) { + LOG.error("find_by_tag SPARQL failed for {}", tagFqn, e); + return error("SPARQL execution failed: " + e.getMessage()); + } + + List> entities = parseRows(json); + Map result = new LinkedHashMap<>(); + result.put("tagFqn", tagFqn); + result.put("entityTypeFilter", entityType); + result.put("limit", limit); + result.put("offset", offset); + result.put("results", entities); + result.put("returnedCount", entities.size()); + return result; + } + + @Override + public Map execute( + Authorizer authorizer, + Limits limits, + CatalogSecurityContext securityContext, + Map params) { + throw new UnsupportedOperationException("FindByTagTool does not enforce write limits."); + } + + static String buildSparql(String tagFqn, String entityType, int limit, int offset) { + String escapedFqn = tagFqn.replace("\"", "\\\""); + StringBuilder sb = new StringBuilder(); + // Match either a Tag (om:tagFQN) or a GlossaryTerm (om:fullyQualifiedName) — the input FQN + // can be either. GlossaryTerms in OM RDF do not carry om:tagFQN, so without this UNION the + // tool silently returned zero results for any glossary FQN. + sb.append("PREFIX om: \n") + .append("PREFIX rdfs: \n") + .append("SELECT DISTINCT ?entity ?entityType ?fqn ?label WHERE {\n") + .append(" { ?tag om:tagFQN \"") + .append(escapedFqn) + .append("\" }\n") + .append(" UNION { ?tag om:fullyQualifiedName \"") + .append(escapedFqn) + .append("\" }\n") + .append(" { ?entity om:hasTag ?tag } UNION { ?entity om:hasGlossaryTerm ?tag }\n") + .append(" ?entity a ?entityType .\n") + .append(" OPTIONAL { ?entity om:fullyQualifiedName ?fqn }\n") + .append(" OPTIONAL { ?entity rdfs:label ?label }\n"); + if (entityType != null && !entityType.isBlank()) { + String capitalized = Character.toUpperCase(entityType.charAt(0)) + entityType.substring(1); + sb.append(" FILTER(?entityType = )\n"); + } + sb.append("} ORDER BY ?fqn LIMIT ").append(limit).append(" OFFSET ").append(offset); + return sb.toString(); + } + + private static List> parseRows(String selectJson) { + if (selectJson == null || selectJson.isBlank()) { + return List.of(); + } + try { + Map sparql = JsonUtils.readValue(selectJson, Map.class); + Object results = sparql.get("results"); + if (!(results instanceof Map resultsMap)) return List.of(); + Object bindings = resultsMap.get("bindings"); + if (!(bindings instanceof List rows)) return List.of(); + List> entities = new ArrayList<>(rows.size()); + for (Object row : rows) { + if (!(row instanceof Map r)) continue; + Map entity = new LinkedHashMap<>(); + Object e = bindingValue(r, "entity"); + if (e == null) continue; + entity.put("entity", e); + Object t = bindingValue(r, "entityType"); + if (t != null) entity.put("entityType", t); + Object fqn = bindingValue(r, "fqn"); + if (fqn != null) entity.put("fullyQualifiedName", fqn); + Object label = bindingValue(r, "label"); + if (label != null) entity.put("label", label); + entities.add(entity); + } + return entities; + } catch (Exception e) { + LOG.warn("Failed to parse find_by_tag SELECT results: {}", e.getMessage()); + return List.of(); + } + } + + private static Object bindingValue(Map row, String name) { + Object node = row.get(name); + if (!(node instanceof Map nodeMap)) return null; + return nodeMap.get("value"); + } + + private static String string(Map params, String key) { + Object v = params.get(key); + return v instanceof String s ? s : null; + } + + private static int intParam(Map params, String key, int defaultValue) { + Object v = params.get(key); + if (v instanceof Number n) return n.intValue(); + if (v instanceof String s) { + try { + return Integer.parseInt(s); + } catch (NumberFormatException e) { + return defaultValue; + } + } + return defaultValue; + } + + private static Map error(String message) { + Map result = new HashMap<>(); + result.put("error", message); + return result; + } +} diff --git a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/OntologyDescribeTool.java b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/OntologyDescribeTool.java new file mode 100644 index 000000000000..302b91f9fa17 --- /dev/null +++ b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/OntologyDescribeTool.java @@ -0,0 +1,116 @@ +package org.openmetadata.mcp.tools; + +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.limits.Limits; +import org.openmetadata.service.rdf.RdfIriValidator; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.resources.rdf.OntologyDocument; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +/** + * Returns either the entire OpenMetadata ontology or a focused DESCRIBE for a single class / + * property URI. + * + *

The full ontology is served from the bundled classpath resource (no triplestore round + * trip). A focused DESCRIBE goes through the triplestore so it picks up any side-ontology + * extensions registered there. + */ +@Slf4j +public class OntologyDescribeTool implements McpTool { + + @Override + public Map execute( + Authorizer authorizer, CatalogSecurityContext securityContext, Map params) + throws IOException { + String resource = string(params, "resource"); + String format = normalizeFormat(string(params, "format")); + + if (resource == null || resource.isBlank()) { + OntologyDocument.SerializedOntology serialized = OntologyDocument.serializeAsString(format); + Map result = new LinkedHashMap<>(); + result.put("scope", "full-ontology"); + result.put("format", format); + result.put("mediaType", serialized.mediaType()); + result.put("body", serialized.body()); + return result; + } + + String validatedResource = RdfIriValidator.validateEntityIri(resource); + if (validatedResource == null) { + return error( + "'resource' must be a valid absolute http(s) IRI (no whitespace, control characters," + + " angle brackets, or quotes)"); + } + + RdfRepository repository = RdfRepository.getInstanceOrNull(); + if (repository == null || !repository.isEnabled()) { + return error("RDF repository is not enabled; cannot DESCRIBE individual ontology resources"); + } + + String describe = "DESCRIBE <" + validatedResource + ">"; + String mime = formatMime(format); + String body; + try { + body = repository.executeSparqlQueryDirect(describe, mime); + } catch (Exception e) { + LOG.error("Ontology DESCRIBE failed for {}", validatedResource, e); + return error("DESCRIBE failed: " + e.getMessage()); + } + + Map result = new LinkedHashMap<>(); + result.put("scope", "describe"); + result.put("resource", validatedResource); + result.put("format", format); + result.put("mediaType", mime); + result.put("body", body == null ? "" : body); + return result; + } + + @Override + public Map execute( + Authorizer authorizer, + Limits limits, + CatalogSecurityContext securityContext, + Map params) { + throw new UnsupportedOperationException("OntologyDescribeTool does not enforce write limits."); + } + + private static String normalizeFormat(String format) { + if (format == null) return "turtle"; + return switch (format.toLowerCase()) { + case "jsonld", "json-ld", "ld+json" -> "jsonld"; + case "rdfxml", "rdf+xml", "rdf/xml" -> "rdfxml"; + case "ntriples", "n-triples" -> "ntriples"; + default -> "turtle"; + }; + } + + /** + * Maps a normalised format name to the SPARQL-accept MIME type the triplestore expects. Every + * format returned by {@link #normalizeFormat} must round-trip through here. + */ + private static String formatMime(String format) { + return switch (format) { + case "jsonld" -> "application/ld+json"; + case "rdfxml" -> "application/rdf+xml"; + case "ntriples" -> "application/n-triples"; + default -> "text/turtle"; + }; + } + + private static String string(Map params, String key) { + Object v = params.get(key); + return v instanceof String s ? s : null; + } + + private static Map error(String message) { + Map result = new HashMap<>(); + result.put("error", message); + return result; + } +} diff --git a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/ShaclValidateTool.java b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/ShaclValidateTool.java new file mode 100644 index 000000000000..e33421f4233a --- /dev/null +++ b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/ShaclValidateTool.java @@ -0,0 +1,153 @@ +package org.openmetadata.mcp.tools; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.UUID; +import lombok.extern.slf4j.Slf4j; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.riot.RDFFormat; +import org.apache.jena.shacl.ValidationReport; +import org.openmetadata.service.limits.Limits; +import org.openmetadata.service.rdf.RdfIriValidator; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.resources.rdf.RdfShaclValidator; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +/** + * Runs SHACL validation against either a single entity's subgraph or the entire dataset and + * returns the resulting validation report. + * + *

If {@code entityId} + {@code entityType} (or {@code entityUri}) is supplied, the tool runs + * {@code DESCRIBE } first and validates only that subgraph. Otherwise it pulls everything + * and validates the whole graph (admin-style usage; expensive). + */ +@Slf4j +public class ShaclValidateTool implements McpTool { + + @Override + public Map execute( + Authorizer authorizer, CatalogSecurityContext securityContext, Map params) + throws IOException { + RdfRepository repository = RdfRepository.getInstanceOrNull(); + if (repository == null || !repository.isEnabled()) { + return error("RDF repository is not enabled on this OpenMetadata server"); + } + + String entityUri = resolveEntityUri(params, repository.getBaseUri()); + if (entityUri != null && entityUri.startsWith("error:")) { + return error(entityUri.substring("error:".length())); + } + + if (entityUri == null && !Boolean.TRUE.equals(params.get("fullGraph"))) { + return error( + "Full-graph SHACL validation must be explicitly enabled by passing fullGraph=true. " + + "It loads the entire triplestore into memory and can OOM the MCP server. Prefer " + + "passing entityId+entityType (or entityUri) to scope the check."); + } + + String constructQuery = + entityUri != null + ? "DESCRIBE <" + entityUri + ">" + : "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }"; + + String dataTurtle; + try { + dataTurtle = repository.executeSparqlQueryDirect(constructQuery, "text/turtle"); + } catch (Exception e) { + LOG.error("Subgraph fetch for SHACL failed", e); + return error("Failed to fetch subgraph: " + e.getMessage()); + } + + Model dataModel = ModelFactory.createDefaultModel(); + try (StringReader reader = new StringReader(dataTurtle == null ? "" : dataTurtle)) { + RDFDataMgr.read(dataModel, reader, repository.getBaseUri(), Lang.TURTLE); + } catch (Exception e) { + LOG.error("Failed to parse subgraph for SHACL validation", e); + return error("Failed to parse subgraph: " + e.getMessage()); + } + + ValidationReport report = RdfShaclValidator.validate(dataModel); + String format = normalizeFormat(string(params, "format")); + RDFFormat rdfFormat = + "jsonld".equals(format) ? RDFFormat.JSONLD_PRETTY : RDFFormat.TURTLE_PRETTY; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + RDFDataMgr.write(out, report.getModel(), rdfFormat); + + Map result = new LinkedHashMap<>(); + result.put("scope", entityUri == null ? "full-graph" : "entity"); + if (entityUri != null) { + result.put("entityUri", entityUri); + } + result.put("conforms", report.conforms()); + long violationCount = report.getEntries() == null ? 0 : report.getEntries().stream().count(); + result.put("violationCount", violationCount); + result.put("format", format); + result.put("report", out.toString(StandardCharsets.UTF_8)); + return result; + } + + @Override + public Map execute( + Authorizer authorizer, + Limits limits, + CatalogSecurityContext securityContext, + Map params) { + throw new UnsupportedOperationException("ShaclValidateTool does not enforce write limits."); + } + + private static String resolveEntityUri(Map params, String baseUri) { + String entityUri = string(params, "entityUri"); + if (entityUri != null && !entityUri.isBlank()) { + String validated = RdfIriValidator.validateEntityIri(entityUri); + if (validated == null) { + return "error:'entityUri' must be a valid absolute http(s) IRI"; + } + return validated; + } + String entityId = string(params, "entityId"); + String entityType = string(params, "entityType"); + if (entityId == null && entityType == null) { + return null; // full-graph scope + } + if (entityId == null || entityType == null) { + return "error:Both 'entityId' and 'entityType' are required when scoping by entity, or omit both for full-graph scope"; + } + try { + UUID.fromString(entityId); + } catch (IllegalArgumentException e) { + return "error:'entityId' must be a UUID"; + } + if (!entityType.matches("[A-Za-z][A-Za-z0-9]*")) { + return "error:'entityType' must be alphanumeric"; + } + return baseUri + "entity/" + entityType + "/" + entityId; + } + + private static String normalizeFormat(String format) { + if (format == null) return "turtle"; + return switch (format.toLowerCase()) { + case "jsonld", "json-ld", "ld+json" -> "jsonld"; + default -> "turtle"; + }; + } + + private static String string(Map params, String key) { + Object v = params.get(key); + return v instanceof String s ? s : null; + } + + private static Map error(String message) { + Map result = new HashMap<>(); + result.put("error", message); + return result; + } +} diff --git a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SparqlQueryTool.java b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SparqlQueryTool.java new file mode 100644 index 000000000000..f1dfaf0bd8c4 --- /dev/null +++ b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SparqlQueryTool.java @@ -0,0 +1,181 @@ +package org.openmetadata.mcp.tools; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryException; +import org.apache.jena.query.QueryFactory; +import org.openmetadata.service.limits.Limits; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.rdf.federation.SparqlFederationGuard; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +/** + * Read-only SPARQL query tool for AI agents. Wraps {@code RdfRepository.executeSparqlQuery}. + * + *

This tool is deliberately the only path through which an MCP client can talk SPARQL. + * Hardening: + * + *

    + *
  • Rejects SPARQL UPDATE / INSERT / DELETE / LOAD / CLEAR / DROP / CREATE — only + * SELECT, ASK, DESCRIBE, CONSTRUCT pass the gate. + *
  • Federation: enforces the same allowlist as REST, so an MCP client cannot bypass it. + *
  • Result size: caps the response body at {@code maxBytes} (default 1 MiB) to prevent the + * triplestore from streaming a huge result into the agent context. + *
  • Format: only the four standard SPARQL result formats are accepted. + *
+ */ +@Slf4j +public class SparqlQueryTool implements McpTool { + + private static final int DEFAULT_MAX_BYTES = 1 * 1024 * 1024; + private static final int HARD_MAX_BYTES = 16 * 1024 * 1024; + + @Override + public Map execute( + Authorizer authorizer, CatalogSecurityContext securityContext, Map params) + throws IOException { + String query = string(params, "query"); + if (query == null || query.isBlank()) { + return error("'query' parameter is required"); + } + + Query parsed; + try { + parsed = QueryFactory.create(query); + } catch (QueryException e) { + return error("SPARQL parse error: " + e.getMessage()); + } + if (!isReadOnly(parsed)) { + return error( + "Only read-only SPARQL queries (SELECT, ASK, DESCRIBE, CONSTRUCT) are allowed via this tool. Use the admin REST endpoint for SPARQL UPDATE."); + } + + RdfRepository repository = RdfRepository.getInstanceOrNull(); + if (repository == null || !repository.isEnabled()) { + return error("RDF repository is not enabled on this OpenMetadata server"); + } + + try { + new SparqlFederationGuard(repository.getConfig()).enforce(query); + } catch (SparqlFederationGuard.FederationDisallowedException e) { + return error(e.getMessage()); + } + + String format = normalizeFormat(string(params, "format")); + String mimeType = mimeFor(format); + String inferenceLevel = string(params, "inferenceLevel"); + + int maxBytes = clamp(intParam(params, "maxBytes", DEFAULT_MAX_BYTES), 1024, HARD_MAX_BYTES); + + String body; + try { + body = + inferenceLevel != null + && !inferenceLevel.isBlank() + && !"none".equalsIgnoreCase(inferenceLevel) + ? repository.executeSparqlQueryWithInference(query, mimeType, inferenceLevel) + : repository.executeSparqlQuery(query, mimeType); + } catch (Exception e) { + LOG.error("SPARQL query execution failed", e); + return error("SPARQL execution failed: " + e.getMessage()); + } + + Map result = new HashMap<>(); + result.put("format", format); + result.put("queryType", parsed.queryType().toString()); + if (body == null) { + result.put("body", ""); + result.put("truncated", false); + result.put("byteCount", 0); + return result; + } + byte[] bytes = body.getBytes(java.nio.charset.StandardCharsets.UTF_8); + boolean truncated = bytes.length > maxBytes; + if (truncated) { + // Truncate by bytes (not chars). Multi-byte UTF-8 sequences must not be split mid-rune, + // so back off until we land on the start of a code point (top bits != 10xxxxxx). + int cut = maxBytes; + while (cut > 0 && (bytes[cut] & 0xC0) == 0x80) { + cut--; + } + result.put("body", new String(bytes, 0, cut, java.nio.charset.StandardCharsets.UTF_8)); + } else { + result.put("body", body); + } + result.put("truncated", truncated); + result.put("byteCount", bytes.length); + return result; + } + + @Override + public Map execute( + Authorizer authorizer, + Limits limits, + CatalogSecurityContext securityContext, + Map params) { + throw new UnsupportedOperationException("SparqlQueryTool does not enforce write limits."); + } + + private static boolean isReadOnly(Query query) { + return query.isSelectType() + || query.isAskType() + || query.isDescribeType() + || query.isConstructType(); + } + + private static String normalizeFormat(String format) { + if (format == null || format.isBlank()) { + return "json"; + } + return switch (format.toLowerCase()) { + case "json", "xml", "csv", "tsv", "turtle", "rdfxml", "ntriples", "jsonld" -> format + .toLowerCase(); + default -> "json"; + }; + } + + private static String mimeFor(String format) { + return switch (format) { + case "xml" -> "application/sparql-results+xml"; + case "csv" -> "text/csv"; + case "tsv" -> "text/tab-separated-values"; + case "turtle" -> "text/turtle"; + case "rdfxml" -> "application/rdf+xml"; + case "ntriples" -> "application/n-triples"; + case "jsonld" -> "application/ld+json"; + default -> "application/sparql-results+json"; + }; + } + + private static String string(Map params, String key) { + Object v = params.get(key); + return v instanceof String s ? s : null; + } + + private static int intParam(Map params, String key, int defaultValue) { + Object v = params.get(key); + if (v instanceof Number n) return n.intValue(); + if (v instanceof String s) { + try { + return Integer.parseInt(s); + } catch (NumberFormatException e) { + return defaultValue; + } + } + return defaultValue; + } + + private static int clamp(int v, int lo, int hi) { + return Math.min(Math.max(v, lo), hi); + } + + private static Map error(String message) { + Map result = new HashMap<>(); + result.put("error", message); + return result; + } +} diff --git a/openmetadata-mcp/src/main/resources/json/data/mcp/tools.json b/openmetadata-mcp/src/main/resources/json/data/mcp/tools.json index 9de66dc2a683..13442257d006 100644 --- a/openmetadata-mcp/src/main/resources/json/data/mcp/tools.json +++ b/openmetadata-mcp/src/main/resources/json/data/mcp/tools.json @@ -860,6 +860,135 @@ "entityType" ] } + }, + { + "name": "sparql_query", + "description": "Run a read-only SPARQL query (SELECT, ASK, DESCRIBE, CONSTRUCT) against the OpenMetadata knowledge graph. Use this when the question is graph-shaped — e.g. 'all tables with FK references into customers.id', 'all upstream tables of dashboard X', 'columns tagged PII whose null count > 0'. UPDATE / INSERT / DELETE / DROP / LOAD / CLEAR / CREATE are rejected; use the admin REST endpoint for writes. SERVICE clauses against external endpoints are rejected unless on the federation allowlist. Returns the result body in the chosen format (default 'json' for SELECT/ASK; 'turtle' is recommended for DESCRIBE/CONSTRUCT). Body is capped at maxBytes (default 1 MiB).", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The SPARQL query string. Use https://open-metadata.org/ontology/ as the om: prefix." + }, + "format": { + "type": "string", + "description": "Result format. SELECT/ASK: json (default), xml, csv, tsv. CONSTRUCT/DESCRIBE: turtle, jsonld, ntriples, rdfxml.", + "default": "json" + }, + "inferenceLevel": { + "type": "string", + "description": "Optional. Reasoning level applied at query time: none (default), rdfs, owl, custom.", + "default": "none" + }, + "maxBytes": { + "type": "integer", + "description": "Cap on the response body size in bytes. Default 1048576 (1 MiB). Min 1024, max 16777216.", + "default": 1048576 + } + }, + "required": ["query"] + } + }, + { + "name": "entity_neighborhood", + "description": "Return the n-hop neighborhood of an OpenMetadata entity in the knowledge graph as triples plus a flat edges list. Use this when you have a specific entity (UUID + type) and want to see what else is connected to it: hasColumn, belongsToSchema, hasTag, owners, lineage, etc. Depth is bounded at 1-3 hops. Returns Turtle for the triples and a structured edges array (direction, predicate, neighbor URI, neighbor label).", + "parameters": { + "type": "object", + "properties": { + "entityId": { + "type": "string", + "description": "UUID of the entity (the 'id' field on the entity)." + }, + "entityType": { + "type": "string", + "description": "Entity type singular (table, dashboard, pipeline, glossaryTerm, ...)." + }, + "depth": { + "type": "integer", + "description": "Hop depth, 1-3. Default 2.", + "default": 2 + }, + "limit": { + "type": "integer", + "description": "Maximum triples returned. Default 200, max 2000.", + "default": 200 + } + }, + "required": ["entityId", "entityType"] + } + }, + { + "name": "find_by_tag", + "description": "Find all entities tagged with a given classification tag or glossary term FQN. Walks om:hasTag and om:hasGlossaryTerm. Use this for 'show me everything classified PII.Sensitive' or 'all assets associated with the BusinessTerms.Customer glossary term'. Optionally filter by entity type. Paginated.", + "parameters": { + "type": "object", + "properties": { + "tagFqn": { + "type": "string", + "description": "FQN of the tag or glossary term (e.g. 'PII.Sensitive', 'BusinessTerms.Customer')." + }, + "entityType": { + "type": "string", + "description": "Optional. Restrict results to a single entity type (e.g. 'table')." + }, + "limit": { + "type": "integer", + "description": "Maximum results. Default 50, max 500.", + "default": 50 + }, + "offset": { + "type": "integer", + "description": "Page offset. Default 0.", + "default": 0 + } + }, + "required": ["tagFqn"] + } + }, + { + "name": "shacl_validate", + "description": "Run SHACL validation against either a single entity's subgraph (scoped via entityId+entityType or entityUri) or the entire dataset. Returns conforms (boolean), violationCount, and the SHACL ValidationReport in the requested RDF format. Read-only; never blocks writes. Use this to debug data-quality issues before mutating the graph.", + "parameters": { + "type": "object", + "properties": { + "entityId": { + "type": "string", + "description": "Optional. UUID of the entity to scope validation to." + }, + "entityType": { + "type": "string", + "description": "Optional. Entity type singular. Required if entityId is provided." + }, + "entityUri": { + "type": "string", + "description": "Optional alternative to entityId+entityType: full URI of the entity." + }, + "format": { + "type": "string", + "description": "Report format: turtle (default) or jsonld.", + "default": "turtle" + } + } + } + }, + { + "name": "ontology_describe", + "description": "Return the OpenMetadata ontology — either the full canonical ontology (when 'resource' is omitted) or a SPARQL DESCRIBE for a single class or property URI. Use this to discover available classes, properties, ranges, and domains the agent can query. Format: turtle (default), jsonld, ntriples, rdfxml.", + "parameters": { + "type": "object", + "properties": { + "resource": { + "type": "string", + "description": "Optional. Full URI of the class or property to DESCRIBE (e.g. 'https://open-metadata.org/ontology/Column'). Omit to return the full ontology." + }, + "format": { + "type": "string", + "description": "Output format. Default 'turtle'.", + "default": "turtle" + } + } + } } ] } diff --git a/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/EntityNeighborhoodToolTest.java b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/EntityNeighborhoodToolTest.java new file mode 100644 index 000000000000..6aa44c932632 --- /dev/null +++ b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/EntityNeighborhoodToolTest.java @@ -0,0 +1,179 @@ +package org.openmetadata.mcp.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Map; +import java.util.UUID; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +class EntityNeighborhoodToolTest { + + private static final Authorizer AUTHORIZER = mock(Authorizer.class); + private static final CatalogSecurityContext SEC = mock(CatalogSecurityContext.class); + + @Test + @DisplayName("Missing entityId is rejected") + void missingEntityId() throws IOException { + Map result = + new EntityNeighborhoodTool().execute(AUTHORIZER, SEC, Map.of("entityType", "table")); + assertEquals("'entityId' parameter is required", result.get("error")); + } + + @Test + @DisplayName("Missing entityType is rejected") + void missingEntityType() throws IOException { + Map result = + new EntityNeighborhoodTool() + .execute(AUTHORIZER, SEC, Map.of("entityId", UUID.randomUUID().toString())); + assertEquals("'entityType' parameter is required", result.get("error")); + } + + @Test + @DisplayName("Non-UUID entityId is rejected with a clean error") + void nonUuidEntityIdRejected() throws IOException { + Map result = + new EntityNeighborhoodTool() + .execute(AUTHORIZER, SEC, Map.of("entityId", "not-a-uuid", "entityType", "table")); + assertEquals("'entityId' must be a UUID", result.get("error")); + } + + @Test + @DisplayName("Non-alphanumeric entityType is rejected (defends against URI injection)") + void badEntityTypeRejected() throws IOException { + Map result = + new EntityNeighborhoodTool() + .execute( + AUTHORIZER, + SEC, + Map.of("entityId", UUID.randomUUID().toString(), "entityType", "table> ; DROP --")); + assertEquals("'entityType' must be alphanumeric", result.get("error")); + } + + @Test + @DisplayName("RDF disabled returns service-unavailable error") + void rdfDisabled() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(false); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new EntityNeighborhoodTool() + .execute( + AUTHORIZER, + SEC, + Map.of("entityId", UUID.randomUUID().toString(), "entityType", "table")); + assertNotNull(result.get("error")); + } + } + + @Test + @DisplayName("Successful call returns triples + edges + clamped depth") + void successfulCall() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + when(repo.executeSparqlQuery(anyString(), org.mockito.ArgumentMatchers.eq("text/turtle"))) + .thenReturn( + " ."); + when(repo.executeSparqlQuery( + anyString(), org.mockito.ArgumentMatchers.eq("application/sparql-results+json"))) + .thenReturn( + "{\"results\":{\"bindings\":[" + + "{\"direction\":{\"value\":\"outgoing\"},\"predicate\":{\"value\":\"https://open-metadata.org/ontology/hasColumn\"},\"neighbor\":{\"value\":\"urn:c1\"}}" + + "]}}"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new EntityNeighborhoodTool() + .execute( + AUTHORIZER, + SEC, + Map.of( + "entityId", + "11111111-1111-1111-1111-111111111111", + "entityType", + "table", + "depth", + 99)); + + assertNull(result.get("error")); + assertEquals(3, result.get("depth"), "Depth must be clamped to 3"); + assertNotNull(result.get("triples")); + @SuppressWarnings("unchecked") + var edges = (java.util.List>) result.get("edges"); + assertEquals(1, edges.size()); + assertEquals("outgoing", edges.get(0).get("direction")); + } + } + + @Test + @DisplayName("buildConstructQuery includes inverse direction (incoming edges)") + void constructQueryIncludesInverse() { + String q = + EntityNeighborhoodTool.buildConstructQuery( + "https://open-metadata.org/entity/table/abc", 2, 100); + // Incoming arm now binds ?o = and matches ?s ?p + assertTrue(q.contains("?s ?p ")); + } + + @Test + @DisplayName("buildConstructQuery respects depth") + void constructQueryDepthBounds() { + String d1 = EntityNeighborhoodTool.buildConstructQuery("urn:e", 1, 100); + String d3 = EntityNeighborhoodTool.buildConstructQuery("urn:e", 3, 100); + // depth-1: only outgoing + incoming arms (split by UNION = 2 pieces + 1 = 3) + assertTrue(d1.split("UNION").length <= 3, "Depth 1 should not contain 2/3-hop unions"); + assertTrue(d3.contains("?n3"), "Depth 3 must include the 3-hop chain variable"); + } + + @Test + @DisplayName("buildConstructQuery preserves real subjects on multi-hop arms") + void constructQueryPreservesMultiHopSubjects() { + String q = EntityNeighborhoodTool.buildConstructQuery("urn:e", 2, 100); + // The 2-hop "second-edge" arm must NOT bind ?s to the start entity — it must let the + // intermediate node be ?s so the emitted triple is faithful to the real graph. + assertTrue( + q.contains(" ?p1 ?s . ?s ?p ?o"), + "Depth-2 second-edge arm must bind ?s to the intermediate node, not "); + } + + @Test + @DisplayName("Repository throws → tool returns clean error") + void repositoryThrows() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenThrow(new RuntimeException("boom")); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new EntityNeighborhoodTool() + .execute( + AUTHORIZER, + SEC, + Map.of( + "entityId", + "22222222-2222-2222-2222-222222222222", + "entityType", + "pipeline")); + assertTrue(((String) result.get("error")).contains("Neighborhood query failed")); + } + } +} diff --git a/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/FindByTagToolTest.java b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/FindByTagToolTest.java new file mode 100644 index 000000000000..544da8d611a7 --- /dev/null +++ b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/FindByTagToolTest.java @@ -0,0 +1,159 @@ +package org.openmetadata.mcp.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Map; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +class FindByTagToolTest { + + private static final Authorizer AUTHORIZER = mock(Authorizer.class); + private static final CatalogSecurityContext SEC = mock(CatalogSecurityContext.class); + + @Test + @DisplayName("Missing tagFqn is rejected") + void missingTagFqn() throws IOException { + Map result = new FindByTagTool().execute(AUTHORIZER, SEC, Map.of()); + assertEquals("'tagFqn' parameter is required", result.get("error")); + } + + @Test + @DisplayName("tagFqn containing a quote is rejected (defends against SPARQL string injection)") + void illegalQuoteRejected() throws IOException { + Map result = + new FindByTagTool().execute(AUTHORIZER, SEC, Map.of("tagFqn", "PII\".Sensitive")); + assertEquals("'tagFqn' contains illegal characters", result.get("error")); + } + + @Test + @DisplayName("tagFqn with backslash or newline is rejected") + void illegalControlCharsRejected() throws IOException { + Map result = + new FindByTagTool().execute(AUTHORIZER, SEC, Map.of("tagFqn", "PII.Sens\nitive")); + assertEquals("'tagFqn' contains illegal characters", result.get("error")); + + Map result2 = + new FindByTagTool().execute(AUTHORIZER, SEC, Map.of("tagFqn", "PII\\Sensitive")); + assertEquals("'tagFqn' contains illegal characters", result2.get("error")); + } + + @Test + @DisplayName("Non-alphanumeric entityType is rejected") + void badEntityTypeRejected() throws IOException { + Map result = + new FindByTagTool() + .execute( + AUTHORIZER, SEC, Map.of("tagFqn", "PII.Sensitive", "entityType", "table OR 1=1")); + assertEquals("'entityType' must be alphanumeric", result.get("error")); + } + + @Test + @DisplayName("RDF disabled returns service-unavailable error") + void rdfDisabled() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(false); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new FindByTagTool().execute(AUTHORIZER, SEC, Map.of("tagFqn", "PII.Sensitive")); + assertNotNull(result.get("error")); + } + } + + @Test + @DisplayName("Successful call returns parsed entities with FQN, type, label") + void successfulCall() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenReturn( + "{\"results\":{\"bindings\":[" + + "{\"entity\":{\"value\":\"https://open-metadata.org/entity/table/abc\"}," + + " \"entityType\":{\"value\":\"https://open-metadata.org/ontology/Table\"}," + + " \"fqn\":{\"value\":\"svc.db.s.t\"}," + + " \"label\":{\"value\":\"t\"}}" + + "]}}"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new FindByTagTool().execute(AUTHORIZER, SEC, Map.of("tagFqn", "PII.Sensitive")); + assertNull(result.get("error")); + assertEquals(1, result.get("returnedCount")); + @SuppressWarnings("unchecked") + java.util.List> rows = + (java.util.List>) result.get("results"); + assertEquals("svc.db.s.t", rows.get(0).get("fullyQualifiedName")); + } + } + + @Test + @DisplayName( + "buildSparql escapes embedded quotes via the regex earlier; the final query is parameterized correctly") + void buildSparqlContainsEscapedFqn() { + String sparql = FindByTagTool.buildSparql("PII.Sensitive", "table", 50, 0); + assertTrue(sparql.contains("\"PII.Sensitive\"")); + assertTrue(sparql.contains("LIMIT 50")); + assertTrue(sparql.contains("OFFSET 0")); + assertTrue(sparql.contains("ontology/Table")); + } + + @Test + @DisplayName("buildSparql matches GlossaryTerm by om:fullyQualifiedName, not only om:tagFQN") + void buildSparqlMatchesGlossaryFqn() { + String sparql = FindByTagTool.buildSparql("MyGlossary.PII", null, 50, 0); + assertTrue( + sparql.contains("om:tagFQN \"MyGlossary.PII\""), "must still match Tags by om:tagFQN"); + assertTrue( + sparql.contains("om:fullyQualifiedName \"MyGlossary.PII\""), + "must also match GlossaryTerms by om:fullyQualifiedName"); + } + + @Test + @DisplayName("Empty result set returns empty list, not error") + void emptyResultSet() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenReturn("{\"results\":{\"bindings\":[]}}"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new FindByTagTool().execute(AUTHORIZER, SEC, Map.of("tagFqn", "PII.None")); + assertNull(result.get("error")); + assertEquals(0, result.get("returnedCount")); + } + } + + @Test + @DisplayName("Limit beyond hard max is clamped to 500") + void limitClamped() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenReturn("{\"results\":{\"bindings\":[]}}"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new FindByTagTool() + .execute(AUTHORIZER, SEC, Map.of("tagFqn", "PII.None", "limit", 999_999)); + assertEquals(500, result.get("limit")); + } + } +} diff --git a/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/OntologyDescribeToolTest.java b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/OntologyDescribeToolTest.java new file mode 100644 index 000000000000..d8e303f587dc --- /dev/null +++ b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/OntologyDescribeToolTest.java @@ -0,0 +1,93 @@ +package org.openmetadata.mcp.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Map; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +class OntologyDescribeToolTest { + + private static final Authorizer AUTHORIZER = mock(Authorizer.class); + private static final CatalogSecurityContext SEC = mock(CatalogSecurityContext.class); + + @Test + @DisplayName( + "No 'resource' returns the full ontology from classpath without touching the triplestore") + void fullOntologyServedFromClasspath() throws IOException { + Map result = new OntologyDescribeTool().execute(AUTHORIZER, SEC, Map.of()); + assertNull(result.get("error")); + assertEquals("full-ontology", result.get("scope")); + String body = (String) result.get("body"); + assertNotNull(body); + assertTrue( + body.contains("om:") || body.contains("ontology"), + "Full ontology body must look like RDF, got: " + + body.substring(0, Math.min(200, body.length()))); + } + + @Test + @DisplayName("Non-URI 'resource' is rejected") + void nonUriResourceRejected() throws IOException { + Map result = + new OntologyDescribeTool().execute(AUTHORIZER, SEC, Map.of("resource", "Column")); + assertEquals( + "'resource' must be an absolute http(s) URI for the class or property", + result.get("error")); + } + + @Test + @DisplayName("RDF disabled while DESCRIBE-ing a single class returns service-unavailable error") + void describeRequiresRdf() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(null); + Map result = + new OntologyDescribeTool() + .execute( + AUTHORIZER, SEC, Map.of("resource", "https://open-metadata.org/ontology/Column")); + assertNotNull(result.get("error")); + } + } + + @Test + @DisplayName("Successful DESCRIBE call returns turtle by default") + void successfulDescribe() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.executeSparqlQueryDirect( + anyString(), org.mockito.ArgumentMatchers.eq("text/turtle"))) + .thenReturn("@prefix om: ."); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new OntologyDescribeTool() + .execute( + AUTHORIZER, SEC, Map.of("resource", "https://open-metadata.org/ontology/Column")); + assertEquals("describe", result.get("scope")); + assertEquals("turtle", result.get("format")); + assertEquals("text/turtle", result.get("mediaType")); + assertNotNull(result.get("body")); + } + } + + @Test + @DisplayName("Format normalizes 'json-ld' to 'jsonld'") + void formatNormalization() throws IOException { + Map result = + new OntologyDescribeTool().execute(AUTHORIZER, SEC, Map.of("format", "json-ld")); + assertEquals("jsonld", result.get("format")); + } +} diff --git a/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/ShaclValidateToolTest.java b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/ShaclValidateToolTest.java new file mode 100644 index 000000000000..ab9e98e7bd83 --- /dev/null +++ b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/ShaclValidateToolTest.java @@ -0,0 +1,179 @@ +package org.openmetadata.mcp.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Map; +import java.util.UUID; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +class ShaclValidateToolTest { + + private static final Authorizer AUTHORIZER = mock(Authorizer.class); + private static final CatalogSecurityContext SEC = mock(CatalogSecurityContext.class); + + @Test + @DisplayName("RDF disabled returns service-unavailable error") + void rdfDisabled() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(null); + Map result = new ShaclValidateTool().execute(AUTHORIZER, SEC, Map.of()); + assertNotNull(result.get("error")); + } + } + + @Test + @DisplayName("Non-URI entityUri is rejected") + void nonUriEntityUri() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new ShaclValidateTool().execute(AUTHORIZER, SEC, Map.of("entityUri", "not-an-http-uri")); + assertEquals("'entityUri' must be an absolute http(s) URI", result.get("error")); + } + } + + @Test + @DisplayName("entityId without entityType is rejected") + void entityIdWithoutType() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new ShaclValidateTool() + .execute(AUTHORIZER, SEC, Map.of("entityId", UUID.randomUUID().toString())); + assertNotNull(result.get("error")); + } + } + + @Test + @DisplayName("Non-UUID entityId is rejected") + void badEntityId() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new ShaclValidateTool() + .execute(AUTHORIZER, SEC, Map.of("entityId", "abc", "entityType", "table")); + assertEquals("'entityId' must be a UUID", result.get("error")); + } + } + + @Test + @DisplayName("Successful entity-scoped validation reports conforms with violationCount") + void successfulEntityScopedValidation() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + // Conforming subgraph: a typed entity with the required label and FQN. + String turtleSubgraph = + "@prefix om: .\n" + + "@prefix rdfs: .\n" + + " a om:Table ;\n" + + " rdfs:label \"abc\" ;\n" + + " om:fullyQualifiedName \"svc.db.s.abc\" ;\n" + + " om:hasColumn .\n" + + " a om:Column ;\n" + + " om:fullyQualifiedName \"svc.db.s.abc.id\" ."; + when(repo.executeSparqlQueryDirect( + anyString(), org.mockito.ArgumentMatchers.eq("text/turtle"))) + .thenReturn(turtleSubgraph); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new ShaclValidateTool() + .execute( + AUTHORIZER, + SEC, + Map.of( + "entityId", "11111111-1111-1111-1111-111111111111", "entityType", "table")); + assertNull(result.get("error")); + assertEquals("entity", result.get("scope")); + assertNotNull(result.get("conforms")); + assertNotNull(result.get("report")); + } + } + + @Test + @DisplayName("Subgraph that violates a shape returns conforms=false and a non-empty report") + void violationDetected() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + // Bad column lineage: fromColumn is a literal where shape requires om:Column. + String bad = + "@prefix om: .\n" + + " a om:ColumnLineage ;\n" + + " om:fromColumn \"svc.db.s.t.col_a\" ."; + when(repo.executeSparqlQueryDirect(anyString(), anyString())).thenReturn(bad); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new ShaclValidateTool() + .execute(AUTHORIZER, SEC, Map.of("entityUri", "https://open-metadata.org/lin/1")); + assertEquals(false, result.get("conforms")); + assertTrue(((Long) result.get("violationCount")) > 0); + } + } + + @Test + @DisplayName("Empty body from triplestore is handled gracefully") + void emptyBodyHandled() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + when(repo.executeSparqlQueryDirect(anyString(), anyString())).thenReturn(null); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + new ShaclValidateTool() + .execute( + AUTHORIZER, + SEC, + Map.of( + "entityId", "11111111-1111-1111-1111-111111111111", "entityType", "table")); + assertNull(result.get("error")); + assertNotNull(result.get("conforms")); + } + } + + @Test + @DisplayName("No scope params → full-graph validation") + void fullGraphScope() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getBaseUri()).thenReturn("https://open-metadata.org/"); + when(repo.executeSparqlQueryDirect(anyString(), anyString())).thenReturn(""); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = new ShaclValidateTool().execute(AUTHORIZER, SEC, Map.of()); + assertEquals("full-graph", result.get("scope")); + } + } +} diff --git a/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/SparqlQueryToolTest.java b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/SparqlQueryToolTest.java new file mode 100644 index 000000000000..2941d37eeb2b --- /dev/null +++ b/openmetadata-mcp/src/test/java/org/openmetadata/mcp/tools/SparqlQueryToolTest.java @@ -0,0 +1,256 @@ +package org.openmetadata.mcp.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Map; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.openmetadata.schema.api.configuration.rdf.RdfConfiguration; +import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.auth.CatalogSecurityContext; + +/** + * Failure-mode tests for SparqlQueryTool. Each test names the bad input it is exercising — + * these are the queries we expect adversarial or sloppy MCP clients to send. + */ +class SparqlQueryToolTest { + + private static final Authorizer AUTHORIZER = mock(Authorizer.class); + private static final CatalogSecurityContext SEC = mock(CatalogSecurityContext.class); + + private static SparqlQueryTool newTool() { + return new SparqlQueryTool(); + } + + @Test + @DisplayName("Missing 'query' parameter returns a clean error") + void missingQueryParam() throws IOException { + Map result = newTool().execute(AUTHORIZER, SEC, Map.of()); + assertEquals("'query' parameter is required", result.get("error")); + } + + @Test + @DisplayName("Empty / blank query is rejected") + void blankQueryRejected() throws IOException { + Map result = newTool().execute(AUTHORIZER, SEC, Map.of("query", " ")); + assertNotNull(result.get("error")); + } + + @Test + @DisplayName("Garbage SPARQL surfaces as a parse error rather than a 500") + void garbageQuerySurfacesParseError() throws IOException { + Map result = + newTool().execute(AUTHORIZER, SEC, Map.of("query", "not sparql at all {{}}")); + assertNotNull(result.get("error")); + assertTrue(((String) result.get("error")).startsWith("SPARQL parse error")); + } + + @Test + @DisplayName("INSERT DATA is rejected — only read-only queries allowed via this tool") + void insertDataRejected() throws IOException { + String q = "INSERT DATA { }"; + Map result = newTool().execute(AUTHORIZER, SEC, Map.of("query", q)); + String err = (String) result.get("error"); + // INSERT DATA fails the SPARQL Query parser (it's an update operation), so we surface a + // parse error. Either way, the tool refuses it. + assertNotNull(err); + } + + @Test + @DisplayName("DELETE WHERE is rejected — only read-only queries allowed via this tool") + void deleteRejected() throws IOException { + String q = "DELETE WHERE { ?s ?p ?o }"; + Map result = newTool().execute(AUTHORIZER, SEC, Map.of("query", q)); + assertNotNull(result.get("error")); + } + + @Test + @DisplayName("DROP GRAPH is rejected") + void dropRejected() throws IOException { + String q = "DROP GRAPH "; + Map result = newTool().execute(AUTHORIZER, SEC, Map.of("query", q)); + assertNotNull(result.get("error")); + } + + @Test + @DisplayName("RDF disabled on the server returns a service-unavailable error") + void repositoryDisabled() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(false); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + newTool().execute(AUTHORIZER, SEC, Map.of("query", "SELECT * WHERE { ?s ?p ?o }")); + assertEquals( + "RDF repository is not enabled on this OpenMetadata server", result.get("error")); + } + } + + @Test + @DisplayName("RDF instance missing returns a service-unavailable error") + void repositoryMissing() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(null); + + Map result = + newTool().execute(AUTHORIZER, SEC, Map.of("query", "SELECT * WHERE { ?s ?p ?o }")); + assertNotNull(result.get("error")); + } + } + + @Test + @DisplayName("SERVICE clause to non-allowlisted endpoint is rejected") + void serviceClauseRejected() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getConfig()).thenReturn(new RdfConfiguration()); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + String q = "SELECT * WHERE { SERVICE { ?s ?p ?o } }"; + Map result = newTool().execute(AUTHORIZER, SEC, Map.of("query", q)); + assertNotNull(result.get("error")); + assertTrue(((String) result.get("error")).contains("SERVICE")); + } + } + + @Test + @DisplayName("Successful SELECT returns body, format, queryType, and untruncated metadata") + void successfulSelect() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getConfig()).thenReturn(new RdfConfiguration()); + when(repo.executeSparqlQuery( + "SELECT * WHERE { ?s ?p ?o } LIMIT 1", "application/sparql-results+json")) + .thenReturn("{\"results\":{\"bindings\":[]}}"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + newTool() + .execute(AUTHORIZER, SEC, Map.of("query", "SELECT * WHERE { ?s ?p ?o } LIMIT 1")); + + assertNull(result.get("error")); + assertEquals("json", result.get("format")); + assertEquals("SELECT", result.get("queryType")); + assertFalse((Boolean) result.get("truncated")); + assertNotNull(result.get("body")); + } + } + + @Test + @DisplayName("Result larger than maxBytes is truncated and flagged") + void resultTruncatedWhenOversized() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getConfig()).thenReturn(new RdfConfiguration()); + String hugeBody = "x".repeat(8_000); + when(repo.executeSparqlQuery( + "SELECT * WHERE { ?s ?p ?o }", "application/sparql-results+json")) + .thenReturn(hugeBody); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + newTool() + .execute( + AUTHORIZER, + SEC, + Map.of("query", "SELECT * WHERE { ?s ?p ?o }", "maxBytes", 2048)); + assertTrue((Boolean) result.get("truncated")); + assertTrue(((String) result.get("body")).length() <= 2048); + assertEquals(8_000, result.get("byteCount")); + } + } + + @Test + @DisplayName("Repository throws → tool returns clean error rather than propagating") + void repositoryThrows() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getConfig()).thenReturn(new RdfConfiguration()); + when(repo.executeSparqlQuery( + org.mockito.ArgumentMatchers.anyString(), org.mockito.ArgumentMatchers.anyString())) + .thenThrow(new RuntimeException("Fuseki connection refused")); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + newTool().execute(AUTHORIZER, SEC, Map.of("query", "ASK { ?s ?p ?o }")); + String err = (String) result.get("error"); + assertNotNull(err); + assertTrue(err.contains("SPARQL execution failed")); + } + } + + @Test + @DisplayName("Inference level 'rdfs' routes through the inference path") + void inferenceLevelRouted() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getConfig()).thenReturn(new RdfConfiguration()); + when(repo.executeSparqlQueryWithInference( + org.mockito.ArgumentMatchers.anyString(), + org.mockito.ArgumentMatchers.anyString(), + org.mockito.ArgumentMatchers.eq("rdfs"))) + .thenReturn("{}"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + newTool() + .execute( + AUTHORIZER, + SEC, + Map.of("query", "SELECT * WHERE { ?s ?p ?o }", "inferenceLevel", "rdfs")); + assertNull(result.get("error")); + org.mockito.Mockito.verify(repo) + .executeSparqlQueryWithInference( + org.mockito.ArgumentMatchers.anyString(), + org.mockito.ArgumentMatchers.anyString(), + org.mockito.ArgumentMatchers.eq("rdfs")); + } + } + + @Test + @DisplayName("Format defaults to json when unspecified or unrecognized") + void formatDefaultsToJson() throws IOException { + try (MockedStatic mocked = mockStatic(RdfRepository.class)) { + RdfRepository repo = mock(RdfRepository.class); + when(repo.isEnabled()).thenReturn(true); + when(repo.getConfig()).thenReturn(new RdfConfiguration()); + when(repo.executeSparqlQuery( + org.mockito.ArgumentMatchers.anyString(), + org.mockito.ArgumentMatchers.eq("application/sparql-results+json"))) + .thenReturn("{}"); + mocked.when(RdfRepository::getInstanceOrNull).thenReturn(repo); + + Map result = + newTool() + .execute( + AUTHORIZER, + SEC, + Map.of("query", "SELECT * WHERE { ?s ?p ?o }", "format", "weirdformat")); + assertEquals("json", result.get("format")); + } + } + + @Test + @DisplayName("Limits-aware execute throws — write-tool contract not applicable") + void writeContractNotSupported() { + org.junit.jupiter.api.Assertions.assertThrows( + UnsupportedOperationException.class, + () -> newTool().execute(AUTHORIZER, null, SEC, Map.of("query", "ASK {}"))); + } +} diff --git a/openmetadata-service/pom.xml b/openmetadata-service/pom.xml index 365729fd2e84..a42af58369e3 100644 --- a/openmetadata-service/pom.xml +++ b/openmetadata-service/pom.xml @@ -1061,7 +1061,14 @@ jena-rdfconnection ${jena.version} - + + + org.apache.jena + jena-shacl + ${jena.version} + + org.apache.calcite diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/migration/api/MigrationWorkflow.java b/openmetadata-service/src/main/java/org/openmetadata/service/migration/api/MigrationWorkflow.java index b01fe1dd2f4a..02f30adf0fa3 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/migration/api/MigrationWorkflow.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/migration/api/MigrationWorkflow.java @@ -106,9 +106,11 @@ public void loadMigrations() { public void validateMigrationsForServer() { if (!migrations.isEmpty()) { + List pendingVersions = migrations.stream().map(MigrationProcess::getVersion).toList(); throw new IllegalStateException( - "There are pending migrations to be run on the database." - + " Please backup your data and run `./bootstrap/openmetadata-ops.sh migrate`." + "There are pending migrations to be run on the database: " + + pendingVersions + + ". Please backup your data and run `./bootstrap/openmetadata-ops.sh migrate`." + " You can find more information on upgrading OpenMetadata at" + " https://docs.open-metadata.org/deployment/upgrade "); } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfIriValidator.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfIriValidator.java new file mode 100644 index 000000000000..47ad159ae848 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfIriValidator.java @@ -0,0 +1,63 @@ +package org.openmetadata.service.rdf; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Locale; + +/** + * Validates user-supplied IRIs before they are interpolated into SPARQL queries. + * + *

SPARQL angle-bracket-delimited IRI references must not contain characters that can escape the + * template (newlines, {@code #} comments, quotes, control characters, etc.). Stripping {@code >} + * alone is not enough — see the {@code DESCRIBE <…>} injection finding from the May 2026 PR + * review. This single utility is shared by every code path that builds a SPARQL DESCRIBE / CLEAR + * GRAPH query around a user-supplied URI so updates only need to land in one place. + * + *

Accepted form: {@code http(s)://host[/path][?query][#fragment]} with no whitespace, no angle + * brackets, no quotes, no backticks, and no control characters. Maximum 2048 characters. + */ +public final class RdfIriValidator { + + /** Soft cap. 2 KiB is well above legitimate OM URIs and keeps logs bounded. */ + static final int MAX_LENGTH = 2048; + + private RdfIriValidator() {} + + /** + * Returns the sanitized IRI when valid, {@code null} otherwise. Trims leading/trailing + * whitespace before validation; the validated form is the trimmed candidate. + */ + public static String validateEntityIri(String raw) { + if (raw == null) { + return null; + } + String candidate = raw.trim(); + if (candidate.isEmpty() || candidate.length() > MAX_LENGTH) { + return null; + } + for (int i = 0; i < candidate.length(); i++) { + char c = candidate.charAt(i); + if (c < 0x20 || c == 0x7F || c == ' ' || c == '<' || c == '>' || c == '"' || c == '\'' + || c == '`') { + return null; + } + } + try { + URI uri = new URI(candidate); + if (!uri.isAbsolute()) { + return null; + } + String scheme = uri.getScheme(); + if (scheme == null) { + return null; + } + String schemeLower = scheme.toLowerCase(Locale.ROOT); + if (!"http".equals(schemeLower) && !"https".equals(schemeLower)) { + return null; + } + } catch (URISyntaxException e) { + return null; + } + return candidate; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfUtils.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfUtils.java index e3dccc64722e..31a132987cc1 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfUtils.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfUtils.java @@ -1,11 +1,13 @@ package org.openmetadata.service.rdf; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.Set; /** * Utility methods for RDF operations */ -public class RdfUtils { +public final class RdfUtils { private static final Set PROV_ACTIVITY_TYPES = Set.of( @@ -44,9 +46,7 @@ public class RdfUtils { "dataproduct", "domain"); - private RdfUtils() { - // Private constructor for utility class - } + private RdfUtils() {} /** * Maps an entity type to its PROV-O class (Entity, Activity, or Agent). @@ -98,7 +98,30 @@ public static String getRdfType(String entityType) { case "policy" -> "om:Policy"; case "dataproduct" -> "dprod:DataProduct"; // W3C Data Product vocabulary case "domain" -> "skos:Collection"; // Organizational grouping + case "persona" -> "om:Persona"; + case "llmmodel" -> "om:LLMModel"; + case "aiapplication" -> "om:AIApplication"; + case "mcpserver" -> "om:McpServer"; + case "agentexecution" -> "om:AgentExecution"; + case "mcpexecution" -> "om:McpExecution"; + case "prompttemplate" -> "om:PromptTemplate"; + case "workflow", "workflowdefinition" -> "om:Workflow"; + case "workflowinstance" -> "om:WorkflowInstance"; + case "automation" -> "om:Automation"; default -> "om:" + entityType.substring(0, 1).toUpperCase() + entityType.substring(1); }; } + + /** + * Mints a stable, FQN-derived URI for a Column resource. Columns are sub-objects of a Table and + * have no UUID, so the FQN is the only universal identifier. The same scheme is used by the + * Table-side mapping (Table om:hasColumn) and by column-level lineage (om:fromColumn / + * om:toColumn) so that SPARQL traversal across both sides resolves to the same resource. + */ + public static String columnUri(String baseUri, String columnFqn) { + if (columnFqn == null || columnFqn.isEmpty()) { + return null; + } + return baseUri + "entity/column/" + URLEncoder.encode(columnFqn, StandardCharsets.UTF_8); + } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/extension/CustomOntologyRegistry.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/extension/CustomOntologyRegistry.java new file mode 100644 index 000000000000..35eed22b4e9b --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/extension/CustomOntologyRegistry.java @@ -0,0 +1,60 @@ +package org.openmetadata.service.rdf.extension; + +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.api.configuration.rdf.CustomOntology; + +/** + * In-memory registry of user-authored ontology extensions. Each extension is keyed by its + * {@code name}. Reads are lock-free; writes are synchronized. + * + *

Persistence is intentionally deferred — admin writes that pass {@link + * CustomOntologyValidator#validate(CustomOntology)} are upserted into this registry, and the + * registry is rebuilt on server restart from any DB-backed store added in a future phase. + */ +@Slf4j +public final class CustomOntologyRegistry { + + private static final CustomOntologyRegistry INSTANCE = new CustomOntologyRegistry(); + + public static CustomOntologyRegistry getInstance() { + return INSTANCE; + } + + // ConcurrentHashMap supports lock-free reads while writes mutate concurrently. Iteration order + // is not preserved; {@link #list()} returns in whatever order ConcurrentHashMap chooses, which + // is acceptable since the only stable contract here is "all current extensions". + private final ConcurrentMap extensions = new ConcurrentHashMap<>(); + + private CustomOntologyRegistry() {} + + /** @return all extensions; iteration order is not guaranteed. */ + public List list() { + return List.copyOf(extensions.values()); + } + + public Optional get(String name) { + return Optional.ofNullable(extensions.get(name)); + } + + /** + * Insert or replace an extension. The caller is responsible for validation; this method does + * none. Returns the previous extension at that name (if any). + */ + public synchronized Optional upsert(CustomOntology extension) { + return Optional.ofNullable(extensions.put(extension.getName(), extension)); + } + + /** @return true if the extension was removed. */ + public synchronized boolean delete(String name) { + return extensions.remove(name) != null; + } + + /** Visible for tests. */ + synchronized void resetForTests() { + extensions.clear(); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/extension/CustomOntologyValidator.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/extension/CustomOntologyValidator.java new file mode 100644 index 000000000000..f14d88cfb65d --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/extension/CustomOntologyValidator.java @@ -0,0 +1,283 @@ +package org.openmetadata.service.rdf.extension; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.api.configuration.rdf.CustomOntology; +import org.openmetadata.schema.api.configuration.rdf.CustomOntologyClass; +import org.openmetadata.schema.api.configuration.rdf.CustomOntologyProperty; + +/** + * Validates a {@link CustomOntology} extension before it is registered with the server. + * + *

Hard rules — the validator must reject any of these: + * + *

    + *
  1. The extension has a non-blank name and at least one class or property. + *
  2. Every custom class / property URI is in the {@code om-extension:} namespace. The + * canonical {@code om:} namespace is read-only — extensions cannot redefine + * {@code om:Column}, {@code om:Table}, etc. + *
  3. No two classes (or two properties) share the same URI within the same extension. + *
  4. Each class declares at least one parent in {@code subClassOf}; the parent must reference + * a known canonical OpenMetadata class or another class declared in this same extension. + *
  5. The class hierarchy contains no cycles ({@code A → B → A}). + *
  6. Object/Datatype property domain/range URIs reference either a canonical class or a class + * in this extension. + *
+ */ +@Slf4j +public final class CustomOntologyValidator { + + private static final String EXTENSION_NS = "https://open-metadata.org/ontology-extension/"; + private static final String CANONICAL_NS = "https://open-metadata.org/ontology/"; + + /** + * The canonical class URIs that admins are allowed to reference as parents / domains / ranges. + * Pulled from {@code openmetadata-spec/src/main/resources/rdf/ontology/openmetadata.ttl}; the + * list is intentionally small — anything outside it must be a class declared in the same + * extension. + */ + private static final Set KNOWN_CANONICAL_CLASSES = + Set.of( + "Entity", + "DataAsset", + "Service", + "Database", + "DatabaseSchema", + "Table", + "Column", + "TableConstraint", + "Pipeline", + "Topic", + "Dashboard", + "Chart", + "MLModel", + "Container", + "SearchIndex", + "APICollection", + "APIEndpoint", + "Glossary", + "GlossaryTerm", + "Tag", + "Classification", + "Domain", + "DataProduct", + "DataContract", + "Persona", + "User", + "Team", + "LineageDetails", + "ColumnLineage", + "LLMModel", + "AIApplication", + "McpServer", + "AgentExecution", + "PromptTemplate", + "Workflow", + "Automation"); + + private CustomOntologyValidator() {} + + /** @return list of validation errors. Empty list means the extension is valid. */ + public static List validate(CustomOntology extension) { + List errors = new ArrayList<>(); + if (extension == null) { + errors.add("extension must not be null"); + return errors; + } + if (isBlank(extension.getName())) { + errors.add("'name' must not be blank"); + } else if (!extension.getName().matches("^[a-z][a-z0-9-]{1,62}[a-z0-9]$")) { + errors.add( + "'name' must be 3-64 chars, lowercase letters / digits / hyphen, start with a letter"); + } + List classes = + extension.getClasses() == null ? List.of() : extension.getClasses(); + List properties = + extension.getProperties() == null ? List.of() : extension.getProperties(); + if (classes.isEmpty() && properties.isEmpty()) { + errors.add("extension must declare at least one class or property"); + } + + Set classUris = new HashSet<>(); + for (CustomOntologyClass cls : classes) { + validateClass(cls, classUris, errors); + } + + Set propertyUris = new HashSet<>(); + for (CustomOntologyProperty prop : properties) { + validateProperty(prop, propertyUris, classUris, errors); + } + + detectClassHierarchyCycles(classes, errors); + + return errors; + } + + /** @return true if validation passed; false otherwise (errors are logged at WARN). */ + public static boolean isValid(CustomOntology extension) { + List errors = validate(extension); + if (!errors.isEmpty()) { + LOG.warn( + "Custom ontology '{}' failed validation: {}", + extension == null ? "" : extension.getName(), + errors); + } + return errors.isEmpty(); + } + + private static void validateClass( + CustomOntologyClass cls, Set seenUris, List errors) { + if (cls == null) { + errors.add("null class entry"); + return; + } + if (cls.getUri() == null || cls.getUri().isBlank()) { + errors.add("class missing 'uri'"); + return; + } + if (!isExtensionUri(cls.getUri())) { + errors.add( + "class URI '" + + cls.getUri() + + "' must be in the om-extension namespace (" + + EXTENSION_NS + + "); the canonical om: namespace is read-only"); + } + if (!seenUris.add(cls.getUri())) { + errors.add("duplicate class URI in this extension: " + cls.getUri()); + } + if (cls.getSubClassOf() == null || cls.getSubClassOf().isEmpty()) { + errors.add("class '" + cls.getUri() + "' must declare at least one subClassOf parent"); + } else { + for (String parent : cls.getSubClassOf()) { + if (!isKnownClassReference(parent, seenUris)) { + errors.add( + "class '" + + cls.getUri() + + "' references unknown parent class '" + + parent + + "'; expected canonical om: class or another class in this extension"); + } + } + } + } + + private static void validateProperty( + CustomOntologyProperty prop, + Set seenUris, + Set declaredClassUris, + List errors) { + if (prop == null) { + errors.add("null property entry"); + return; + } + if (prop.getUri() == null || prop.getUri().isBlank()) { + errors.add("property missing 'uri'"); + return; + } + if (!isExtensionUri(prop.getUri())) { + errors.add("property URI '" + prop.getUri() + "' must be in the om-extension namespace"); + } + if (!seenUris.add(prop.getUri())) { + errors.add("duplicate property URI in this extension: " + prop.getUri()); + } + if (prop.getDomain() == null || prop.getDomain().isBlank()) { + errors.add("property '" + prop.getUri() + "' missing 'domain'"); + } else if (!isKnownClassReference(prop.getDomain(), declaredClassUris)) { + errors.add( + "property '" + + prop.getUri() + + "' has domain '" + + prop.getDomain() + + "' which is not a known canonical class or a class in this extension"); + } + if (prop.getRange() == null || prop.getRange().isBlank()) { + errors.add("property '" + prop.getUri() + "' missing 'range'"); + } else if (prop.getType() == CustomOntologyProperty.Type.OBJECT_PROPERTY) { + if (!isKnownClassReference(prop.getRange(), declaredClassUris)) { + errors.add( + "ObjectProperty '" + + prop.getUri() + + "' has range '" + + prop.getRange() + + "' which is not a known canonical class or a class in this extension"); + } + } else if (prop.getType() == CustomOntologyProperty.Type.DATATYPE_PROPERTY) { + if (!prop.getRange().startsWith("http://www.w3.org/2001/XMLSchema#")) { + errors.add( + "DatatypeProperty '" + + prop.getUri() + + "' range must be an xsd: datatype URI (got '" + + prop.getRange() + + "')"); + } + } + } + + /** + * Detect a cycle in the subClassOf graph using iterative DFS. A cycle is any path that returns + * to a node already on the current path stack. + */ + private static void detectClassHierarchyCycles( + List classes, List errors) { + Map> graph = new HashMap<>(); + for (CustomOntologyClass cls : classes) { + if (cls != null && cls.getUri() != null) { + graph.put(cls.getUri(), cls.getSubClassOf() == null ? List.of() : cls.getSubClassOf()); + } + } + Set visited = new HashSet<>(); + Set onStack = new HashSet<>(); + for (String node : graph.keySet()) { + if (hasCycle(node, graph, visited, onStack)) { + errors.add("class hierarchy contains a cycle through " + node); + } + } + } + + private static boolean hasCycle( + String node, Map> graph, Set visited, Set onStack) { + if (onStack.contains(node)) return true; + if (visited.contains(node)) return false; + onStack.add(node); + visited.add(node); + for (String parent : graph.getOrDefault(node, List.of())) { + if (graph.containsKey(parent) && hasCycle(parent, graph, visited, onStack)) { + return true; + } + } + onStack.remove(node); + return false; + } + + private static boolean isExtensionUri(String uri) { + return uri != null && uri.startsWith(EXTENSION_NS); + } + + /** + * @return true if the URI references either a canonical om: class on the allowlist or a class + * declared inside the current extension. + */ + private static boolean isKnownClassReference(String uri, Set extensionClassUris) { + if (uri == null) return false; + if (extensionClassUris.contains(uri)) return true; + if (uri.startsWith(CANONICAL_NS)) { + String localName = uri.substring(CANONICAL_NS.length()); + return KNOWN_CANONICAL_CLASSES.contains(localName); + } + // Allow short-form prefixed names like "om:Table". + if (uri.startsWith("om:")) { + return KNOWN_CANONICAL_CLASSES.contains(uri.substring(3)); + } + return false; + } + + private static boolean isBlank(String s) { + return s == null || s.isBlank(); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/federation/SparqlFederationGuard.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/federation/SparqlFederationGuard.java new file mode 100644 index 000000000000..0fa99160e48b --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/federation/SparqlFederationGuard.java @@ -0,0 +1,187 @@ +package org.openmetadata.service.rdf.federation; + +import java.net.URI; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryException; +import org.apache.jena.query.QueryFactory; +import org.apache.jena.sparql.syntax.ElementService; +import org.apache.jena.sparql.syntax.ElementSubQuery; +import org.apache.jena.sparql.syntax.ElementVisitorBase; +import org.apache.jena.sparql.syntax.ElementWalker; +import org.openmetadata.schema.api.configuration.rdf.RdfConfiguration; +import org.openmetadata.schema.api.configuration.rdf.SparqlFederationConfig; + +/** + * Inspects an incoming SPARQL query for {@code SERVICE } clauses and rejects any whose + * endpoint URI is not in the configured allowlist. + * + *

Detection uses Jena's {@link ElementWalker} so it sees only real SERVICE elements — the + * keyword "SERVICE" inside a string literal or a comment is correctly ignored. The walker also + * recurses into subqueries, OPTIONAL/UNION/MINUS branches, and nested SERVICE blocks. + * + *

Behavior: + * + *

    + *
  • Federation disabled (the default) — any SERVICE clause is a violation. + *
  • Federation enabled — a SERVICE clause must reference a URI present in + * {@code allowedEndpoints} verbatim. + *
  • Variable SERVICE endpoints ({@code SERVICE ?endpoint}) cannot be statically allowlisted + * and are always rejected. + *
  • Queries that fail to parse here are passed through; the SPARQL engine returns its own + * parse error to the caller, preserving message fidelity. + *
+ */ +@Slf4j +public final class SparqlFederationGuard { + + private final boolean federationEnabled; + private final Set allowedEndpoints; + + public SparqlFederationGuard(RdfConfiguration config) { + SparqlFederationConfig federation = config == null ? null : config.getFederation(); + this.federationEnabled = federation != null && Boolean.TRUE.equals(federation.getEnabled()); + this.allowedEndpoints = + federation == null || federation.getAllowedEndpoints() == null + ? Set.of() + : federation.getAllowedEndpoints().stream() + .map(URI::toString) + .collect(Collectors.toUnmodifiableSet()); + } + + /** Visible for tests. Package-private constructor that takes the policy directly. */ + SparqlFederationGuard(boolean federationEnabled, Set allowedEndpoints) { + this.federationEnabled = federationEnabled; + this.allowedEndpoints = allowedEndpoints == null ? Set.of() : Set.copyOf(allowedEndpoints); + } + + /** + * @return all distinct SERVICE endpoint URIs found in the query. Order of first appearance is + * preserved. Variable endpoints surface as the literal string {@code ?varname}. + */ + public List serviceEndpoints(String sparql) { + Optional parsed = parseQuietly(sparql); + if (parsed.isEmpty()) { + return List.of(); + } + EndpointCollector collector = new EndpointCollector(); + ElementWalker.walk(parsed.get().getQueryPattern(), collector); + return List.copyOf(collector.endpoints); + } + + /** + * @return the first endpoint that violates the policy, or empty if the query is allowed. + */ + public Optional firstDisallowedEndpoint(String sparql) { + for (String endpoint : serviceEndpoints(sparql)) { + if (!isAllowed(endpoint)) { + return Optional.of(endpoint); + } + } + return Optional.empty(); + } + + /** + * Convenience: throw {@link FederationDisallowedException} if any SERVICE clause is rejected. + */ + public void enforce(String sparql) { + Optional blocked = firstDisallowedEndpoint(sparql); + if (blocked.isPresent()) { + throw new FederationDisallowedException(blocked.get(), federationEnabled, allowedEndpoints); + } + } + + private boolean isAllowed(String endpoint) { + if (endpoint.startsWith("?")) { + // Variable endpoints can't be statically allowlisted. + return false; + } + if (!federationEnabled) { + return false; + } + return allowedEndpoints.contains(endpoint); + } + + private Optional parseQuietly(String sparql) { + try { + return Optional.ofNullable(QueryFactory.create(sparql)); + } catch (QueryException e) { + LOG.debug( + "SPARQL parse failed inside federation guard; deferring to engine: {}", e.getMessage()); + return Optional.empty(); + } + } + + private static final class EndpointCollector extends ElementVisitorBase { + private final Set endpoints = new LinkedHashSet<>(); + + @Override + public void visit(ElementService el) { + if (el.getServiceNode().isVariable()) { + endpoints.add("?" + el.getServiceNode().getName()); + } else if (el.getServiceNode().isURI()) { + endpoints.add(el.getServiceNode().getURI()); + } + } + + @Override + public void visit(ElementSubQuery el) { + // ElementWalker stops at subquery boundaries; descend manually so a SERVICE inside an + // inner SELECT still gets caught. + if (el.getQuery() != null && el.getQuery().getQueryPattern() != null) { + ElementWalker.walk(el.getQuery().getQueryPattern(), this); + } + } + } + + /** + * Thrown by {@link #enforce(String)} when a query references a disallowed endpoint. Carries the + * effective policy so callers can include it in the error response. + */ + public static final class FederationDisallowedException extends RuntimeException { + + private final String blockedEndpoint; + private final boolean federationEnabled; + private final Set allowedEndpoints; + + FederationDisallowedException( + String blockedEndpoint, boolean federationEnabled, Set allowedEndpoints) { + super(buildMessage(blockedEndpoint, federationEnabled, allowedEndpoints)); + this.blockedEndpoint = blockedEndpoint; + this.federationEnabled = federationEnabled; + this.allowedEndpoints = Collections.unmodifiableSet(allowedEndpoints); + } + + public String getBlockedEndpoint() { + return blockedEndpoint; + } + + public boolean isFederationEnabled() { + return federationEnabled; + } + + public Set getAllowedEndpoints() { + return allowedEndpoints; + } + + private static String buildMessage( + String endpoint, boolean federationEnabled, Set allowedEndpoints) { + if (!federationEnabled) { + return "SPARQL SERVICE clause references " + + endpoint + + " but federated SPARQL is disabled. " + + "Enable rdf.federation.enabled and add the endpoint to rdf.federation.allowedEndpoints."; + } + return "SPARQL SERVICE clause references " + + endpoint + + " which is not in the allowlist. Allowed endpoints: " + + allowedEndpoints; + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/inference/InferenceRuleRegistry.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/inference/InferenceRuleRegistry.java new file mode 100644 index 000000000000..3b6f6661887f --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/inference/InferenceRuleRegistry.java @@ -0,0 +1,114 @@ +package org.openmetadata.service.rdf.inference; + +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import java.io.InputStream; +import java.util.Comparator; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.api.configuration.rdf.InferenceRule; + +/** + * In-memory registry of OpenMetadata inference rules. The starter pack is loaded once per JVM + * from the classpath under {@code rdf/inference-rules/}; further entries can be {@link + * #upsert(InferenceRule) upserted} programmatically. + * + *

This is intentionally a Phase-1 storage shape — there is no DB-backed persistence yet. + * Admin REST writes call {@link #upsert(InferenceRule)} after passing validation. A follow-up + * phase will swap this in-memory registry for a JDBI-backed repository without changing the + * exposed API. + */ +@Slf4j +public final class InferenceRuleRegistry { + + private static final String[] STARTER_PACK = { + "/rdf/inference-rules/transitive-lineage-closure.json", + "/rdf/inference-rules/pii-propagation-via-lineage.json", + "/rdf/inference-rules/schema-tag-inheritance.json", + "/rdf/inference-rules/domain-membership-inheritance.json" + }; + + private static final InferenceRuleRegistry INSTANCE = new InferenceRuleRegistry(); + + public static InferenceRuleRegistry getInstance() { + return INSTANCE; + } + + // ConcurrentHashMap supports lock-free reads from {@link #list()} / {@link #get(String)} while + // {@link #upsert} / {@link #delete} mutate concurrently. Iteration order isn't preserved, so + // {@code list()} sorts explicitly (priority + name) for deterministic API output. + private final ConcurrentMap rules = new ConcurrentHashMap<>(); + private final ObjectMapper mapper = new ObjectMapper(); + private volatile boolean starterLoaded = false; + + private InferenceRuleRegistry() {} + + /** + * Load the starter pack from the classpath. Idempotent. + */ + public synchronized void loadStarterPackIfNeeded() { + if (starterLoaded) return; + int loaded = 0; + for (String path : STARTER_PACK) { + try (InputStream is = InferenceRuleRegistry.class.getResourceAsStream(path)) { + if (is == null) { + LOG.warn("Inference rule starter pack resource missing: {}", path); + continue; + } + InferenceRule rule = mapper.readValue(is, InferenceRule.class); + List errors = InferenceRuleValidator.validate(rule); + if (!errors.isEmpty()) { + LOG.error("Starter pack rule '{}' failed validation: {}", path, errors); + continue; + } + rules.put(rule.getName(), rule); + loaded++; + } catch (IOException e) { + LOG.error("Failed to load starter pack rule {}", path, e); + } + } + starterLoaded = true; + LOG.info("Loaded {} inference rules from starter pack", loaded); + } + + /** @return all rules in priority order, then by name. */ + public List list() { + loadStarterPackIfNeeded(); + return rules.values().stream() + .sorted( + Comparator.comparing( + (InferenceRule r) -> r.getPriority() == null ? 100 : r.getPriority()) + .thenComparing(InferenceRule::getName)) + .toList(); + } + + /** @return the rule with the given name, or empty if no such rule. */ + public Optional get(String name) { + loadStarterPackIfNeeded(); + return Optional.ofNullable(rules.get(name)); + } + + /** + * Insert or replace a rule. The caller is responsible for validation (see {@link + * InferenceRuleValidator}); this method does no validation itself. + */ + public synchronized void upsert(InferenceRule rule) { + loadStarterPackIfNeeded(); + rules.put(rule.getName(), rule); + } + + /** @return true if a rule with that name was removed. */ + public synchronized boolean delete(String name) { + loadStarterPackIfNeeded(); + return rules.remove(name) != null; + } + + /** Clear the registry. Visible for tests. */ + synchronized void resetForTests() { + rules.clear(); + starterLoaded = false; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/inference/InferenceRuleValidator.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/inference/InferenceRuleValidator.java new file mode 100644 index 000000000000..4ccb49413db4 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/inference/InferenceRuleValidator.java @@ -0,0 +1,142 @@ +package org.openmetadata.service.rdf.inference; + +import java.util.ArrayList; +import java.util.List; +import lombok.extern.slf4j.Slf4j; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryException; +import org.apache.jena.query.QueryFactory; +import org.apache.jena.sparql.syntax.ElementService; +import org.apache.jena.sparql.syntax.ElementSubQuery; +import org.apache.jena.sparql.syntax.ElementVisitorBase; +import org.apache.jena.sparql.syntax.ElementWalker; +import org.openmetadata.schema.api.configuration.rdf.InferenceRule; + +/** + * Validates {@link InferenceRule} payloads before they are accepted or executed. + * + *

The validator is intentionally strict — admins write these rules and they run server-side + * against the whole graph, so a malformed or hostile rule has wide blast radius. + * + *

Checks performed (in order; first failing check returns): + * + *

    + *
  1. The rule has a non-blank name and rule body. + *
  2. The body parses as a SPARQL Query (rejects SPARQL UPDATE — those are emitted via the + * resulting CONSTRUCT triples, not by the rule body itself). + *
  3. For {@code ruleType=CONSTRUCT}, the parsed query must be CONSTRUCT type. SELECT, ASK, + * and DESCRIBE rules are rejected — they don't produce inferable triples. + *
  4. The body must not contain any SERVICE clauses. Inference must be deterministic and run + * against the local graph; federated lookups are rejected. + *
  5. The body must not be a no-op CONSTRUCT (empty WHERE) — those would either produce + * nothing or, with ASK semantics, blow up. + *
  6. Priority is within bounds. + *
+ */ +@Slf4j +public final class InferenceRuleValidator { + + private InferenceRuleValidator() {} + + /** @return the list of validation errors. Empty list means the rule is valid. */ + public static List validate(InferenceRule rule) { + List errors = new ArrayList<>(); + if (rule == null) { + errors.add("rule must not be null"); + return errors; + } + if (isBlank(rule.getName())) { + errors.add("'name' must not be blank"); + } else if (!rule.getName().matches("^[a-z][a-z0-9-]{1,62}[a-z0-9]$")) { + errors.add( + "'name' must be 3-64 chars, lowercase letters / digits / hyphen, start with a letter, end with a letter or digit"); + } + if (isBlank(rule.getRuleBody())) { + errors.add("'ruleBody' must not be blank"); + } + InferenceRule.RuleType ruleType = + rule.getRuleType() == null ? InferenceRule.RuleType.CONSTRUCT : rule.getRuleType(); + if (ruleType == InferenceRule.RuleType.RDFS) { + // RDFS is a placeholder — the engine doesn't ship a parser for that body shape yet. + errors.add( + "ruleType=RDFS is reserved for future use; current engine only ships CONSTRUCT support"); + return errors; + } + if (rule.getPriority() != null && (rule.getPriority() < 0 || rule.getPriority() > 10_000)) { + errors.add("'priority' must be between 0 and 10000"); + } + if (errors.size() > 0) { + return errors; + } + + Query parsed; + try { + parsed = QueryFactory.create(rule.getRuleBody()); + } catch (QueryException e) { + errors.add("ruleBody failed to parse as SPARQL: " + e.getMessage()); + return errors; + } + if (!parsed.isConstructType()) { + errors.add( + "ruleBody must be a SPARQL CONSTRUCT query for ruleType=CONSTRUCT (got " + + parsed.queryType() + + "); inference rules emit new triples and only CONSTRUCT does that"); + return errors; + } + if (parsed.getQueryPattern() == null || isEmptyPattern(parsed.getQueryPattern().toString())) { + errors.add("ruleBody must have a non-empty WHERE pattern"); + } + if (parsed.getConstructTemplate() == null + || parsed.getConstructTemplate().getTriples().isEmpty()) { + errors.add("ruleBody CONSTRUCT template must contain at least one triple pattern"); + } + ServiceFinder serviceFinder = new ServiceFinder(); + if (parsed.getQueryPattern() != null) { + ElementWalker.walk(parsed.getQueryPattern(), serviceFinder); + } + if (serviceFinder.found) { + errors.add( + "ruleBody must not contain SERVICE clauses; inference is local-only and federated rules are rejected"); + } + return errors; + } + + /** + * @return true if the rule passed validation; false otherwise. Errors are logged at WARN. + */ + public static boolean isValid(InferenceRule rule) { + List errors = validate(rule); + if (!errors.isEmpty()) { + LOG.warn( + "Inference rule '{}' failed validation: {}", + rule == null ? "" : rule.getName(), + errors); + } + return errors.isEmpty(); + } + + private static boolean isBlank(String s) { + return s == null || s.isBlank(); + } + + private static boolean isEmptyPattern(String pattern) { + String trimmed = pattern.replaceAll("\\s", ""); + return trimmed.isEmpty() || trimmed.equals("{}"); + } + + private static final class ServiceFinder extends ElementVisitorBase { + boolean found; + + @Override + public void visit(ElementService el) { + found = true; + } + + @Override + public void visit(ElementSubQuery el) { + if (el.getQuery() != null && el.getQuery().getQueryPattern() != null) { + ElementWalker.walk(el.getQuery().getQueryPattern(), this); + } + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CentralityComputation.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CentralityComputation.java new file mode 100644 index 000000000000..2d7de87dd8b8 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CentralityComputation.java @@ -0,0 +1,188 @@ +package org.openmetadata.service.rdf.insights; + +import com.fasterxml.jackson.databind.JsonNode; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.rdf.RdfRepository; + +/** + * Pulls a graph snapshot of one entity type out of Fuseki, runs {@link PageRank} on it, and + * persists the resulting scores back into the named graph + * {@code }. The {@code /v1/rdf/insights/important} + * endpoint reads those triples through the {@code om:centralityScore} predicate. + * + *

Edge weights chosen to reflect importance for governance purposes: + * + *

    + *
  • {@code prov:wasDerivedFrom} — 1.0 (lineage) + *
  • {@code om:hasTag}, {@code om:hasGlossaryTerm} — 0.5 (semantic linkage) + *
  • {@code om:hasColumn} — 0.2 (containment, weak) + *
+ * + *

Scope: only entities of the requested type are added as nodes; off-type targets become + * dangling sinks so they don't hijack mass. PageRank is run with default damping 0.85. + */ +@Slf4j +public final class CentralityComputation { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String INSIGHTS_GRAPH_PREFIX = OM_NS + "insights/centrality/"; + + private final RdfRepository repository; + private final PageRank pageRank; + + public CentralityComputation(RdfRepository repository) { + this(repository, new PageRank()); + } + + CentralityComputation(RdfRepository repository, PageRank pageRank) { + this.repository = repository; + this.pageRank = pageRank; + } + + /** + * Run centrality for one entity type end-to-end: extract → compute → persist. + * + * @return summary describing what got computed. + */ + public Result computeAndPersist(String entityType) { + String safeType = ImportanceQueryBuilder.validateEntityType(entityType); + String classLocalName = capitalize(safeType); + + Map> graph = extractGraph(classLocalName); + if (graph.isEmpty()) { + LOG.info("No entities of type {} found; skipping centrality run", classLocalName); + return new Result(safeType, 0, 0, false); + } + PageRank.Result ranked = pageRank.compute(graph); + persistScores(classLocalName, ranked.scores()); + return new Result(safeType, ranked.scores().size(), ranked.iterations(), ranked.converged()); + } + + /** Extract the graph snapshot from Fuseki via SPARQL. Visible for testing. */ + Map> extractGraph(String classLocalName) { + String sparql = + String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "PREFIX prov: ", + "SELECT ?from ?to ?predicate WHERE {", + " ?from a om:" + classLocalName + " .", + " {", + " ?from prov:wasDerivedFrom ?to .", + " BIND(\"prov:wasDerivedFrom\" AS ?predicate)", + " } UNION {", + " ?from om:hasTag ?to .", + " BIND(\"om:hasTag\" AS ?predicate)", + " } UNION {", + " ?from om:hasGlossaryTerm ?to .", + " BIND(\"om:hasGlossaryTerm\" AS ?predicate)", + " } UNION {", + " ?from om:hasColumn ?to .", + " BIND(\"om:hasColumn\" AS ?predicate)", + " }", + "}"); + + String json; + try { + json = repository.executeSparqlQuery(sparql, "application/sparql-results+json"); + } catch (Exception e) { + LOG.error("Failed to extract centrality graph for {}", classLocalName, e); + return Map.of(); + } + return parseGraph(json); + } + + /** Parse SPARQL JSON results into a weighted adjacency map. Visible for testing. */ + static Map> parseGraph(String selectJson) { + Map> graph = new LinkedHashMap<>(); + if (selectJson == null || selectJson.isBlank()) { + return graph; + } + try { + JsonNode root = JsonUtils.readTree(selectJson); + JsonNode bindings = root.path("results").path("bindings"); + if (!bindings.isArray()) return graph; + for (JsonNode row : bindings) { + String from = textValue(row, "from"); + String to = textValue(row, "to"); + String predicate = textValue(row, "predicate"); + if (from == null || to == null || predicate == null) continue; + double weight = weightFor(predicate); + graph.computeIfAbsent(from, k -> new HashMap<>()).merge(to, weight, Double::sum); + } + } catch (Exception e) { + LOG.warn("Failed to parse centrality SPARQL result: {}", e.getMessage()); + } + return graph; + } + + static double weightFor(String predicate) { + return switch (predicate) { + case "prov:wasDerivedFrom" -> 1.0; + case "om:hasTag", "om:hasGlossaryTerm" -> 0.5; + case "om:hasColumn" -> 0.2; + default -> 0.0; + }; + } + + /** Write the scores back to Fuseki under a dedicated named graph. Visible for testing. */ + void persistScores(String classLocalName, Map scores) { + String graphUri = INSIGHTS_GRAPH_PREFIX + classLocalName.toLowerCase(java.util.Locale.ROOT); + StringBuilder update = new StringBuilder(); + update.append("PREFIX om: <").append(OM_NS).append(">\n"); + update.append("PREFIX xsd: \n"); + update + .append("WITH <") + .append(graphUri) + .append( + "> DELETE { ?s om:centralityScore ?o ; om:centralityRank ?r } WHERE { ?s om:centralityScore ?o . OPTIONAL { ?s om:centralityRank ?r } } ;\n"); + update.append("INSERT DATA { GRAPH <").append(graphUri).append("> {\n"); + int rank = 1; + for (Map.Entry e : + scores.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .toList()) { + update + .append(" <") + .append(e.getKey()) + .append("> om:centralityScore \"") + .append(e.getValue()) + .append("\"^^xsd:double ; om:centralityRank \"") + .append(rank++) + .append("\"^^xsd:integer .\n"); + } + update.append("} }"); + try { + repository.executeSparqlUpdate(update.toString()); + } catch (Exception e) { + LOG.error("Failed to persist centrality scores for {}", classLocalName, e); + } + } + + private static String textValue(JsonNode row, String varName) { + JsonNode node = row.path(varName); + if (node.isMissingNode() || node.isNull()) return null; + JsonNode value = node.path("value"); + return value.isMissingNode() || value.isNull() ? null : value.asText(); + } + + private static String capitalize(String s) { + if (s == null || s.isEmpty()) return s; + return Character.toUpperCase(s.charAt(0)) + s.substring(1); + } + + /** Result of a centrality run. */ + public record Result(String entityType, int nodesScored, int iterations, boolean converged) {} + + /** Helper for tests to construct rows for parseGraph. */ + static List exampleRows() { + return List.of( + new String[] {"urn:t1", "urn:t2", "prov:wasDerivedFrom"}, + new String[] {"urn:t1", "urn:c1", "om:hasColumn"}); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CoOccurrenceQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CoOccurrenceQueryBuilder.java new file mode 100644 index 000000000000..aed4cbe85d2b --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CoOccurrenceQueryBuilder.java @@ -0,0 +1,101 @@ +package org.openmetadata.service.rdf.insights; + +/** + * SPARQL builders for Phase 3.5 catalog-wide insights — pure aggregate views over the existing + * triples, no precomputation. Each method is a standalone SPARQL SELECT with input validation; + * callers ({@code RdfResource}) hand the query verbatim to the SPARQL endpoint and stream the + * results back as SPARQL-JSON. + * + *

    + *
  • {@link #tagCoOccurrence(int, int)} — pairs of tags that get applied to the same entity, by + * overlap count. Pairs are canonicalised (str(a) < str(b)) so each pair is reported once. + *
  • {@link #glossaryReach(int, int)} — glossary terms ranked by the number of distinct domains + * they appear under. Useful for "term used across the most domains" insight in Phase 3.5. + *
+ */ +public final class CoOccurrenceQueryBuilder { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + + /** Default top-N when caller doesn't specify. */ + public static final int DEFAULT_LIMIT = 20; + + /** Hard cap so a buggy caller can't ask for tens of thousands of rows. */ + public static final int MAX_LIMIT = 100; + + /** Default minimum overlap threshold — pairs that co-occur on fewer entities are dropped. */ + public static final int DEFAULT_MIN_COUNT = 2; + + private CoOccurrenceQueryBuilder() {} + + /** + * Tag co-occurrence: pairs of tags applied to the same entity together at least {@code minCount} + * times. Result columns: ?tagA, ?tagB, ?count. + * + * @param minCount minimum number of shared entities; values < 1 are clamped to 1 + * @param limit number of rows; clamped to [1, {@link #MAX_LIMIT}] + */ + public static String tagCoOccurrence(int minCount, int limit) { + int safeMin = clamp(minCount, 1, Integer.MAX_VALUE); + int safeLimit = clamp(limit, 1, MAX_LIMIT); + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "SELECT ?tagA ?tagB (COUNT(?entity) AS ?count) WHERE {", + " ?entity om:hasTag ?tagA .", + " ?entity om:hasTag ?tagB .", + " FILTER(STR(?tagA) < STR(?tagB))", + "}", + "GROUP BY ?tagA ?tagB", + "HAVING (COUNT(?entity) >= " + safeMin + ")", + "ORDER BY DESC(?count) ?tagA ?tagB", + "LIMIT " + safeLimit); + } + + /** + * Glossary reach: each glossary term + the number of distinct domains it shows up in. A term + * that's used by tables across many domains is more cross-cutting and signals a richer concept. + * + *

Result columns: ?term, ?domainCount. + * + * @param minDomains floor on the count; values < 1 become 1 + * @param limit number of rows; clamped to [1, {@link #MAX_LIMIT}] + */ + public static String glossaryReach(int minDomains, int limit) { + int safeMin = clamp(minDomains, 1, Integer.MAX_VALUE); + int safeLimit = clamp(limit, 1, MAX_LIMIT); + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "SELECT ?term (COUNT(DISTINCT ?domain) AS ?domainCount) WHERE {", + " ?entity om:hasGlossaryTerm ?term .", + " ?entity om:hasDomain ?domain .", + "}", + "GROUP BY ?term", + "HAVING (COUNT(DISTINCT ?domain) >= " + safeMin + ")", + "ORDER BY DESC(?domainCount) ?term", + "LIMIT " + safeLimit); + } + + /** + * Tag popularity: tags ranked by the number of entities they're applied to. Result columns: + * ?tag, ?entityCount. + */ + public static String tagPopularity(int limit) { + int safeLimit = clamp(limit, 1, MAX_LIMIT); + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "SELECT ?tag (COUNT(DISTINCT ?entity) AS ?entityCount) WHERE {", + " ?entity om:hasTag ?tag .", + "}", + "GROUP BY ?tag", + "ORDER BY DESC(?entityCount) ?tag", + "LIMIT " + safeLimit); + } + + private static int clamp(int v, int lo, int hi) { + if (v < lo) return lo; + return Math.min(v, hi); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CommunityComputation.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CommunityComputation.java new file mode 100644 index 000000000000..d4946fac4a33 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/CommunityComputation.java @@ -0,0 +1,281 @@ +package org.openmetadata.service.rdf.insights; + +import com.fasterxml.jackson.databind.JsonNode; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.rdf.RdfRepository; + +/** + * Phase 3.2 community-detection driver. Pulls a sub-graph out of Fuseki for one + * {@link GraphType graph kind} (lineage or tag-co-occurrence) restricted to one entity type, runs + * {@link Louvain}, and persists the resulting partition under a dedicated named graph + * {@code }. + * + *

Each persisted community is an {@code om:Community} resource with {@code om:hasMember}, + * {@code om:modularity}, {@code om:communityType}, and {@code om:communitySize}. The + * {@code modularity} value is the same on every community of a given run — it describes the + * partition as a whole, not any single cluster. + * + *

Determinism: {@code Louvain} processes nodes in the SPARQL result iteration order, so as long + * as Fuseki returns rows in a stable order (Jena's TDB does, modulo equal-cost solutions), the + * persisted membership is reproducible. + */ +@Slf4j +public final class CommunityComputation { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String INSIGHTS_GRAPH_PREFIX = OM_NS + "insights/communities/"; + private static final String COMMUNITY_URI_PREFIX = OM_NS + "instance/Community/"; + + private final RdfRepository repository; + private final Louvain louvain; + + public CommunityComputation(RdfRepository repository) { + this(repository, new Louvain()); + } + + CommunityComputation(RdfRepository repository, Louvain louvain) { + this.repository = repository; + this.louvain = louvain; + } + + public Result computeAndPersist(String entityType, String graphType) { + String safeType = ImportanceQueryBuilder.validateEntityType(entityType); + GraphType gt = GraphType.parse(graphType); + String classLocalName = capitalize(safeType); + + Map> graph = extractGraph(classLocalName, gt); + if (graph.isEmpty()) { + LOG.info( + "No edges of kind {} found for entity type {}; skipping community run", + gt, + classLocalName); + return new Result(safeType, gt.label, 0, 0, 0.0); + } + Louvain.Result partition = louvain.compute(graph); + persistCommunities(classLocalName, gt, partition); + return new Result( + safeType, + gt.label, + partition.communityCount(), + partition.communityByNode().size(), + partition.modularity()); + } + + /** + * Build the SPARQL SELECT used by GET /v1/rdf/insights/communities to list previously persisted + * communities for the given (entityType, graphType) pair. + */ + public static String listingSparql(String entityType, String graphType) { + String safeType = ImportanceQueryBuilder.validateEntityType(entityType); + GraphType gt = GraphType.parse(graphType); + String classLocalName = capitalize(safeType); + String graphUri = + INSIGHTS_GRAPH_PREFIX + gt.label + "/" + classLocalName.toLowerCase(Locale.ROOT); + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "SELECT ?community ?size ?modularity ?member", + "FROM <" + graphUri + ">", + "WHERE {", + " ?community a om:Community ;", + " om:communitySize ?size ;", + " om:modularity ?modularity ;", + " om:hasMember ?member .", + "}", + "ORDER BY DESC(?size) ?community ?member"); + } + + Map> extractGraph(String classLocalName, GraphType graphType) { + String sparql = + switch (graphType) { + case LINEAGE -> lineageGraphSparql(classLocalName); + case TAG_CO_OCCURRENCE -> tagCoOccurrenceSparql(classLocalName); + }; + String json; + try { + json = repository.executeSparqlQuery(sparql, "application/sparql-results+json"); + } catch (Exception e) { + LOG.error("Failed to extract {} graph for {}", graphType, classLocalName, e); + return Map.of(); + } + return parseGraph(json); + } + + static String lineageGraphSparql(String classLocalName) { + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "PREFIX prov: ", + "SELECT ?from ?to (1.0 AS ?weight) WHERE {", + " ?from a om:" + classLocalName + " .", + " ?to a om:" + classLocalName + " .", + " {", + " ?from prov:wasDerivedFrom ?to .", + " } UNION {", + " ?from om:upstream ?to .", + " } UNION {", + " ?to om:downstream ?from .", + " }", + " FILTER(?from != ?to)", + "}"); + } + + static String tagCoOccurrenceSparql(String classLocalName) { + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "SELECT ?from ?to (COUNT(?shared) AS ?weight) WHERE {", + " ?from a om:" + classLocalName + " .", + " ?to a om:" + classLocalName + " .", + " {", + " ?from om:hasTag ?shared .", + " ?to om:hasTag ?shared .", + " } UNION {", + " ?from om:hasGlossaryTerm ?shared .", + " ?to om:hasGlossaryTerm ?shared .", + " }", + " FILTER(STR(?from) < STR(?to))", + "}", + "GROUP BY ?from ?to"); + } + + /** + * Parses the SPARQL bindings into a directed adjacency map (one weighted entry per edge). The + * companion SPARQL query already canonicalises with {@code FILTER(STR(?from) < STR(?to))} so + * each pair appears once. {@link Louvain#addAllEdges} is responsible for symmetrising the + * adjacency internally — emitting both directions here would double-count every edge weight. + * + *

Each target node is still registered as a (possibly-empty) key so that Louvain's + * {@code graph.keySet()} includes every participating node, not only the ones that appear as a + * source. + */ + static Map> parseGraph(String selectJson) { + Map> graph = new LinkedHashMap<>(); + if (selectJson == null || selectJson.isBlank()) return graph; + try { + JsonNode root = JsonUtils.readTree(selectJson); + JsonNode bindings = root.path("results").path("bindings"); + if (!bindings.isArray()) return graph; + for (JsonNode row : bindings) { + String from = textValue(row, "from"); + String to = textValue(row, "to"); + if (from == null || to == null || from.equals(to)) continue; + double weight = doubleValue(row, "weight", 1.0); + if (weight <= 0) continue; + graph.computeIfAbsent(from, k -> new HashMap<>()).merge(to, weight, Double::sum); + // Register the target as a node too, without adding the reverse edge weight. Louvain + // will symmetrise the adjacency itself; we just need every node visible to it. + graph.computeIfAbsent(to, k -> new HashMap<>()); + } + } catch (Exception e) { + LOG.warn("Failed to parse community graph SPARQL result: {}", e.getMessage()); + } + return graph; + } + + void persistCommunities( + String classLocalName, GraphType graphType, Louvain.Result partition) { + String graphUri = + INSIGHTS_GRAPH_PREFIX + graphType.label + "/" + classLocalName.toLowerCase(Locale.ROOT); + Map> members = partition.membersByCommunity(); + + StringBuilder update = new StringBuilder(); + update.append("PREFIX om: <").append(OM_NS).append(">\n"); + update.append("PREFIX xsd: \n"); + update + .append("WITH <") + .append(graphUri) + .append("> DELETE { ?s ?p ?o } WHERE { ?s a om:Community ; ?p ?o } ;\n"); + update.append("INSERT DATA { GRAPH <").append(graphUri).append("> {\n"); + for (Map.Entry> entry : members.entrySet()) { + String communityUri = + COMMUNITY_URI_PREFIX + + graphType.label + + "/" + + classLocalName.toLowerCase(Locale.ROOT) + + "/" + + entry.getKey(); + update + .append(" <") + .append(communityUri) + .append("> a om:Community ; om:communityType \"") + .append(graphType.label) + .append("\" ; om:communitySize \"") + .append(entry.getValue().size()) + .append("\"^^xsd:integer ; om:modularity \"") + .append(partition.modularity()) + .append("\"^^xsd:double"); + for (String member : entry.getValue()) { + update.append(" ; om:hasMember <").append(member).append(">"); + } + update.append(" .\n"); + } + update.append("} }"); + try { + repository.executeSparqlUpdate(update.toString()); + } catch (Exception e) { + LOG.error("Failed to persist communities for {} / {}", classLocalName, graphType, e); + } + } + + private static String textValue(JsonNode row, String varName) { + JsonNode node = row.path(varName); + if (node.isMissingNode() || node.isNull()) return null; + JsonNode value = node.path("value"); + return value.isMissingNode() || value.isNull() ? null : value.asText(); + } + + private static double doubleValue(JsonNode row, String varName, double fallback) { + JsonNode node = row.path(varName); + if (node.isMissingNode() || node.isNull()) return fallback; + JsonNode value = node.path("value"); + if (value.isMissingNode() || value.isNull()) return fallback; + try { + return Double.parseDouble(value.asText()); + } catch (NumberFormatException e) { + return fallback; + } + } + + private static String capitalize(String s) { + if (s == null || s.isEmpty()) return s; + return Character.toUpperCase(s.charAt(0)) + s.substring(1); + } + + /** Source graph kinds supported by community detection. */ + public enum GraphType { + LINEAGE("lineage"), + TAG_CO_OCCURRENCE("tagCoOccurrence"); + + public final String label; + + GraphType(String label) { + this.label = label; + } + + public static GraphType parse(String value) { + if (value == null || value.isBlank()) return LINEAGE; + String norm = value.trim().toLowerCase(Locale.ROOT); + return switch (norm) { + case "lineage" -> LINEAGE; + case "tagcooccurrence", + "tag", + "tags", + "tag-co-occurrence", + "tag_co_occurrence" -> TAG_CO_OCCURRENCE; + default -> throw new IllegalArgumentException( + "graphType must be one of: lineage, tagCoOccurrence (got: " + value + ")"); + }; + } + } + + /** Result of a community-detection run. */ + public record Result( + String entityType, String graphType, int communities, int membersTotal, double modularity) {} +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/ImportanceQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/ImportanceQueryBuilder.java new file mode 100644 index 000000000000..3c71a529a9c4 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/ImportanceQueryBuilder.java @@ -0,0 +1,111 @@ +package org.openmetadata.service.rdf.insights; + +/** + * Builds the SPARQL query that ranks entities by an importance score blending OpenMetadata's + * existing usage percentile (real query data) with downstream lineage count (graph topology). + * + *

Scoring formula: + * + *

{@code
+ * score = 0.6 * (usagePercentile / 100)
+ *       + 0.4 * (downstreamCount / max(downstreamCount across the entity type))
+ *       + 0.0 * centralityScore   // 3.1.b will plug PageRank here for null-usage entities
+ * }
+ * + *

Both terms are 0–1 after normalization. Entities without usage data fall to the bottom + * until 3.1.b's PageRank fallback lands and {@code om:centralityScore} starts populating. + */ +public final class ImportanceQueryBuilder { + + /** Hard-capped to keep the response page-sized. */ + static final int MIN_LIMIT = 1; + + static final int MAX_LIMIT = 100; + static final int DEFAULT_LIMIT = 20; + + /** Allowed window values map to the matching `om:usage{Window}Percentile` predicate. */ + private static final java.util.Set WINDOWS = + java.util.Set.of("daily", "weekly", "monthly"); + + private ImportanceQueryBuilder() {} + + public static String build(String entityType, String window, int limit) { + String safeType = validateEntityType(entityType); + String safeWindow = validateWindow(window); + int safeLimit = clamp(limit, MIN_LIMIT, MAX_LIMIT); + String classLocalName = capitalize(safeType); + String pctPredicate = "usage" + capitalize(safeWindow) + "Percentile"; + + return String.join( + "\n", + "PREFIX om: ", + "PREFIX rdfs: ", + "PREFIX prov: ", + "PREFIX xsd: ", + "SELECT ?entity ?fqn ?label ?usagePct ?downstreamCount ?score WHERE {", + " {", + " SELECT ?entity (COUNT(?downstream) AS ?downstreamCount) WHERE {", + " ?entity a om:" + classLocalName + " .", + " OPTIONAL { ?downstream prov:wasDerivedFrom ?entity }", + " } GROUP BY ?entity", + " }", + " ?entity om:fullyQualifiedName ?fqn .", + " OPTIONAL { ?entity rdfs:label ?label }", + " OPTIONAL { ?entity om:" + pctPredicate + " ?usagePct }", + " OPTIONAL { ?entity om:centralityScore ?centrality }", + " {", + " SELECT (MAX(?dc) AS ?maxDownstream) WHERE {", + " SELECT (COUNT(?ds) AS ?dc) WHERE {", + " ?e a om:" + classLocalName + " .", + " OPTIONAL { ?ds prov:wasDerivedFrom ?e }", + " } GROUP BY ?e", + " }", + " }", + " BIND(COALESCE(?usagePct, 0.0) / 100.0 AS ?usageNorm)", + " BIND(", + " IF(?maxDownstream > 0,", + " xsd:double(?downstreamCount) / xsd:double(?maxDownstream),", + " 0.0)", + " AS ?downstreamNorm)", + " BIND(COALESCE(?centrality, 0.0) AS ?centralityNorm)", + " BIND(", + " (0.6 * ?usageNorm) + (0.4 * ?downstreamNorm) + (0.0 * ?centralityNorm)", + " AS ?score)", + "}", + "ORDER BY DESC(?score) DESC(?downstreamCount)", + "LIMIT " + safeLimit); + } + + static String validateEntityType(String entityType) { + if (entityType == null || entityType.isBlank()) { + throw new IllegalArgumentException("'entityType' is required"); + } + String trimmed = entityType.trim(); + if (!trimmed.matches("[a-zA-Z][a-zA-Z0-9]*")) { + throw new IllegalArgumentException( + "'entityType' must be alphanumeric (got '" + entityType + "')"); + } + return trimmed; + } + + static String validateWindow(String window) { + if (window == null || window.isBlank()) { + return "daily"; + } + String lower = window.trim().toLowerCase(java.util.Locale.ROOT); + if (!WINDOWS.contains(lower)) { + throw new IllegalArgumentException( + "'window' must be one of " + WINDOWS + " (got '" + window + "')"); + } + return lower; + } + + static int clamp(int v, int lo, int hi) { + return Math.min(Math.max(v, lo), hi); + } + + private static String capitalize(String s) { + if (s == null || s.isEmpty()) return s; + return Character.toUpperCase(s.charAt(0)) + s.substring(1); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/LineagePathBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/LineagePathBuilder.java new file mode 100644 index 000000000000..ead72dfbd256 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/LineagePathBuilder.java @@ -0,0 +1,167 @@ +package org.openmetadata.service.rdf.insights; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.Locale; +import java.util.Set; + +/** + * Builds the SPARQL fragments used by {@link LineagePathFinder} to walk one BFS frontier step + * across the lineage graph. + * + *

The lineage graph is built from three predicates emitted by {@code RdfPropertyMapper}: + * + *

    + *
  • {@code prov:wasDerivedFrom} — entity → its upstream source + *
  • {@code om:upstream} — same direction, OpenMetadata-flavored alias + *
  • {@code om:downstream} — entity → an entity that derives from it + *
+ * + *

For an "upstream" walk from {@code A}, the algorithm follows {@code A prov:wasDerivedFrom ?x} + * and {@code A om:upstream ?x}. For a "downstream" walk it inverts {@code prov:wasDerivedFrom} + * (asks for {@code ?x prov:wasDerivedFrom A}) and follows {@code A om:downstream ?x}. {@code both} + * does both. + * + *

All inputs are validated; any URI not recognized by {@link URI} is rejected before SPARQL is + * emitted. The class is intentionally side-effect-free so it can be exhaustively unit-tested. + */ +public final class LineagePathBuilder { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String PROV_NS = "http://www.w3.org/ns/prov#"; + + /** Default upper bound on BFS depth. */ + public static final int DEFAULT_MAX_HOPS = 6; + + /** Hard cap so a buggy caller can't ask for thousands of frontier expansions. */ + public static final int HARD_MAX_HOPS = 25; + + private LineagePathBuilder() {} + + /** Allowed walk directions for {@link #frontierQuery(Collection, Direction)}. */ + public enum Direction { + UPSTREAM, + DOWNSTREAM, + BOTH; + + public static Direction parse(String value) { + if (value == null || value.isBlank()) return UPSTREAM; + return switch (value.trim().toLowerCase(Locale.ROOT)) { + case "upstream" -> UPSTREAM; + case "downstream" -> DOWNSTREAM; + case "both" -> BOTH; + default -> throw new IllegalArgumentException( + "direction must be one of: upstream, downstream, both (got: " + value + ")"); + }; + } + } + + /** Validate a node URI. Returns the URI string. Throws if missing/blank/malformed. */ + public static String validateNodeUri(String label, String uri) { + if (uri == null || uri.isBlank()) { + throw new IllegalArgumentException(label + " is required"); + } + String trimmed = uri.trim(); + if (trimmed.contains(">") || trimmed.contains("<") || trimmed.contains("\n")) { + throw new IllegalArgumentException(label + " contains illegal characters"); + } + URI parsed; + try { + parsed = new URI(trimmed); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(label + " is not a valid URI: " + e.getMessage()); + } + if (!parsed.isAbsolute()) { + throw new IllegalArgumentException(label + " must be an absolute URI"); + } + String scheme = parsed.getScheme(); + if (!"http".equalsIgnoreCase(scheme) && !"https".equalsIgnoreCase(scheme)) { + throw new IllegalArgumentException( + label + " must use http or https scheme (got: " + scheme + ")"); + } + return trimmed; + } + + /** Clamp maxHops to [1, HARD_MAX_HOPS]; null or < 1 falls back to DEFAULT_MAX_HOPS. */ + public static int clampMaxHops(Integer requested) { + if (requested == null || requested < 1) return DEFAULT_MAX_HOPS; + return Math.min(requested, HARD_MAX_HOPS); + } + + /** + * Build a SPARQL SELECT that, given a frontier of node URIs, returns every + * (?from, ?to, ?predicate) triple reachable in one hop in the requested direction. The result is + * always written so that ?from is the frontier node being expanded — i.e. for a downstream walk + * the inverse of {@code prov:wasDerivedFrom} is rewritten so the caller can treat ?from as + * "current" and ?to as "next" without branching. + */ + public static String frontierQuery(Collection frontier, Direction direction) { + if (frontier == null || frontier.isEmpty()) { + throw new IllegalArgumentException("frontier must contain at least one URI"); + } + Set validated = new LinkedHashSet<>(); + for (String uri : frontier) validated.add(validateNodeUri("frontier node", uri)); + + StringBuilder values = new StringBuilder(); + for (String uri : validated) values.append(" <").append(uri).append(">\n"); + + StringBuilder unions = new StringBuilder(); + if (direction == Direction.UPSTREAM || direction == Direction.BOTH) { + unions.append(union("?from prov:wasDerivedFrom ?to", "prov:wasDerivedFrom")); + unions.append(" UNION\n"); + unions.append(union("?from om:upstream ?to", "om:upstream")); + } + if (direction == Direction.DOWNSTREAM || direction == Direction.BOTH) { + if (unions.length() > 0) unions.append(" UNION\n"); + unions.append(union("?to prov:wasDerivedFrom ?from", "^prov:wasDerivedFrom")); + unions.append(" UNION\n"); + unions.append(union("?from om:downstream ?to", "om:downstream")); + } + + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "PREFIX prov: <" + PROV_NS + ">", + "SELECT ?from ?to ?predicate WHERE {", + " VALUES ?from {", + values.toString().stripTrailing(), + " }", + " {", + unions.toString(), + " }", + " FILTER(?to != ?from)", + "}"); + } + + /** + * Build a SPARQL SELECT that returns the rdf:type values for a set of nodes. Used to decorate + * the final path with class info without round-tripping per node. + */ + public static String typesQuery(Collection nodes) { + if (nodes == null || nodes.isEmpty()) { + throw new IllegalArgumentException("nodes must contain at least one URI"); + } + Set validated = new LinkedHashSet<>(); + for (String uri : nodes) validated.add(validateNodeUri("node", uri)); + + StringBuilder values = new StringBuilder(); + for (String uri : validated) values.append(" <").append(uri).append(">\n"); + + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "SELECT ?node ?type WHERE {", + " VALUES ?node {", + values.toString().stripTrailing(), + " }", + " ?node a ?type .", + " FILTER(STRSTARTS(STR(?type), \"" + OM_NS + "\"))", + "}"); + } + + private static String union(String triple, String predicateLabel) { + return " { " + triple + " . BIND(\"" + predicateLabel + "\" AS ?predicate) }"; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/LineagePathFinder.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/LineagePathFinder.java new file mode 100644 index 000000000000..ea54f876b3bd --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/LineagePathFinder.java @@ -0,0 +1,251 @@ +package org.openmetadata.service.rdf.insights; + +import com.fasterxml.jackson.databind.JsonNode; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Deque; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.rdf.RdfRepository; + +/** + * BFS path finder over the lineage graph. Walks one frontier at a time using SPARQL queries built + * by {@link LineagePathBuilder}, so the algorithm is independent of dataset size — only the active + * frontier is held in memory. + * + *

Termination conditions: + * + *

    + *
  • Target reached → reconstruct path via parent map; return {@code found = true}. + *
  • Frontier becomes empty → return {@code found = false}. + *
  • maxHops reached → return {@code found = false}. + *
+ * + *

The {@code visited} set guards against cycles. Each node is expanded at most once. The found + * path is always the shortest one (BFS), with ties broken by SPARQL result order. + */ +@Slf4j +public final class LineagePathFinder { + + private final RdfRepository repository; + + public LineagePathFinder(RdfRepository repository) { + this.repository = repository; + } + + /** + * Walk the lineage graph from {@code fromUri} to {@code toUri}. + * + * @param fromUri starting node URI (validated) + * @param toUri target node URI (validated) + * @param direction upstream / downstream / both + * @param maxHops upper bound on path length; clamped via {@link LineagePathBuilder#clampMaxHops} + */ + public Path findPath( + String fromUri, String toUri, LineagePathBuilder.Direction direction, Integer maxHops) { + String from = LineagePathBuilder.validateNodeUri("from", fromUri); + String to = LineagePathBuilder.validateNodeUri("to", toUri); + LineagePathBuilder.Direction dir = + direction == null ? LineagePathBuilder.Direction.UPSTREAM : direction; + int hopBudget = LineagePathBuilder.clampMaxHops(maxHops); + + if (from.equals(to)) { + return Path.found(from, to, dir, hopBudget, List.of(new Hop(0, from, null, List.of()))); + } + + Map parents = new HashMap<>(); + Set visited = new LinkedHashSet<>(); + visited.add(from); + Set frontier = new LinkedHashSet<>(); + frontier.add(from); + + int depth = 0; + String reached = null; + while (!frontier.isEmpty() && depth < hopBudget) { + Map nextLevel = expandFrontier(frontier, dir, visited); + if (nextLevel.isEmpty()) break; + depth++; + + for (Map.Entry e : nextLevel.entrySet()) { + String node = e.getKey(); + if (parents.putIfAbsent(node, e.getValue()) == null) { + visited.add(node); + } + if (node.equals(to)) { + reached = node; + break; + } + } + if (reached != null) break; + frontier = new LinkedHashSet<>(nextLevel.keySet()); + } + + if (reached == null) { + return Path.notFound(from, to, dir, hopBudget); + } + List hops = reconstructPath(from, to, parents); + decorateWithTypes(hops); + return Path.found(from, to, dir, hopBudget, hops); + } + + /** + * Run one frontier query and return a map of (newly discovered node → ParentEdge). Already + * visited nodes are filtered out so the BFS stays acyclic. + */ + Map expandFrontier( + Set frontier, LineagePathBuilder.Direction direction, Set visited) { + String sparql = LineagePathBuilder.frontierQuery(frontier, direction); + String json; + try { + json = repository.executeSparqlQuery(sparql, "application/sparql-results+json"); + } catch (Exception e) { + LOG.warn("Path finder frontier query failed: {}", e.getMessage()); + return Map.of(); + } + return parseFrontierResult(json, visited); + } + + static Map parseFrontierResult(String json, Set visited) { + Map next = new LinkedHashMap<>(); + if (json == null || json.isBlank()) return next; + try { + JsonNode root = JsonUtils.readTree(json); + JsonNode bindings = root.path("results").path("bindings"); + if (!bindings.isArray()) return next; + for (JsonNode row : bindings) { + String from = textValue(row, "from"); + String to = textValue(row, "to"); + String predicate = textValue(row, "predicate"); + if (from == null || to == null || predicate == null) continue; + if (visited.contains(to)) continue; + next.putIfAbsent(to, new ParentEdge(from, predicate)); + } + } catch (Exception e) { + LOG.warn("Failed to parse path frontier result: {}", e.getMessage()); + } + return next; + } + + private List reconstructPath(String from, String to, Map parents) { + Deque stack = new ArrayDeque<>(); + String cursor = to; + int safety = parents.size() + 2; + while (cursor != null && !cursor.equals(from) && safety-- > 0) { + ParentEdge edge = parents.get(cursor); + if (edge == null) break; + stack.push(new Hop(0, cursor, edge.predicate(), List.of())); + cursor = edge.parent(); + } + stack.push(new Hop(0, from, null, List.of())); + + List ordered = new ArrayList<>(stack.size()); + int step = 0; + while (!stack.isEmpty()) { + Hop h = stack.pop(); + ordered.add(new Hop(step++, h.node(), h.predicate(), h.rdfTypes())); + } + return ordered; + } + + /** Single SPARQL round-trip to fetch {@code rdf:type} for every node in the path. */ + void decorateWithTypes(List hops) { + if (hops.isEmpty()) return; + Set nodes = new LinkedHashSet<>(); + for (Hop h : hops) nodes.add(h.node()); + String sparql = LineagePathBuilder.typesQuery(nodes); + String json; + try { + json = repository.executeSparqlQuery(sparql, "application/sparql-results+json"); + } catch (Exception e) { + LOG.debug("Type decoration failed (non-fatal): {}", e.getMessage()); + return; + } + Map> types = parseTypesResult(json); + for (int i = 0; i < hops.size(); i++) { + Hop h = hops.get(i); + List t = types.getOrDefault(h.node(), List.of()); + hops.set(i, new Hop(h.step(), h.node(), h.predicate(), t)); + } + } + + static Map> parseTypesResult(String json) { + Map> result = new HashMap<>(); + if (json == null || json.isBlank()) return result; + try { + JsonNode root = JsonUtils.readTree(json); + JsonNode bindings = root.path("results").path("bindings"); + if (!bindings.isArray()) return result; + for (JsonNode row : bindings) { + String node = textValue(row, "node"); + String type = textValue(row, "type"); + if (node == null || type == null) continue; + result.computeIfAbsent(node, k -> new ArrayList<>()).add(type); + } + for (Map.Entry> e : result.entrySet()) { + Set uniq = new LinkedHashSet<>(e.getValue()); + e.setValue(List.copyOf(uniq)); + } + } catch (Exception e) { + LOG.debug("Failed to parse type decoration result: {}", e.getMessage()); + } + return Collections.unmodifiableMap(result); + } + + private static String textValue(JsonNode row, String varName) { + JsonNode node = row.path(varName); + if (node.isMissingNode() || node.isNull()) return null; + JsonNode value = node.path("value"); + return value.isMissingNode() || value.isNull() ? null : value.asText(); + } + + /** Edge that points back to a node's BFS parent. */ + record ParentEdge(String parent, String predicate) {} + + /** One hop in a returned path. */ + public record Hop(int step, String node, String predicate, List rdfTypes) {} + + /** + * BFS path response. {@code hops} is {@code nodes.size() - 1} when found, else {@code 0}. + */ + public record Path( + String from, + String to, + String direction, + int maxHops, + boolean found, + int hops, + List nodes) { + + static Path notFound(String from, String to, LineagePathBuilder.Direction dir, int maxHops) { + return new Path(from, to, dir.name().toLowerCase(), maxHops, false, 0, List.of()); + } + + static Path found( + String from, String to, LineagePathBuilder.Direction dir, int maxHops, List nodes) { + return new Path( + from, to, dir.name().toLowerCase(), maxHops, true, Math.max(0, nodes.size() - 1), nodes); + } + } + + /** Helper to build a Hop list for unit tests. */ + static List hopList(String... nodes) { + List out = new ArrayList<>(); + for (int i = 0; i < nodes.length; i++) { + out.add(new Hop(i, nodes[i], i == 0 ? null : "prov:wasDerivedFrom", List.of())); + } + return out; + } + + /** No-op visited-set placeholder used in tests. */ + static Set emptyVisited() { + return new HashSet<>(); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/Louvain.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/Louvain.java new file mode 100644 index 000000000000..1d46acb96dc4 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/Louvain.java @@ -0,0 +1,228 @@ +package org.openmetadata.service.rdf.insights; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * Modularity-optimizing community detection — the greedy "first pass" of the Louvain algorithm + * (Blondel et al., 2008). Each node starts in its own community; we repeatedly move every node to + * the neighbouring community that yields the highest modularity gain until no move improves the + * partition. The aggregation/recursion step of full multi-level Louvain is intentionally left out — + * a single greedy pass already produces high-quality communities on the kinds of graphs an + * OpenMetadata catalog produces (lineage chains, tag co-occurrence) and keeps the algorithm + * deterministic and ~150 lines instead of ~400. + * + *

Determinism: nodes are processed in input-map iteration order; candidate communities are + * sorted by integer id; ties on modularity gain favour the lower-numbered community. A given input + * graph therefore always produces the same partition, which is exactly what the Phase 3.2 success + * criterion calls for ("Louvain run produces om:Community resources with deterministic + * membership for the seed graph"). + * + *

Edge weight handling: the input is treated as undirected; if both directions are present they + * sum, single-direction edges are mirrored. Self-loops are ignored. Negative weights are clamped + * to zero — modularity isn't well-defined for them. + * + *

Modularity gain when inserting node {@code i} into community {@code C} (i has been removed + * from its current community before the test) is the standard Louvain shortcut: + * + *

+ *   ΔQ ∝ k_iC − Σ_tot(C) · k_i / (2m)
+ * 
+ * + * where {@code k_iC} = sum of weights from i to members of C, {@code Σ_tot(C)} = sum of degrees of + * nodes in C, {@code k_i} = degree of i, {@code 2m} = sum of all degrees. The constant 1/(2m) is + * the same across candidates so we drop it for the argmax. + */ +public final class Louvain { + + private static final int DEFAULT_MAX_ITERATIONS = 32; + + private final int maxIterations; + + public Louvain() { + this(DEFAULT_MAX_ITERATIONS); + } + + public Louvain(int maxIterations) { + if (maxIterations < 1) { + throw new IllegalArgumentException("maxIterations must be >= 1"); + } + this.maxIterations = maxIterations; + } + + /** + * Run greedy modularity optimization on a weighted graph. + * + * @param graph adjacency map. {@code graph.get(a).get(b)} = edge weight from a to b. Undirected; + * symmetrized internally. Null or empty graphs return an empty result. + * @param node id type (must support equals/hashCode and have a stable toString) + * @return final partition + modularity + iteration count + */ + public Result compute(Map> graph) { + if (graph == null || graph.isEmpty()) return new Result<>(Map.of(), 0.0, 0); + + List nodes = new ArrayList<>(graph.keySet()); + int n = nodes.size(); + Map idx = new LinkedHashMap<>(n); + for (int i = 0; i < n; i++) idx.put(nodes.get(i), i); + + Map> adj = new LinkedHashMap<>(n); + for (int i = 0; i < n; i++) adj.put(i, new LinkedHashMap<>()); + addAllEdges(graph, idx, adj); + + double[] degree = new double[n]; + double totalWeight = 0.0; + for (int i = 0; i < n; i++) { + double d = 0.0; + for (double w : adj.get(i).values()) d += w; + degree[i] = d; + totalWeight += d; + } + if (totalWeight == 0.0) { + return singletonsResult(nodes); + } + + int[] community = new int[n]; + double[] commTotal = new double[n]; + for (int i = 0; i < n; i++) { + community[i] = i; + commTotal[i] = degree[i]; + } + + int iterations = 0; + boolean moved = true; + while (moved && iterations < maxIterations) { + moved = false; + iterations++; + for (int i = 0; i < n; i++) { + int chosen = considerMoves(i, community, commTotal, degree, totalWeight, adj); + if (chosen != community[i]) { + commTotal[community[i]] -= degree[i]; + commTotal[chosen] += degree[i]; + community[i] = chosen; + moved = true; + } + } + } + + Map renumbered = renumber(community); + Map finalPartition = new LinkedHashMap<>(n); + for (int i = 0; i < n; i++) finalPartition.put(nodes.get(i), renumbered.get(community[i])); + double modularity = computeModularity(adj, community, degree, totalWeight); + return new Result<>(finalPartition, modularity, iterations); + } + + private static void addAllEdges( + Map> graph, Map idx, Map> adj) { + for (Map.Entry> e : graph.entrySet()) { + Integer src = idx.get(e.getKey()); + if (src == null || e.getValue() == null) continue; + for (Map.Entry e2 : e.getValue().entrySet()) { + Integer dst = idx.get(e2.getKey()); + if (dst == null || dst.equals(src)) continue; + double w = e2.getValue() == null ? 0.0 : Math.max(0.0, e2.getValue()); + if (w == 0.0) continue; + adj.get(src).merge(dst, w, Double::sum); + adj.get(dst).merge(src, w, Double::sum); + } + } + } + + /** Choose the best community for node {@code i}, breaking ties toward lower community id. */ + private static int considerMoves( + int i, + int[] community, + double[] commTotal, + double[] degree, + double totalWeight, + Map> adj) { + int currentComm = community[i]; + double k_i = degree[i]; + + Map commLinks = new LinkedHashMap<>(); + for (Map.Entry e : adj.get(i).entrySet()) { + if (e.getKey() == i) continue; + commLinks.merge(community[e.getKey()], e.getValue(), Double::sum); + } + + commTotal[currentComm] -= k_i; + + int bestComm = currentComm; + double bestGain = 0.0; + List candidates = new ArrayList<>(commLinks.keySet()); + Collections.sort(candidates); + for (int c : candidates) { + double k_iC = commLinks.getOrDefault(c, 0.0); + double gain = k_iC - commTotal[c] * k_i / totalWeight; + if (gain > bestGain || (gain == bestGain && c < bestComm)) { + bestGain = gain; + bestComm = c; + } + } + commTotal[currentComm] += k_i; + return bestComm; + } + + /** Compress community ids to a dense [0..k-1] range, preserving discovery order. */ + static Map renumber(int[] community) { + Map map = new LinkedHashMap<>(); + for (int c : community) { + map.computeIfAbsent(c, k -> map.size()); + } + return map; + } + + /** Modularity Q = (1/2m) Σ [A_ij − k_i k_j / 2m] · δ(c_i, c_j). */ + static double computeModularity( + Map> adj, + int[] community, + double[] degree, + double totalWeight) { + if (totalWeight == 0) return 0.0; + double q = 0.0; + int n = adj.size(); + for (int i = 0; i < n; i++) { + for (Map.Entry e : adj.get(i).entrySet()) { + int j = e.getKey(); + if (community[i] != community[j]) continue; + double aij = e.getValue(); + double expected = degree[i] * degree[j] / totalWeight; + q += aij - expected; + } + } + return q / totalWeight; + } + + private static Result singletonsResult(List nodes) { + Map partition = new LinkedHashMap<>(nodes.size()); + for (int i = 0; i < nodes.size(); i++) partition.put(nodes.get(i), i); + return new Result<>(partition, 0.0, 0); + } + + /** + * Result of a Louvain run. + * + * @param communityByNode every input node mapped to a dense 0-based community id + * @param modularity Q score of the final partition; higher is better, [-1, 1] + * @param iterations number of greedy passes performed + */ + public record Result(Map communityByNode, double modularity, int iterations) { + + /** Inverse view: community id → list of members in input iteration order. */ + public Map> membersByCommunity() { + Map> out = new LinkedHashMap<>(); + for (Map.Entry e : communityByNode.entrySet()) { + out.computeIfAbsent(e.getValue(), k -> new ArrayList<>()).add(e.getKey()); + } + return out; + } + + /** Distinct community count. */ + public int communityCount() { + return (int) communityByNode.values().stream().distinct().count(); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/PageRank.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/PageRank.java new file mode 100644 index 000000000000..1bbb67ed9ea8 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/PageRank.java @@ -0,0 +1,167 @@ +package org.openmetadata.service.rdf.insights; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +/** + * Iterative weighted PageRank with proper handling of dangling nodes and + * disconnected components. The output is a normalized importance score in + * {@code [0,1]} that sums to 1.0 across all nodes in the graph. + * + *

This is intentionally a tiny, dependency-free implementation. JGraphT or Spark GraphFrames + * would be appropriate for million-edge graphs; OpenMetadata's metadata graph for a single + * tenant typically has < 50k nodes and the simpler approach keeps the codebase lean. + * + *

Algorithm (classic Brin/Page formulation, weighted edges): + * + *

{@code
+ * for each iteration:
+ *   for each node v:
+ *     newScore[v] = (1 - d) / N
+ *                 + d * sum over u in inEdges(v) of (score[u] * w(u,v) / outWeight(u))
+ *                 + d * danglingMass / N    // redistribute mass from sinks
+ * }
+ * + *

Stops when {@code max |newScore[v] - score[v]| < tolerance} or {@code maxIterations} + * is reached. Output normalizes to sum 1.0. + */ +public final class PageRank { + + /** Standard damping factor. */ + public static final double DEFAULT_DAMPING = 0.85; + + public static final int DEFAULT_MAX_ITERATIONS = 100; + public static final double DEFAULT_TOLERANCE = 1e-6; + + private final double damping; + private final int maxIterations; + private final double tolerance; + + public PageRank() { + this(DEFAULT_DAMPING, DEFAULT_MAX_ITERATIONS, DEFAULT_TOLERANCE); + } + + public PageRank(double damping, int maxIterations, double tolerance) { + if (damping <= 0 || damping >= 1) { + throw new IllegalArgumentException("damping must be in (0, 1)"); + } + if (maxIterations <= 0) { + throw new IllegalArgumentException("maxIterations must be positive"); + } + if (tolerance <= 0) { + throw new IllegalArgumentException("tolerance must be positive"); + } + this.damping = damping; + this.maxIterations = maxIterations; + this.tolerance = tolerance; + } + + /** + * Compute weighted PageRank. + * + * @param outgoing for each node, the map of {@code targetNode → edgeWeight}. Source nodes + * with no outgoing edges (dangling sinks) are still included in the result; their mass is + * redistributed uniformly. Nodes that appear only as targets are treated as having no + * outgoing edges. Empty input returns an empty map. + * @return map of {@code node → score}, scores in [0, 1] summing to 1.0 (modulo floating-point + * drift). Iteration count is reported via {@link Result#iterations()}. + */ + public Result compute(Map> outgoing) { + Objects.requireNonNull(outgoing, "outgoing"); + // Nodes = union of sources and all targets (so a dangling target still gets a score). + Map> graph = new HashMap<>(); + for (Map.Entry> e : outgoing.entrySet()) { + graph.computeIfAbsent(e.getKey(), k -> new HashMap<>()); + for (String t : e.getValue().keySet()) { + graph.computeIfAbsent(t, k -> new HashMap<>()); + } + graph.get(e.getKey()).putAll(e.getValue()); + } + int n = graph.size(); + if (n == 0) { + return new Result<>(Collections.emptyMap(), 0, true); + } + + // Pre-compute outgoing weight totals once per source. + Map outWeight = new HashMap<>(); + for (Map.Entry> e : graph.entrySet()) { + double sum = 0.0; + for (Double w : e.getValue().values()) { + if (w != null && w > 0) sum += w; + } + outWeight.put(e.getKey(), sum); + } + + Map score = new HashMap<>(n); + double init = 1.0 / n; + for (String node : graph.keySet()) score.put(node, init); + + int iterations = 0; + boolean converged = false; + while (iterations < maxIterations) { + iterations++; + // Sum of mass from dangling nodes — to be redistributed uniformly. + double dangling = 0.0; + for (Map.Entry e : score.entrySet()) { + if (outWeight.getOrDefault(e.getKey(), 0.0) <= 0) { + dangling += e.getValue(); + } + } + Map next = new HashMap<>(n); + double danglingShare = damping * dangling / n; + double base = (1 - damping) / n + danglingShare; + for (String node : graph.keySet()) next.put(node, base); + + for (Map.Entry> e : graph.entrySet()) { + String src = e.getKey(); + double srcOut = outWeight.getOrDefault(src, 0.0); + if (srcOut <= 0) continue; + double srcScore = score.get(src); + for (Map.Entry edge : e.getValue().entrySet()) { + double w = edge.getValue() == null ? 0 : edge.getValue(); + if (w <= 0) continue; + double contribution = damping * srcScore * (w / srcOut); + next.merge(edge.getKey(), contribution, Double::sum); + } + } + + double maxDelta = 0.0; + for (Map.Entry e : next.entrySet()) { + double d = Math.abs(e.getValue() - score.getOrDefault(e.getKey(), 0.0)); + if (d > maxDelta) maxDelta = d; + } + score = next; + if (maxDelta < tolerance) { + converged = true; + break; + } + } + + // Normalize so scores sum to 1.0 (guards against floating-point drift). + double total = 0.0; + for (double v : score.values()) total += v; + if (total > 0) { + Map normalized = new HashMap<>(n); + for (Map.Entry e : score.entrySet()) { + normalized.put(e.getKey(), e.getValue() / total); + } + score = normalized; + } + return new Result<>(score, iterations, converged); + } + + /** Visible-for-tests convenience: which nodes appear in the result. */ + public static Set nodes(Map> graph) { + Set all = new java.util.HashSet<>(); + for (Map.Entry> e : graph.entrySet()) { + all.add(e.getKey()); + all.addAll(e.getValue().keySet()); + } + return all; + } + + public record Result(Map scores, int iterations, boolean converged) {} +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/RecommendationsQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/RecommendationsQueryBuilder.java new file mode 100644 index 000000000000..391c0a4667a8 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/insights/RecommendationsQueryBuilder.java @@ -0,0 +1,94 @@ +package org.openmetadata.service.rdf.insights; + +/** + * SPARQL builder for Phase 3.4 dataset recommendations. Given an entity URI, scores every other + * entity by graph-topology similarity along three dimensions: + * + *

    + *
  • Tag overlap — number of distinct {@code om:hasTag} values shared with the seed. + *
  • Glossary overlap — number of distinct {@code om:hasGlossaryTerm} values shared. + *
  • Lineage proximity — number of distinct lineage neighbours both entities share, where + * a "neighbour" is reachable in one hop along {@code om:upstream}, {@code om:downstream}, + * {@code prov:wasDerivedFrom}, or its inverse. + *
+ * + *

The composite score is {@code 1.0·tags + 1.5·glossary + 2.0·lineage}. Lineage proximity is + * weighted highest because it's a stronger structural signal than tag co-occurrence — two tables + * that derive from the same source are tightly coupled, whereas two tables sharing a "PII" tag may + * be entirely unrelated. + * + *

The query is a single SPARQL aggregate with three sub-SELECTs unioned together so the engine + * can compute each dimension once and the outer GROUP BY sums them; this keeps the query plan + * cache-friendly and keeps the result set naturally sparse — entities with zero overlap on any + * dimension never appear in the union and so never reach the score formula. + */ +public final class RecommendationsQueryBuilder { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String PROV_NS = "http://www.w3.org/ns/prov#"; + + /** Default top-N when caller doesn't specify. */ + public static final int DEFAULT_LIMIT = 10; + + /** Hard cap so a buggy caller can't ask for hundreds of recommendations. */ + public static final int MAX_LIMIT = 50; + + static final double WEIGHT_TAG = 1.0; + static final double WEIGHT_GLOSSARY = 1.5; + static final double WEIGHT_LINEAGE = 2.0; + + private RecommendationsQueryBuilder() {} + + /** + * Build the recommendations SPARQL. + * + * @param entityUri seed entity URI; validated as absolute http(s) + * @param limit number of recommendations; clamped to [1, {@link #MAX_LIMIT}] + */ + public static String build(String entityUri, int limit) { + String safeUri = LineagePathBuilder.validateNodeUri("entityUri", entityUri); + int safeLimit = clamp(limit, 1, MAX_LIMIT); + return String.join( + "\n", + "PREFIX om: <" + OM_NS + ">", + "PREFIX prov: <" + PROV_NS + ">", + "SELECT ?candidate", + " (SUM(?t) AS ?tagOverlap)", + " (SUM(?g) AS ?glossaryOverlap)", + " (SUM(?l) AS ?lineageOverlap)", + " ((SUM(?t) * " + WEIGHT_TAG + ")", + " + (SUM(?g) * " + WEIGHT_GLOSSARY + ")", + " + (SUM(?l) * " + WEIGHT_LINEAGE + ") AS ?score)", + "WHERE {", + " {", + " SELECT ?candidate (COUNT(DISTINCT ?tag) AS ?t) (0 AS ?g) (0 AS ?l) WHERE {", + " <" + safeUri + "> om:hasTag ?tag .", + " ?candidate om:hasTag ?tag .", + " FILTER(?candidate != <" + safeUri + ">)", + " } GROUP BY ?candidate", + " } UNION {", + " SELECT ?candidate (0 AS ?t) (COUNT(DISTINCT ?term) AS ?g) (0 AS ?l) WHERE {", + " <" + safeUri + "> om:hasGlossaryTerm ?term .", + " ?candidate om:hasGlossaryTerm ?term .", + " FILTER(?candidate != <" + safeUri + ">)", + " } GROUP BY ?candidate", + " } UNION {", + " SELECT ?candidate (0 AS ?t) (0 AS ?g) (COUNT(DISTINCT ?n) AS ?l) WHERE {", + " <" + + safeUri + + "> (om:upstream|om:downstream|prov:wasDerivedFrom|^prov:wasDerivedFrom) ?n .", + " ?candidate (om:upstream|om:downstream|prov:wasDerivedFrom|^prov:wasDerivedFrom) ?n .", + " FILTER(?candidate != <" + safeUri + ">)", + " } GROUP BY ?candidate", + " }", + "}", + "GROUP BY ?candidate", + "ORDER BY DESC(?score) ?candidate", + "LIMIT " + safeLimit); + } + + private static int clamp(int v, int lo, int hi) { + if (v < lo) return lo; + return Math.min(v, hi); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/JsonLdTranslator.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/JsonLdTranslator.java index 4512e9e469b0..81e0a311db45 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/JsonLdTranslator.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/JsonLdTranslator.java @@ -51,7 +51,9 @@ private void loadContexts() { "governance", "quality", "operations", - "lineage" + "lineage", + "ai", + "automation" }; for (String contextName : contexts) { try { @@ -112,6 +114,7 @@ public ObjectNode toJsonLd(EntityInterface entity) { JsonNode entityJson = objectMapper.valueToTree(entity); Map entityMap = objectMapper.convertValue(entityJson, Map.class); addJsonLdPropertiesToReferences(entityMap); + assignColumnIds(entityMap); String entityType = entity.getEntityReference().getType(); String id = baseUri + "entity/" + entityType + "/" + entity.getId(); @@ -173,6 +176,41 @@ private void addJsonLdPropertiesToReferences(Map map) { } } + /** + * Assign FQN-derived URIs to every Column nested in a Table (or another column) so each Column + * is a first-class named resource. The same URI is minted by column-level lineage so SPARQL can + * traverse from a lineage edge to the column it references. + */ + private void assignColumnIds(Map entityMap) { + Object columnsValue = entityMap.get("columns"); + if (columnsValue instanceof java.util.List) { + for (Object column : (java.util.List) columnsValue) { + if (column instanceof Map) { + assignColumnId((Map) column); + } + } + } + } + + private void assignColumnId(Map column) { + Object fqn = column.get("fullyQualifiedName"); + if (fqn instanceof String && !((String) fqn).isEmpty()) { + String columnUri = RdfUtils.columnUri(baseUri, (String) fqn); + if (columnUri != null) { + column.put("@id", columnUri); + column.put("@type", "om:Column"); + } + } + Object children = column.get("children"); + if (children instanceof java.util.List) { + for (Object child : (java.util.List) children) { + if (child instanceof Map) { + assignColumnId((Map) child); + } + } + } + } + private ObjectNode createSimpleJsonLd(EntityInterface entity) { ObjectNode result = objectMapper.createObjectNode(); @@ -320,10 +358,6 @@ private Object selectContext(String entityType) { "testcaseresult", "testcaseresolutionstatus" -> contextCache.get("quality"); case "ingestionpipeline", - "workflow", - "workflowdefinition", - "workflowinstance", - "workflowinstancestate", "eventsubscription", "kpi", "datainsightchart", @@ -332,6 +366,17 @@ private Object selectContext(String entityType) { "appmarketplacedefinition", "document", "page" -> contextCache.get("operations"); + case "llmmodel", + "aiapplication", + "mcpserver", + "mcpexecution", + "agentexecution", + "prompttemplate" -> contextCache.get("ai"); + case "workflow", + "workflowdefinition", + "workflowinstance", + "workflowinstancestate", + "automation" -> contextCache.get("automation"); default -> contextCache.get("base"); }; } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfActivityMapper.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfActivityMapper.java new file mode 100644 index 000000000000..7bee241a6af5 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfActivityMapper.java @@ -0,0 +1,132 @@ +package org.openmetadata.service.rdf.translator; + +import com.fasterxml.jackson.databind.JsonNode; +import org.apache.jena.datatypes.xsd.XSDDatatype; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.Property; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.vocabulary.RDF; + +/** + * Emits {@code prov:Activity} resources for OpenMetadata pipeline runs. + * + *

OpenMetadata stores the latest run of a pipeline under {@code Pipeline.pipelineStatus}. We + * surface that as a navigable PROV-O activity tied to the pipeline (as the agent associated + * with the activity) and to the input/output datasets via {@code prov:used} / + * {@code prov:generated}, so SPARQL can answer "who ran what at when, against which datasets, + * with what outcome." + */ +public final class RdfActivityMapper { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String PROV_NS = "http://www.w3.org/ns/prov#"; + + private RdfActivityMapper() {} + + static void emitPipelineActivity( + JsonNode pipelineStatus, + String pipelineFqn, + Resource pipelineResource, + String baseUri, + Model model) { + if (pipelineStatus == null || pipelineStatus.isNull() || !pipelineStatus.isObject()) { + return; + } + if (!pipelineStatus.has("timestamp") || pipelineStatus.get("timestamp").isNull()) { + // PROV activity is meaningless without a startedAtTime equivalent. + return; + } + long startMillis = pipelineStatus.get("timestamp").asLong(); + String activityUri = activityUri(pipelineResource, pipelineFqn, startMillis); + Resource activity = model.createResource(activityUri); + activity.addProperty(RDF.type, model.createResource(PROV_NS + "Activity")); + activity.addProperty(RDF.type, model.createResource(OM_NS + "PipelineExecution")); + + String startedAt = java.time.Instant.ofEpochMilli(startMillis).toString(); + activity.addProperty( + model.createProperty(PROV_NS, "startedAtTime"), + model.createTypedLiteral(startedAt, XSDDatatype.XSDdateTime)); + if (pipelineStatus.has("endTime") && pipelineStatus.get("endTime").isNumber()) { + String endedAt = + java.time.Instant.ofEpochMilli(pipelineStatus.get("endTime").asLong()).toString(); + activity.addProperty( + model.createProperty(PROV_NS, "endedAtTime"), + model.createTypedLiteral(endedAt, XSDDatatype.XSDdateTime)); + } + + if (pipelineStatus.has("executionStatus") && !pipelineStatus.get("executionStatus").isNull()) { + activity.addProperty( + model.createProperty(OM_NS, "executionStatus"), + pipelineStatus.get("executionStatus").asText()); + } + if (pipelineStatus.has("executionId") && !pipelineStatus.get("executionId").isNull()) { + activity.addProperty( + model.createProperty(OM_NS, "executionId"), pipelineStatus.get("executionId").asText()); + } + + // PROV-O: prov:wasInformedBy is an Activity → Activity relation. The pipeline run "was + // informed by" its pipeline definition (the template Activity). Previously this used + // prov:wasGeneratedBy, which has domain prov:Entity and range prov:Activity — inverted. + activity.addProperty(model.createProperty(PROV_NS, "wasInformedBy"), pipelineResource); + pipelineResource.addProperty(model.createProperty(OM_NS, "hasExecution"), activity); + + addAgent(activity, pipelineStatus.get("executedBy"), baseUri, model); + addUsedDatasets(activity, pipelineStatus.get("inputs"), "datasetFQN", "used", baseUri, model); + addUsedDatasets( + activity, pipelineStatus.get("outputs"), "datasetFQN", "generated", baseUri, model); + } + + private static void addAgent( + Resource activity, JsonNode executedBy, String baseUri, Model model) { + if (executedBy == null || executedBy.isNull() || !executedBy.has("id")) { + return; + } + String type = + executedBy.has("type") && !executedBy.get("type").isNull() + ? executedBy.get("type").asText() + : "user"; + // Always mint agent IRIs under the deployment's entity namespace, never the ontology + // namespace. JsonLdTranslator/RdfRepository wire the `om:` prefix to the *ontology* URI + // (https://open-metadata.org/ontology/...), so reading that prefix here would place agent + // resources alongside class definitions and mix ontology + instance data. + Resource agent = + model.createResource(baseUri + "entity/" + type + "/" + executedBy.get("id").asText()); + activity.addProperty(model.createProperty(PROV_NS, "wasAssociatedWith"), agent); + } + + private static void addUsedDatasets( + Resource activity, + JsonNode datasets, + String fqnField, + String predicate, + String baseUri, + Model model) { + if (datasets == null || !datasets.isArray()) { + return; + } + Property prop = model.createProperty(PROV_NS, predicate); + for (JsonNode item : datasets) { + if (!item.isObject() || !item.has(fqnField) || item.get(fqnField).isNull()) { + continue; + } + String fqn = item.get(fqnField).asText(); + // Datasets in Pipeline runs are referenced by FQN (no UUID at this layer); mint a stable + // table URI from the FQN. The triplestore may already contain the table at a UUID-based + // URI; both will participate in queries via the om:fullyQualifiedName literal. + String datasetUri = + baseUri + + "entity/datasetByFqn/" + + java.net.URLEncoder.encode(fqn, java.nio.charset.StandardCharsets.UTF_8); + Resource dataset = model.createResource(datasetUri); + dataset.addProperty(model.createProperty(OM_NS, "fullyQualifiedName"), fqn); + activity.addProperty(prop, dataset); + } + } + + private static String activityUri( + Resource pipelineResource, String pipelineFqn, long startMillis) { + String suffix = pipelineFqn != null ? pipelineFqn : pipelineResource.getURI(); + int hash = (suffix + ":" + startMillis).hashCode(); + return pipelineResource.getURI() + "/run/" + Integer.toHexString(hash); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java index 2eeeeee033b5..33448f928960 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java @@ -50,8 +50,17 @@ public class RdfPropertyMapper { private static final Set STRUCTURED_PROPERTIES = Set.of("lifeCycle", "customProperties", "extension", "certification"); - // Properties that should be omitted from RDF because they are audit/helper data. - private static final Set IGNORED_PROPERTIES = Set.of("changeDescription", "votes"); + // Properties that should be omitted from RDF because they are audit/helper data, or are + // handled by a dedicated emission step elsewhere in this class (e.g. tableConstraints, which + // requires the parent table FQN to mint constrained-column URIs). + private static final Set IGNORED_PROPERTIES = + Set.of( + "changeDescription", + "votes", + "tableConstraints", + "profile", + "pipelineStatus", + "usageSummary"); // Lineage properties that need special handling private static final Set LINEAGE_PROPERTIES = @@ -120,6 +129,37 @@ public void mapEntityToRdf(EntityInterface entity, Resource entityResource, Mode processContextMappings((Map) context, entityJson, entityResource, model); } + // Table-level constraints need the parent table's FQN to resolve column-name references + // into om:Column URIs, so they're emitted here rather than through the field-mapping loop. + if (entityJson.has("tableConstraints") && entityJson.get("tableConstraints").isArray()) { + emitTableConstraints( + entityJson.get("tableConstraints"), + entity.getFullyQualifiedName(), + entityResource, + model); + } + + // Table profile becomes structured DQV measurements rather than an opaque JSON literal. + if (entityJson.has("profile")) { + RdfQualityMapper.emitTableProfile(entityJson.get("profile"), entityResource, model); + } + + // Pipeline runs surface as prov:Activity resources tied back to inputs and outputs. + if (entityJson.has("pipelineStatus") && !entityJson.get("pipelineStatus").isNull()) { + RdfActivityMapper.emitPipelineActivity( + entityJson.get("pipelineStatus"), + entity.getFullyQualifiedName(), + entityResource, + baseUri, + model); + } + + // Usage summary becomes om:usageDaily/Weekly/MonthlyCount + Percentile triples so the + // /v1/rdf/insights/important endpoint can rank entities by real query usage. + if (entityJson.has("usageSummary") && !entityJson.get("usageSummary").isNull()) { + RdfUsageMapper.emitUsageSummary(entityJson.get("usageSummary"), entityResource, model); + } + // Always add standard properties addStandardProperties(entity, entityResource, model); @@ -236,6 +276,11 @@ private void processFieldMapping( return; } + if ("columns".equals(fieldName) && fieldValue.isArray()) { + emitColumns(fieldValue, entityResource, model); + return; + } + if (mapping instanceof String) { // Simple property mapping: "name": "rdfs:label" addSimpleProperty(entityResource, (String) mapping, fieldValue, model); @@ -1001,14 +1046,22 @@ private void addLineageDetails( /** * Adds column-level lineage as structured RDF. Enables SPARQL queries like: "Which columns feed - * into column X" or "What transformation is applied to column Y" + * into column X" or "What transformation is applied to column Y". + * + *

Source and destination columns are emitted as URI references via om:fromColumn / + * om:toColumn so that SPARQL property paths can join them with om:Column resources minted on the + * Table side. The original FQN string is preserved as om:fromColumnFqn / om:toColumnFqn for + * back-compatibility with consumers that match by string FQN. */ private void addColumnLineage( JsonNode columnsLineage, Resource lineageDetailsResource, Model model) { Property hasColumnLineage = model.createProperty(OM_NS, "hasColumnLineage"); + Property fromColumn = model.createProperty(OM_NS, "fromColumn"); + Property toColumn = model.createProperty(OM_NS, "toColumn"); + Property fromColumnFqn = model.createProperty(OM_NS, "fromColumnFqn"); + Property toColumnFqn = model.createProperty(OM_NS, "toColumnFqn"); for (JsonNode colLineage : columnsLineage) { - // Create column lineage resource String colLineageUri = lineageDetailsResource.getURI() + "/columnLineage/" + UUID.randomUUID(); Resource colLineageResource = model.createResource(colLineageUri); @@ -1016,21 +1069,16 @@ private void addColumnLineage( lineageDetailsResource.addProperty(hasColumnLineage, colLineageResource); colLineageResource.addProperty(RDF.type, model.createResource(OM_NS + "ColumnLineage")); - // Add source columns if (colLineage.has("fromColumns") && colLineage.get("fromColumns").isArray()) { - Property fromColumnProp = model.createProperty(OM_NS, "fromColumn"); for (JsonNode fromCol : colLineage.get("fromColumns")) { - colLineageResource.addProperty(fromColumnProp, fromCol.asText()); + linkColumn(colLineageResource, fromColumn, fromColumnFqn, fromCol, model); } } - // Add destination column if (colLineage.has("toColumn") && !colLineage.get("toColumn").isNull()) { - colLineageResource.addProperty( - model.createProperty(OM_NS, "toColumn"), colLineage.get("toColumn").asText()); + linkColumn(colLineageResource, toColumn, toColumnFqn, colLineage.get("toColumn"), model); } - // Add transformation function if (colLineage.has("function") && !colLineage.get("function").isNull()) { colLineageResource.addProperty( model.createProperty(OM_NS, "transformFunction"), colLineage.get("function").asText()); @@ -1038,6 +1086,27 @@ private void addColumnLineage( } } + private void linkColumn( + Resource colLineageResource, + Property uriProperty, + Property fqnProperty, + JsonNode columnFqnNode, + Model model) { + String fqn = columnFqnNode.asText(); + if (fqn == null || fqn.isEmpty()) { + return; + } + colLineageResource.addProperty(fqnProperty, fqn); + String columnUri = RdfUtils.columnUri(baseUri, fqn); + if (columnUri == null) { + return; + } + Resource columnResource = model.createResource(columnUri); + columnResource.addProperty(RDF.type, model.createResource(OM_NS + "Column")); + columnResource.addProperty(model.createProperty(OM_NS, "fullyQualifiedName"), fqn); + colLineageResource.addProperty(uriProperty, columnResource); + } + /** * Handles full lineage object (entity + nodes + upstreamEdges + downstreamEdges) */ @@ -1119,6 +1188,218 @@ private void addContainerProperty( } } + /** + * Emit each Column in a Table.columns array as a first-class named resource and link the table + * to it via om:hasColumn. URIs are derived from the Column's FQN so that lineage edges + * (om:fromColumn / om:toColumn) resolve to the same resource. + */ + private void emitColumns(JsonNode columns, Resource tableResource, Model model) { + Property hasColumn = model.createProperty(OM_NS, "hasColumn"); + for (JsonNode column : columns) { + if (!column.isObject() || !column.has("fullyQualifiedName")) { + continue; + } + Resource columnResource = buildColumnResource(column, model); + if (columnResource != null) { + tableResource.addProperty(hasColumn, columnResource); + if (column.has("children") && column.get("children").isArray()) { + emitColumnChildren(column.get("children"), columnResource, model); + } + } + } + } + + /** + * Emit table-level constraints (PRIMARY_KEY, UNIQUE, FOREIGN_KEY, ...) from + * {@code Table.tableConstraints[]} as named om:TableConstraint resources, and project + * back onto the constrained columns. For FOREIGN_KEY, also emit + * {@code om:references } triples so SPARQL queries can + * traverse FK edges directly. + */ + private void emitTableConstraints( + JsonNode constraints, String tableFqn, Resource tableResource, Model model) { + if (tableFqn == null || tableFqn.isEmpty()) { + return; + } + Property hasConstraint = model.createProperty(OM_NS, "hasConstraint"); + Property constraintType = model.createProperty(OM_NS, "constraintType"); + Property hasConstrainedColumn = model.createProperty(OM_NS, "hasConstrainedColumn"); + Property hasReferredColumn = model.createProperty(OM_NS, "hasReferredColumn"); + Property references = model.createProperty(OM_NS, "references"); + Property relationshipType = model.createProperty(OM_NS, "relationshipType"); + Property isUnique = model.createProperty(OM_NS, "isUnique"); + Property isPrimaryKey = model.createProperty(OM_NS, "isPrimaryKey"); + Resource tableConstraintClass = model.createResource(OM_NS + "TableConstraint"); + + int index = 0; + for (JsonNode constraint : constraints) { + if (!constraint.isObject() || !constraint.has("constraintType")) { + index++; + continue; + } + String type = constraint.get("constraintType").asText(); + Resource constraintResource = model.createResource(constraintUri(tableResource, type, index)); + constraintResource.addProperty(RDF.type, tableConstraintClass); + constraintResource.addProperty(constraintType, type); + tableResource.addProperty(hasConstraint, constraintResource); + if (constraint.has("relationshipType") && !constraint.get("relationshipType").isNull()) { + constraintResource.addProperty( + relationshipType, constraint.get("relationshipType").asText()); + } + + java.util.List sourceColumns = + resolveColumns(constraint.get("columns"), tableFqn, model); + for (Resource sourceColumn : sourceColumns) { + constraintResource.addProperty(hasConstrainedColumn, sourceColumn); + if ("PRIMARY_KEY".equals(type)) { + sourceColumn.addProperty(isPrimaryKey, model.createTypedLiteral(true)); + sourceColumn.addProperty( + model.createProperty(OM_NS, "isNullable"), model.createTypedLiteral(false)); + sourceColumn.addProperty(isUnique, model.createTypedLiteral(true)); + } else if ("UNIQUE".equals(type)) { + sourceColumn.addProperty(isUnique, model.createTypedLiteral(true)); + } + } + + if ("FOREIGN_KEY".equals(type)) { + java.util.List referredColumns = + resolveReferredColumns(constraint.get("referredColumns"), model); + for (Resource referred : referredColumns) { + constraintResource.addProperty(hasReferredColumn, referred); + } + // Pair source columns with referred columns positionally so SPARQL can traverse + // om:references directly without going through the + // constraint resource. The pairs are in declared array order. + int pairs = Math.min(sourceColumns.size(), referredColumns.size()); + for (int i = 0; i < pairs; i++) { + sourceColumns.get(i).addProperty(references, referredColumns.get(i)); + } + } + index++; + } + } + + private String constraintUri(Resource tableResource, String type, int index) { + return tableResource.getURI() + "/constraint/" + type + "/" + index; + } + + private java.util.List resolveColumns( + JsonNode columnNames, String tableFqn, Model model) { + java.util.List resolved = new java.util.ArrayList<>(); + if (columnNames == null || !columnNames.isArray()) { + return resolved; + } + for (JsonNode name : columnNames) { + if (!name.isTextual()) { + continue; + } + String columnFqn = tableFqn + "." + name.asText(); + Resource columnResource = ensureColumnResource(columnFqn, model); + if (columnResource != null) { + resolved.add(columnResource); + } + } + return resolved; + } + + private java.util.List resolveReferredColumns(JsonNode referred, Model model) { + java.util.List resolved = new java.util.ArrayList<>(); + if (referred == null || !referred.isArray()) { + return resolved; + } + for (JsonNode fqnNode : referred) { + if (!fqnNode.isTextual()) { + continue; + } + Resource columnResource = ensureColumnResource(fqnNode.asText(), model); + if (columnResource != null) { + resolved.add(columnResource); + } + } + return resolved; + } + + private Resource ensureColumnResource(String columnFqn, Model model) { + String columnUri = RdfUtils.columnUri(baseUri, columnFqn); + if (columnUri == null) { + return null; + } + Resource columnResource = model.createResource(columnUri); + columnResource.addProperty(RDF.type, model.createResource(OM_NS + "Column")); + columnResource.addProperty(model.createProperty(OM_NS, "fullyQualifiedName"), columnFqn); + return columnResource; + } + + private void emitColumnChildren(JsonNode children, Resource parentColumn, Model model) { + Property hasChild = model.createProperty(OM_NS, "hasChildColumn"); + for (JsonNode child : children) { + if (!child.isObject() || !child.has("fullyQualifiedName")) { + continue; + } + Resource childResource = buildColumnResource(child, model); + if (childResource != null) { + parentColumn.addProperty(hasChild, childResource); + if (child.has("children") && child.get("children").isArray()) { + emitColumnChildren(child.get("children"), childResource, model); + } + } + } + } + + private Resource buildColumnResource(JsonNode column, Model model) { + String fqn = column.get("fullyQualifiedName").asText(); + String columnUri = RdfUtils.columnUri(baseUri, fqn); + if (columnUri == null) { + return null; + } + Resource columnResource = model.createResource(columnUri); + columnResource.addProperty(RDF.type, model.createResource(OM_NS + "Column")); + columnResource.addProperty(model.createProperty(OM_NS, "fullyQualifiedName"), fqn); + if (column.has("name") && !column.get("name").isNull()) { + columnResource.addProperty(RDFS.label, column.get("name").asText()); + } + if (column.has("dataType") && !column.get("dataType").isNull()) { + columnResource.addProperty( + model.createProperty(OM_NS, "columnDataType"), column.get("dataType").asText()); + } + if (column.has("description") && !column.get("description").isNull()) { + columnResource.addProperty( + model.createProperty(OM_NS, "columnDescription"), column.get("description").asText()); + } + if (column.has("ordinalPosition") && column.get("ordinalPosition").isNumber()) { + columnResource.addProperty( + model.createProperty(OM_NS, "ordinalPosition"), + model.createTypedLiteral(column.get("ordinalPosition").asInt())); + } + if (column.has("constraint") && !column.get("constraint").isNull()) { + applyColumnConstraint(columnResource, column.get("constraint").asText(), model); + } + if (column.has("profile") && !column.get("profile").isNull()) { + RdfQualityMapper.emitColumnProfile(column.get("profile"), columnResource, model); + } + return columnResource; + } + + private void applyColumnConstraint(Resource columnResource, String constraint, Model model) { + Property isPrimaryKey = model.createProperty(OM_NS, "isPrimaryKey"); + Property isNullable = model.createProperty(OM_NS, "isNullable"); + Property isUnique = model.createProperty(OM_NS, "isUnique"); + switch (constraint) { + case "PRIMARY_KEY" -> { + columnResource.addProperty(isPrimaryKey, model.createTypedLiteral(true)); + columnResource.addProperty(isNullable, model.createTypedLiteral(false)); + columnResource.addProperty(isUnique, model.createTypedLiteral(true)); + } + case "UNIQUE" -> columnResource.addProperty(isUnique, model.createTypedLiteral(true)); + case "NOT_NULL" -> columnResource.addProperty(isNullable, model.createTypedLiteral(false)); + case "NULL" -> columnResource.addProperty(isNullable, model.createTypedLiteral(true)); + default -> { + // Unknown / vendor-specific (DIST_KEY, SORT_KEY, etc.) — fall through; surfaced via + // table-level constraints if relevant. + } + } + } + private void addTypedProperty( Resource resource, String propertyId, JsonNode value, String type, Model model) { Property property = createProperty(propertyId, model); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfQualityMapper.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfQualityMapper.java new file mode 100644 index 000000000000..5efb77f13a76 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfQualityMapper.java @@ -0,0 +1,150 @@ +package org.openmetadata.service.rdf.translator; + +import com.fasterxml.jackson.databind.JsonNode; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Map; +import org.apache.jena.datatypes.xsd.XSDDatatype; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.Property; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.vocabulary.RDF; + +/** + * Emits {@code dqv:QualityMeasurement} triples from OpenMetadata table and column profiles. + * + *

OpenMetadata historically stored these as opaque JSON literals, which makes them + * unqueryable. This mapper turns each numeric profile field into a navigable measurement so + * SPARQL can answer "tables with completeness < 95%", "columns whose null count exceeded N", + * etc. + */ +public final class RdfQualityMapper { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String DQV_NS = "http://www.w3.org/ns/dqv#"; + private static final String PROV_NS = "http://www.w3.org/ns/prov#"; + + // Maps the JSON field name on Table.profile / Column.profile to the metric URI under om:. + // Only numeric metrics are emitted as dqv:value literals; min/max are skipped because their + // type is polymorphic (string / number / dateTime) and a single dqv:value triple can't express + // that without losing the datatype. + private static final Map TABLE_METRICS = + Map.of( + "rowCount", "RowCountMetric", + "columnCount", "ColumnCountMetric", + "sizeInByte", "SizeInBytesMetric"); + + private static final Map COLUMN_METRICS = buildColumnMetricMap(); + + private static Map buildColumnMetricMap() { + return Map.ofEntries( + Map.entry("valuesCount", "ValuesCountMetric"), + Map.entry("validCount", "ValidCountMetric"), + Map.entry("nullCount", "NullCountMetric"), + Map.entry("nullProportion", "NullProportionMetric"), + Map.entry("missingCount", "MissingCountMetric"), + Map.entry("missingPercentage", "MissingPercentageMetric"), + Map.entry("uniqueCount", "UniqueCountMetric"), + Map.entry("uniqueProportion", "UniqueProportionMetric"), + Map.entry("distinctCount", "DistinctCountMetric"), + Map.entry("distinctProportion", "DistinctProportionMetric"), + Map.entry("duplicateCount", "DuplicateCountMetric"), + Map.entry("mean", "MeanMetric"), + Map.entry("sum", "SumMetric"), + Map.entry("stddev", "StddevMetric"), + Map.entry("variance", "VarianceMetric"), + Map.entry("median", "MedianMetric"), + Map.entry("minLength", "MinLengthMetric"), + Map.entry("maxLength", "MaxLengthMetric")); + } + + private RdfQualityMapper() {} + + static void emitTableProfile(JsonNode profile, Resource tableResource, Model model) { + if (profile == null || profile.isNull() || !profile.isObject()) { + return; + } + String timestamp = readTimestamp(profile); + emitMeasurements(profile, TABLE_METRICS, timestamp, tableResource, model); + } + + static void emitColumnProfile(JsonNode profile, Resource columnResource, Model model) { + if (profile == null || profile.isNull() || !profile.isObject()) { + return; + } + String timestamp = readTimestamp(profile); + emitMeasurements(profile, COLUMN_METRICS, timestamp, columnResource, model); + } + + private static void emitMeasurements( + JsonNode profile, + Map metricMap, + String timestamp, + Resource subjectResource, + Model model) { + Property hasMeasurement = model.createProperty(DQV_NS, "hasQualityMeasurement"); + Property isMeasurementOf = model.createProperty(DQV_NS, "isMeasurementOf"); + Property dqvValue = model.createProperty(DQV_NS, "value"); + Property dqvComputedOn = model.createProperty(DQV_NS, "computedOn"); + Property generatedAtTime = model.createProperty(PROV_NS, "generatedAtTime"); + Resource measurementClass = model.createResource(DQV_NS + "QualityMeasurement"); + + for (Map.Entry entry : metricMap.entrySet()) { + String fieldName = entry.getKey(); + String metricLocalName = entry.getValue(); + JsonNode value = profile.get(fieldName); + if (value == null || value.isNull() || !value.isNumber()) { + continue; + } + String measurementUri = measurementUri(subjectResource.getURI(), metricLocalName, timestamp); + Resource measurement = model.createResource(measurementUri); + measurement.addProperty(RDF.type, measurementClass); + measurement.addProperty(isMeasurementOf, model.createResource(OM_NS + metricLocalName)); + measurement.addProperty(dqvComputedOn, subjectResource); + addNumericValue(measurement, dqvValue, value, model); + if (timestamp != null) { + measurement.addProperty( + generatedAtTime, model.createTypedLiteral(timestamp, XSDDatatype.XSDdateTime)); + } + subjectResource.addProperty(hasMeasurement, measurement); + } + } + + private static void addNumericValue( + Resource measurement, Property dqvValue, JsonNode value, Model model) { + if (value.isInt()) { + measurement.addProperty(dqvValue, model.createTypedLiteral(value.asInt())); + } else if (value.isLong()) { + measurement.addProperty(dqvValue, model.createTypedLiteral(value.asLong())); + } else { + measurement.addProperty(dqvValue, model.createTypedLiteral(value.asDouble())); + } + } + + /** + * Mints a deterministic URI for a dqv:QualityMeasurement so that re-emit overwrites prior + * triples instead of creating orphans. Each (subject, metric, timestamp) tuple maps to exactly + * one URI. Missing timestamps fall back to "latest" so back-to-back emits without a profile + * timestamp are idempotent. + */ + static String measurementUri(String subjectUri, String metricLocalName, String timestamp) { + String slot = timestamp == null || timestamp.isEmpty() ? "latest" : timestamp; + String encodedSlot = URLEncoder.encode(slot, StandardCharsets.UTF_8); + return subjectUri + "/measurement/" + metricLocalName + "/" + encodedSlot; + } + + private static String readTimestamp(JsonNode profile) { + JsonNode ts = profile.get("timestamp"); + if (ts == null || ts.isNull()) { + return null; + } + if (ts.isTextual()) { + return ts.asText(); + } + if (ts.isNumber()) { + // OpenMetadata profiles record the timestamp as epoch millis; convert to ISO-8601. + return java.time.Instant.ofEpochMilli(ts.asLong()).toString(); + } + return null; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfUsageMapper.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfUsageMapper.java new file mode 100644 index 000000000000..c88c34092218 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfUsageMapper.java @@ -0,0 +1,64 @@ +package org.openmetadata.service.rdf.translator; + +import com.fasterxml.jackson.databind.JsonNode; +import org.apache.jena.datatypes.xsd.XSDDatatype; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.Property; +import org.apache.jena.rdf.model.Resource; + +/** + * Emits RDF triples from {@code Entity.usageSummary} ({@code UsageDetails}). Surfaces query + * usage as a navigable signal so SPARQL — and the {@code /v1/rdf/insights/important} endpoint + * built on top — can rank entities by how often they're actually queried. + * + *

Triples emitted on the entity (only when the corresponding stat is present and numeric): + * + *

    + *
  • {@code om:usageDailyCount}, {@code om:usageDailyPercentile} + *
  • {@code om:usageWeeklyCount}, {@code om:usageWeeklyPercentile} + *
  • {@code om:usageMonthlyCount}, {@code om:usageMonthlyPercentile} + *
  • {@code om:usageDate} ({@code xsd:date}) + *
+ * + *

Percentile values come from OpenMetadata's usage pipeline as 0–100 floats; we keep them in + * that scale (consumers divide by 100 when blending into a 0–1 importance score). + */ +public final class RdfUsageMapper { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + + private RdfUsageMapper() {} + + public static void emitUsageSummary(JsonNode usage, Resource entityResource, Model model) { + if (usage == null || usage.isNull() || !usage.isObject()) { + return; + } + emitStats(usage.get("dailyStats"), "Daily", entityResource, model); + emitStats(usage.get("weeklyStats"), "Weekly", entityResource, model); + emitStats(usage.get("monthlyStats"), "Monthly", entityResource, model); + + JsonNode date = usage.get("date"); + if (date != null && date.isTextual()) { + Property usageDate = model.createProperty(OM_NS, "usageDate"); + entityResource.addProperty( + usageDate, model.createTypedLiteral(date.asText(), XSDDatatype.XSDdate)); + } + } + + private static void emitStats( + JsonNode stats, String window, Resource entityResource, Model model) { + if (stats == null || stats.isNull() || !stats.isObject()) { + return; + } + JsonNode count = stats.get("count"); + if (count != null && count.isNumber()) { + Property countProp = model.createProperty(OM_NS, "usage" + window + "Count"); + entityResource.addProperty(countProp, model.createTypedLiteral(count.asLong())); + } + JsonNode pct = stats.get("percentileRank"); + if (pct != null && pct.isNumber()) { + Property pctProp = model.createProperty(OM_NS, "usage" + window + "Percentile"); + entityResource.addProperty(pctProp, model.createTypedLiteral(pct.asDouble())); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/OntologyDocument.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/OntologyDocument.java new file mode 100644 index 000000000000..628638e3644f --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/OntologyDocument.java @@ -0,0 +1,124 @@ +package org.openmetadata.service.resources.rdf; + +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import lombok.extern.slf4j.Slf4j; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.riot.RDFFormat; + +/** + * Loads the canonical OpenMetadata ontology from the classpath and serves it in the requested + * RDF serialization. The TTL files ship in the openmetadata-spec module: + * + *

    + *
  • {@code /rdf/ontology/openmetadata.ttl} — the main OWL ontology + *
  • {@code /rdf/ontology/openmetadata-prov.ttl} — PROV-aligned extension + *
+ * + * Both are merged into a single response so that consumers don't have to follow imports. + */ +@Slf4j +public final class OntologyDocument { + + private static final String MAIN_RESOURCE = "/rdf/ontology/openmetadata.ttl"; + private static final String PROV_RESOURCE = "/rdf/ontology/openmetadata-prov.ttl"; + + private OntologyDocument() {} + + /** Holder pattern for thread-safe lazy parsing. The merged model is immutable post-load. */ + private static final class Holder { + private static final Model MODEL = loadModel(); + } + + private static Model loadModel() { + Model model = ModelFactory.createDefaultModel(); + readInto(model, MAIN_RESOURCE); + readInto(model, PROV_RESOURCE); + return model; + } + + private static void readInto(Model model, String resourcePath) { + try (InputStream is = OntologyDocument.class.getResourceAsStream(resourcePath)) { + if (is == null) { + LOG.warn("Ontology resource not found on classpath: {}", resourcePath); + return; + } + RDFDataMgr.read(model, is, Lang.TURTLE); + } catch (IOException e) { + LOG.warn("Failed to read ontology resource {}: {}", resourcePath, e.getMessage()); + } catch (RuntimeException e) { + // RDFDataMgr.read can throw Jena RuntimeExceptions (e.g. RiotException) on a malformed + // TTL. Catch broadly so a corrupt ontology file degrades to a partial/empty model rather + // than failing class initialization and taking down /rdf/ontology + MCP describe. + LOG.warn("Failed to parse ontology resource {}: {}", resourcePath, e.getMessage()); + } + } + + /** + * Serialize the merged ontology in the requested RDF format. Returns body, the chosen media + * type, and the file extension. Suitable for callers that don't speak JAX-RS Response (e.g. + * MCP tools). + */ + public static SerializedOntology serializeAsString(String format) { + Format f = Format.parse(format); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + RDFDataMgr.write(out, Holder.MODEL, f.rdfFormat); + return new SerializedOntology(out.toString(StandardCharsets.UTF_8), f.mediaType, f.extension); + } + + public record SerializedOntology(String body, String mediaType, String extension) {} + + static Response serve(String format) { + Format f = Format.parse(format); + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + RDFDataMgr.write(out, Holder.MODEL, f.rdfFormat); + return Response.ok(out.toString(StandardCharsets.UTF_8)) + .type(f.mediaType) + .header("Content-Disposition", "inline; filename=openmetadata-ontology." + f.extension) + .build(); + } catch (Exception e) { + LOG.error("Failed to serialize ontology as {}", format, e); + return Response.serverError() + .entity("{\"error\": \"failed to serialize ontology\"}") + .type(MediaType.APPLICATION_JSON) + .build(); + } + } + + private enum Format { + TURTLE(RDFFormat.TURTLE_PRETTY, "text/turtle", "ttl"), + RDFXML(RDFFormat.RDFXML_PRETTY, "application/rdf+xml", "rdf"), + NTRIPLES(RDFFormat.NTRIPLES, "application/n-triples", "nt"), + JSONLD(RDFFormat.JSONLD_PRETTY, "application/ld+json", "jsonld"); + + final RDFFormat rdfFormat; + final String mediaType; + final String extension; + + Format(RDFFormat rdfFormat, String mediaType, String extension) { + this.rdfFormat = rdfFormat; + this.mediaType = mediaType; + this.extension = extension; + } + + static Format parse(String requested) { + if (requested == null) { + return TURTLE; + } + return switch (requested.toLowerCase()) { + case "rdfxml", "rdf+xml", "rdf/xml" -> RDFXML; + case "ntriples", "n-triples" -> NTRIPLES; + case "jsonld", "json-ld", "ld+json" -> JSONLD; + default -> TURTLE; + }; + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java index a07e26176622..607255c7f991 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java @@ -19,19 +19,45 @@ import jakarta.ws.rs.core.Response; import jakarta.ws.rs.core.SecurityContext; import jakarta.ws.rs.core.UriInfo; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import javax.validation.constraints.NotEmpty; import lombok.extern.slf4j.Slf4j; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.riot.RDFFormat; +import org.apache.jena.shacl.ValidationReport; +import org.apache.jena.update.UpdateFactory; +import org.openmetadata.schema.api.configuration.rdf.CustomOntology; +import org.openmetadata.schema.api.configuration.rdf.InferenceRule; import org.openmetadata.schema.api.rdf.SparqlQuery; import org.openmetadata.schema.utils.JsonUtils; import org.openmetadata.service.Entity; import org.openmetadata.service.OpenMetadataApplicationConfig; +import org.openmetadata.service.rdf.RdfIriValidator; import org.openmetadata.service.rdf.RdfRepository; +import org.openmetadata.service.rdf.extension.CustomOntologyRegistry; +import org.openmetadata.service.rdf.extension.CustomOntologyValidator; +import org.openmetadata.service.rdf.federation.SparqlFederationGuard; +import org.openmetadata.service.rdf.inference.InferenceRuleRegistry; +import org.openmetadata.service.rdf.inference.InferenceRuleValidator; +import org.openmetadata.service.rdf.insights.CentralityComputation; +import org.openmetadata.service.rdf.insights.CoOccurrenceQueryBuilder; +import org.openmetadata.service.rdf.insights.CommunityComputation; +import org.openmetadata.service.rdf.insights.ImportanceQueryBuilder; +import org.openmetadata.service.rdf.insights.LineagePathBuilder; +import org.openmetadata.service.rdf.insights.LineagePathFinder; +import org.openmetadata.service.rdf.insights.RecommendationsQueryBuilder; import org.openmetadata.service.rdf.semantic.SemanticSearchEngine; import org.openmetadata.service.resources.Collection; import org.openmetadata.service.security.Authorizer; @@ -49,6 +75,7 @@ public class RdfResource { private volatile RdfRepository rdfRepository; private final Authorizer authorizer; private volatile SemanticSearchEngine semanticSearchEngine; + private volatile SparqlFederationGuard federationGuard; private OpenMetadataApplicationConfig config; public static final String RDF_XML = "application/rdf+xml"; @@ -91,6 +118,15 @@ public void initialize(OpenMetadataApplicationConfig config) { || !Boolean.TRUE.equals(config.getRdfConfiguration().getEnabled())) { LOG.info("RDF support is disabled in configuration"); } + this.federationGuard = new SparqlFederationGuard(config.getRdfConfiguration()); + } + + private synchronized SparqlFederationGuard getFederationGuard() { + if (federationGuard == null) { + // Tests or restarted resource without initialize(); default to closed allowlist. + federationGuard = new SparqlFederationGuard(null); + } + return federationGuard; } @GET @@ -132,6 +168,598 @@ public Response getRdfStatus(@Context SecurityContext securityContext) { return Response.ok().entity(statusJson).type(MediaType.APPLICATION_JSON).build(); } + @POST + @Path("/validate") + @Produces({TURTLE, JSON_LD, MediaType.APPLICATION_JSON}) + @Operation( + operationId = "validateGraph", + summary = "Run SHACL validation against the OpenMetadata knowledge graph", + description = + "Loads the canonical SHACL shapes (rdf/shapes/openmetadata-shapes.ttl) and validates either a single entity's subgraph or the entire dataset against them. The endpoint reports violations; it does not mutate the graph or block writes.", + responses = { + @ApiResponse( + responseCode = "200", + description = + "SHACL validation report (sh:ValidationReport). Conforms field is true when there are no violations.", + content = {@Content(mediaType = TURTLE), @Content(mediaType = JSON_LD)}), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response validateGraph( + @Context SecurityContext securityContext, + @Parameter( + description = + "Optional. Full URI of the entity to scope the validation to (DESCRIBE ). Omit to validate the whole dataset (admin-only, expensive).") + @QueryParam("entityUri") + String entityUri, + @Parameter(description = "Report serialization: turtle (default) or jsonld") + @QueryParam("format") + @DefaultValue("turtle") + String format) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"error\": \"RDF service not enabled\"}") + .type(MediaType.APPLICATION_JSON) + .build(); + } + + String constructQuery; + if (entityUri != null && !entityUri.isBlank()) { + String validated = RdfIriValidator.validateEntityIri(entityUri); + if (validated == null) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse("entityUri must be an absolute http(s) IRI")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + constructQuery = String.format("DESCRIBE <%s>", validated); + } else { + constructQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }"; + } + + String dataTurtle = getRdfRepository().executeSparqlQueryDirect(constructQuery, "text/turtle"); + Model dataModel = ModelFactory.createDefaultModel(); + try (StringReader reader = new StringReader(dataTurtle)) { + RDFDataMgr.read(dataModel, reader, getRdfRepository().getBaseUri(), Lang.TURTLE); + } catch (Exception e) { + LOG.error("Failed to parse subgraph for SHACL validation", e); + return Response.serverError() + .entity("{\"error\": \"failed to load subgraph for validation\"}") + .type(MediaType.APPLICATION_JSON) + .build(); + } + + ValidationReport report = RdfShaclValidator.validate(dataModel); + + RDFFormat rdfFormat = + "jsonld".equalsIgnoreCase(format) ? RDFFormat.JSONLD_PRETTY : RDFFormat.TURTLE_PRETTY; + String responseMediaType = "jsonld".equalsIgnoreCase(format) ? JSON_LD : TURTLE; + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + RDFDataMgr.write(out, report.getModel(), rdfFormat); + return Response.ok(out.toString(StandardCharsets.UTF_8)) + .type(responseMediaType) + .header("OM-SHACL-Conforms", String.valueOf(report.conforms())) + .build(); + } + + @GET + @Path("/rules") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "listInferenceRules", + summary = "List loaded inference rules", + description = + "Returns all inference rules loaded into this server, in execution order (priority then name). Includes the shipped starter pack plus any rules that have been upserted at runtime.", + responses = { + @ApiResponse(responseCode = "200", description = "List of inference rules"), + @ApiResponse(responseCode = "403", description = "Forbidden") + }) + public Response listInferenceRules(@Context SecurityContext securityContext) { + authorizer.authorizeAdmin(securityContext); + List rules = InferenceRuleRegistry.getInstance().list(); + return Response.ok(JsonUtils.pojoToJson(Map.of("rules", rules))).build(); + } + + @GET + @Path("/rules/{name}") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "getInferenceRule", + summary = "Get a single inference rule by name", + responses = { + @ApiResponse(responseCode = "200", description = "The rule"), + @ApiResponse(responseCode = "403", description = "Forbidden"), + @ApiResponse(responseCode = "404", description = "Rule not found") + }) + public Response getInferenceRule( + @Context SecurityContext securityContext, @PathParam("name") String name) { + authorizer.authorizeAdmin(securityContext); + return InferenceRuleRegistry.getInstance() + .get(name) + .map(rule -> Response.ok(JsonUtils.pojoToJson(rule)).build()) + .orElse( + Response.status(Response.Status.NOT_FOUND) + .entity(buildErrorResponse("Inference rule not found: " + name)) + .type(MediaType.APPLICATION_JSON) + .build()); + } + + @GET + @Path("/insights/important") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "listImportantEntities", + summary = "Rank entities by an importance score that blends usage data and lineage topology", + description = + "Returns the top-N entities of the given type ranked by a composite importance score. The score blends OpenMetadata's existing usage percentile (real query data — 0.6 weight) with downstream lineage edge count (graph topology — 0.4 weight). Once Phase 3.1.b ships, an om:centralityScore from PageRank will fill in for entities that have no query usage data. Results are SPARQL JSON.", + responses = { + @ApiResponse( + responseCode = "200", + description = + "Ranked list of entities with usage percentile, downstream count, and composite score"), + @ApiResponse(responseCode = "400", description = "Invalid entityType, window, or limit"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response listImportantEntities( + @Context SecurityContext securityContext, + @Parameter( + description = + "Entity type to rank (singular: table, dashboard, pipeline, mlmodel, ...). Required.", + required = true) + @QueryParam("entityType") + @NotEmpty + String entityType, + @Parameter(description = "Usage window: daily, weekly, or monthly. Defaults to daily.") + @QueryParam("window") + @DefaultValue("daily") + String window, + @Parameter(description = "Number of results. 1–100, defaults to 20.") + @QueryParam("limit") + @DefaultValue("20") + int limit) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + String sparql; + try { + sparql = ImportanceQueryBuilder.build(entityType, window, limit); + } catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse(e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } + return executeSparqlQuery(sparql, "json", "none"); + } + + @POST + @Path("/insights/recompute-centrality") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "recomputeCentrality", + summary = "Run weighted PageRank on the entity graph and persist scores", + description = + "Triggers Phase 3.1.b's centrality computation: walks lineage / tagging / containment edges of the requested entity type, runs weighted PageRank, and writes the results to the named graph . The /v1/rdf/insights/important endpoint blends these scores in for entities without query usage data. Admin-only; expensive — designed to run on a schedule, but exposed for manual triggering.", + responses = { + @ApiResponse(responseCode = "200", description = "Centrality computation result"), + @ApiResponse(responseCode = "400", description = "Invalid entityType"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response recomputeCentrality( + @Context SecurityContext securityContext, + @Parameter( + description = "Entity type to score (e.g. table, dashboard, pipeline). Required.", + required = true) + @QueryParam("entityType") + @NotEmpty + String entityType) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + try { + CentralityComputation.Result result = + new CentralityComputation(getRdfRepository()).computeAndPersist(entityType); + return Response.ok(JsonUtils.pojoToJson(result)).build(); + } catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse(e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } + } + + @POST + @Path("/insights/recompute-communities") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "recomputeCommunities", + summary = "Run Louvain community detection and persist communities", + description = + "Phase 3.2: extracts the lineage or tag-co-occurrence graph for the requested entity type, runs Louvain modularity optimization, and persists communities to the named graph . Each community is an om:Community resource with om:hasMember triples and modularity score. Admin-only; designed to be triggered on a schedule.", + responses = { + @ApiResponse(responseCode = "200", description = "Community detection result"), + @ApiResponse(responseCode = "400", description = "Invalid entityType or graphType"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response recomputeCommunities( + @Context SecurityContext securityContext, + @Parameter(description = "Entity type to cluster (e.g. table, dashboard).", required = true) + @QueryParam("entityType") + @NotEmpty + String entityType, + @Parameter(description = "Source graph: lineage (default) or tagCoOccurrence.") + @QueryParam("graphType") + @DefaultValue("lineage") + String graphType) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + try { + CommunityComputation.Result result = + new CommunityComputation(getRdfRepository()).computeAndPersist(entityType, graphType); + return Response.ok(JsonUtils.pojoToJson(result)).build(); + } catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse(e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } + } + + @GET + @Path("/insights/communities") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "listCommunities", + summary = "List communities discovered by the latest community-detection run", + description = + "Returns a SPARQL SELECT JSON document with rows of (community, size, modularity, member) for the named graph populated by /insights/recompute-communities. Communities are ordered by size descending; one row per (community, member) pair so the caller can group as needed.", + responses = { + @ApiResponse(responseCode = "200", description = "Communities + members"), + @ApiResponse(responseCode = "400", description = "Invalid entityType or graphType"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response listCommunities( + @Context SecurityContext securityContext, + @Parameter(description = "Entity type whose community partition you want.", required = true) + @QueryParam("entityType") + @NotEmpty + String entityType, + @Parameter(description = "Source graph: lineage (default) or tagCoOccurrence.") + @QueryParam("graphType") + @DefaultValue("lineage") + String graphType) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + String sparql; + try { + sparql = CommunityComputation.listingSparql(entityType, graphType); + } catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse(e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } + return executeSparqlQuery(sparql, "json", "none"); + } + + @GET + @Path("/insights/path") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "findLineagePath", + summary = "Find the shortest lineage path between two entities", + description = + "BFS over the lineage graph (prov:wasDerivedFrom, om:upstream, om:downstream) returning the shortest path between two URIs. Use direction=upstream to walk from entity to its sources, downstream to walk to derived entities, both for either. Each hop returns the URI, the predicate that connected it, and any om:* rdf:type values. Useful for explain-lineage UIs and impact-analysis tooling.", + responses = { + @ApiResponse( + responseCode = "200", + description = "Path between the two entities (or found=false)"), + @ApiResponse( + responseCode = "400", + description = "Invalid from/to URI, direction, or maxHops"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response findLineagePath( + @Context SecurityContext securityContext, + @Parameter(description = "Starting entity URI (absolute http(s)).", required = true) + @QueryParam("from") + @NotEmpty + String from, + @Parameter(description = "Target entity URI (absolute http(s)).", required = true) + @QueryParam("to") + @NotEmpty + String to, + @Parameter(description = "Walk direction: upstream (default), downstream, or both.") + @QueryParam("direction") + @DefaultValue("upstream") + String direction, + @Parameter(description = "Max hops to explore. 1–25, defaults to 6.") @QueryParam("maxHops") + Integer maxHops) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + try { + LineagePathBuilder.Direction dir = LineagePathBuilder.Direction.parse(direction); + LineagePathFinder.Path path = + new LineagePathFinder(getRdfRepository()).findPath(from, to, dir, maxHops); + return Response.ok(JsonUtils.pojoToJson(path)).build(); + } catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse(e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } + } + + @GET + @Path("/insights/recommendations") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "datasetRecommendations", + summary = "Recommend related entities for a given seed URI", + description = + "Phase 3.4: ranks every other entity by graph-topology similarity to the given seed — overlap on tags, glossary terms, and direct lineage neighbours. Pure SPARQL, no precomputation. Score formula: 1.0 · tagOverlap + 1.5 · glossaryOverlap + 2.0 · lineageOverlap.", + responses = { + @ApiResponse(responseCode = "200", description = "Ranked recommendations"), + @ApiResponse(responseCode = "400", description = "Invalid entityUri or limit"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response datasetRecommendations( + @Context SecurityContext securityContext, + @Parameter(description = "Seed entity URI (absolute http(s)).", required = true) + @QueryParam("entityUri") + @NotEmpty + String entityUri, + @Parameter(description = "Number of recommendations. 1–50, default 10.") + @QueryParam("limit") + @DefaultValue("10") + int limit) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + String sparql; + try { + sparql = RecommendationsQueryBuilder.build(entityUri, limit); + } catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse(e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } + return executeSparqlQuery(sparql, "json", "none"); + } + + @GET + @Path("/insights/tag-cooccurrence") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "tagCoOccurrence", + summary = "Pairs of tags applied to the same entities", + description = + "Phase 3.5: returns pairs of tags that appear together on the same entity, sorted by overlap count descending. Surfaces governance signals like 'PII and Confidential are almost always co-applied'. Pure SPARQL aggregate over om:hasTag — no precomputation required.", + responses = { + @ApiResponse(responseCode = "200", description = "Tag pair counts"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response tagCoOccurrence( + @Context SecurityContext securityContext, + @Parameter(description = "Minimum number of shared entities. 1+, default 2.") + @QueryParam("minCount") + @DefaultValue("2") + int minCount, + @Parameter(description = "Number of pairs to return. 1–100, default 20.") + @QueryParam("limit") + @DefaultValue("20") + int limit) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + return executeSparqlQuery( + CoOccurrenceQueryBuilder.tagCoOccurrence(minCount, limit), "json", "none"); + } + + @GET + @Path("/insights/glossary-reach") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "glossaryReach", + summary = "Glossary terms ranked by domain reach", + description = + "Phase 3.5: returns glossary terms ordered by the number of distinct domains in which they're used, surfacing the most cross-cutting concepts. Pure SPARQL aggregate over om:hasGlossaryTerm × om:hasDomain.", + responses = { + @ApiResponse(responseCode = "200", description = "Term reach counts"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response glossaryReach( + @Context SecurityContext securityContext, + @Parameter(description = "Minimum number of domains. 1+, default 2.") + @QueryParam("minDomains") + @DefaultValue("2") + int minDomains, + @Parameter(description = "Number of terms to return. 1–100, default 20.") + @QueryParam("limit") + @DefaultValue("20") + int limit) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + return executeSparqlQuery( + CoOccurrenceQueryBuilder.glossaryReach(minDomains, limit), "json", "none"); + } + + @GET + @Path("/insights/tag-popularity") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "tagPopularity", + summary = "Tags ranked by number of tagged entities", + description = + "Phase 3.5: returns tags ordered by the number of distinct entities they're applied to. Companion to /insights/tag-cooccurrence — useful for triaging tag taxonomy bloat.", + responses = { + @ApiResponse(responseCode = "200", description = "Tag entity counts"), + @ApiResponse(responseCode = "503", description = "RDF service not enabled") + }) + public Response tagPopularity( + @Context SecurityContext securityContext, + @Parameter(description = "Number of tags to return. 1–100, default 20.") + @QueryParam("limit") + @DefaultValue("20") + int limit) { + authorizer.authorizeAdmin(securityContext); + if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(buildErrorResponse("RDF repository is not enabled")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + return executeSparqlQuery(CoOccurrenceQueryBuilder.tagPopularity(limit), "json", "none"); + } + + @GET + @Path("/ontology/extensions") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "listCustomOntologyExtensions", + summary = "List user-authored ontology extensions", + description = + "Returns every ontology extension registered with this server. Each extension is a bundle of custom OWL classes and properties under the om-extension namespace.", + responses = {@ApiResponse(responseCode = "200", description = "Extension list")}) + public Response listCustomOntologyExtensions(@Context SecurityContext securityContext) { + authorizer.authorizeAdmin(securityContext); + return Response.ok( + JsonUtils.pojoToJson(Map.of("extensions", CustomOntologyRegistry.getInstance().list()))) + .build(); + } + + @GET + @Path("/ontology/extensions/{name}") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "getCustomOntologyExtension", + summary = "Get a single custom ontology extension by name", + responses = { + @ApiResponse(responseCode = "200", description = "The extension"), + @ApiResponse(responseCode = "404", description = "Extension not found") + }) + public Response getCustomOntologyExtension( + @Context SecurityContext securityContext, @PathParam("name") String name) { + authorizer.authorizeAdmin(securityContext); + return CustomOntologyRegistry.getInstance() + .get(name) + .map(ext -> Response.ok(JsonUtils.pojoToJson(ext)).build()) + .orElse( + Response.status(Response.Status.NOT_FOUND) + .entity(buildErrorResponse("Custom ontology extension not found: " + name)) + .type(MediaType.APPLICATION_JSON) + .build()); + } + + @POST + @Path("/ontology/extensions/validate") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "validateCustomOntologyExtension", + summary = "Validate a candidate ontology extension without persisting it", + description = + "Runs the same validator that admin writes are gated on (URIs in om-extension namespace, no redefinition of canonical classes, no cycles, valid domain/range references) and returns the list of errors.", + responses = {@ApiResponse(responseCode = "200", description = "Validation result")}) + public Response validateCustomOntologyExtension( + @Context SecurityContext securityContext, CustomOntology candidate) { + authorizer.authorizeAdmin(securityContext); + List errors = CustomOntologyValidator.validate(candidate); + Map body = new java.util.LinkedHashMap<>(); + body.put("valid", errors.isEmpty()); + body.put("errors", errors); + return Response.ok(JsonUtils.pojoToJson(body)).build(); + } + + @POST + @Path("/rules/validate") + @Produces(MediaType.APPLICATION_JSON) + @Operation( + operationId = "validateInferenceRule", + summary = "Validate a candidate inference rule without persisting it", + description = + "Runs the same validator that admin writes are gated on (CONSTRUCT-only, no SERVICE clauses, syntactically well-formed) and returns the list of errors. Useful for an admin UI that wants live feedback while editing a rule.", + responses = { + @ApiResponse(responseCode = "200", description = "Validation result"), + @ApiResponse(responseCode = "403", description = "Forbidden") + }) + public Response validateInferenceRule( + @Context SecurityContext securityContext, InferenceRule candidate) { + authorizer.authorizeAdmin(securityContext); + List errors = InferenceRuleValidator.validate(candidate); + Map body = new java.util.LinkedHashMap<>(); + body.put("valid", errors.isEmpty()); + body.put("errors", errors); + return Response.ok(JsonUtils.pojoToJson(body)).build(); + } + + @GET + @Path("/ontology") + @Produces({TURTLE, RDF_XML, N_TRIPLES, JSON_LD, MediaType.WILDCARD}) + @Operation( + operationId = "getOntology", + summary = "Download the OpenMetadata ontology", + description = + "Returns the canonical OpenMetadata OWL ontology and its PROV-aligned extension as a single document. The ontology imports DCAT, PROV-O, and SKOS by reference. Format is selected via the Accept header or the format query param: turtle (default), rdfxml, ntriples, jsonld.", + responses = { + @ApiResponse( + responseCode = "200", + description = "Ontology document in the requested serialization", + content = { + @Content(mediaType = TURTLE), + @Content(mediaType = RDF_XML), + @Content(mediaType = N_TRIPLES), + @Content(mediaType = JSON_LD) + }), + @ApiResponse(responseCode = "500", description = "Ontology resource missing or unreadable") + }) + public Response getOntology( + @Parameter( + description = + "Output serialization. One of: turtle, rdfxml, ntriples, jsonld. Defaults to turtle.") + @QueryParam("format") + @DefaultValue("turtle") + String format) { + return OntologyDocument.serve(format); + } + @GET @Path("/debug/glossary-relations") @Operation( @@ -470,22 +1098,32 @@ public Response updateSparql( .build(); } + String query = sparqlQuery.getQuery(); + if (query == null || query.isBlank()) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse("SPARQL update body is required")) + .type(MediaType.APPLICATION_JSON) + .build(); + } + + // Validate with Jena's parser, which understands PREFIX / BASE prologues, comments, and + // whitespace. The prior implementation matched the first keyword as a substring after + // upper-casing the body — that rejected legitimate updates beginning with `PREFIX …` and + // could be bypassed by injecting whitespace or comments. UpdateFactory throws if the body + // doesn't parse as a SPARQL UPDATE (which includes SELECT/ASK/CONSTRUCT/DESCRIBE — those + // belong on the read endpoint). try { - String query = sparqlQuery.getQuery().trim().toUpperCase(); - if (!query.startsWith("INSERT") - && !query.startsWith("DELETE") - && !query.startsWith("LOAD") - && !query.startsWith("CLEAR") - && !query.startsWith("CREATE") - && !query.startsWith("DROP")) { - return Response.status(Response.Status.BAD_REQUEST) - .entity("Only SPARQL UPDATE operations are allowed on this endpoint") - .build(); - } + UpdateFactory.create(query); + } catch (Exception e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(buildErrorResponse("Invalid SPARQL UPDATE: " + e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } - getRdfRepository().executeSparqlUpdate(sparqlQuery.getQuery()); + try { + getRdfRepository().executeSparqlUpdate(query); return Response.ok().entity("{\"status\": \"success\"}").build(); - } catch (Exception e) { LOG.error("Error executing SPARQL update", e); return Response.status(Response.Status.INTERNAL_SERVER_ERROR) @@ -501,6 +1139,16 @@ private Response executeSparqlQuery(String query, String format, String inferenc .build(); } + try { + getFederationGuard().enforce(query); + } catch (SparqlFederationGuard.FederationDisallowedException e) { + LOG.warn("Rejected SPARQL with disallowed SERVICE clause: {}", e.getBlockedEndpoint()); + return Response.status(Response.Status.FORBIDDEN) + .entity(buildErrorResponse(e.getMessage())) + .type(MediaType.APPLICATION_JSON) + .build(); + } + try { String mimeType = getMimeTypeForFormat(format); String results; diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfShaclValidator.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfShaclValidator.java new file mode 100644 index 000000000000..19338e6fa3f4 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfShaclValidator.java @@ -0,0 +1,63 @@ +package org.openmetadata.service.resources.rdf; + +import java.io.IOException; +import java.io.InputStream; +import lombok.extern.slf4j.Slf4j; +import org.apache.jena.graph.Graph; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.shacl.Shapes; +import org.apache.jena.shacl.ValidationReport; + +/** + * Loads {@code openmetadata-shapes.ttl} from the classpath and validates an arbitrary RDF model + * against it. The validation is "report only" — callers decide whether a non-empty report is a + * blocker. The shapes graph is parsed once per JVM (Holder pattern). + */ +@Slf4j +public final class RdfShaclValidator { + + private static final String SHAPES_RESOURCE = "/rdf/shapes/openmetadata-shapes.ttl"; + + private RdfShaclValidator() {} + + private static final class Holder { + private static final Shapes SHAPES = loadShapes(); + } + + private static Shapes loadShapes() { + Model shapesModel = ModelFactory.createDefaultModel(); + try (InputStream is = RdfShaclValidator.class.getResourceAsStream(SHAPES_RESOURCE)) { + if (is == null) { + LOG.warn("SHACL shapes resource not found on classpath: {}", SHAPES_RESOURCE); + return Shapes.parse(shapesModel.getGraph()); + } + RDFDataMgr.read(shapesModel, is, Lang.TURTLE); + } catch (IOException e) { + LOG.warn("Failed to read SHACL shapes resource {}: {}", SHAPES_RESOURCE, e.getMessage()); + } catch (RuntimeException e) { + // RDFDataMgr.read can throw RiotException (and other Jena RuntimeExceptions) on a malformed + // TTL. Catch broadly so a corrupt resource degrades to an empty shape set rather than + // failing class initialization and taking down callers (RdfResource, MCP tools). + LOG.warn("Failed to parse SHACL shapes resource {}: {}", SHAPES_RESOURCE, e.getMessage()); + shapesModel = ModelFactory.createDefaultModel(); + } + return Shapes.parse(shapesModel.getGraph()); + } + + /** Validate {@code data} against the OpenMetadata shapes. Never throws on conforming data. */ + public static ValidationReport validate(Graph data) { + return org.apache.jena.shacl.ShaclValidator.get().validate(Holder.SHAPES, data); + } + + public static ValidationReport validate(Model data) { + return validate(data.getGraph()); + } + + /** Expose the shapes for callers that need to inspect or extend them. */ + public static Shapes shapes() { + return Holder.SHAPES; + } +} diff --git a/openmetadata-service/src/main/resources/rdf/inference-rules/domain-membership-inheritance.json b/openmetadata-service/src/main/resources/rdf/inference-rules/domain-membership-inheritance.json new file mode 100644 index 000000000000..8f7c332513b6 --- /dev/null +++ b/openmetadata-service/src/main/resources/rdf/inference-rules/domain-membership-inheritance.json @@ -0,0 +1,10 @@ +{ + "name": "domain-membership-inheritance", + "displayName": "Domain membership inheritance", + "description": "If a Table belongs to a Domain via om:belongsToDomain, every Column of that Table inherits the same domain membership. Lets SPARQL queries against om:belongsToDomain return both table-level and column-level results without separate lookups.", + "ruleType": "CONSTRUCT", + "priority": 400, + "enabled": true, + "tags": ["governance"], + "ruleBody": "PREFIX om: \nCONSTRUCT { ?column om:belongsToDomain ?domain }\nWHERE {\n ?table a om:Table ;\n om:belongsToDomain ?domain ;\n om:hasColumn ?column .\n}" +} diff --git a/openmetadata-service/src/main/resources/rdf/inference-rules/pii-propagation-via-lineage.json b/openmetadata-service/src/main/resources/rdf/inference-rules/pii-propagation-via-lineage.json new file mode 100644 index 000000000000..cb7dbf2d4890 --- /dev/null +++ b/openmetadata-service/src/main/resources/rdf/inference-rules/pii-propagation-via-lineage.json @@ -0,0 +1,10 @@ +{ + "name": "pii-propagation-via-lineage", + "displayName": "PII propagation through lineage", + "description": "If a column is tagged with a PII classification AND another column receives data from it via column-level lineage, propagate the same PII tag to the downstream column. The propagated tag is marked om:inferred so admins can distinguish derived tags from manually applied ones.", + "ruleType": "CONSTRUCT", + "priority": 200, + "enabled": true, + "tags": ["security", "lineage"], + "ruleBody": "PREFIX om: \nCONSTRUCT {\n ?downstream om:hasTag ?piiTag .\n ?downstream om:inferredTagSource ?upstream\n}\nWHERE {\n ?upstream om:hasTag ?piiTag .\n ?piiTag om:tagFQN ?fqn .\n FILTER(STRSTARTS(?fqn, \"PII.\"))\n ?colLineage om:fromColumn ?upstream ;\n om:toColumn ?downstream .\n}" +} diff --git a/openmetadata-service/src/main/resources/rdf/inference-rules/schema-tag-inheritance.json b/openmetadata-service/src/main/resources/rdf/inference-rules/schema-tag-inheritance.json new file mode 100644 index 000000000000..b6c77af81c73 --- /dev/null +++ b/openmetadata-service/src/main/resources/rdf/inference-rules/schema-tag-inheritance.json @@ -0,0 +1,10 @@ +{ + "name": "schema-tag-inheritance", + "displayName": "Schema → table → column tag inheritance", + "description": "Propagate tags down the containment hierarchy: a tag on a DatabaseSchema is inherited by every Table in that schema, and a tag on a Table is inherited by every Column. Inferred tags are marked om:inferredTagSource pointing to the parent that supplied them.", + "ruleType": "CONSTRUCT", + "priority": 300, + "enabled": true, + "tags": ["governance"], + "ruleBody": "PREFIX om: \nCONSTRUCT {\n ?descendant om:hasTag ?tag .\n ?descendant om:inferredTagSource ?ancestor\n}\nWHERE {\n {\n ?ancestor a om:DatabaseSchema ;\n om:hasTag ?tag .\n ?descendant om:belongsToSchema ?ancestor .\n } UNION {\n ?ancestor a om:Table ;\n om:hasTag ?tag .\n ?ancestor om:hasColumn ?descendant .\n }\n}" +} diff --git a/openmetadata-service/src/main/resources/rdf/inference-rules/transitive-lineage-closure.json b/openmetadata-service/src/main/resources/rdf/inference-rules/transitive-lineage-closure.json new file mode 100644 index 000000000000..7bd0e6ee5805 --- /dev/null +++ b/openmetadata-service/src/main/resources/rdf/inference-rules/transitive-lineage-closure.json @@ -0,0 +1,10 @@ +{ + "name": "transitive-lineage-closure", + "displayName": "Transitive lineage closure", + "description": "Materialize indirect upstream/downstream lineage edges by walking prov:wasDerivedFrom transitively. Lets SPARQL answer 'all upstream tables of dashboard X' without requiring property paths in user queries.", + "ruleType": "CONSTRUCT", + "priority": 100, + "enabled": true, + "tags": ["lineage"], + "ruleBody": "PREFIX prov: \nPREFIX om: \nCONSTRUCT { ?x om:transitivelyDerivedFrom ?y }\nWHERE {\n ?x prov:wasDerivedFrom+ ?y .\n FILTER(?x != ?y)\n}" +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfJsonLdContextTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfJsonLdContextTest.java index 0a4cd1dce0d5..2fb9452e8dd7 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfJsonLdContextTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfJsonLdContextTest.java @@ -19,6 +19,9 @@ class RdfJsonLdContextTest { private static ObjectMapper objectMapper; private static JsonNode baseContext; private static JsonNode lineageContext; + private static JsonNode governanceContext; + private static JsonNode aiContext; + private static JsonNode automationContext; @BeforeAll static void loadContexts() throws Exception { @@ -44,6 +47,42 @@ static void loadContexts() throws Exception { } } } + + try (InputStream is = + RdfJsonLdContextTest.class.getResourceAsStream("/rdf/contexts/governance.jsonld")) { + if (is != null) { + JsonNode contextDoc = objectMapper.readTree(is); + if (contextDoc.get("@context").isArray()) { + governanceContext = contextDoc.get("@context").get(1); + } else { + governanceContext = contextDoc.get("@context"); + } + } + } + + try (InputStream is = + RdfJsonLdContextTest.class.getResourceAsStream("/rdf/contexts/ai.jsonld")) { + if (is != null) { + JsonNode contextDoc = objectMapper.readTree(is); + if (contextDoc.get("@context").isArray()) { + aiContext = contextDoc.get("@context").get(1); + } else { + aiContext = contextDoc.get("@context"); + } + } + } + + try (InputStream is = + RdfJsonLdContextTest.class.getResourceAsStream("/rdf/contexts/automation.jsonld")) { + if (is != null) { + JsonNode contextDoc = objectMapper.readTree(is); + if (contextDoc.get("@context").isArray()) { + automationContext = contextDoc.get("@context").get(1); + } else { + automationContext = contextDoc.get("@context"); + } + } + } } @Nested @@ -386,4 +425,143 @@ void testColumnsLineageMapping() { } } } + + @Nested + @DisplayName("P1.8: Governance Context SKOS Hierarchy") + class GovernanceContextSkosTests { + + @Test + @DisplayName("glossary field should map to skos:inScheme on a glossary term") + void testGlossaryMapsToInScheme() { + assertNotNull(governanceContext, "governance.jsonld should be loaded"); + JsonNode glossary = governanceContext.get("glossary"); + assertNotNull(glossary, "'glossary' field mapping must be defined"); + assertEquals( + "skos:inScheme", + glossary.get("@id").asText(), + "GlossaryTerm.glossary should use SKOS inScheme, not the legacy om:belongsToGlossary"); + assertEquals("@id", glossary.get("@type").asText()); + } + + @Test + @DisplayName("classification field should map to skos:inScheme on a tag") + void testClassificationMapsToInScheme() { + assertNotNull(governanceContext); + JsonNode classification = governanceContext.get("classification"); + assertNotNull(classification, "'classification' field mapping must be defined"); + assertEquals( + "skos:inScheme", + classification.get("@id").asText(), + "Tag.classification should use SKOS inScheme to align with skos:ConceptScheme membership"); + } + + @Test + @DisplayName("parent field should map to skos:broader") + void testParentMapsToSkosBroader() { + assertNotNull(governanceContext); + JsonNode parent = governanceContext.get("parent"); + assertNotNull(parent, "'parent' field mapping must be defined"); + assertEquals("skos:broader", parent.get("@id").asText()); + assertEquals("@id", parent.get("@type").asText()); + } + + @Test + @DisplayName("children field should map to skos:narrower with @set container") + void testChildrenMapsToSkosNarrower() { + assertNotNull(governanceContext); + JsonNode children = governanceContext.get("children"); + assertNotNull( + children, + "'children' field mapping must be defined (it replaces the prior 'childTerms' alias which referenced a non-existent field)"); + assertEquals("skos:narrower", children.get("@id").asText()); + assertEquals("@id", children.get("@type").asText()); + assertEquals("@set", children.get("@container").asText()); + } + + @Test + @DisplayName("Stale childTerms alias should no longer be present") + void testNoLegacyChildTermsAlias() { + assertNotNull(governanceContext); + assertNull( + governanceContext.get("childTerms"), + "Legacy 'childTerms' alias must not coexist with 'children' — GlossaryTerm has no 'childTerms' field, so the alias would never fire"); + } + + @Test + @DisplayName("DataContract and Persona fields should be wired to ontology predicates") + void testDataContractAndPersonaFieldsWired() { + assertNotNull(governanceContext); + assertNotNull(governanceContext.get("contractStatus")); + assertEquals( + "om:contractStatus", governanceContext.get("contractStatus").get("@id").asText()); + assertNotNull(governanceContext.get("appliesTo")); + assertEquals("om:appliesToEntity", governanceContext.get("appliesTo").get("@id").asText()); + assertEquals("@id", governanceContext.get("appliesTo").get("@type").asText()); + assertNotNull(governanceContext.get("users")); + assertEquals("om:appliesToUser", governanceContext.get("users").get("@id").asText()); + } + } + + @Nested + @DisplayName("P1.5: AI / Automation Contexts") + class AiAutomationContextTests { + + @Test + @DisplayName("ai.jsonld should be loadable and define LLMModel + AIApplication types") + void testAiContextLoaded() { + assertNotNull(aiContext, "ai.jsonld should be on the classpath"); + assertEquals("om:LLMModel", aiContext.get("LLMModel").get("@id").asText()); + assertEquals("om:AIApplication", aiContext.get("AIApplication").get("@id").asText()); + assertEquals("om:McpServer", aiContext.get("McpServer").get("@id").asText()); + assertEquals("om:PromptTemplate", aiContext.get("PromptTemplate").get("@id").asText()); + } + + @Test + @DisplayName("AgentExecution and McpExecution should be PROV activities") + void testExecutionsAreProvActivities() { + assertNotNull(aiContext); + JsonNode agentExec = aiContext.get("AgentExecution"); + assertNotNull(agentExec); + assertTrue( + agentExec.get("@type").toString().contains("prov:Activity"), + "AgentExecution must be typed as prov:Activity for cross-system PROV traversal"); + JsonNode mcpExec = aiContext.get("McpExecution"); + assertTrue(mcpExec.get("@type").toString().contains("prov:Activity")); + } + + @Test + @DisplayName("LLMModel.trainingDatasets should map to om:hasTrainingDataset for AI lineage") + void testTrainingDatasetsMapping() { + assertNotNull(aiContext); + JsonNode trainingDatasets = aiContext.get("trainingDatasets"); + assertNotNull( + trainingDatasets, + "trainingDatasets must be wired so AI lineage queries can traverse model -> dataset"); + assertEquals("om:hasTrainingDataset", trainingDatasets.get("@id").asText()); + assertEquals("@id", trainingDatasets.get("@type").asText()); + assertEquals("@set", trainingDatasets.get("@container").asText()); + } + + @Test + @DisplayName("AIApplication.models should map to om:usesModel object property") + void testAiAppUsesModelMapping() { + assertNotNull(aiContext); + JsonNode models = aiContext.get("models"); + assertNotNull(models); + assertEquals("om:usesModel", models.get("@id").asText()); + assertEquals("@id", models.get("@type").asText()); + } + + @Test + @DisplayName("automation.jsonld should define Workflow + WorkflowInstance types") + void testAutomationContextLoaded() { + assertNotNull(automationContext, "automation.jsonld should be on the classpath"); + assertEquals("om:Workflow", automationContext.get("Workflow").get("@id").asText()); + JsonNode wfInstance = automationContext.get("WorkflowInstance"); + assertNotNull(wfInstance); + assertTrue( + wfInstance.get("@type").toString().contains("prov:Activity"), + "WorkflowInstance is a single run and must be typed as prov:Activity"); + } + } } diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java index 49441df3b7a3..806c52dcf46b 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java @@ -313,7 +313,7 @@ void testDownstreamEdges() throws Exception { } @Test - @DisplayName("Column lineage should be stored with fromColumn and toColumn properties") + @DisplayName("Column lineage should emit URI references plus FQN strings for back-compat") void testColumnLineage() throws Exception { ArrayNode upstreamEdges = objectMapper.createArrayNode(); ObjectNode edge = objectMapper.createObjectNode(); @@ -328,10 +328,10 @@ void testColumnLineage() throws Exception { ObjectNode colLineage = objectMapper.createObjectNode(); ArrayNode fromColumns = objectMapper.createArrayNode(); - fromColumns.add("source_table.column_a"); - fromColumns.add("source_table.column_b"); + fromColumns.add("service.db.schema.source_table.column_a"); + fromColumns.add("service.db.schema.source_table.column_b"); colLineage.set("fromColumns", fromColumns); - colLineage.put("toColumn", "target_table.merged_column"); + colLineage.put("toColumn", "service.db.schema.target_table.merged_column"); colLineage.put("function", "CONCAT(column_a, column_b)"); columnsLineage.add(colLineage); @@ -346,22 +346,437 @@ void testColumnLineage() throws Exception { method.setAccessible(true); method.invoke(propertyMapper, "upstreamEdges", upstreamEdges, entityResource, model); - // Find column lineage in the model Property hasColumnLineage = model.createProperty(OM_NS, "hasColumnLineage"); StmtIterator stmts = model.listStatements(null, hasColumnLineage, (Resource) null); assertTrue(stmts.hasNext(), "Should have column lineage"); + Resource colLineageResource = stmts.next().getObject().asResource(); + + Resource expectedFromA = + model.createResource( + RdfUtils.columnUri(BASE_URI, "service.db.schema.source_table.column_a")); + Resource expectedFromB = + model.createResource( + RdfUtils.columnUri(BASE_URI, "service.db.schema.source_table.column_b")); + Resource expectedTo = + model.createResource( + RdfUtils.columnUri(BASE_URI, "service.db.schema.target_table.merged_column")); + + Property fromColumn = model.createProperty(OM_NS, "fromColumn"); + Property toColumn = model.createProperty(OM_NS, "toColumn"); + assertTrue( + model.contains(colLineageResource, fromColumn, expectedFromA), + "fromColumn should reference URI for column_a"); + assertTrue( + model.contains(colLineageResource, fromColumn, expectedFromB), + "fromColumn should reference URI for column_b"); + assertTrue( + model.contains(colLineageResource, toColumn, expectedTo), + "toColumn should reference URI for merged_column"); + + Property fromColumnFqn = model.createProperty(OM_NS, "fromColumnFqn"); + Property toColumnFqn = model.createProperty(OM_NS, "toColumnFqn"); + assertTrue( + model.contains( + colLineageResource, fromColumnFqn, "service.db.schema.source_table.column_a"), + "fromColumnFqn literal should be retained for back-compat"); + assertTrue( + model.contains( + colLineageResource, toColumnFqn, "service.db.schema.target_table.merged_column"), + "toColumnFqn literal should be retained for back-compat"); + + Resource columnClass = model.createResource(OM_NS + "Column"); + assertTrue( + model.contains(expectedFromA, RDF.type, columnClass), + "Source column resource should be typed as om:Column"); + assertTrue( + model.contains(expectedTo, RDF.type, columnClass), + "Target column resource should be typed as om:Column"); + + Property transformFunc = model.createProperty(OM_NS, "transformFunction"); + assertTrue( + model.contains(colLineageResource, transformFunc, "CONCAT(column_a, column_b)"), + "transformFunction should be stored as a literal on the column-lineage resource"); + } + } + + @Nested + @DisplayName("P1.1: Column resource emission") + class ColumnResourceTests { - // Verify fromColumn is stored - Property fromColumnProp = model.createProperty(OM_NS, "fromColumn"); - assertTrue(model.contains(null, fromColumnProp), "Should have fromColumn properties"); + @Test + @DisplayName("Table.columns should be emitted as named om:Column resources at FQN-derived URIs") + void testTableColumnsEmittedAsNamedResources() throws Exception { + Map contextCache = new HashMap<>(); + contextCache.put("dataAsset-complete", Map.of()); + propertyMapper = new RdfPropertyMapper(BASE_URI, objectMapper, contextCache); + + ArrayNode columns = objectMapper.createArrayNode(); + ObjectNode pkColumn = objectMapper.createObjectNode(); + pkColumn.put("name", "id"); + pkColumn.put("dataType", "BIGINT"); + pkColumn.put("constraint", "PRIMARY_KEY"); + pkColumn.put("ordinalPosition", 0); + pkColumn.put("description", "Primary key"); + pkColumn.put("fullyQualifiedName", "service.db.schema.orders.id"); + columns.add(pkColumn); + + ObjectNode amountColumn = objectMapper.createObjectNode(); + amountColumn.put("name", "amount"); + amountColumn.put("dataType", "DECIMAL"); + amountColumn.put("ordinalPosition", 1); + amountColumn.put("fullyQualifiedName", "service.db.schema.orders.amount"); + columns.add(amountColumn); - // Verify toColumn is stored - Property toColumnProp = model.createProperty(OM_NS, "toColumn"); - assertTrue(model.contains(null, toColumnProp), "Should have toColumn property"); + invokePrivate( + "emitColumns", + new Class[] {JsonNode.class, Resource.class, Model.class}, + columns, + entityResource, + model); + + Resource pkResource = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.schema.orders.id")); + Resource amountResource = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.schema.orders.amount")); + + Property hasColumn = model.createProperty(OM_NS, "hasColumn"); + assertTrue( + model.contains(entityResource, hasColumn, pkResource), + "Table should link to PK column via om:hasColumn"); + assertTrue( + model.contains(entityResource, hasColumn, amountResource), + "Table should link to amount column via om:hasColumn"); + + Resource columnClass = model.createResource(OM_NS + "Column"); + assertTrue(model.contains(pkResource, RDF.type, columnClass)); + assertTrue( + model.contains(pkResource, model.createProperty(OM_NS, "columnDataType"), "BIGINT")); + assertTrue( + model.getProperty(pkResource, model.createProperty(OM_NS, "isPrimaryKey")).getBoolean(), + "Primary key constraint should set om:isPrimaryKey true"); + assertFalse( + model.getProperty(pkResource, model.createProperty(OM_NS, "isNullable")).getBoolean(), + "Primary key implies om:isNullable false"); + assertTrue( + model.contains(amountResource, model.createProperty(OM_NS, "columnDataType"), "DECIMAL")); + } + + @Test + @DisplayName("Nested struct/map columns should link via om:hasChildColumn") + void testNestedChildColumns() throws Exception { + ArrayNode columns = objectMapper.createArrayNode(); + ObjectNode struct = objectMapper.createObjectNode(); + struct.put("name", "address"); + struct.put("dataType", "STRUCT"); + struct.put("fullyQualifiedName", "service.db.schema.users.address"); + + ArrayNode children = objectMapper.createArrayNode(); + ObjectNode street = objectMapper.createObjectNode(); + street.put("name", "street"); + street.put("dataType", "VARCHAR"); + street.put("fullyQualifiedName", "service.db.schema.users.address.street"); + children.add(street); + struct.set("children", children); + + columns.add(struct); + + invokePrivate( + "emitColumns", + new Class[] {JsonNode.class, Resource.class, Model.class}, + columns, + entityResource, + model); + + Resource addressResource = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.schema.users.address")); + Resource streetResource = + model.createResource( + RdfUtils.columnUri(BASE_URI, "service.db.schema.users.address.street")); + + assertTrue( + model.contains( + addressResource, model.createProperty(OM_NS, "hasChildColumn"), streetResource), + "Parent struct column should link to child via om:hasChildColumn"); + assertTrue(model.contains(streetResource, RDF.type, model.createResource(OM_NS + "Column"))); + } + + @Test + @DisplayName("Per-column constraints map to isPrimaryKey, isNullable, and isUnique") + void testPerColumnConstraintFlags() throws Exception { + ArrayNode columns = objectMapper.createArrayNode(); + columns.add(columnNode("id", "BIGINT", "service.db.s.t.id", "PRIMARY_KEY")); + columns.add(columnNode("email", "VARCHAR", "service.db.s.t.email", "UNIQUE")); + columns.add(columnNode("country", "VARCHAR", "service.db.s.t.country", "NOT_NULL")); + columns.add(columnNode("nickname", "VARCHAR", "service.db.s.t.nickname", "NULL")); + + invokePrivate( + "emitColumns", + new Class[] {JsonNode.class, Resource.class, Model.class}, + columns, + entityResource, + model); + + Resource id = model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.t.id")); + Resource email = model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.t.email")); + Resource country = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.t.country")); + Resource nickname = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.t.nickname")); + + Property isPrimaryKey = model.createProperty(OM_NS, "isPrimaryKey"); + Property isUnique = model.createProperty(OM_NS, "isUnique"); + Property isNullable = model.createProperty(OM_NS, "isNullable"); + + assertTrue(model.getProperty(id, isPrimaryKey).getBoolean()); + assertTrue(model.getProperty(id, isUnique).getBoolean()); + assertFalse(model.getProperty(id, isNullable).getBoolean()); + + assertTrue(model.getProperty(email, isUnique).getBoolean()); + assertFalse( + model.contains(email, isPrimaryKey), + "UNIQUE alone should not imply primary-key membership"); + + assertFalse(model.getProperty(country, isNullable).getBoolean()); + assertTrue(model.getProperty(nickname, isNullable).getBoolean()); + } + + @Test + @DisplayName("FOREIGN_KEY table constraint emits om:references and TableConstraint resource") + void testForeignKeyTableConstraint() throws Exception { + ArrayNode constraints = objectMapper.createArrayNode(); + ObjectNode fk = objectMapper.createObjectNode(); + fk.put("constraintType", "FOREIGN_KEY"); + fk.put("relationshipType", "MANY_TO_ONE"); + ArrayNode cols = objectMapper.createArrayNode(); + cols.add("customer_id"); + fk.set("columns", cols); + ArrayNode referred = objectMapper.createArrayNode(); + referred.add("service.db.s.customers.id"); + fk.set("referredColumns", referred); + constraints.add(fk); + + invokePrivate( + "emitTableConstraints", + new Class[] {JsonNode.class, String.class, Resource.class, Model.class}, + constraints, + "service.db.s.orders", + entityResource, + model); + + Resource customerIdCol = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.orders.customer_id")); + Resource referredCol = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.customers.id")); + + Property references = model.createProperty(OM_NS, "references"); + assertTrue( + model.contains(customerIdCol, references, referredCol), + "FK should produce direct om:references triple between source and referred column"); + + Property hasConstraint = model.createProperty(OM_NS, "hasConstraint"); + Resource constraintResource = + model.listObjectsOfProperty(entityResource, hasConstraint).next().asResource(); + assertTrue( + model.contains( + constraintResource, RDF.type, model.createResource(OM_NS + "TableConstraint"))); + assertTrue( + model.contains( + constraintResource, model.createProperty(OM_NS, "constraintType"), "FOREIGN_KEY")); + assertTrue( + model.contains( + constraintResource, model.createProperty(OM_NS, "relationshipType"), "MANY_TO_ONE")); + assertTrue( + model.contains( + constraintResource, + model.createProperty(OM_NS, "hasConstrainedColumn"), + customerIdCol)); + assertTrue( + model.contains( + constraintResource, model.createProperty(OM_NS, "hasReferredColumn"), referredCol)); + } + + @Test + @DisplayName("Multi-column PRIMARY_KEY constraint marks every member column") + void testMultiColumnPrimaryKey() throws Exception { + ArrayNode constraints = objectMapper.createArrayNode(); + ObjectNode pk = objectMapper.createObjectNode(); + pk.put("constraintType", "PRIMARY_KEY"); + ArrayNode cols = objectMapper.createArrayNode(); + cols.add("tenant_id"); + cols.add("user_id"); + pk.set("columns", cols); + constraints.add(pk); + + invokePrivate( + "emitTableConstraints", + new Class[] {JsonNode.class, String.class, Resource.class, Model.class}, + constraints, + "service.db.s.users", + entityResource, + model); + + Resource tenantId = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.users.tenant_id")); + Resource userId = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.users.user_id")); + + Property isPrimaryKey = model.createProperty(OM_NS, "isPrimaryKey"); + assertTrue(model.getProperty(tenantId, isPrimaryKey).getBoolean()); + assertTrue(model.getProperty(userId, isPrimaryKey).getBoolean()); + } + + private ObjectNode columnNode(String name, String dataType, String fqn, String constraint) { + ObjectNode col = objectMapper.createObjectNode(); + col.put("name", name); + col.put("dataType", dataType); + col.put("fullyQualifiedName", fqn); + if (constraint != null) { + col.put("constraint", constraint); + } + return col; + } + + @Test + @DisplayName("Column.profile is emitted as DQV measurements rather than a JSON literal") + void testColumnProfileEmittedAsDqv() throws Exception { + ArrayNode columns = objectMapper.createArrayNode(); + ObjectNode col = objectMapper.createObjectNode(); + col.put("name", "email"); + col.put("dataType", "VARCHAR"); + col.put("fullyQualifiedName", "service.db.s.users.email"); + ObjectNode profile = objectMapper.createObjectNode(); + profile.put("valuesCount", 1000); + profile.put("nullCount", 12); + profile.put("nullProportion", 0.012); + profile.put("uniqueCount", 985); + profile.put("timestamp", 1714300000000L); + col.set("profile", profile); + columns.add(col); + + invokePrivate( + "emitColumns", + new Class[] {JsonNode.class, Resource.class, Model.class}, + columns, + entityResource, + model); + + Resource emailColumn = + model.createResource(RdfUtils.columnUri(BASE_URI, "service.db.s.users.email")); + Property hasMeasurement = + model.createProperty("http://www.w3.org/ns/dqv#", "hasQualityMeasurement"); + java.util.List measurements = + model.listObjectsOfProperty(emailColumn, hasMeasurement).toList().stream() + .map(node -> node.asResource()) + .toList(); + assertEquals( + 4, + measurements.size(), + "Expected 4 numeric profile metrics (valuesCount, nullCount, nullProportion, uniqueCount)"); + + Property isMeasurementOf = + model.createProperty("http://www.w3.org/ns/dqv#", "isMeasurementOf"); + Property dqvValue = model.createProperty("http://www.w3.org/ns/dqv#", "value"); + java.util.Map byMetric = new java.util.HashMap<>(); + for (Resource m : measurements) { + Resource metric = model.getProperty(m, isMeasurementOf).getObject().asResource(); + double v = model.getProperty(m, dqvValue).getDouble(); + byMetric.put(metric.getURI(), v); + } + assertEquals(1000.0, byMetric.get(OM_NS + "ValuesCountMetric"), 0.0); + assertEquals(12.0, byMetric.get(OM_NS + "NullCountMetric"), 0.0); + assertEquals(0.012, byMetric.get(OM_NS + "NullProportionMetric"), 1e-9); + assertEquals(985.0, byMetric.get(OM_NS + "UniqueCountMetric"), 0.0); + + // Each measurement should also be tied back to the column via dqv:computedOn. + Property computedOn = model.createProperty("http://www.w3.org/ns/dqv#", "computedOn"); + for (Resource m : measurements) { + assertTrue(model.contains(m, computedOn, emailColumn)); + } + } + + @Test + @DisplayName("Pipeline run is emitted as a prov:Activity tied to inputs and outputs") + void testPipelineRunEmitsProvActivity() throws Exception { + ObjectNode pipelineStatus = objectMapper.createObjectNode(); + pipelineStatus.put("timestamp", 1714300000000L); + pipelineStatus.put("endTime", 1714300120000L); + pipelineStatus.put("executionStatus", "Successful"); + pipelineStatus.put("executionId", "airflow-run-123"); + ArrayNode inputs = objectMapper.createArrayNode(); + ObjectNode in = objectMapper.createObjectNode(); + in.put("datasetFQN", "service.db.s.source"); + inputs.add(in); + pipelineStatus.set("inputs", inputs); + ArrayNode outputs = objectMapper.createArrayNode(); + ObjectNode out = objectMapper.createObjectNode(); + out.put("datasetFQN", "service.db.s.target"); + outputs.add(out); + pipelineStatus.set("outputs", outputs); + ObjectNode executedBy = objectMapper.createObjectNode(); + executedBy.put("id", UUID.randomUUID().toString()); + executedBy.put("type", "user"); + pipelineStatus.set("executedBy", executedBy); + + java.lang.reflect.Method method = + org.openmetadata.service.rdf.translator.RdfActivityMapper.class.getDeclaredMethod( + "emitPipelineActivity", + JsonNode.class, + String.class, + Resource.class, + String.class, + Model.class); + method.setAccessible(true); + method.invoke( + null, pipelineStatus, "service.pipeline.daily_etl", entityResource, BASE_URI, model); + + Property hasExecution = model.createProperty(OM_NS, "hasExecution"); + Resource activity = + model.listObjectsOfProperty(entityResource, hasExecution).next().asResource(); + + assertTrue( + model.contains( + activity, RDF.type, model.createResource("http://www.w3.org/ns/prov#Activity"))); + assertTrue( + model.contains(activity, model.createProperty(OM_NS, "executionStatus"), "Successful")); + assertTrue( + model.contains(activity, model.createProperty(OM_NS, "executionId"), "airflow-run-123")); + // PROV-O: activity-to-activity relation. Pipeline run wasInformedBy pipeline definition. + assertTrue( + model.contains( + activity, + model.createProperty("http://www.w3.org/ns/prov#", "wasInformedBy"), + entityResource)); + assertTrue( + model.contains( + activity, model.createProperty("http://www.w3.org/ns/prov#", "startedAtTime"))); + assertTrue( + model.contains( + activity, model.createProperty("http://www.w3.org/ns/prov#", "endedAtTime"))); + assertTrue( + model.contains(activity, model.createProperty("http://www.w3.org/ns/prov#", "used")), + "Activity should reference its input dataset via prov:used"); + assertTrue( + model.contains(activity, model.createProperty("http://www.w3.org/ns/prov#", "generated")), + "Activity should reference its output dataset via prov:generated"); + assertTrue( + model.contains( + activity, model.createProperty("http://www.w3.org/ns/prov#", "wasAssociatedWith")), + "Activity should record who triggered the run via prov:wasAssociatedWith"); + } + + @Test + @DisplayName("RdfUtils.columnUri should be deterministic and percent-encode FQNs") + void testColumnUri() { + String uri = RdfUtils.columnUri(BASE_URI, "service.db.schema.orders.amount"); + assertEquals(BASE_URI + "entity/column/service.db.schema.orders.amount", uri); + + String specialUri = RdfUtils.columnUri(BASE_URI, "service db.weird name"); + assertTrue( + specialUri.contains("service+db.weird+name") || specialUri.contains("service%20db"), + "FQN with whitespace should be percent-encoded"); - // Verify transformation function is stored - Property transformFuncProp = model.createProperty(OM_NS, "transformFunction"); - assertTrue(model.contains(null, transformFuncProp), "Should have transformFunction property"); + assertNull(RdfUtils.columnUri(BASE_URI, null)); + assertNull(RdfUtils.columnUri(BASE_URI, "")); } } diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/extension/CustomOntologyValidatorTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/extension/CustomOntologyValidatorTest.java new file mode 100644 index 000000000000..a6131b63a710 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/extension/CustomOntologyValidatorTest.java @@ -0,0 +1,345 @@ +package org.openmetadata.service.rdf.extension; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.openmetadata.schema.api.configuration.rdf.CustomOntology; +import org.openmetadata.schema.api.configuration.rdf.CustomOntologyClass; +import org.openmetadata.schema.api.configuration.rdf.CustomOntologyProperty; + +class CustomOntologyValidatorTest { + + private static final String EXT_NS = "https://open-metadata.org/ontology-extension/"; + + private static CustomOntologyClass cls(String localName, String... parents) { + return new CustomOntologyClass() + .withUri(EXT_NS + localName) + .withSubClassOf(java.util.List.of(parents)); + } + + private static CustomOntologyProperty objProp(String localName, String domain, String range) { + return new CustomOntologyProperty() + .withUri(EXT_NS + localName) + .withType(CustomOntologyProperty.Type.OBJECT_PROPERTY) + .withDomain(domain) + .withRange(range); + } + + private static CustomOntologyProperty datatypeProp( + String localName, String domain, String range) { + return new CustomOntologyProperty() + .withUri(EXT_NS + localName) + .withType(CustomOntologyProperty.Type.DATATYPE_PROPERTY) + .withDomain(domain) + .withRange(range); + } + + private static CustomOntology ext(String name) { + return new CustomOntology().withName(name); + } + + @Nested + @DisplayName("Required fields and shape") + class RequiredFieldsAndShape { + + @Test + @DisplayName("Null extension is rejected") + void nullExtension() { + assertTrue( + CustomOntologyValidator.validate(null).stream() + .anyMatch(e -> e.contains("must not be null"))); + } + + @Test + @DisplayName("Blank name is rejected") + void blankName() { + assertTrue( + CustomOntologyValidator.validate(ext("")).stream() + .anyMatch(e -> e.contains("'name' must not be blank"))); + } + + @Test + @DisplayName("Name with uppercase is rejected") + void uppercaseName() { + assertTrue( + CustomOntologyValidator.validate(ext("MyExtension")).stream() + .anyMatch(e -> e.contains("name") && e.contains("lowercase"))); + } + + @Test + @DisplayName("Extension with no classes and no properties is rejected") + void emptyExtension() { + List errors = CustomOntologyValidator.validate(ext("empty-ext")); + assertTrue(errors.stream().anyMatch(e -> e.contains("at least one class or property"))); + } + } + + @Nested + @DisplayName("Namespace enforcement") + class NamespaceEnforcement { + + @Test + @DisplayName("Class URI in canonical om: namespace is rejected (cannot redefine)") + void canonicalNamespaceClassRejected() { + CustomOntology e = + ext("redefine-table") + .withClasses( + List.of( + new CustomOntologyClass() + .withUri("https://open-metadata.org/ontology/Table") + .withSubClassOf(List.of("https://open-metadata.org/ontology/Entity")))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch( + err -> err.contains("om-extension namespace") && err.contains("ontology/Table"))); + } + + @Test + @DisplayName("Property URI outside om-extension namespace is rejected") + void canonicalNamespacePropertyRejected() { + CustomOntology e = + ext("bad-prop") + .withClasses(List.of(cls("Foo", "om:Entity"))) + .withProperties( + List.of( + new CustomOntologyProperty() + .withUri("https://example.org/somewhere") + .withType(CustomOntologyProperty.Type.OBJECT_PROPERTY) + .withDomain(EXT_NS + "Foo") + .withRange("om:Entity"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("om-extension namespace"))); + } + } + + @Nested + @DisplayName("Class hierarchy checks") + class ClassHierarchy { + + @Test + @DisplayName("Class without subClassOf is rejected") + void classNeedsParent() { + CustomOntologyClass orphan = new CustomOntologyClass().withUri(EXT_NS + "Orphan"); + CustomOntology e = ext("orphan-ext").withClasses(List.of(orphan)); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("must declare at least one subClassOf parent"))); + } + + @Test + @DisplayName("Class referencing unknown canonical parent is rejected") + void unknownCanonicalParentRejected() { + CustomOntology e = + ext("unknown-parent") + .withClasses( + List.of(cls("Widget", "https://open-metadata.org/ontology/NonexistentClass"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("unknown parent class"))); + } + + @Test + @DisplayName("Class referencing canonical om: short-form parent is accepted") + void shortFormCanonicalParent() { + CustomOntology e = ext("short-form").withClasses(List.of(cls("Widget", "om:DataAsset"))); + assertTrue(CustomOntologyValidator.validate(e).isEmpty()); + } + + @Test + @DisplayName("Class referencing another class in the same extension is accepted") + void siblingExtensionClassReference() { + CustomOntology e = + ext("siblings") + .withClasses(List.of(cls("Parent", "om:Entity"), cls("Child", EXT_NS + "Parent"))); + List errors = CustomOntologyValidator.validate(e); + assertTrue(errors.isEmpty(), "Got: " + errors); + } + + @Test + @DisplayName("Cycle in class hierarchy is detected (A → B → A)") + void hierarchyCycleDetected() { + CustomOntology e = + ext("cycle").withClasses(List.of(cls("A", EXT_NS + "B"), cls("B", EXT_NS + "A"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("contains a cycle")), + "Validator must detect 2-node hierarchy cycles"); + } + + @Test + @DisplayName("Self-referencing class is detected as a cycle") + void selfCycleDetected() { + CustomOntology e = ext("self-cycle").withClasses(List.of(cls("Self", EXT_NS + "Self"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("contains a cycle"))); + } + + @Test + @DisplayName("3-node cycle (A → B → C → A) is detected") + void threeNodeCycleDetected() { + CustomOntology e = + ext("3-cycle") + .withClasses( + List.of(cls("A", EXT_NS + "B"), cls("B", EXT_NS + "C"), cls("C", EXT_NS + "A"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("contains a cycle"))); + } + + @Test + @DisplayName("Duplicate class URI within the same extension is rejected") + void duplicateClassUriRejected() { + CustomOntology e = + ext("dupes").withClasses(List.of(cls("Same", "om:Entity"), cls("Same", "om:Entity"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("duplicate class URI"))); + } + } + + @Nested + @DisplayName("Property checks") + class PropertyChecks { + + @Test + @DisplayName("ObjectProperty with unknown domain is rejected") + void unknownDomainRejected() { + CustomOntology e = + ext("unknown-domain") + .withClasses(List.of(cls("Foo", "om:Entity"))) + .withProperties( + List.of( + objProp( + "rel", "https://open-metadata.org/ontology/NoSuchClass", "om:Entity"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("domain") && err.contains("not a known"))); + } + + @Test + @DisplayName("ObjectProperty with non-class range is rejected") + void objectPropertyRangeMustBeClass() { + CustomOntology e = + ext("bad-range") + .withClasses(List.of(cls("Foo", "om:Entity"))) + .withProperties( + List.of( + objProp("rel", EXT_NS + "Foo", "http://www.w3.org/2001/XMLSchema#string"))); + // ObjectProperty range that is an xsd type is rejected because it's not a known class. + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("range") && err.contains("not a known"))); + } + + @Test + @DisplayName("DatatypeProperty range must be an xsd: datatype URI") + void datatypePropertyRequiresXsdRange() { + CustomOntology e = + ext("bad-dt-range") + .withClasses(List.of(cls("Foo", "om:Entity"))) + .withProperties(List.of(datatypeProp("score", EXT_NS + "Foo", EXT_NS + "Foo"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch( + err -> err.contains("DatatypeProperty") && err.contains("xsd: datatype URI"))); + } + + @Test + @DisplayName("Valid DatatypeProperty with xsd:string range is accepted") + void validDatatypeProperty() { + CustomOntology e = + ext("valid-dt") + .withClasses(List.of(cls("Foo", "om:Entity"))) + .withProperties( + List.of( + datatypeProp( + "score", EXT_NS + "Foo", "http://www.w3.org/2001/XMLSchema#string"))); + List errors = CustomOntologyValidator.validate(e); + assertTrue(errors.isEmpty(), "Got: " + errors); + } + + @Test + @DisplayName("Duplicate property URI within the same extension is rejected") + void duplicatePropertyUri() { + CustomOntology e = + ext("dupe-props") + .withClasses(List.of(cls("Foo", "om:Entity"))) + .withProperties( + List.of( + objProp("rel", EXT_NS + "Foo", "om:Entity"), + objProp("rel", EXT_NS + "Foo", "om:Entity"))); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("duplicate property URI"))); + } + + @Test + @DisplayName("Property with no domain is rejected") + void missingDomain() { + CustomOntologyProperty p = + new CustomOntologyProperty() + .withUri(EXT_NS + "rel") + .withType(CustomOntologyProperty.Type.OBJECT_PROPERTY) + .withRange("om:Entity"); + CustomOntology e = + ext("no-domain").withClasses(List.of(cls("Foo", "om:Entity"))).withProperties(List.of(p)); + assertTrue( + CustomOntologyValidator.validate(e).stream() + .anyMatch(err -> err.contains("missing 'domain'"))); + } + } + + @Nested + @DisplayName("Happy path") + class HappyPath { + + @Test + @DisplayName("Full valid extension passes validation cleanly") + void fullValidExtension() { + CustomOntology e = + ext("regulatory-controls") + .withDescription("SOX compliance controls") + .withClasses( + List.of( + cls("RegulatoryControl", "om:Entity"), + cls("SoxControl", EXT_NS + "RegulatoryControl"))) + .withProperties( + List.of( + objProp("hasControl", "om:DataAsset", EXT_NS + "RegulatoryControl"), + datatypeProp( + "controlOwnerEmail", + EXT_NS + "RegulatoryControl", + "http://www.w3.org/2001/XMLSchema#string"))); + List errors = CustomOntologyValidator.validate(e); + assertTrue(errors.isEmpty(), "Expected no errors but got: " + errors); + assertTrue(CustomOntologyValidator.isValid(e)); + } + + @Test + @DisplayName("Property-only extension (no classes) is accepted") + void propertyOnlyExtension() { + CustomOntology e = + ext("annotations") + .withProperties( + List.of( + datatypeProp( + "soxRelevant", + "om:DataAsset", + "http://www.w3.org/2001/XMLSchema#boolean"))); + assertTrue(CustomOntologyValidator.validate(e).isEmpty()); + } + } + + @Test + @DisplayName("isValid wrapper returns boolean and logs on failure") + void isValidWrapper() { + assertFalse(CustomOntologyValidator.isValid(null)); + assertFalse(CustomOntologyValidator.isValid(ext("empty-test"))); + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/federation/SparqlFederationGuardTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/federation/SparqlFederationGuardTest.java new file mode 100644 index 000000000000..f6db29b8bc76 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/federation/SparqlFederationGuardTest.java @@ -0,0 +1,253 @@ +package org.openmetadata.service.rdf.federation; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import java.util.Set; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +/** + * Failure-mode coverage for the SPARQL federation guard. Each test names exactly the case it + * is exercising — these are the queries that real users (and adversaries) will send. + */ +class SparqlFederationGuardTest { + + private static final String WIKIDATA = "https://query.wikidata.org/sparql"; + private static final String DBPEDIA = "https://dbpedia.org/sparql"; + + private SparqlFederationGuard disabled() { + return new SparqlFederationGuard(false, Set.of()); + } + + private SparqlFederationGuard withAllowlist(String... endpoints) { + return new SparqlFederationGuard(true, Set.of(endpoints)); + } + + @Nested + @DisplayName("Federation disabled (default policy)") + class FederationDisabled { + + @Test + @DisplayName("Plain query without SERVICE is always allowed") + void plainQueryAllowed() { + String q = "SELECT * WHERE { ?s ?p ?o } LIMIT 1"; + assertTrue(disabled().firstDisallowedEndpoint(q).isEmpty()); + } + + @Test + @DisplayName("Any SERVICE clause is rejected when federation is disabled") + void serviceRejectedWhenDisabled() { + String q = "SELECT * WHERE { SERVICE <" + WIKIDATA + "> { ?s ?p ?o } } LIMIT 1"; + assertEquals(WIKIDATA, disabled().firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("enforce throws FederationDisallowedException with helpful message") + void enforceThrows() { + String q = "SELECT * WHERE { SERVICE <" + WIKIDATA + "> { ?s ?p ?o } }"; + var ex = + assertThrows( + SparqlFederationGuard.FederationDisallowedException.class, + () -> disabled().enforce(q)); + assertEquals(WIKIDATA, ex.getBlockedEndpoint()); + assertFalse(ex.isFederationEnabled()); + assertTrue(ex.getMessage().contains("federated SPARQL is disabled")); + } + } + + @Nested + @DisplayName("Federation enabled with allowlist") + class FederationEnabled { + + @Test + @DisplayName("Allowlisted endpoint passes") + void allowlistedPasses() { + String q = "SELECT * WHERE { SERVICE <" + WIKIDATA + "> { ?s ?p ?o } }"; + assertTrue(withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).isEmpty()); + } + + @Test + @DisplayName("Endpoint not on allowlist is rejected even when federation is enabled") + void notAllowlistedRejected() { + String q = "SELECT * WHERE { SERVICE <" + DBPEDIA + "> { ?s ?p ?o } }"; + assertEquals(DBPEDIA, withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("Multiple SERVICE clauses: allowed + disallowed → reject the disallowed one") + void mixedServicesRejected() { + String q = + "SELECT * WHERE { " + + " SERVICE <" + + WIKIDATA + + "> { ?s ?p ?o } " + + " SERVICE <" + + DBPEDIA + + "> { ?s ?p ?o } " + + "}"; + assertEquals(DBPEDIA, withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("All SERVICE clauses allowlisted → query allowed") + void allServicesAllowed() { + String q = + "SELECT * WHERE { " + + " SERVICE <" + + WIKIDATA + + "> { ?a ?b ?c } " + + " SERVICE <" + + DBPEDIA + + "> { ?d ?e ?f } " + + "}"; + assertTrue(withAllowlist(WIKIDATA, DBPEDIA).firstDisallowedEndpoint(q).isEmpty()); + } + + @Test + @DisplayName("SERVICE SILENT is detected the same as SERVICE") + void silentServiceDetected() { + String q = "SELECT * WHERE { SERVICE SILENT <" + DBPEDIA + "> { ?s ?p ?o } }"; + assertEquals(DBPEDIA, withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("SERVICE with variable endpoint is always rejected — can't be allowlisted") + void variableServiceRejected() { + String q = "SELECT * WHERE { ?endpoint a SERVICE ?endpoint { ?s ?p ?o } }"; + // The SPARQL parser may or may not accept this exact form depending on context; if it does, + // a variable endpoint is unprovable against a static allowlist and must be rejected. + var blocked = withAllowlist(WIKIDATA).firstDisallowedEndpoint(q); + // If the engine parses it, we expect rejection; if not, no SERVICE was extracted, which is + // also acceptable because the engine itself will reject the query. + blocked.ifPresent(b -> assertTrue(b.startsWith("?"))); + } + + @Test + @DisplayName("Trailing-slash mismatch: allowlist must match the URI exactly") + void trailingSlashDoesNotMatch() { + String q = "SELECT * WHERE { SERVICE <" + WIKIDATA + "/> { ?s ?p ?o } }"; + assertEquals( + WIKIDATA + "/", + withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).orElseThrow(), + "We compare endpoint URIs as strings, including trailing slashes; this is documented behavior"); + } + + @Test + @DisplayName("Nested SERVICE inside OPTIONAL is detected") + void nestedInsideOptional() { + String q = "SELECT * WHERE { ?s ?p ?o OPTIONAL { SERVICE <" + DBPEDIA + "> { ?s ?p ?o } } }"; + assertEquals(DBPEDIA, withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("Nested SERVICE inside UNION branch is detected") + void nestedInsideUnion() { + String q = "SELECT * WHERE { { ?s ?p ?o } UNION { SERVICE <" + DBPEDIA + "> { ?s ?p ?o } } }"; + assertEquals(DBPEDIA, withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("SERVICE inside subquery is detected") + void nestedInsideSubquery() { + String q = + "SELECT * WHERE { " + + " { SELECT ?s WHERE { SERVICE <" + + DBPEDIA + + "> { ?s ?p ?o } } } " + + "}"; + assertEquals(DBPEDIA, withAllowlist(WIKIDATA).firstDisallowedEndpoint(q).orElseThrow()); + } + } + + @Nested + @DisplayName("Adversarial inputs") + class AdversarialInputs { + + @Test + @DisplayName("The literal text 'SERVICE' inside a string literal must NOT trigger the guard") + void serviceLiteralInString() { + String q = "SELECT * WHERE { ?s ?p \"SERVICE <" + DBPEDIA + ">\" }"; + assertTrue( + disabled().firstDisallowedEndpoint(q).isEmpty(), + "Regex-based detectors fail this; the parser-based guard must not"); + } + + @Test + @DisplayName("SPARQL comment with 'SERVICE' must not trigger the guard") + void serviceInComment() { + String q = "# SERVICE <" + DBPEDIA + "> { ?s ?p ?o }\n" + "SELECT * WHERE { ?s ?p ?o }"; + assertTrue(disabled().firstDisallowedEndpoint(q).isEmpty()); + } + + @Test + @DisplayName("Unparseable garbage SPARQL is passed through (engine emits its own parse error)") + void unparseableQueryPassesThrough() { + String garbage = "this is not sparql {{{}}}"; + assertTrue( + disabled().firstDisallowedEndpoint(garbage).isEmpty(), + "Guard must not turn a parse error into a federation error — engine handles parsing"); + } + + @Test + @DisplayName("Empty / null / whitespace queries are passed through") + void emptyQueriesPassedThrough() { + assertTrue(disabled().firstDisallowedEndpoint("").isEmpty()); + assertTrue(disabled().firstDisallowedEndpoint(" ").isEmpty()); + } + + @Test + @DisplayName("Lowercase 'service' keyword is detected (SPARQL is case-insensitive)") + void lowercaseService() { + String q = "select * where { service <" + DBPEDIA + "> { ?s ?p ?o } }"; + assertEquals(DBPEDIA, disabled().firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("ASK with SERVICE is also guarded (not just SELECT)") + void askQueryGuarded() { + String q = "ASK { SERVICE <" + DBPEDIA + "> { ?s ?p ?o } }"; + assertEquals(DBPEDIA, disabled().firstDisallowedEndpoint(q).orElseThrow()); + } + + @Test + @DisplayName("CONSTRUCT with SERVICE is also guarded") + void constructQueryGuarded() { + String q = "CONSTRUCT { ?s ?p ?o } WHERE { SERVICE <" + DBPEDIA + "> { ?s ?p ?o } }"; + assertEquals(DBPEDIA, disabled().firstDisallowedEndpoint(q).orElseThrow()); + } + } + + @Nested + @DisplayName("serviceEndpoints listing") + class ServiceEndpointsListing { + + @Test + @DisplayName("Returns endpoints in order of first appearance, deduplicated") + void listingOrderAndDedup() { + String q = + "SELECT * WHERE { " + + " SERVICE <" + + DBPEDIA + + "> { ?s ?p ?o } " + + " SERVICE <" + + WIKIDATA + + "> { ?s ?p ?o } " + + " SERVICE <" + + DBPEDIA + + "> { ?s ?p ?o } " + + "}"; + assertEquals(List.of(DBPEDIA, WIKIDATA), disabled().serviceEndpoints(q)); + } + + @Test + @DisplayName("Returns empty list for queries without SERVICE") + void emptyListForNoService() { + assertTrue(disabled().serviceEndpoints("SELECT ?s WHERE { ?s a ?t }").isEmpty()); + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/inference/InferenceRuleValidatorTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/inference/InferenceRuleValidatorTest.java new file mode 100644 index 000000000000..a141a87932f6 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/inference/InferenceRuleValidatorTest.java @@ -0,0 +1,226 @@ +package org.openmetadata.service.rdf.inference; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.openmetadata.schema.api.configuration.rdf.InferenceRule; + +/** + * Failure-mode coverage for {@link InferenceRuleValidator}. Each test exercises a way the + * validator must reject a hostile or malformed rule before it can be applied server-side. + */ +class InferenceRuleValidatorTest { + + private static final String VALID_BODY = + "PREFIX om: \n" + + "CONSTRUCT { ?x om:transitivelyDerivedFrom ?y }\n" + + "WHERE { ?x + ?y . FILTER(?x != ?y) }"; + + private static InferenceRule rule(String name, String body) { + return new InferenceRule() + .withName(name) + .withRuleType(InferenceRule.RuleType.CONSTRUCT) + .withRuleBody(body) + .withEnabled(true) + .withPriority(100); + } + + private static List validate(InferenceRule rule) { + return InferenceRuleValidator.validate(rule); + } + + @Nested + @DisplayName("Required fields") + class RequiredFields { + + @Test + @DisplayName("Null rule produces a single error") + void nullRuleRejected() { + List errors = validate(null); + assertFalse(errors.isEmpty()); + assertTrue(errors.get(0).contains("must not be null")); + } + + @Test + @DisplayName("Blank name is rejected") + void blankNameRejected() { + assertTrue( + validate(rule("", VALID_BODY)).stream() + .anyMatch(e -> e.contains("'name' must not be blank"))); + } + + @Test + @DisplayName("Name with uppercase or spaces is rejected (must match the schema pattern)") + void invalidNamePatternRejected() { + assertTrue( + validate(rule("MyRule", VALID_BODY)).stream() + .anyMatch(e -> e.contains("name") && e.contains("lowercase"))); + assertTrue( + validate(rule("my rule", VALID_BODY)).stream() + .anyMatch(e -> e.contains("name") && e.contains("lowercase"))); + assertTrue( + validate(rule("a", VALID_BODY)).stream() + .anyMatch(e -> e.contains("name") && e.contains("3-64"))); + } + + @Test + @DisplayName("Blank ruleBody is rejected") + void blankBodyRejected() { + assertTrue( + validate(rule("good-name", "")).stream() + .anyMatch(e -> e.contains("'ruleBody' must not be blank"))); + } + } + + @Nested + @DisplayName("RuleType handling") + class RuleTypeHandling { + + @Test + @DisplayName("RDFS ruleType is rejected (reserved for future)") + void rdfsTypeRejected() { + InferenceRule r = rule("future-rdfs", VALID_BODY).withRuleType(InferenceRule.RuleType.RDFS); + assertTrue(validate(r).stream().anyMatch(e -> e.contains("RDFS is reserved for future use"))); + } + } + + @Nested + @DisplayName("SPARQL body shape") + class SparqlBodyShape { + + @Test + @DisplayName("Garbage SPARQL is rejected with a parse error") + void parseError() { + List errors = validate(rule("bad-syntax", "this is not sparql")); + assertTrue(errors.stream().anyMatch(e -> e.startsWith("ruleBody failed to parse as SPARQL"))); + } + + @Test + @DisplayName("SELECT body is rejected — must be CONSTRUCT for ruleType=CONSTRUCT") + void selectBodyRejected() { + String selectBody = "SELECT ?s WHERE { ?s ?p ?o }"; + assertTrue( + validate(rule("must-be-construct", selectBody)).stream() + .anyMatch(e -> e.contains("must be a SPARQL CONSTRUCT query"))); + } + + @Test + @DisplayName("ASK body is rejected") + void askBodyRejected() { + String askBody = "ASK { ?s ?p ?o }"; + assertTrue( + validate(rule("ask-not-allowed", askBody)).stream() + .anyMatch(e -> e.contains("must be a SPARQL CONSTRUCT"))); + } + + @Test + @DisplayName("DESCRIBE body is rejected") + void describeBodyRejected() { + String describeBody = "DESCRIBE "; + assertTrue( + validate(rule("describe-not-allowed", describeBody)).stream() + .anyMatch(e -> e.contains("must be a SPARQL CONSTRUCT"))); + } + + @Test + @DisplayName("CONSTRUCT with empty WHERE pattern is rejected") + void emptyWherePatternRejected() { + String emptyWhere = "CONSTRUCT { } WHERE { }"; + List errors = validate(rule("empty-where", emptyWhere)); + assertTrue( + errors.stream().anyMatch(e -> e.contains("non-empty WHERE pattern")), "Got: " + errors); + } + + @Test + @DisplayName("CONSTRUCT with empty template is rejected") + void emptyTemplateRejected() { + String emptyTemplate = "CONSTRUCT { } WHERE { ?s ?p ?o }"; + assertTrue( + validate(rule("empty-template", emptyTemplate)).stream() + .anyMatch(e -> e.contains("CONSTRUCT template must contain at least one triple"))); + } + } + + @Nested + @DisplayName("SERVICE rejection") + class ServiceRejection { + + @Test + @DisplayName("CONSTRUCT body containing a SERVICE clause is rejected") + void serviceClauseRejected() { + String body = + "CONSTRUCT { ?s ?p ?o } WHERE { SERVICE { ?s ?p ?o } }"; + List errors = validate(rule("federated-rule", body)); + assertTrue( + errors.stream().anyMatch(e -> e.contains("must not contain SERVICE clauses")), + "Got: " + errors); + } + + @Test + @DisplayName("SERVICE inside a subquery is also rejected") + void serviceInSubqueryRejected() { + String body = + "CONSTRUCT { ?s ?p ?o } WHERE { { SELECT ?s ?p ?o WHERE { SERVICE { ?s ?p ?o } } } }"; + assertTrue( + validate(rule("nested-service", body)).stream() + .anyMatch(e -> e.contains("must not contain SERVICE clauses"))); + } + } + + @Nested + @DisplayName("Priority bounds") + class PriorityBounds { + + @Test + @DisplayName("Priority below 0 is rejected") + void negativePriorityRejected() { + InferenceRule r = rule("low-pri", VALID_BODY).withPriority(-1); + assertTrue( + validate(r).stream().anyMatch(e -> e.contains("'priority' must be between 0 and 10000"))); + } + + @Test + @DisplayName("Priority above 10000 is rejected") + void highPriorityRejected() { + InferenceRule r = rule("high-pri", VALID_BODY).withPriority(10_001); + assertTrue(validate(r).stream().anyMatch(e -> e.contains("'priority' must be between"))); + } + } + + @Nested + @DisplayName("Happy path") + class HappyPath { + + @Test + @DisplayName("Well-formed CONSTRUCT rule passes validation with no errors") + void validRulePasses() { + InferenceRule r = rule("transitive-lineage", VALID_BODY); + List errors = validate(r); + assertTrue(errors.isEmpty(), "Expected no errors but got: " + errors); + } + } + + @Nested + @DisplayName("Starter pack rules ship valid") + class StarterPackValidation { + + @Test + @DisplayName("All shipped starter-pack rules pass validation") + void starterPackValid() { + InferenceRuleRegistry registry = InferenceRuleRegistry.getInstance(); + registry.loadStarterPackIfNeeded(); + List rules = registry.list(); + assertTrue(rules.size() >= 4, "Starter pack must ship at least 4 rules"); + for (InferenceRule rule : rules) { + List errors = validate(rule); + assertTrue( + errors.isEmpty(), + "Starter pack rule '" + rule.getName() + "' must validate cleanly. Got: " + errors); + } + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CentralityComputationTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CentralityComputationTest.java new file mode 100644 index 000000000000..babf3a63a6e3 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CentralityComputationTest.java @@ -0,0 +1,231 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.contains; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.util.Map; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; +import org.openmetadata.service.rdf.RdfRepository; + +class CentralityComputationTest { + + @Nested + @DisplayName("Predicate weights") + class PredicateWeights { + + @Test + @DisplayName("Lineage weighted highest, hasColumn weakest, unknown predicates excluded") + void weights() { + assertEquals(1.0, CentralityComputation.weightFor("prov:wasDerivedFrom")); + assertEquals(0.5, CentralityComputation.weightFor("om:hasTag")); + assertEquals(0.5, CentralityComputation.weightFor("om:hasGlossaryTerm")); + assertEquals(0.2, CentralityComputation.weightFor("om:hasColumn")); + assertEquals(0.0, CentralityComputation.weightFor("om:somethingElse")); + } + } + + @Nested + @DisplayName("SPARQL JSON → adjacency map parsing") + class GraphParsing { + + private static String row(String from, String to, String pred) { + return "{\"from\":{\"value\":\"" + + from + + "\"},\"to\":{\"value\":\"" + + to + + "\"},\"predicate\":{\"value\":\"" + + pred + + "\"}}"; + } + + private static String body(String... rows) { + return "{\"results\":{\"bindings\":[" + String.join(",", rows) + "]}}"; + } + + @Test + @DisplayName("Empty / null / blank input returns an empty graph") + void emptyInputs() { + assertTrue(CentralityComputation.parseGraph(null).isEmpty()); + assertTrue(CentralityComputation.parseGraph("").isEmpty()); + assertTrue(CentralityComputation.parseGraph("not json").isEmpty()); + assertTrue(CentralityComputation.parseGraph("{\"results\":{\"bindings\":[]}}").isEmpty()); + } + + @Test + @DisplayName("Single lineage edge produces weight 1.0") + void singleLineageEdge() { + Map> g = + CentralityComputation.parseGraph(body(row("urn:a", "urn:b", "prov:wasDerivedFrom"))); + assertEquals(1.0, g.get("urn:a").get("urn:b")); + } + + @Test + @DisplayName("hasColumn edge produces weight 0.2") + void hasColumnEdge() { + Map> g = + CentralityComputation.parseGraph(body(row("urn:t", "urn:c", "om:hasColumn"))); + assertEquals(0.2, g.get("urn:t").get("urn:c")); + } + + @Test + @DisplayName("Multiple edges to the same target sum their weights") + void parallelEdges() { + Map> g = + CentralityComputation.parseGraph( + body( + row("urn:a", "urn:b", "prov:wasDerivedFrom"), + row("urn:a", "urn:b", "om:hasColumn"))); + assertEquals(1.2, g.get("urn:a").get("urn:b"), 1e-9); + } + + @Test + @DisplayName("Unknown predicate produces weight 0.0 (effectively dropped)") + void unknownPredicateSilentlyDropped() { + Map> g = + CentralityComputation.parseGraph(body(row("urn:a", "urn:b", "om:unknown"))); + assertEquals(0.0, g.get("urn:a").get("urn:b")); + } + + @Test + @DisplayName("Rows missing 'from' or 'to' are skipped") + void missingBindingsSkipped() { + String partial = "{\"results\":{\"bindings\":[{\"from\":{\"value\":\"urn:a\"}}]}}"; + assertTrue(CentralityComputation.parseGraph(partial).isEmpty()); + } + } + + @Nested + @DisplayName("End-to-end computeAndPersist") + class EndToEnd { + + @Test + @DisplayName("Empty graph: returns 0 nodes, no SPARQL UPDATE") + void emptyGraph() { + RdfRepository repo = mock(RdfRepository.class); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenReturn("{\"results\":{\"bindings\":[]}}"); + CentralityComputation comp = new CentralityComputation(repo); + + CentralityComputation.Result r = comp.computeAndPersist("table"); + + assertEquals("table", r.entityType()); + assertEquals(0, r.nodesScored()); + assertFalse(r.converged()); + verify(repo, never()).executeSparqlUpdate(anyString()); + } + + @Test + @DisplayName("Lineage triangle persisted with normalized scores summing to 1.0") + void lineageTriangle() { + RdfRepository repo = mock(RdfRepository.class); + String body = + "{\"results\":{\"bindings\":[" + + "{\"from\":{\"value\":\"urn:a\"},\"to\":{\"value\":\"urn:b\"},\"predicate\":{\"value\":\"prov:wasDerivedFrom\"}}," + + "{\"from\":{\"value\":\"urn:b\"},\"to\":{\"value\":\"urn:c\"},\"predicate\":{\"value\":\"prov:wasDerivedFrom\"}}," + + "{\"from\":{\"value\":\"urn:c\"},\"to\":{\"value\":\"urn:a\"},\"predicate\":{\"value\":\"prov:wasDerivedFrom\"}}" + + "]}}"; + when(repo.executeSparqlQuery(anyString(), anyString())).thenReturn(body); + + CentralityComputation comp = new CentralityComputation(repo); + CentralityComputation.Result r = comp.computeAndPersist("table"); + + assertEquals(3, r.nodesScored()); + assertTrue(r.converged()); + + ArgumentCaptor update = ArgumentCaptor.forClass(String.class); + verify(repo, times(1)).executeSparqlUpdate(update.capture()); + String sparqlUpdate = update.getValue(); + assertTrue( + sparqlUpdate.contains( + "WITH "), + "Should write to the entityType-specific named graph: " + sparqlUpdate); + assertTrue(sparqlUpdate.contains("om:centralityScore")); + assertTrue(sparqlUpdate.contains("om:centralityRank")); + assertTrue(sparqlUpdate.contains("DELETE")); + assertTrue(sparqlUpdate.contains("INSERT DATA")); + // All three nodes referenced + assertTrue(sparqlUpdate.contains("urn:a")); + assertTrue(sparqlUpdate.contains("urn:b")); + assertTrue(sparqlUpdate.contains("urn:c")); + } + + @Test + @DisplayName("Star topology: hub gets the highest persisted score (rank 1)") + void hubGetsRank1() { + RdfRepository repo = mock(RdfRepository.class); + StringBuilder body = new StringBuilder("{\"results\":{\"bindings\":["); + for (int i = 0; i < 5; i++) { + if (i > 0) body.append(","); + body.append("{\"from\":{\"value\":\"urn:leaf-") + .append(i) + .append( + "\"},\"to\":{\"value\":\"urn:hub\"},\"predicate\":{\"value\":\"prov:wasDerivedFrom\"}}"); + } + body.append("]}}"); + when(repo.executeSparqlQuery(anyString(), anyString())).thenReturn(body.toString()); + + CentralityComputation comp = new CentralityComputation(repo); + comp.computeAndPersist("table"); + + ArgumentCaptor update = ArgumentCaptor.forClass(String.class); + verify(repo).executeSparqlUpdate(update.capture()); + String sparql = update.getValue(); + // The hub should appear before any leaf in the INSERT block (sorted by score desc). + int hubIdx = sparql.indexOf("urn:hub"); + int firstLeafIdx = sparql.indexOf("urn:leaf-"); + assertTrue(hubIdx > 0 && firstLeafIdx > 0); + assertTrue(hubIdx < firstLeafIdx, "Hub should be persisted first (highest score)"); + } + + @Test + @DisplayName("Bad entityType is rejected before any SPARQL is sent") + void badEntityTypeRejected() { + RdfRepository repo = mock(RdfRepository.class); + CentralityComputation comp = new CentralityComputation(repo); + + org.junit.jupiter.api.Assertions.assertThrows( + IllegalArgumentException.class, () -> comp.computeAndPersist("table OR 1=1")); + verify(repo, never()).executeSparqlQuery(anyString(), anyString()); + verify(repo, never()).executeSparqlUpdate(anyString()); + } + + @Test + @DisplayName("Repository SPARQL throws → empty result, no update attempted") + void repositoryThrows() { + RdfRepository repo = mock(RdfRepository.class); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenThrow(new RuntimeException("Fuseki unreachable")); + CentralityComputation comp = new CentralityComputation(repo); + + CentralityComputation.Result r = comp.computeAndPersist("table"); + assertEquals(0, r.nodesScored()); + verify(repo, never()).executeSparqlUpdate(anyString()); + } + + @Test + @DisplayName("Persisted graph URI uses lowercase entityType") + void graphUriLowercase() { + RdfRepository repo = mock(RdfRepository.class); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenReturn( + "{\"results\":{\"bindings\":[" + + "{\"from\":{\"value\":\"urn:a\"},\"to\":{\"value\":\"urn:b\"},\"predicate\":{\"value\":\"prov:wasDerivedFrom\"}}" + + "]}}"); + new CentralityComputation(repo).computeAndPersist("Dashboard"); + verify(repo) + .executeSparqlUpdate( + contains("")); + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CoOccurrenceQueryBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CoOccurrenceQueryBuilderTest.java new file mode 100644 index 000000000000..a47c7809dd86 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CoOccurrenceQueryBuilderTest.java @@ -0,0 +1,130 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryFactory; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +class CoOccurrenceQueryBuilderTest { + + @Nested + @DisplayName("Tag co-occurrence") + class TagCoOccurrence { + + @Test + @DisplayName("Generated query parses with Jena and selects the expected vars") + void parses() { + Query q = QueryFactory.create(CoOccurrenceQueryBuilder.tagCoOccurrence(2, 20)); + assertTrue(q.isSelectType()); + assertTrue(q.getResultVars().contains("tagA")); + assertTrue(q.getResultVars().contains("tagB")); + assertTrue(q.getResultVars().contains("count")); + } + + @Test + @DisplayName("Pairs are canonicalised so each pair appears once") + void canonicalised() { + String sparql = CoOccurrenceQueryBuilder.tagCoOccurrence(2, 20); + assertTrue( + sparql.contains("STR(?tagA) < STR(?tagB)"), + "Without canonical pair filter (a, b) and (b, a) would each appear"); + } + + @Test + @DisplayName("HAVING enforces the minimum count") + void havingPresent() { + String sparql = CoOccurrenceQueryBuilder.tagCoOccurrence(5, 20); + assertTrue(sparql.contains("HAVING (COUNT(?entity) >= 5)")); + } + + @Test + @DisplayName("minCount below 1 is clamped to 1") + void minCountClampedLow() { + String sparql = CoOccurrenceQueryBuilder.tagCoOccurrence(-100, 10); + assertTrue(sparql.contains("HAVING (COUNT(?entity) >= 1)")); + } + + @Test + @DisplayName("limit above MAX_LIMIT is clamped down") + void limitClampedHigh() { + String sparql = CoOccurrenceQueryBuilder.tagCoOccurrence(2, 9999); + assertTrue(sparql.endsWith("LIMIT " + CoOccurrenceQueryBuilder.MAX_LIMIT)); + } + + @Test + @DisplayName("limit below 1 is clamped to 1") + void limitClampedLow() { + String sparql = CoOccurrenceQueryBuilder.tagCoOccurrence(2, 0); + assertTrue(sparql.endsWith("LIMIT 1")); + } + + @Test + @DisplayName("ORDER BY DESC(?count) is present") + void orderByCount() { + assertTrue(CoOccurrenceQueryBuilder.tagCoOccurrence(2, 20).contains("ORDER BY DESC(?count)")); + } + } + + @Nested + @DisplayName("Glossary reach") + class GlossaryReach { + + @Test + @DisplayName("Generated query parses with Jena and counts DISTINCT domains") + void parses() { + Query q = QueryFactory.create(CoOccurrenceQueryBuilder.glossaryReach(2, 20)); + assertTrue(q.isSelectType()); + assertTrue(q.getResultVars().contains("term")); + assertTrue(q.getResultVars().contains("domainCount")); + String sparql = CoOccurrenceQueryBuilder.glossaryReach(2, 20); + assertTrue(sparql.contains("COUNT(DISTINCT ?domain)")); + } + + @Test + @DisplayName("HAVING enforces minDomains floor; clamped values are used") + void minDomainsClamped() { + assertTrue( + CoOccurrenceQueryBuilder.glossaryReach(-5, 20) + .contains("HAVING (COUNT(DISTINCT ?domain) >= 1)")); + assertTrue( + CoOccurrenceQueryBuilder.glossaryReach(7, 20) + .contains("HAVING (COUNT(DISTINCT ?domain) >= 7)")); + } + + @Test + @DisplayName("Joins om:hasGlossaryTerm with om:hasDomain on the same entity") + void joinsCorrectly() { + String sparql = CoOccurrenceQueryBuilder.glossaryReach(2, 20); + assertTrue(sparql.contains("?entity om:hasGlossaryTerm ?term")); + assertTrue(sparql.contains("?entity om:hasDomain ?domain")); + } + } + + @Nested + @DisplayName("Tag popularity") + class TagPopularity { + + @Test + @DisplayName("Generated query parses, counts DISTINCT entities, no HAVING required") + void parses() { + Query q = QueryFactory.create(CoOccurrenceQueryBuilder.tagPopularity(20)); + assertTrue(q.isSelectType()); + assertTrue(q.getResultVars().contains("tag")); + assertTrue(q.getResultVars().contains("entityCount")); + String sparql = CoOccurrenceQueryBuilder.tagPopularity(20); + assertTrue(sparql.contains("COUNT(DISTINCT ?entity)")); + } + + @Test + @DisplayName("limit clamping behaves like the other builders") + void clamping() { + assertTrue(CoOccurrenceQueryBuilder.tagPopularity(0).endsWith("LIMIT 1")); + assertTrue( + CoOccurrenceQueryBuilder.tagPopularity(9999) + .endsWith("LIMIT " + CoOccurrenceQueryBuilder.MAX_LIMIT)); + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CommunityComputationTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CommunityComputationTest.java new file mode 100644 index 000000000000..21dfcdbed92b --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/CommunityComputationTest.java @@ -0,0 +1,306 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.contains; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.util.Map; +import org.apache.jena.query.QueryFactory; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; +import org.openmetadata.service.rdf.RdfRepository; + +class CommunityComputationTest { + + @Nested + @DisplayName("GraphType parsing") + class GraphTypeParsing { + + @Test + @DisplayName("Defaults to lineage when null/blank") + void defaults() { + assertEquals( + CommunityComputation.GraphType.LINEAGE, CommunityComputation.GraphType.parse(null)); + assertEquals( + CommunityComputation.GraphType.LINEAGE, CommunityComputation.GraphType.parse("")); + assertEquals( + CommunityComputation.GraphType.LINEAGE, CommunityComputation.GraphType.parse(" ")); + } + + @Test + @DisplayName("Aliases for tagCoOccurrence are accepted") + void tagAliases() { + assertEquals( + CommunityComputation.GraphType.TAG_CO_OCCURRENCE, + CommunityComputation.GraphType.parse("tagCoOccurrence")); + assertEquals( + CommunityComputation.GraphType.TAG_CO_OCCURRENCE, + CommunityComputation.GraphType.parse("tags")); + assertEquals( + CommunityComputation.GraphType.TAG_CO_OCCURRENCE, + CommunityComputation.GraphType.parse("TAG")); + assertEquals( + CommunityComputation.GraphType.TAG_CO_OCCURRENCE, + CommunityComputation.GraphType.parse("tag-co-occurrence")); + } + + @Test + @DisplayName("Unknown values are rejected") + void unknown() { + assertThrows( + IllegalArgumentException.class, + () -> CommunityComputation.GraphType.parse("citation_network")); + } + } + + @Nested + @DisplayName("SPARQL well-formedness") + class WellFormed { + + @Test + @DisplayName("Lineage graph SPARQL parses with Jena and uses both upstream/downstream edges") + void lineageSparqlParses() { + String sparql = CommunityComputation.lineageGraphSparql("Table"); + QueryFactory.create(sparql); + assertTrue(sparql.contains("prov:wasDerivedFrom")); + assertTrue(sparql.contains("om:upstream")); + assertTrue(sparql.contains("om:downstream")); + assertTrue(sparql.contains("a om:Table")); + } + + @Test + @DisplayName("Tag co-occurrence SPARQL parses and groups by pair with COUNT(?shared)") + void tagSparqlParses() { + String sparql = CommunityComputation.tagCoOccurrenceSparql("Dashboard"); + QueryFactory.create(sparql); + assertTrue(sparql.contains("om:hasTag")); + assertTrue(sparql.contains("om:hasGlossaryTerm")); + assertTrue(sparql.contains("COUNT(?shared)")); + assertTrue( + sparql.contains("STR(?from) < STR(?to)"), + "Pairs must be canonicalized to avoid double-counting"); + } + + @Test + @DisplayName("Listing SPARQL targets the entityType-specific named graph") + void listingSparqlTargetsNamedGraph() { + String sparql = CommunityComputation.listingSparql("table", "lineage"); + QueryFactory.create(sparql); + assertTrue( + sparql.contains( + "FROM ")); + assertTrue(sparql.contains("ORDER BY DESC(?size)")); + } + + @Test + @DisplayName("Listing SPARQL rejects bad entity / graph type before emitting SPARQL") + void listingSparqlValidatesInput() { + assertThrows( + IllegalArgumentException.class, + () -> CommunityComputation.listingSparql("table OR 1=1", "lineage")); + assertThrows( + IllegalArgumentException.class, + () -> CommunityComputation.listingSparql("table", "elsewhere")); + } + } + + @Nested + @DisplayName("SPARQL JSON → adjacency parsing") + class GraphParsing { + + private static String row(String from, String to, String weight) { + String w = weight == null ? "" : ",\"weight\":{\"value\":\"" + weight + "\"}"; + return "{\"from\":{\"value\":\"" + from + "\"},\"to\":{\"value\":\"" + to + "\"}" + w + "}"; + } + + private static String body(String... rows) { + return "{\"results\":{\"bindings\":[" + String.join(",", rows) + "]}}"; + } + + @Test + @DisplayName("Empty / null / blank input → empty adjacency") + void empty() { + assertTrue(CommunityComputation.parseGraph(null).isEmpty()); + assertTrue(CommunityComputation.parseGraph("").isEmpty()); + assertTrue(CommunityComputation.parseGraph("not json").isEmpty()); + assertTrue(CommunityComputation.parseGraph("{\"results\":{\"bindings\":[]}}").isEmpty()); + } + + @Test + @DisplayName("Single edge yields one directed entry (Louvain symmetrizes internally)") + void directedSingleEdge() { + Map> g = + CommunityComputation.parseGraph(body(row("urn:a", "urn:b", "1.0"))); + assertEquals(1.0, g.get("urn:a").get("urn:b")); + // The target node is registered (so Louvain sees it) but the reverse weight is NOT added — + // Louvain.addAllEdges adds both directions to its internal adjacency. Duplicating here + // would double-count every edge weight. + assertTrue(g.containsKey("urn:b"), "target node must be a key so Louvain enumerates it"); + assertTrue( + g.get("urn:b") == null || !g.get("urn:b").containsKey("urn:a"), + "reverse-direction weight must not be populated by parseGraph"); + } + + @Test + @DisplayName("Missing weight defaults to 1.0") + void weightDefaults() { + Map> g = + CommunityComputation.parseGraph(body(row("urn:a", "urn:b", null))); + assertEquals(1.0, g.get("urn:a").get("urn:b")); + } + + @Test + @DisplayName("Self-loops are dropped") + void selfLoopsDropped() { + Map> g = + CommunityComputation.parseGraph(body(row("urn:a", "urn:a", "1.0"))); + assertTrue(g.isEmpty()); + } + + @Test + @DisplayName("Non-positive weights are dropped") + void nonPositiveDropped() { + Map> g = + CommunityComputation.parseGraph( + body(row("urn:a", "urn:b", "0"), row("urn:c", "urn:d", "-1"))); + assertTrue(g.isEmpty()); + } + + @Test + @DisplayName("Non-numeric weights fall back to default and the edge is kept") + void nonNumericFallback() { + Map> g = + CommunityComputation.parseGraph(body(row("urn:a", "urn:b", "garbage"))); + assertEquals(1.0, g.get("urn:a").get("urn:b")); + } + } + + @Nested + @DisplayName("End-to-end computeAndPersist") + class EndToEnd { + + @Test + @DisplayName("Empty graph: returns 0 communities, no SPARQL UPDATE") + void emptyGraph() { + RdfRepository repo = mock(RdfRepository.class); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenReturn("{\"results\":{\"bindings\":[]}}"); + CommunityComputation comp = new CommunityComputation(repo); + CommunityComputation.Result r = comp.computeAndPersist("table", "lineage"); + assertEquals(0, r.communities()); + assertEquals(0, r.membersTotal()); + verify(repo, never()).executeSparqlUpdate(anyString()); + } + + @Test + @DisplayName("Two cliques persist as two om:Community resources with correct members") + void twoCliques() { + RdfRepository repo = mock(RdfRepository.class); + String body = + "{\"results\":{\"bindings\":[" + + edge("urn:a", "urn:b") + + "," + + edge("urn:b", "urn:c") + + "," + + edge("urn:a", "urn:c") + + "," + + edge("urn:x", "urn:y") + + "," + + edge("urn:y", "urn:z") + + "," + + edge("urn:x", "urn:z") + + "," + + edge("urn:c", "urn:x") + + "]}}"; + when(repo.executeSparqlQuery(anyString(), anyString())).thenReturn(body); + + CommunityComputation.Result r = + new CommunityComputation(repo).computeAndPersist("table", "lineage"); + assertEquals(2, r.communities()); + assertEquals(6, r.membersTotal()); + + ArgumentCaptor update = ArgumentCaptor.forClass(String.class); + verify(repo, times(1)).executeSparqlUpdate(update.capture()); + String sparql = update.getValue(); + assertTrue(sparql.contains("om:Community")); + assertTrue(sparql.contains("om:hasMember")); + assertTrue(sparql.contains("om:communitySize")); + assertTrue(sparql.contains("om:modularity")); + assertTrue( + sparql.contains( + "WITH "), + "Must persist into lineage/table named graph: " + sparql); + assertTrue(sparql.contains("urn:a")); + assertTrue(sparql.contains("urn:z")); + } + + @Test + @DisplayName("Bad entity type is rejected before any SPARQL is sent") + void badEntityTypeRejected() { + RdfRepository repo = mock(RdfRepository.class); + assertThrows( + IllegalArgumentException.class, + () -> new CommunityComputation(repo).computeAndPersist("foo OR 1=1", "lineage")); + verify(repo, never()).executeSparqlQuery(anyString(), anyString()); + } + + @Test + @DisplayName("Unknown graphType is rejected before any SPARQL is sent") + void badGraphType() { + RdfRepository repo = mock(RdfRepository.class); + assertThrows( + IllegalArgumentException.class, + () -> new CommunityComputation(repo).computeAndPersist("table", "weather")); + verify(repo, never()).executeSparqlQuery(anyString(), anyString()); + } + + @Test + @DisplayName("SPARQL exception during extraction → empty result, no update") + void sparqlError() { + RdfRepository repo = mock(RdfRepository.class); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenThrow(new RuntimeException("Fuseki down")); + CommunityComputation.Result r = + new CommunityComputation(repo).computeAndPersist("table", "lineage"); + assertEquals(0, r.communities()); + verify(repo, never()).executeSparqlUpdate(anyString()); + } + + @Test + @DisplayName("Tag-co-occurrence run uses the tagCoOccurrence named graph") + void tagsTargetsTagGraph() { + RdfRepository repo = mock(RdfRepository.class); + String body = "{\"results\":{\"bindings\":[" + edgeWithWeight("urn:a", "urn:b", 3.0) + "]}}"; + when(repo.executeSparqlQuery(anyString(), anyString())).thenReturn(body); + + new CommunityComputation(repo).computeAndPersist("table", "tagCoOccurrence"); + verify(repo) + .executeSparqlUpdate( + contains( + "")); + } + + private static String edge(String from, String to) { + return "{\"from\":{\"value\":\"" + from + "\"},\"to\":{\"value\":\"" + to + "\"}}"; + } + + private static String edgeWithWeight(String from, String to, double w) { + return "{\"from\":{\"value\":\"" + + from + + "\"},\"to\":{\"value\":\"" + + to + + "\"},\"weight\":{\"value\":\"" + + w + + "\"}}"; + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/ImportanceQueryBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/ImportanceQueryBuilderTest.java new file mode 100644 index 000000000000..b10aca7ed5ce --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/ImportanceQueryBuilderTest.java @@ -0,0 +1,209 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryFactory; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +/** + * Tests for {@link ImportanceQueryBuilder}. Covers: + * + *
    + *
  1. Input validation — bad entityType / window / limit must be rejected, not sanitized + * silently. + *
  2. SPARQL well-formedness — every produced query must parse with Jena (catches typos in + * string interpolation). + *
  3. Score formula — the projected SELECT contains the right predicates and the BIND for + * score blends usage and downstream count with the right weights. + *
+ */ +class ImportanceQueryBuilderTest { + + @Nested + @DisplayName("Input validation") + class InputValidation { + + @Test + @DisplayName("Null entityType is rejected") + void nullEntityType() { + assertThrows( + IllegalArgumentException.class, () -> ImportanceQueryBuilder.build(null, "daily", 10)); + } + + @Test + @DisplayName("Blank entityType is rejected") + void blankEntityType() { + assertThrows( + IllegalArgumentException.class, () -> ImportanceQueryBuilder.build(" ", "daily", 10)); + } + + @Test + @DisplayName("Non-alphanumeric entityType is rejected (defends against SPARQL injection)") + void injectionAttemptRejected() { + assertThrows( + IllegalArgumentException.class, + () -> ImportanceQueryBuilder.build("table> ; DROP --", "daily", 10)); + assertThrows( + IllegalArgumentException.class, + () -> ImportanceQueryBuilder.build("table OR 1=1", "daily", 10)); + assertThrows( + IllegalArgumentException.class, + () -> ImportanceQueryBuilder.build("ta'ble", "daily", 10)); + } + + @Test + @DisplayName("Unknown window is rejected") + void unknownWindow() { + assertThrows( + IllegalArgumentException.class, + () -> ImportanceQueryBuilder.build("table", "yearly", 10)); + } + + @Test + @DisplayName("Window is normalized to lowercase") + void windowLowercased() { + String q = ImportanceQueryBuilder.build("table", "DAILY", 10); + assertTrue(q.contains("usageDailyPercentile")); + } + + @Test + @DisplayName("Empty / null window defaults to daily") + void windowDefaults() { + String q1 = ImportanceQueryBuilder.build("table", null, 10); + String q2 = ImportanceQueryBuilder.build("table", "", 10); + assertTrue(q1.contains("usageDailyPercentile")); + assertTrue(q2.contains("usageDailyPercentile")); + } + + @Test + @DisplayName("Limit below 1 is clamped to 1") + void limitClampedLow() { + String q = ImportanceQueryBuilder.build("table", "daily", -5); + assertTrue(q.endsWith("LIMIT 1"), "Got: " + q); + } + + @Test + @DisplayName("Limit above 100 is clamped to 100") + void limitClampedHigh() { + String q = ImportanceQueryBuilder.build("table", "daily", 9999); + assertTrue(q.endsWith("LIMIT 100")); + } + + @Test + @DisplayName("Default limit is 20") + void defaultLimit() { + assertEquals(20, ImportanceQueryBuilder.DEFAULT_LIMIT); + } + } + + @Nested + @DisplayName("Class-name capitalization") + class ClassCapitalization { + + @Test + @DisplayName("entityType is capitalized to match the OWL class") + void capitalize() { + assertTrue(ImportanceQueryBuilder.build("table", "daily", 10).contains("a om:Table")); + assertTrue(ImportanceQueryBuilder.build("dashboard", "daily", 10).contains("a om:Dashboard")); + assertTrue(ImportanceQueryBuilder.build("pipeline", "daily", 10).contains("a om:Pipeline")); + } + + @Test + @DisplayName("Already-capitalized entityType stays capitalized") + void alreadyCapitalized() { + assertTrue(ImportanceQueryBuilder.build("Table", "daily", 10).contains("a om:Table")); + } + } + + @Nested + @DisplayName("SPARQL well-formedness") + class WellFormed { + + @Test + @DisplayName("Generated query parses as a valid SPARQL Query (Jena)") + void parsesWithJena() { + Query q = QueryFactory.create(ImportanceQueryBuilder.build("table", "daily", 20)); + assertTrue(q.isSelectType(), "Expected SELECT query"); + assertTrue(q.getResultVars().contains("entity")); + assertTrue(q.getResultVars().contains("score")); + assertTrue(q.getResultVars().contains("usagePct")); + assertTrue(q.getResultVars().contains("downstreamCount")); + } + + @Test + @DisplayName("Generated query uses ORDER BY DESC(score)") + void orderByScore() { + String body = ImportanceQueryBuilder.build("table", "daily", 20); + assertTrue(body.contains("ORDER BY DESC(?score)"), "Got: " + body); + } + } + + @Nested + @DisplayName("Score formula") + class ScoreFormula { + + @Test + @DisplayName("Score blends usage (0.6) and downstream (0.4) with centrality reserved at 0.0") + void weightsExpressed() { + String body = ImportanceQueryBuilder.build("table", "daily", 20); + assertTrue(body.contains("0.6 * ?usageNorm"), "Got: " + body); + assertTrue(body.contains("0.4 * ?downstreamNorm"), "Got: " + body); + assertTrue( + body.contains("0.0 * ?centralityNorm"), + "Centrality term must be reserved at 0.0 until 3.1.b ships its PageRank fallback"); + } + + @Test + @DisplayName("Usage percentile is divided by 100 to land in 0-1") + void usageNormalizedTo01() { + assertTrue( + ImportanceQueryBuilder.build("table", "daily", 20) + .contains("COALESCE(?usagePct, 0.0) / 100.0")); + } + + @Test + @DisplayName("Downstream count is normalized by max-downstream-count via subquery") + void downstreamNormalizedByMax() { + String body = ImportanceQueryBuilder.build("table", "daily", 20); + assertTrue(body.contains("MAX(?dc) AS ?maxDownstream")); + assertTrue(body.contains("xsd:double(?downstreamCount) / xsd:double(?maxDownstream)")); + assertTrue( + body.contains("IF(?maxDownstream > 0,"), + "Must guard against division by zero when no entity has downstream lineage"); + } + } + + @Nested + @DisplayName("Window selection") + class WindowSelection { + + @Test + @DisplayName("daily window queries usageDailyPercentile") + void daily() { + assertTrue( + ImportanceQueryBuilder.build("table", "daily", 20) + .contains("om:usageDailyPercentile ?usagePct")); + } + + @Test + @DisplayName("weekly window queries usageWeeklyPercentile") + void weekly() { + assertTrue( + ImportanceQueryBuilder.build("table", "weekly", 20) + .contains("om:usageWeeklyPercentile ?usagePct")); + } + + @Test + @DisplayName("monthly window queries usageMonthlyPercentile") + void monthly() { + assertTrue( + ImportanceQueryBuilder.build("table", "monthly", 20) + .contains("om:usageMonthlyPercentile ?usagePct")); + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LineagePathBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LineagePathBuilderTest.java new file mode 100644 index 000000000000..c082f13fbcaf --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LineagePathBuilderTest.java @@ -0,0 +1,274 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryFactory; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +/** + * Tests for {@link LineagePathBuilder}. Covers: + * + *
    + *
  1. Input validation — bad URI / hop / direction values must throw, not silently sanitize. + *
  2. SPARQL well-formedness — every produced query must parse with Jena. + *
  3. Predicate UNIONs change with direction so the caller can swap walk modes without + * branching elsewhere. + *
+ */ +class LineagePathBuilderTest { + + @Nested + @DisplayName("URI validation") + class UriValidation { + + @Test + @DisplayName("Null URI is rejected") + void nullUri() { + assertThrows( + IllegalArgumentException.class, () -> LineagePathBuilder.validateNodeUri("from", null)); + } + + @Test + @DisplayName("Blank URI is rejected") + void blankUri() { + assertThrows( + IllegalArgumentException.class, () -> LineagePathBuilder.validateNodeUri("from", " ")); + } + + @Test + @DisplayName("Relative URI is rejected") + void relativeUri() { + assertThrows( + IllegalArgumentException.class, + () -> LineagePathBuilder.validateNodeUri("from", "/relative/path")); + } + + @Test + @DisplayName("Non-http scheme is rejected") + void nonHttpScheme() { + assertThrows( + IllegalArgumentException.class, + () -> LineagePathBuilder.validateNodeUri("from", "file:///etc/passwd")); + assertThrows( + IllegalArgumentException.class, + () -> LineagePathBuilder.validateNodeUri("from", "ftp://example.com/x")); + } + + @Test + @DisplayName("URI with angle bracket is rejected (defense against SPARQL injection)") + void angleBracketRejected() { + assertThrows( + IllegalArgumentException.class, + () -> + LineagePathBuilder.validateNodeUri( + "from", "https://example.com/x> ; DROP GRAPH LineagePathBuilder.validateNodeUri("from", "https://x.com/y\nINSERT")); + } + + @Test + @DisplayName("Malformed URI is rejected") + void malformedUri() { + assertThrows( + IllegalArgumentException.class, + () -> LineagePathBuilder.validateNodeUri("from", "https://[badhost")); + } + + @Test + @DisplayName("Valid http(s) URIs are accepted and trimmed") + void valid() { + assertEquals( + "https://open-metadata.org/instance/Table/abc", + LineagePathBuilder.validateNodeUri( + "from", " https://open-metadata.org/instance/Table/abc ")); + assertEquals( + "http://localhost:3030/foo", + LineagePathBuilder.validateNodeUri("from", "http://localhost:3030/foo")); + } + } + + @Nested + @DisplayName("Direction parsing") + class DirectionParsing { + + @Test + @DisplayName("Defaults to upstream when null/blank") + void defaultsToUpstream() { + assertEquals(LineagePathBuilder.Direction.UPSTREAM, LineagePathBuilder.Direction.parse(null)); + assertEquals(LineagePathBuilder.Direction.UPSTREAM, LineagePathBuilder.Direction.parse("")); + assertEquals(LineagePathBuilder.Direction.UPSTREAM, LineagePathBuilder.Direction.parse(" ")); + } + + @Test + @DisplayName("Recognized values map case-insensitively") + void recognized() { + assertEquals( + LineagePathBuilder.Direction.UPSTREAM, LineagePathBuilder.Direction.parse("UPSTREAM")); + assertEquals( + LineagePathBuilder.Direction.DOWNSTREAM, + LineagePathBuilder.Direction.parse("Downstream")); + assertEquals(LineagePathBuilder.Direction.BOTH, LineagePathBuilder.Direction.parse(" both ")); + } + + @Test + @DisplayName("Unknown direction is rejected") + void unknown() { + assertThrows( + IllegalArgumentException.class, () -> LineagePathBuilder.Direction.parse("sideways")); + } + } + + @Nested + @DisplayName("Hop budget clamping") + class HopBudgetClamping { + + @Test + @DisplayName("Null and < 1 fall back to default") + void defaults() { + assertEquals(LineagePathBuilder.DEFAULT_MAX_HOPS, LineagePathBuilder.clampMaxHops(null)); + assertEquals(LineagePathBuilder.DEFAULT_MAX_HOPS, LineagePathBuilder.clampMaxHops(0)); + assertEquals(LineagePathBuilder.DEFAULT_MAX_HOPS, LineagePathBuilder.clampMaxHops(-9)); + } + + @Test + @DisplayName("Values within [1, HARD_MAX_HOPS] are passed through") + void passthrough() { + assertEquals(1, LineagePathBuilder.clampMaxHops(1)); + assertEquals(7, LineagePathBuilder.clampMaxHops(7)); + assertEquals( + LineagePathBuilder.HARD_MAX_HOPS, + LineagePathBuilder.clampMaxHops(LineagePathBuilder.HARD_MAX_HOPS)); + } + + @Test + @DisplayName("Values above HARD_MAX_HOPS are clamped down") + void clampedHigh() { + assertEquals(LineagePathBuilder.HARD_MAX_HOPS, LineagePathBuilder.clampMaxHops(9999)); + } + } + + @Nested + @DisplayName("Frontier SPARQL well-formedness") + class FrontierSparql { + + @Test + @DisplayName("Empty frontier is rejected before SPARQL is built") + void emptyFrontier() { + assertThrows( + IllegalArgumentException.class, + () -> LineagePathBuilder.frontierQuery(List.of(), LineagePathBuilder.Direction.UPSTREAM)); + assertThrows( + IllegalArgumentException.class, + () -> LineagePathBuilder.frontierQuery(null, LineagePathBuilder.Direction.UPSTREAM)); + } + + @Test + @DisplayName("Bad URI in the frontier is rejected before SPARQL is built") + void badFrontierUri() { + assertThrows( + IllegalArgumentException.class, + () -> + LineagePathBuilder.frontierQuery( + List.of("not a uri"), LineagePathBuilder.Direction.UPSTREAM)); + } + + @Test + @DisplayName("Upstream query parses and contains both upstream predicates") + void upstreamQueryParses() { + String sparql = + LineagePathBuilder.frontierQuery( + List.of("https://x.com/a"), LineagePathBuilder.Direction.UPSTREAM); + Query q = QueryFactory.create(sparql); + assertTrue(q.isSelectType()); + assertTrue(q.getResultVars().contains("from")); + assertTrue(q.getResultVars().contains("to")); + assertTrue(q.getResultVars().contains("predicate")); + assertTrue(sparql.contains("prov:wasDerivedFrom")); + assertTrue(sparql.contains("om:upstream")); + } + + @Test + @DisplayName("Downstream query inverts prov:wasDerivedFrom") + void downstreamInverts() { + String sparql = + LineagePathBuilder.frontierQuery( + List.of("https://x.com/a"), LineagePathBuilder.Direction.DOWNSTREAM); + QueryFactory.create(sparql); + assertTrue(sparql.contains("?to prov:wasDerivedFrom ?from")); + assertTrue(sparql.contains("om:downstream")); + assertTrue( + sparql.contains("\"^prov:wasDerivedFrom\""), + "Inverted edge must be labelled as ^prov:wasDerivedFrom so callers can render direction"); + } + + @Test + @DisplayName("Both direction emits all four predicate variants") + void bothEmitsAll() { + String sparql = + LineagePathBuilder.frontierQuery( + List.of("https://x.com/a"), LineagePathBuilder.Direction.BOTH); + QueryFactory.create(sparql); + assertTrue(sparql.contains("prov:wasDerivedFrom")); + assertTrue(sparql.contains("om:upstream")); + assertTrue(sparql.contains("om:downstream")); + assertTrue(sparql.contains("^prov:wasDerivedFrom")); + } + + @Test + @DisplayName("Multi-node frontier produces VALUES with all URIs") + void multiNodeValues() { + String sparql = + LineagePathBuilder.frontierQuery( + List.of("https://x.com/a", "https://x.com/b", "https://x.com/c"), + LineagePathBuilder.Direction.UPSTREAM); + QueryFactory.create(sparql); + assertTrue(sparql.contains("")); + assertTrue(sparql.contains("")); + assertTrue(sparql.contains("")); + } + + @Test + @DisplayName("Self-loops are filtered out") + void selfLoopsFiltered() { + String sparql = + LineagePathBuilder.frontierQuery( + List.of("https://x.com/a"), LineagePathBuilder.Direction.UPSTREAM); + assertTrue( + sparql.contains("FILTER(?to != ?from)"), + "Self-loops shouldn't pollute BFS — must be filtered server-side"); + } + } + + @Nested + @DisplayName("Types SPARQL") + class TypesSparql { + + @Test + @DisplayName("Empty input is rejected") + void empty() { + assertThrows(IllegalArgumentException.class, () -> LineagePathBuilder.typesQuery(List.of())); + assertThrows(IllegalArgumentException.class, () -> LineagePathBuilder.typesQuery(null)); + } + + @Test + @DisplayName("Only om: types are returned (filter is present)") + void omFilter() { + String sparql = LineagePathBuilder.typesQuery(List.of("https://x.com/a")); + QueryFactory.create(sparql); + assertTrue(sparql.contains("STRSTARTS"), "Must filter to om-namespaced types"); + assertTrue(sparql.contains("https://open-metadata.org/ontology/")); + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LineagePathFinderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LineagePathFinderTest.java new file mode 100644 index 000000000000..a8ae15df2a15 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LineagePathFinderTest.java @@ -0,0 +1,423 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.openmetadata.service.rdf.RdfRepository; + +/** + * Tests for {@link LineagePathFinder}. Two kinds of tests: + * + *
    + *
  1. Pure parsing — {@link LineagePathFinder#parseFrontierResult} and + * {@link LineagePathFinder#parseTypesResult} given hand-built SPARQL JSON. + *
  2. End-to-end BFS — {@link LineagePathFinder#findPath} against a mocked {@link RdfRepository}. + * The mock dispatches on SPARQL shape (frontier vs types) and on which frontier URIs are + * embedded in the query, so each test reads as a tiny graph definition. + *
+ */ +class LineagePathFinderTest { + + private static final String EMPTY = "{\"results\":{\"bindings\":[]}}"; + + private static String frontierRow(String from, String to, String predicate) { + return "{\"from\":{\"value\":\"" + + from + + "\"},\"to\":{\"value\":\"" + + to + + "\"},\"predicate\":{\"value\":\"" + + predicate + + "\"}}"; + } + + private static String frontierBody(String... rows) { + return "{\"results\":{\"bindings\":[" + String.join(",", rows) + "]}}"; + } + + private static String typesRow(String node, String type) { + return "{\"node\":{\"value\":\"" + node + "\"},\"type\":{\"value\":\"" + type + "\"}}"; + } + + private static String typesBody(String... rows) { + return "{\"results\":{\"bindings\":[" + String.join(",", rows) + "]}}"; + } + + @Nested + @DisplayName("Frontier result parsing") + class FrontierParsing { + + @Test + @DisplayName("Empty / null / blank input → empty map") + void empty() { + Set visited = new HashSet<>(); + assertTrue(LineagePathFinder.parseFrontierResult(null, visited).isEmpty()); + assertTrue(LineagePathFinder.parseFrontierResult("", visited).isEmpty()); + assertTrue(LineagePathFinder.parseFrontierResult("not json", visited).isEmpty()); + assertTrue( + LineagePathFinder.parseFrontierResult("{\"results\":{\"bindings\":[]}}", visited) + .isEmpty()); + } + + @Test + @DisplayName("Already-visited 'to' nodes are dropped") + void visitedDropped() { + Set visited = Set.of("urn:b"); + Map next = + LineagePathFinder.parseFrontierResult( + frontierBody(frontierRow("urn:a", "urn:b", "prov:wasDerivedFrom")), visited); + assertTrue(next.isEmpty(), "Already-visited target must not be re-added to next frontier"); + } + + @Test + @DisplayName("First parent wins when multiple frontier rows mention same target") + void firstParentWins() { + Set visited = new HashSet<>(); + Map next = + LineagePathFinder.parseFrontierResult( + frontierBody( + frontierRow("urn:a", "urn:b", "prov:wasDerivedFrom"), + frontierRow("urn:c", "urn:b", "om:upstream")), + visited); + assertEquals("urn:a", next.get("urn:b").parent()); + assertEquals("prov:wasDerivedFrom", next.get("urn:b").predicate()); + } + + @Test + @DisplayName("Rows with missing fields are skipped") + void partialRows() { + String partial = "{\"results\":{\"bindings\":[{\"from\":{\"value\":\"urn:a\"}}]}}"; + assertTrue(LineagePathFinder.parseFrontierResult(partial, new HashSet<>()).isEmpty()); + } + } + + @Nested + @DisplayName("Types result parsing") + class TypesParsing { + + @Test + @DisplayName("Empty / null / blank input → empty map") + void empty() { + assertTrue(LineagePathFinder.parseTypesResult(null).isEmpty()); + assertTrue(LineagePathFinder.parseTypesResult("").isEmpty()); + assertTrue(LineagePathFinder.parseTypesResult("garbage").isEmpty()); + } + + @Test + @DisplayName("Multiple types per node are aggregated and deduplicated") + void multipleTypes() { + String body = + typesBody( + typesRow("urn:t", "https://open-metadata.org/ontology/Table"), + typesRow("urn:t", "https://open-metadata.org/ontology/DataAsset"), + typesRow("urn:t", "https://open-metadata.org/ontology/Table")); + Map> result = LineagePathFinder.parseTypesResult(body); + List types = result.get("urn:t"); + assertEquals(2, types.size(), "Duplicate types must be dropped"); + assertTrue(types.contains("https://open-metadata.org/ontology/Table")); + assertTrue(types.contains("https://open-metadata.org/ontology/DataAsset")); + } + } + + @Nested + @DisplayName("End-to-end BFS") + class EndToEnd { + + private static final String A = "https://open-metadata.org/instance/Table/a"; + private static final String B = "https://open-metadata.org/instance/Table/b"; + private static final String C = "https://open-metadata.org/instance/Table/c"; + private static final String D = "https://open-metadata.org/instance/Table/d"; + + private RdfRepository mockRepo() { + RdfRepository repo = mock(RdfRepository.class); + lenient().when(repo.executeSparqlQuery(anyString(), anyString())).thenReturn(EMPTY); + return repo; + } + + @Test + @DisplayName("from == to: trivial single-node path returned immediately") + void trivialIdentity() { + RdfRepository repo = mockRepo(); + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, A, LineagePathBuilder.Direction.UPSTREAM, 6); + assertTrue(path.found()); + assertEquals(0, path.hops()); + assertEquals(1, path.nodes().size()); + assertEquals(A, path.nodes().get(0).node()); + assertNull(path.nodes().get(0).predicate()); + } + + @Test + @DisplayName("Direct neighbour found in one hop") + void directNeighbour() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) return typesBody(); + if (sparql.contains("<" + A + ">")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, B, LineagePathBuilder.Direction.UPSTREAM, 6); + assertTrue(path.found()); + assertEquals(1, path.hops()); + assertEquals(List.of(A, B), nodeUris(path)); + assertNull(path.nodes().get(0).predicate()); + assertEquals("prov:wasDerivedFrom", path.nodes().get(1).predicate()); + } + + @Test + @DisplayName("Multi-hop A → B → C → D resolves with three predicate-tagged hops") + void multiHop() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) return typesBody(); + if (sparql.contains("<" + A + ">")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + if (sparql.contains("<" + B + ">")) { + return frontierBody(frontierRow(B, C, "om:upstream")); + } + if (sparql.contains("<" + C + ">")) { + return frontierBody(frontierRow(C, D, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, D, LineagePathBuilder.Direction.UPSTREAM, 6); + assertTrue(path.found()); + assertEquals(3, path.hops()); + assertEquals(List.of(A, B, C, D), nodeUris(path)); + assertEquals("prov:wasDerivedFrom", path.nodes().get(1).predicate()); + assertEquals("om:upstream", path.nodes().get(2).predicate()); + assertEquals("prov:wasDerivedFrom", path.nodes().get(3).predicate()); + } + + @Test + @DisplayName("BFS prefers shorter paths even when a longer one is also reachable") + void bfsShortest() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) return typesBody(); + if (sparql.contains("<" + A + ">")) { + return frontierBody( + frontierRow(A, B, "prov:wasDerivedFrom"), + frontierRow(A, D, "prov:wasDerivedFrom")); + } + if (sparql.contains("<" + B + ">") || sparql.contains("<" + D + ">")) { + return frontierBody(frontierRow(B, C, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, D, LineagePathBuilder.Direction.UPSTREAM, 6); + assertTrue(path.found()); + assertEquals(1, path.hops(), "BFS must take the direct A→D edge, not the A→B→D detour"); + } + + @Test + @DisplayName("Cycle A → B → A does not loop forever; resolves to nearest path") + void cycleSafe() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) return typesBody(); + if (sparql.contains("<" + A + ">")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + if (sparql.contains("<" + B + ">")) { + return frontierBody( + frontierRow(B, A, "prov:wasDerivedFrom"), + frontierRow(B, C, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, C, LineagePathBuilder.Direction.UPSTREAM, 6); + assertTrue(path.found()); + assertEquals(2, path.hops()); + assertEquals(List.of(A, B, C), nodeUris(path)); + } + + @Test + @DisplayName("Disconnected target: BFS exhausts the frontier and reports found=false") + void disconnected() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) return typesBody(); + if (sparql.contains("<" + A + ">")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, D, LineagePathBuilder.Direction.UPSTREAM, 6); + assertFalse(path.found()); + assertEquals(0, path.hops()); + assertTrue(path.nodes().isEmpty()); + assertEquals(A, path.from()); + assertEquals(D, path.to()); + } + + @Test + @DisplayName("maxHops budget is honoured; deeper targets are not found") + void maxHopsBudget() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) return typesBody(); + if (sparql.contains("<" + A + ">")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + if (sparql.contains("<" + B + ">")) { + return frontierBody(frontierRow(B, C, "prov:wasDerivedFrom")); + } + if (sparql.contains("<" + C + ">")) { + return frontierBody(frontierRow(C, D, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, D, LineagePathBuilder.Direction.UPSTREAM, 2); + assertFalse(path.found(), "Three hops cannot fit in a budget of two"); + assertEquals(2, path.maxHops()); + } + + @Test + @DisplayName("SPARQL exception during frontier expansion → not-found, no exception bubbles") + void sparqlError() { + RdfRepository repo = mock(RdfRepository.class); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenThrow(new RuntimeException("Fuseki down")); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, B, LineagePathBuilder.Direction.UPSTREAM, 6); + assertFalse(path.found()); + } + + @Test + @DisplayName("Bad URI: validation fires before any SPARQL is sent") + void badUri() { + RdfRepository repo = mock(RdfRepository.class); + LineagePathFinder finder = new LineagePathFinder(repo); + assertThrows( + IllegalArgumentException.class, + () -> finder.findPath("not a uri", B, LineagePathBuilder.Direction.UPSTREAM, 6)); + assertThrows( + IllegalArgumentException.class, + () -> finder.findPath(A, "ftp://x.com/y", LineagePathBuilder.Direction.UPSTREAM, 6)); + } + + @Test + @DisplayName("Type decoration: each path node carries its om: rdf:types") + void typeDecoration() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) { + return typesBody( + typesRow(A, "https://open-metadata.org/ontology/Table"), + typesRow(A, "https://open-metadata.org/ontology/DataAsset"), + typesRow(B, "https://open-metadata.org/ontology/Table")); + } + if (sparql.contains("<" + A + ">") && sparql.contains("?from ?to ?predicate")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, B, LineagePathBuilder.Direction.UPSTREAM, 6); + assertTrue(path.found()); + assertEquals(2, path.nodes().get(0).rdfTypes().size()); + assertEquals(1, path.nodes().get(1).rdfTypes().size()); + } + + @Test + @DisplayName("Type decoration failure does not break the path response") + void typeDecorationFailure() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) { + throw new RuntimeException("Fuseki blip on type query"); + } + if (sparql.contains("<" + A + ">")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + + LineagePathFinder.Path path = + new LineagePathFinder(repo).findPath(A, B, LineagePathBuilder.Direction.UPSTREAM, 6); + assertTrue(path.found(), "Path must still be returned even if type decoration blows up"); + assertEquals(2, path.nodes().size()); + assertNotNull(path.nodes().get(0).rdfTypes()); + assertTrue(path.nodes().get(0).rdfTypes().isEmpty()); + } + + @Test + @DisplayName("Direction defaults to upstream when null is passed") + void directionDefault() { + RdfRepository repo = mockRepo(); + when(repo.executeSparqlQuery(anyString(), anyString())) + .thenAnswer( + inv -> { + String sparql = inv.getArgument(0); + if (sparql.contains("?node ?type")) return typesBody(); + if (sparql.contains("<" + A + ">")) { + return frontierBody(frontierRow(A, B, "prov:wasDerivedFrom")); + } + return EMPTY; + }); + LineagePathFinder.Path path = new LineagePathFinder(repo).findPath(A, B, null, 6); + assertTrue(path.found()); + assertEquals("upstream", path.direction()); + } + } + + private static List nodeUris(LineagePathFinder.Path path) { + return path.nodes().stream().map(LineagePathFinder.Hop::node).toList(); + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LouvainTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LouvainTest.java new file mode 100644 index 000000000000..a3c49bca8e73 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/LouvainTest.java @@ -0,0 +1,233 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +class LouvainTest { + + private static Map> g(Object... edges) { + Map> graph = new LinkedHashMap<>(); + for (int i = 0; i < edges.length; i += 3) { + String from = (String) edges[i]; + String to = (String) edges[i + 1]; + double w = ((Number) edges[i + 2]).doubleValue(); + graph.computeIfAbsent(from, k -> new HashMap<>()).put(to, w); + graph.computeIfAbsent(to, k -> new HashMap<>()); + } + return graph; + } + + @Nested + @DisplayName("Constructor / input guards") + class Guards { + + @Test + @DisplayName("maxIterations < 1 is rejected") + void badMaxIterations() { + assertThrows(IllegalArgumentException.class, () -> new Louvain(0)); + assertThrows(IllegalArgumentException.class, () -> new Louvain(-3)); + } + + @Test + @DisplayName("Null and empty graphs return empty result with modularity 0") + void emptyInputs() { + Louvain.Result r1 = new Louvain().compute(null); + assertTrue(r1.communityByNode().isEmpty()); + assertEquals(0.0, r1.modularity()); + Louvain.Result r2 = new Louvain().compute(Map.of()); + assertTrue(r2.communityByNode().isEmpty()); + } + + @Test + @DisplayName("Graph with only self-loops produces singletons (self-loops ignored)") + void onlySelfLoops() { + Map> graph = + new LinkedHashMap<>( + Map.of( + "a", new HashMap<>(Map.of("a", 1.0)), + "b", new HashMap<>(Map.of("b", 1.0)))); + Louvain.Result r = new Louvain().compute(graph); + assertEquals(2, r.communityCount(), "Self-loops carry no community signal"); + } + + @Test + @DisplayName("Negative weights are clamped to zero") + void negativeWeights() { + Louvain.Result r = new Louvain().compute(g("a", "b", -100.0)); + assertEquals(2, r.communityCount(), "Negative-weight edges must not pull nodes together"); + } + } + + @Nested + @DisplayName("Topology → community structure") + class Topology { + + @Test + @DisplayName("Triangle (three nodes, three edges): everyone joins one community") + void triangle() { + Map> graph = g("a", "b", 1.0, "b", "c", 1.0, "a", "c", 1.0); + Louvain.Result r = new Louvain().compute(graph); + assertEquals(1, r.communityCount()); + } + + @Test + @DisplayName("Two cliques connected by a single light edge: two communities") + void twoCliques() { + Map> graph = + g( + "a", "b", 1.0, "b", "c", 1.0, "a", "c", 1.0, "x", "y", 1.0, "y", "z", 1.0, "x", "z", + 1.0, "c", "x", 0.01); + Louvain.Result r = new Louvain().compute(graph); + assertEquals(2, r.communityCount()); + assertEquals(r.communityByNode().get("a"), r.communityByNode().get("b")); + assertEquals(r.communityByNode().get("a"), r.communityByNode().get("c")); + assertEquals(r.communityByNode().get("x"), r.communityByNode().get("y")); + assertEquals(r.communityByNode().get("x"), r.communityByNode().get("z")); + assertNotEquals(r.communityByNode().get("a"), r.communityByNode().get("x")); + } + + @Test + @DisplayName("Disconnected components yield distinct communities") + void disconnectedComponents() { + Map> graph = g("a", "b", 1.0, "c", "d", 1.0, "e", "f", 1.0); + Louvain.Result r = new Louvain().compute(graph); + assertEquals(3, r.communityCount()); + } + + @Test + @DisplayName("Star (hub + leaves) collapses to a single community") + void star() { + Map> graph = new LinkedHashMap<>(); + graph.put("hub", new HashMap<>()); + for (int i = 0; i < 6; i++) { + graph.computeIfAbsent("hub", k -> new HashMap<>()).put("leaf-" + i, 1.0); + graph.put("leaf-" + i, new HashMap<>()); + } + Louvain.Result r = new Louvain().compute(graph); + assertEquals(1, r.communityCount()); + } + + @Test + @DisplayName("Heavy edges pull nodes together against light competing edges") + void edgeWeightsRespected() { + Map> graph = g("a", "b", 100.0, "a", "c", 0.1, "c", "d", 100.0); + Louvain.Result r = new Louvain().compute(graph); + assertEquals(r.communityByNode().get("a"), r.communityByNode().get("b")); + assertEquals(r.communityByNode().get("c"), r.communityByNode().get("d")); + assertNotEquals(r.communityByNode().get("a"), r.communityByNode().get("c")); + } + } + + @Nested + @DisplayName("Symmetrization") + class Symmetrization { + + @Test + @DisplayName("Asymmetric input is treated as undirected") + void asymmetricInput() { + Map> graph = new LinkedHashMap<>(); + graph.put("a", new HashMap<>(Map.of("b", 5.0))); + graph.put("b", new HashMap<>()); + Louvain.Result r = new Louvain().compute(graph); + assertEquals(1, r.communityCount(), "Single edge a→b should still pull a, b together"); + } + + @Test + @DisplayName("Both-directions input doesn't double-influence") + void bothDirectionsSum() { + Map> graph = g("a", "b", 5.0, "b", "a", 5.0); + Louvain.Result r = new Louvain().compute(graph); + assertEquals(1, r.communityCount()); + } + } + + @Nested + @DisplayName("Determinism") + class Determinism { + + @Test + @DisplayName("Repeated runs on the same input produce the same partition") + void deterministic() { + Map> graph = + g( + "a", "b", 1.0, "b", "c", 1.0, "c", "a", 1.0, "x", "y", 1.0, "y", "z", 1.0, "z", "x", + 1.0, "a", "x", 0.05); + Louvain.Result r1 = new Louvain().compute(graph); + Louvain.Result r2 = new Louvain().compute(graph); + assertEquals(r1.communityByNode(), r2.communityByNode()); + assertEquals(r1.modularity(), r2.modularity()); + } + + @Test + @DisplayName("Community ids are dense [0..k-1] in discovery order") + void denseIds() { + Map> graph = g("a", "b", 1.0, "c", "d", 1.0); + Louvain.Result r = new Louvain().compute(graph); + List ids = r.communityByNode().values().stream().distinct().sorted().toList(); + assertEquals(List.of(0, 1), ids); + } + } + + @Nested + @DisplayName("Modularity behaviour") + class Modularity { + + @Test + @DisplayName("Tight clusters produce higher modularity than mixed input") + void clustersHaveHigherQ() { + Map> tight = + g( + "a", "b", 1.0, "b", "c", 1.0, "c", "a", 1.0, "x", "y", 1.0, "y", "z", 1.0, "z", "x", + 1.0, "a", "x", 0.05); + Map> mixed = + g( + "a", "b", 1.0, "a", "c", 1.0, "a", "d", 1.0, "a", "e", 1.0, "a", "f", 1.0, "a", "g", + 1.0); + double qTight = new Louvain().compute(tight).modularity(); + double qMixed = new Louvain().compute(mixed).modularity(); + assertTrue( + qTight > qMixed, + "Two-clique partition must score higher modularity than a star (got tight=" + + qTight + + ", mixed=" + + qMixed + + ")"); + } + } + + @Nested + @DisplayName("Result helpers") + class ResultHelpers { + + @Test + @DisplayName("membersByCommunity is the inverse view of communityByNode") + void membersByCommunity() { + Map> graph = + g( + "a", "b", 1.0, "b", "c", 1.0, "a", "c", 1.0, "x", "y", 1.0, "y", "z", 1.0, "x", "z", + 1.0, "c", "x", 0.01); + Louvain.Result r = new Louvain().compute(graph); + Map> members = r.membersByCommunity(); + assertEquals(2, members.size()); + int totalMembers = members.values().stream().mapToInt(List::size).sum(); + assertEquals(6, totalMembers); + } + + @Test + @DisplayName("Iteration count is non-zero whenever any edge exists") + void iterationsAdvance() { + Louvain.Result r = new Louvain().compute(g("a", "b", 1.0)); + assertTrue(r.iterations() >= 1); + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/PageRankTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/PageRankTest.java new file mode 100644 index 000000000000..b6de7085aa87 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/PageRankTest.java @@ -0,0 +1,274 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +/** + * Correctness + edge-case tests for the hand-rolled PageRank implementation. + * + *

Each test names the property under test and uses a small known graph so the expected + * scores can be reasoned about directly. We don't assert exact values from the literature + * (those depend on damping factor and tolerance choices) — instead we assert qualitative + * properties that must hold: + * + *

    + *
  • scores are normalized (sum to 1.0) + *
  • nodes with more incoming weight rank higher + *
  • dangling nodes get a non-zero score (mass redistribution) + *
  • disconnected components both contribute to the result + *
+ */ +class PageRankTest { + + private static Map> g() { + return new LinkedHashMap<>(); + } + + private static void edge(Map> g, String from, String to, double w) { + g.computeIfAbsent(from, k -> new HashMap<>()).put(to, w); + } + + @Nested + @DisplayName("Constructor validation") + class Construction { + + @Test + @DisplayName("Damping outside (0,1) is rejected") + void invalidDamping() { + assertThrows(IllegalArgumentException.class, () -> new PageRank(0.0, 100, 1e-6)); + assertThrows(IllegalArgumentException.class, () -> new PageRank(1.0, 100, 1e-6)); + assertThrows(IllegalArgumentException.class, () -> new PageRank(-0.5, 100, 1e-6)); + } + + @Test + @DisplayName("Non-positive maxIterations is rejected") + void invalidIterations() { + assertThrows(IllegalArgumentException.class, () -> new PageRank(0.85, 0, 1e-6)); + assertThrows(IllegalArgumentException.class, () -> new PageRank(0.85, -1, 1e-6)); + } + + @Test + @DisplayName("Non-positive tolerance is rejected") + void invalidTolerance() { + assertThrows(IllegalArgumentException.class, () -> new PageRank(0.85, 100, 0.0)); + assertThrows(IllegalArgumentException.class, () -> new PageRank(0.85, 100, -1.0)); + } + } + + @Nested + @DisplayName("Edge cases") + class EdgeCases { + + @Test + @DisplayName("Empty graph returns empty result, zero iterations") + void emptyGraph() { + PageRank.Result r = new PageRank().compute(g()); + assertTrue(r.scores().isEmpty()); + assertEquals(0, r.iterations()); + assertTrue(r.converged()); + } + + @Test + @DisplayName("Single node with no edges → score 1.0") + void singleNode() { + Map> g = g(); + g.put("A", new HashMap<>()); + PageRank.Result r = new PageRank().compute(g); + assertEquals(1.0, r.scores().get("A"), 1e-9); + } + + @Test + @DisplayName("Two disconnected nodes get equal score 0.5") + void twoDisconnected() { + Map> g = g(); + g.put("A", new HashMap<>()); + g.put("B", new HashMap<>()); + PageRank.Result r = new PageRank().compute(g); + assertEquals(0.5, r.scores().get("A"), 1e-9); + assertEquals(0.5, r.scores().get("B"), 1e-9); + } + + @Test + @DisplayName("Dangling target node still receives a score") + void danglingTarget() { + Map> g = g(); + edge(g, "A", "B", 1.0); + PageRank.Result r = new PageRank().compute(g); + assertTrue(r.scores().containsKey("A")); + assertTrue(r.scores().containsKey("B")); + assertTrue(r.scores().get("B") > 0); + assertTrue( + r.scores().get("B") > r.scores().get("A"), + "B has incoming edge from A so should score higher than A"); + } + + @Test + @DisplayName("Self-loop on a single node → still normalized") + void selfLoop() { + Map> g = g(); + edge(g, "A", "A", 1.0); + PageRank.Result r = new PageRank().compute(g); + assertEquals(1.0, r.scores().get("A"), 1e-9); + } + + @Test + @DisplayName("Edge with zero weight contributes nothing") + void zeroWeightEdgeIgnored() { + Map> g = g(); + edge(g, "A", "B", 0.0); + // A is effectively dangling; A and B should split mass via dangling redistribution. + PageRank.Result r = new PageRank().compute(g); + assertEquals(0.5, r.scores().get("A"), 1e-3); + assertEquals(0.5, r.scores().get("B"), 1e-3); + } + } + + @Nested + @DisplayName("Output normalization") + class Normalization { + + @Test + @DisplayName("Scores sum to 1.0 across the graph") + void scoresSumToOne() { + Map> g = g(); + edge(g, "A", "B", 1.0); + edge(g, "B", "C", 1.0); + edge(g, "C", "A", 1.0); + edge(g, "C", "B", 1.0); + PageRank.Result r = new PageRank().compute(g); + double total = 0; + for (double v : r.scores().values()) total += v; + assertEquals(1.0, total, 1e-6); + } + + @Test + @DisplayName("Symmetric graph produces equal scores") + void symmetricGraph() { + Map> g = g(); + edge(g, "A", "B", 1.0); + edge(g, "B", "A", 1.0); + PageRank.Result r = new PageRank().compute(g); + assertEquals(r.scores().get("A"), r.scores().get("B"), 1e-6); + } + } + + @Nested + @DisplayName("Ranking properties") + class RankingProperties { + + @Test + @DisplayName("Hub node (many incoming edges) ranks highest") + void hubRanksHighest() { + Map> g = g(); + // A, B, C, D all point at HUB. HUB has no outgoing edges. + edge(g, "A", "HUB", 1.0); + edge(g, "B", "HUB", 1.0); + edge(g, "C", "HUB", 1.0); + edge(g, "D", "HUB", 1.0); + PageRank.Result r = new PageRank().compute(g); + double hub = r.scores().get("HUB"); + for (String n : new String[] {"A", "B", "C", "D"}) { + assertTrue(hub > r.scores().get(n), "HUB > " + n + ": " + r.scores()); + } + } + + @Test + @DisplayName("Edge weight matters: heavy-weighted target outranks lightly-weighted target") + void edgeWeightMatters() { + Map> g = g(); + // SOURCE → HEAVY (weight 10), SOURCE → LIGHT (weight 0.1) + edge(g, "SOURCE", "HEAVY", 10.0); + edge(g, "SOURCE", "LIGHT", 0.1); + PageRank.Result r = new PageRank().compute(g); + assertTrue( + r.scores().get("HEAVY") > r.scores().get("LIGHT"), + "Heavy edge should outrank light edge: " + r.scores()); + } + + @Test + @DisplayName("Star topology — center outranks every leaf") + void starTopology() { + Map> g = g(); + for (int i = 0; i < 10; i++) { + edge(g, "leaf-" + i, "center", 1.0); + } + PageRank.Result r = new PageRank().compute(g); + double center = r.scores().get("center"); + for (int i = 0; i < 10; i++) { + assertTrue(center > r.scores().get("leaf-" + i)); + } + } + + @Test + @DisplayName("Two-component graph: each component ranks consistently within itself") + void twoComponents() { + Map> g = g(); + // Component 1: A → B → C (chain) + edge(g, "A", "B", 1.0); + edge(g, "B", "C", 1.0); + // Component 2: X → Y → X (cycle) + edge(g, "X", "Y", 1.0); + edge(g, "Y", "X", 1.0); + PageRank.Result r = new PageRank().compute(g); + assertTrue(r.scores().get("X") > 0); + assertTrue(r.scores().get("Y") > 0); + assertTrue(r.scores().get("C") > 0, "Dangling end of chain still gets mass"); + // Within the cycle, X and Y should be equal + assertEquals(r.scores().get("X"), r.scores().get("Y"), 1e-6); + } + } + + @Nested + @DisplayName("Convergence") + class Convergence { + + @Test + @DisplayName("Small graphs converge in well under maxIterations") + void convergesQuickly() { + Map> g = g(); + edge(g, "A", "B", 1.0); + edge(g, "B", "A", 1.0); + PageRank.Result r = new PageRank().compute(g); + assertTrue(r.converged()); + assertTrue(r.iterations() < 50, "Took too many iterations: " + r.iterations()); + } + + @Test + @DisplayName("Tight tolerance still converges") + void tightTolerance() { + Map> g = g(); + edge(g, "A", "B", 1.0); + edge(g, "B", "C", 1.0); + edge(g, "C", "A", 1.0); + PageRank.Result r = new PageRank(0.85, 1000, 1e-12).compute(g); + assertTrue(r.converged()); + } + + @Test + @DisplayName("maxIterations=1 returns without converging on a non-trivial graph") + void maxIterationsHonored() { + Map> g = g(); + // Hub-and-spoke needs more than 1 iteration to settle. + for (int i = 0; i < 10; i++) edge(g, "leaf-" + i, "center", 1.0); + PageRank.Result r = new PageRank(0.85, 1, 1e-12).compute(g); + assertEquals(1, r.iterations()); + } + } + + @Test + @DisplayName("nodes() helper returns the union of sources and targets") + void nodesHelper() { + Map> g = g(); + edge(g, "A", "B", 1.0); + edge(g, "B", "C", 1.0); + assertEquals(java.util.Set.of("A", "B", "C"), PageRank.nodes(g)); + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/RecommendationsQueryBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/RecommendationsQueryBuilderTest.java new file mode 100644 index 000000000000..9204d00fe957 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/insights/RecommendationsQueryBuilderTest.java @@ -0,0 +1,108 @@ +package org.openmetadata.service.rdf.insights; + +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryFactory; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +class RecommendationsQueryBuilderTest { + + private static final String URI = "https://open-metadata.org/instance/Table/abc"; + + @Nested + @DisplayName("Input validation") + class InputValidation { + + @Test + @DisplayName("Bad URI is rejected before SPARQL is generated") + void badUri() { + assertThrows( + IllegalArgumentException.class, () -> RecommendationsQueryBuilder.build(null, 10)); + assertThrows( + IllegalArgumentException.class, () -> RecommendationsQueryBuilder.build(" ", 10)); + assertThrows( + IllegalArgumentException.class, + () -> RecommendationsQueryBuilder.build("ftp://x.com/y", 10)); + assertThrows( + IllegalArgumentException.class, + () -> RecommendationsQueryBuilder.build("http://x.com/y> ; DROP", 10)); + } + + @Test + @DisplayName("Limit below 1 is clamped to 1") + void limitClampedLow() { + String sparql = RecommendationsQueryBuilder.build(URI, -10); + assertTrue(sparql.endsWith("LIMIT 1")); + } + + @Test + @DisplayName("Limit above MAX_LIMIT is clamped down") + void limitClampedHigh() { + String sparql = RecommendationsQueryBuilder.build(URI, 9999); + assertTrue(sparql.endsWith("LIMIT " + RecommendationsQueryBuilder.MAX_LIMIT)); + } + } + + @Nested + @DisplayName("SPARQL well-formedness") + class WellFormed { + + @Test + @DisplayName("Generated query parses with Jena and selects the expected vars") + void parses() { + Query q = QueryFactory.create(RecommendationsQueryBuilder.build(URI, 10)); + assertTrue(q.isSelectType()); + assertTrue(q.getResultVars().contains("candidate")); + assertTrue(q.getResultVars().contains("tagOverlap")); + assertTrue(q.getResultVars().contains("glossaryOverlap")); + assertTrue(q.getResultVars().contains("lineageOverlap")); + assertTrue(q.getResultVars().contains("score")); + } + + @Test + @DisplayName("Lineage neighbour predicates cover both directions and the prov inverse") + void lineagePredicates() { + String sparql = RecommendationsQueryBuilder.build(URI, 10); + assertTrue(sparql.contains("om:upstream")); + assertTrue(sparql.contains("om:downstream")); + assertTrue(sparql.contains("prov:wasDerivedFrom")); + assertTrue(sparql.contains("^prov:wasDerivedFrom")); + } + + @Test + @DisplayName("Score formula uses the documented weights and adds three terms") + void scoreFormula() { + String sparql = RecommendationsQueryBuilder.build(URI, 10); + assertTrue(sparql.contains(Double.toString(RecommendationsQueryBuilder.WEIGHT_TAG))); + assertTrue(sparql.contains(Double.toString(RecommendationsQueryBuilder.WEIGHT_GLOSSARY))); + assertTrue(sparql.contains(Double.toString(RecommendationsQueryBuilder.WEIGHT_LINEAGE))); + assertTrue(sparql.contains("ORDER BY DESC(?score)")); + } + + @Test + @DisplayName("Each sub-SELECT excludes the seed itself") + void excludesSeed() { + String sparql = RecommendationsQueryBuilder.build(URI, 10); + long filterCount = + sparql + .lines() + .filter(line -> line.contains("FILTER(?candidate != <" + URI + ">)")) + .count(); + assertTrue(filterCount >= 3, "All three sub-SELECTs must filter out the seed itself"); + } + + @Test + @DisplayName("Outer GROUP BY/SUM combines per-dimension partial counts") + void groupBySum() { + String sparql = RecommendationsQueryBuilder.build(URI, 10); + assertTrue(sparql.contains("SUM(?t)")); + assertTrue(sparql.contains("SUM(?g)")); + assertTrue(sparql.contains("SUM(?l)")); + assertTrue(sparql.contains("GROUP BY ?candidate")); + } + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/translator/RdfUsageMapperTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/translator/RdfUsageMapperTest.java new file mode 100644 index 000000000000..d33b95881a2c --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/translator/RdfUsageMapperTest.java @@ -0,0 +1,144 @@ +package org.openmetadata.service.rdf.translator; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.Property; +import org.apache.jena.rdf.model.Resource; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +class RdfUsageMapperTest { + + private static final String OM = "https://open-metadata.org/ontology/"; + + private ObjectMapper mapper; + private Model model; + private Resource entity; + + @BeforeEach + void setUp() { + mapper = new ObjectMapper(); + model = ModelFactory.createDefaultModel(); + entity = model.createResource("https://open-metadata.org/entity/table/abc"); + } + + @Test + @DisplayName("Full usage summary emits count + percentile triples for daily/weekly/monthly") + void fullUsageSummary() { + ObjectNode usage = mapper.createObjectNode(); + putStats(usage, "dailyStats", 1234, 92.5); + putStats(usage, "weeklyStats", 8500, 88.0); + putStats(usage, "monthlyStats", 35_000, 90.1); + usage.put("date", "2026-04-29"); + + RdfUsageMapper.emitUsageSummary(usage, entity, model); + + assertCount("usageDailyCount", 1234); + assertCount("usageWeeklyCount", 8500); + assertCount("usageMonthlyCount", 35_000); + assertPercentile("usageDailyPercentile", 92.5); + assertPercentile("usageWeeklyPercentile", 88.0); + assertPercentile("usageMonthlyPercentile", 90.1); + + Property usageDate = model.createProperty(OM, "usageDate"); + assertTrue(model.contains(entity, usageDate)); + assertEquals("2026-04-29", model.getProperty(entity, usageDate).getString()); + } + + @Test + @DisplayName("Null usageSummary is a no-op") + void nullUsage() { + RdfUsageMapper.emitUsageSummary(null, entity, model); + assertEquals(0, model.size()); + } + + @Test + @DisplayName("Non-object usageSummary (string, array) is a no-op") + void nonObjectUsage() { + RdfUsageMapper.emitUsageSummary(mapper.getNodeFactory().textNode("oops"), entity, model); + RdfUsageMapper.emitUsageSummary(mapper.createArrayNode(), entity, model); + assertEquals(0, model.size()); + } + + @Test + @DisplayName("Missing percentileRank is allowed — count is still emitted") + void countWithoutPercentile() { + ObjectNode usage = mapper.createObjectNode(); + ObjectNode dailyStats = mapper.createObjectNode(); + dailyStats.put("count", 42); + usage.set("dailyStats", dailyStats); + + RdfUsageMapper.emitUsageSummary(usage, entity, model); + + assertCount("usageDailyCount", 42); + assertFalse( + model.contains(entity, model.createProperty(OM, "usageDailyPercentile")), + "Percentile must not be emitted when not present"); + } + + @Test + @DisplayName("Non-numeric count or percentile is silently skipped") + void nonNumericValuesSkipped() { + ObjectNode usage = mapper.createObjectNode(); + ObjectNode bad = mapper.createObjectNode(); + bad.put("count", "not-a-number"); + bad.put("percentileRank", "very high"); + usage.set("dailyStats", bad); + + RdfUsageMapper.emitUsageSummary(usage, entity, model); + assertEquals(0, model.size(), "Non-numeric stats must be ignored, not coerced"); + } + + @Test + @DisplayName("Only weekly stats present — daily / monthly predicates absent") + void onlyWeekly() { + ObjectNode usage = mapper.createObjectNode(); + putStats(usage, "weeklyStats", 100, 50.0); + + RdfUsageMapper.emitUsageSummary(usage, entity, model); + + assertFalse(model.contains(entity, model.createProperty(OM, "usageDailyCount"))); + assertCount("usageWeeklyCount", 100); + assertFalse(model.contains(entity, model.createProperty(OM, "usageMonthlyCount"))); + } + + @Test + @DisplayName("Date is emitted as xsd:date typed literal") + void datePresent() { + ObjectNode usage = mapper.createObjectNode(); + usage.put("date", "2026-04-29"); + RdfUsageMapper.emitUsageSummary(usage, entity, model); + + Property usageDate = model.createProperty(OM, "usageDate"); + assertTrue( + model.contains(entity, usageDate), + "Date should be present even when no stats are recorded"); + assertEquals("2026-04-29", model.getProperty(entity, usageDate).getString()); + } + + private void putStats(ObjectNode usage, String key, long count, double percentile) { + ObjectNode stats = mapper.createObjectNode(); + stats.put("count", count); + stats.put("percentileRank", percentile); + usage.set(key, stats); + } + + private void assertCount(String predicate, long expected) { + Property p = model.createProperty(OM, predicate); + assertTrue(model.contains(entity, p), "Expected predicate " + predicate); + assertEquals(expected, model.getProperty(entity, p).getLong()); + } + + private void assertPercentile(String predicate, double expected) { + Property p = model.createProperty(OM, predicate); + assertTrue(model.contains(entity, p), "Expected predicate " + predicate); + assertEquals(expected, model.getProperty(entity, p).getDouble(), 1e-9); + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/resources/rdf/OntologyDocumentTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/resources/rdf/OntologyDocumentTest.java new file mode 100644 index 000000000000..e6c0483e39ec --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/resources/rdf/OntologyDocumentTest.java @@ -0,0 +1,70 @@ +package org.openmetadata.service.resources.rdf; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import jakarta.ws.rs.core.Response; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +class OntologyDocumentTest { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String OWL_VERSION_INFO = "http://www.w3.org/2002/07/owl#versionInfo"; + + @Test + @DisplayName( + "Ontology endpoint serves Turtle by default with the bumped version and core classes") + void testServeTurtle() { + Response response = OntologyDocument.serve("turtle"); + assertEquals(200, response.getStatus()); + String body = response.getEntity().toString(); + assertNotNull(body); + + Model parsed = ModelFactory.createDefaultModel(); + RDFDataMgr.read( + parsed, + new java.io.ByteArrayInputStream(body.getBytes(java.nio.charset.StandardCharsets.UTF_8)), + Lang.TURTLE); + + assertTrue( + parsed.contains( + parsed.createResource(OM_NS), + parsed.createProperty(OWL_VERSION_INFO), + parsed.createLiteral("1.1.0")), + "Ontology document should declare owl:versionInfo \"1.1.0\" on the om: ontology"); + + assertTrue( + parsed.containsResource(parsed.createResource(OM_NS + "Column")), + "Core om:Column class must be present in the served ontology"); + assertTrue( + parsed.containsResource(parsed.createResource(OM_NS + "TableConstraint")), + "Newly added om:TableConstraint class must be present"); + assertTrue( + parsed.containsResource(parsed.createResource(OM_NS + "LineageDetails")), + "Newly declared om:LineageDetails class must be present"); + } + + @Test + @DisplayName("Ontology endpoint can render the same document as JSON-LD") + void testServeJsonLd() { + Response response = OntologyDocument.serve("jsonld"); + assertEquals(200, response.getStatus()); + assertEquals("application/ld+json", response.getMediaType().toString()); + String body = response.getEntity().toString(); + assertTrue(body.contains("@context") || body.contains("@graph")); + } + + @Test + @DisplayName("Unknown format defaults to Turtle") + void testUnknownFormatFallsBackToTurtle() { + Response response = OntologyDocument.serve("nonsense"); + assertEquals(200, response.getStatus()); + assertEquals("text/turtle", response.getMediaType().toString()); + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/resources/rdf/RdfShaclValidatorTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/resources/rdf/RdfShaclValidatorTest.java new file mode 100644 index 000000000000..a97d5167693f --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/resources/rdf/RdfShaclValidatorTest.java @@ -0,0 +1,105 @@ +package org.openmetadata.service.resources.rdf; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.shacl.ValidationReport; +import org.apache.jena.vocabulary.RDF; +import org.apache.jena.vocabulary.RDFS; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +class RdfShaclValidatorTest { + + private static final String OM_NS = "https://open-metadata.org/ontology/"; + private static final String BASE = "https://open-metadata.org/"; + + @Test + @DisplayName("A column-lineage edge whose om:fromColumn is a string literal violates the shape") + void testColumnLineageRejectsLiteralFromColumn() { + Model model = ModelFactory.createDefaultModel(); + Resource lineage = model.createResource(BASE + "lineageDetails/x/y/colLineage/1"); + lineage.addProperty(RDF.type, model.createResource(OM_NS + "ColumnLineage")); + // Wrong: literal where the shape requires om:Column. + lineage.addProperty(model.createProperty(OM_NS, "fromColumn"), "service.db.s.t.col_a"); + lineage.addProperty( + model.createProperty(OM_NS, "toColumn"), + model.createResource(BASE + "entity/column/service.db.s.target.col_b")); + + ValidationReport report = RdfShaclValidator.validate(model); + assertFalse( + report.conforms(), + "Literal om:fromColumn should violate ColumnLineageShape (om:Column class constraint)"); + } + + @Test + @DisplayName("Properly-shaped column-lineage with URI references conforms") + void testColumnLineageAcceptsUriReferences() { + Model model = ModelFactory.createDefaultModel(); + Resource fromCol = model.createResource(BASE + "entity/column/service.db.s.t.col_a"); + fromCol.addProperty(RDF.type, model.createResource(OM_NS + "Column")); + fromCol.addProperty(model.createProperty(OM_NS, "fullyQualifiedName"), "service.db.s.t.col_a"); + fromCol.addProperty(RDFS.label, "col_a"); + + Resource toCol = model.createResource(BASE + "entity/column/service.db.s.target.col_b"); + toCol.addProperty(RDF.type, model.createResource(OM_NS + "Column")); + toCol.addProperty( + model.createProperty(OM_NS, "fullyQualifiedName"), "service.db.s.target.col_b"); + toCol.addProperty(RDFS.label, "col_b"); + + Resource lineage = model.createResource(BASE + "lineageDetails/x/y/colLineage/1"); + lineage.addProperty(RDF.type, model.createResource(OM_NS + "ColumnLineage")); + lineage.addProperty(model.createProperty(OM_NS, "fromColumn"), fromCol); + lineage.addProperty(model.createProperty(OM_NS, "toColumn"), toCol); + lineage.addProperty(model.createProperty(OM_NS, "fromColumnFqn"), "service.db.s.t.col_a"); + lineage.addProperty(model.createProperty(OM_NS, "toColumnFqn"), "service.db.s.target.col_b"); + + ValidationReport report = RdfShaclValidator.validate(model); + assertTrue( + report.conforms(), + "URI-based column lineage with both endpoints typed as om:Column should conform: " + + reportSummary(report)); + } + + @Test + @DisplayName("A TableConstraint missing constraintType violates TableConstraintShape") + void testTableConstraintRequiresType() { + Model model = ModelFactory.createDefaultModel(); + Resource constraint = model.createResource(BASE + "entity/table/t/constraint/0"); + constraint.addProperty(RDF.type, model.createResource(OM_NS + "TableConstraint")); + Resource col = model.createResource(BASE + "entity/column/service.db.s.t.id"); + col.addProperty(RDF.type, model.createResource(OM_NS + "Column")); + col.addProperty(model.createProperty(OM_NS, "fullyQualifiedName"), "service.db.s.t.id"); + col.addProperty(RDFS.label, "id"); + constraint.addProperty(model.createProperty(OM_NS, "hasConstrainedColumn"), col); + + ValidationReport report = RdfShaclValidator.validate(model); + assertFalse( + report.conforms(), + "TableConstraint without om:constraintType should violate TableConstraintShape minCount=1"); + } + + @Test + @DisplayName("GlossaryTerm without skos:inScheme violates GlossaryTermShape") + void testGlossaryTermRequiresInScheme() { + Model model = ModelFactory.createDefaultModel(); + Resource term = model.createResource(BASE + "entity/glossaryTerm/123"); + term.addProperty(RDF.type, model.createResource(OM_NS + "GlossaryTerm")); + term.addProperty(RDFS.label, "Customer"); + term.addProperty(model.createProperty(OM_NS, "fullyQualifiedName"), "BusinessTerms.Customer"); + + ValidationReport report = RdfShaclValidator.validate(model); + assertFalse( + report.conforms(), + "GlossaryTerm missing skos:inScheme must be flagged so we surface broken glossary memberships"); + } + + private static String reportSummary(ValidationReport report) { + StringBuilder sb = new StringBuilder(); + report.getEntries().forEach(e -> sb.append(e).append("\n")); + return sb.toString(); + } +} diff --git a/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdf/customOntology.json b/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdf/customOntology.json new file mode 100644 index 000000000000..be6fa7d6c7cc --- /dev/null +++ b/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdf/customOntology.json @@ -0,0 +1,105 @@ +{ + "$id": "https://open-metadata.org/schema/api/configuration/rdf/customOntology.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "CustomOntology", + "description": "A user-authored extension to the canonical OpenMetadata ontology. Custom classes and properties live in the om-extension: namespace and never collide with the read-only canonical om: namespace.", + "type": "object", + "javaType": "org.openmetadata.schema.api.configuration.rdf.CustomOntology", + "definitions": { + "customClass": { + "type": "object", + "javaType": "org.openmetadata.schema.api.configuration.rdf.CustomOntologyClass", + "description": "A user-defined OWL class.", + "properties": { + "uri": { + "description": "Full URI of the class. Must start with the om-extension: namespace.", + "type": "string", + "pattern": "^https://open-metadata\\.org/ontology-extension/[A-Za-z][A-Za-z0-9_-]*$" + }, + "label": { + "description": "Human-readable label (rdfs:label).", + "type": "string" + }, + "description": { + "description": "Markdown description of the class.", + "type": "string" + }, + "subClassOf": { + "description": "Parent class URIs. May reference canonical om: classes (e.g. om:DataAsset) or other custom classes within this same extension. Must not be empty.", + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + } + }, + "required": ["uri", "subClassOf"], + "additionalProperties": false + }, + "customProperty": { + "type": "object", + "javaType": "org.openmetadata.schema.api.configuration.rdf.CustomOntologyProperty", + "description": "A user-defined OWL ObjectProperty or DatatypeProperty.", + "properties": { + "uri": { + "description": "Full URI of the property. Must start with the om-extension: namespace.", + "type": "string", + "pattern": "^https://open-metadata\\.org/ontology-extension/[A-Za-z][A-Za-z0-9_-]*$" + }, + "label": { + "description": "Human-readable label.", + "type": "string" + }, + "description": { + "description": "Markdown description of the property.", + "type": "string" + }, + "type": { + "description": "OWL property type.", + "type": "string", + "enum": ["ObjectProperty", "DatatypeProperty"] + }, + "domain": { + "description": "URI of the property's rdfs:domain (the class instances this property applies to).", + "type": "string" + }, + "range": { + "description": "URI of the property's rdfs:range. For DatatypeProperty, an XSD datatype URI; for ObjectProperty, a class URI.", + "type": "string" + }, + "subPropertyOf": { + "description": "Optional parent properties.", + "type": "array", + "items": { "type": "string" }, + "default": [] + } + }, + "required": ["uri", "type", "domain", "range"], + "additionalProperties": false + } + }, + "properties": { + "name": { + "description": "Stable identifier for the extension. Lowercase letters, digits, hyphen.", + "type": "string", + "pattern": "^[a-z][a-z0-9-]{1,62}[a-z0-9]$" + }, + "displayName": { "type": "string" }, + "description": { + "description": "Markdown description of the extension. Should explain why these classes/properties are needed.", + "type": "string" + }, + "classes": { + "description": "Custom OWL classes defined by this extension.", + "type": "array", + "items": { "$ref": "#/definitions/customClass" }, + "default": [] + }, + "properties": { + "description": "Custom OWL properties defined by this extension.", + "type": "array", + "items": { "$ref": "#/definitions/customProperty" }, + "default": [] + } + }, + "required": ["name"], + "additionalProperties": false +} diff --git a/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdf/inferenceRule.json b/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdf/inferenceRule.json new file mode 100644 index 000000000000..405daafc5064 --- /dev/null +++ b/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdf/inferenceRule.json @@ -0,0 +1,54 @@ +{ + "$id": "https://open-metadata.org/schema/api/configuration/rdf/inferenceRule.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "InferenceRule", + "description": "A SPARQL CONSTRUCT rule that materializes derived triples in the OpenMetadata knowledge graph (e.g. transitive lineage, PII propagation, tag inheritance).", + "type": "object", + "javaType": "org.openmetadata.schema.api.configuration.rdf.InferenceRule", + "properties": { + "name": { + "description": "Stable identifier for the rule (used as primary key). Lowercase letters, digits, hyphen.", + "type": "string", + "pattern": "^[a-z][a-z0-9-]{1,62}[a-z0-9]$" + }, + "displayName": { + "description": "Human-readable name.", + "type": "string" + }, + "description": { + "description": "What the rule does and why it is enabled. Markdown.", + "type": "string" + }, + "ruleType": { + "description": "Body language. CONSTRUCT is a SPARQL CONSTRUCT query that produces new triples. RDFS is a placeholder for future Jena-RDFS rule format.", + "type": "string", + "enum": ["CONSTRUCT", "RDFS"], + "default": "CONSTRUCT" + }, + "ruleBody": { + "description": "The rule body. For ruleType=CONSTRUCT, a SPARQL CONSTRUCT query that emits the inferred triples.", + "type": "string", + "minLength": 16 + }, + "enabled": { + "description": "Whether the rule is currently active. Disabled rules are loaded but not applied.", + "type": "boolean", + "default": true + }, + "priority": { + "description": "Execution order hint. Lower numbers run first. Rules at the same priority run in name order.", + "type": "integer", + "default": 100, + "minimum": 0, + "maximum": 10000 + }, + "tags": { + "description": "Free-form labels (e.g. 'lineage', 'security', 'governance') for filtering in admin UI.", + "type": "array", + "items": { "type": "string" }, + "default": [] + } + }, + "required": ["name", "ruleBody"], + "additionalProperties": false +} diff --git a/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdfConfiguration.json b/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdfConfiguration.json index ec778d921d22..252f61f98d57 100644 --- a/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdfConfiguration.json +++ b/openmetadata-spec/src/main/resources/json/schema/api/configuration/rdfConfiguration.json @@ -105,6 +105,29 @@ "description": "Cache inferred triples for better query performance (requires more storage)", "type": "boolean", "default": false + }, + "federation": { + "description": "Controls federated SPARQL access (SERVICE clauses) to external endpoints. Federation is disabled by default; SERVICE clauses are rejected unless the target URI is in the allowlist.", + "type": "object", + "javaType": "org.openmetadata.schema.api.configuration.rdf.SparqlFederationConfig", + "properties": { + "enabled": { + "description": "Master switch for federated SPARQL. When false, every SERVICE clause is rejected regardless of allowlist contents.", + "type": "boolean", + "default": false + }, + "allowedEndpoints": { + "description": "External SPARQL endpoint URIs that may appear in SERVICE clauses. Compared verbatim against the URI in the SERVICE clause; trailing slashes matter.", + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "default": [] + } + }, + "additionalProperties": false, + "default": null } }, "required": ["enabled", "storageType"], diff --git a/openmetadata-spec/src/main/resources/rdf/contexts/ai.jsonld b/openmetadata-spec/src/main/resources/rdf/contexts/ai.jsonld new file mode 100644 index 000000000000..67f6bd883dda --- /dev/null +++ b/openmetadata-spec/src/main/resources/rdf/contexts/ai.jsonld @@ -0,0 +1,97 @@ +{ + "@context": [ + "./base.jsonld", + { + "LLMModel": { + "@id": "om:LLMModel", + "@type": "om:Entity" + }, + "AIApplication": { + "@id": "om:AIApplication", + "@type": "om:Entity" + }, + "McpServer": { + "@id": "om:McpServer", + "@type": "om:Entity" + }, + "AgentExecution": { + "@id": "om:AgentExecution", + "@type": ["om:Entity", "prov:Activity"] + }, + "McpExecution": { + "@id": "om:McpExecution", + "@type": ["om:Entity", "prov:Activity"] + }, + "PromptTemplate": { + "@id": "om:PromptTemplate", + "@type": "om:Entity" + }, + "modelType": { + "@id": "om:modelType", + "@type": "xsd:string" + }, + "modelProvider": { + "@id": "om:modelProvider", + "@type": "xsd:string" + }, + "modelVersion": { + "@id": "om:modelVersion", + "@type": "xsd:string" + }, + "applicationType": { + "@id": "om:applicationType", + "@type": "xsd:string" + }, + "developmentStage": { + "@id": "om:developmentStage", + "@type": "xsd:string" + }, + "trainingDatasets": { + "@id": "om:hasTrainingDataset", + "@type": "@id", + "@container": "@set" + }, + "validationDatasets": { + "@id": "om:hasValidationDataset", + "@type": "@id", + "@container": "@set" + }, + "models": { + "@id": "om:usesModel", + "@type": "@id", + "@container": "@set" + }, + "mcpServers": { + "@id": "om:usesMcpServer", + "@type": "@id", + "@container": "@set" + }, + "tools": { + "@id": "om:usesTool", + "@type": "@id", + "@container": "@set" + }, + "promptTemplates": { + "@id": "om:hasPromptTemplate", + "@type": "@id", + "@container": "@set" + }, + "application": { + "@id": "om:executedBy", + "@type": "@id" + }, + "executionStatus": { + "@id": "om:executionStatus", + "@type": "xsd:string" + }, + "startTime": { + "@id": "prov:startedAtTime", + "@type": "xsd:dateTime" + }, + "endTime": { + "@id": "prov:endedAtTime", + "@type": "xsd:dateTime" + } + } + ] +} diff --git a/openmetadata-spec/src/main/resources/rdf/contexts/automation.jsonld b/openmetadata-spec/src/main/resources/rdf/contexts/automation.jsonld new file mode 100644 index 000000000000..46691e77c992 --- /dev/null +++ b/openmetadata-spec/src/main/resources/rdf/contexts/automation.jsonld @@ -0,0 +1,47 @@ +{ + "@context": [ + "./base.jsonld", + { + "Workflow": { + "@id": "om:Workflow", + "@type": "om:Entity" + }, + "Automation": { + "@id": "om:Automation", + "@type": "om:Entity" + }, + "WorkflowDefinition": { + "@id": "om:WorkflowDefinition", + "@type": "om:Entity" + }, + "WorkflowInstance": { + "@id": "om:WorkflowInstance", + "@type": ["om:Entity", "prov:Activity"] + }, + "workflowType": { + "@id": "om:workflowType", + "@type": "xsd:string" + }, + "status": { + "@id": "om:hasStatus", + "@type": "xsd:string" + }, + "request": { + "@id": "om:automationRequest", + "@type": "@json" + }, + "response": { + "@id": "om:automationResponse", + "@type": "@json" + }, + "triggerType": { + "@id": "om:triggerType", + "@type": "xsd:string" + }, + "scheduleType": { + "@id": "om:scheduleType", + "@type": "xsd:string" + } + } + ] +} diff --git a/openmetadata-spec/src/main/resources/rdf/ontology/CHANGELOG.md b/openmetadata-spec/src/main/resources/rdf/ontology/CHANGELOG.md new file mode 100644 index 000000000000..8e9964d57a39 --- /dev/null +++ b/openmetadata-spec/src/main/resources/rdf/ontology/CHANGELOG.md @@ -0,0 +1,62 @@ +# OpenMetadata Ontology Changelog + +The canonical ontology lives in `openmetadata.ttl`. The PROV-aligned extension lives in +`openmetadata-prov.ttl`. SHACL shapes live in `../shapes/openmetadata-shapes.ttl`. JSON-LD contexts +live in `../contexts/`. + +The version recorded here is the value of `owl:versionInfo` on the `om:` ontology resource. + +## 1.1.0 — 2026-04-28 + +Knowledge-graph fidelity pass. All changes are additive or domain corrections; existing consumers +that referenced the corrected domains were not actually relying on them, since the prior +declaration did not match what the mapper emitted. + +### Added — Column resources and column lineage + +- `om:Column` resources are now first-class named resources at FQN-derived URIs + (`baseUri + "entity/column/" + URLEncoded(FQN)`). Previously columns were blank nodes, + unreachable from SPARQL. +- `om:fromColumn` and `om:toColumn` (column lineage) are now URI references to `om:Column` + resources, not FQN string literals. The original FQN strings are retained as + `om:fromColumnFqn` / `om:toColumnFqn` for back-compatibility with consumers that match + by string. +- `om:LineageDetails` class declared (was used by the mapper but undeclared). +- `om:hasColumnLineage`, `om:transformFunction` declared. +- `om:hasChildColumn` (subproperty of `om:hasColumn`) for nested struct/map/union columns. +- Domain of `om:fromColumn` / `om:toColumn` corrected from `om:Column` to `om:ColumnLineage`. + +### Added — Table constraints + +- `om:TableConstraint` class. +- `om:hasConstraint` (`om:Table` → `om:TableConstraint`). +- `om:constraintType`, `om:relationshipType` (datatype properties). +- `om:hasConstrainedColumn`, `om:hasReferredColumn` (object properties on the constraint). +- `om:references` (`om:Column` → `om:Column`) — direct FK edges between source and referred + columns, paired positionally from `TableConstraint.columns` and `referredColumns`. +- `om:isUnique` datatype property on columns. +- Per-column `constraint` enum (`PRIMARY_KEY`, `UNIQUE`, `NOT_NULL`, `NULL`) now maps to the + corresponding `om:isPrimaryKey` / `om:isUnique` / `om:isNullable` triples. + +### Changed — SKOS hierarchy + +JSON-LD context `governance.jsonld`: + +- `glossary` (on a glossary term) now maps to `skos:inScheme` (was `om:belongsToGlossary`). +- `classification` (on a tag) now maps to `skos:inScheme` (was `om:belongsToClassification`). +- `parent` (on a glossary term or tag) now maps to `skos:broader` (previously unmapped). +- `children` now maps to `skos:narrower`. The prior `childTerms` alias has been removed; it + referenced a JSON field that does not exist on `GlossaryTerm`, so the mapping never fired. + +The OpenMetadata-specific predicates `om:belongsToGlossary` and `om:belongsToClassification` +were not used outside this single context file; no SPARQL queries reference them. + +### Changed — JSON-LD lineage context + +- `fromColumns` and `toColumn` in `lineage.jsonld` now map to `om:fromColumnFqn` / + `om:toColumnFqn` (datatype properties), not to `om:fromColumns` / `om:toColumn` (which + collided with the new object-typed predicates). + +## 1.0.0 — 2025-08-24 + +Initial ontology release. diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AppRouter/AuthenticatedAppRouter.tsx b/openmetadata-ui/src/main/resources/ui/src/components/AppRouter/AuthenticatedAppRouter.tsx index 7150f47f6d25..ac038955f8e4 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AppRouter/AuthenticatedAppRouter.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/AppRouter/AuthenticatedAppRouter.tsx @@ -208,6 +208,12 @@ const OntologyExplorerPage = withSuspenseFallback( ) ); +const SparqlPlaygroundPage = withSuspenseFallback( + React.lazy( + () => import('../../pages/SparqlPlayground/SparqlPlayground.component') + ) +); + const WorkflowsListPage = withSuspenseFallback( React.lazy( () => import('../../pages/WorkflowDefinitions/WorkflowsPage/WorkflowsPage') @@ -387,6 +393,10 @@ const AuthenticatedAppRouter: FunctionComponent = () => { element={} path={ROUTES.EXPLORE_WITH_TAB} /> + } + path={ROUTES.SPARQL_PLAYGROUND} + /> } path={ROUTES.ONTOLOGY_EXPLORER} diff --git a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx index 94ad866066b9..58bc921d5176 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx @@ -798,29 +798,27 @@ const KnowledgeGraph: React.FC = ({ ); - const knowledgeGraph = loading ? ( + const metadataModeBody = loading ? (
+ ) : hasNoData ? ( + + } + size={SIZE.X_SMALL} + type={ERROR_PLACEHOLDER_TYPE.CUSTOM}> + {t('message.no-knowledge-graph-data')} + + ) : ( graphCanvas ); - if (hasNoData && !loading) { - return ( - - } - size={SIZE.X_SMALL} - type={ERROR_PLACEHOLDER_TYPE.CUSTOM}> - {t('message.no-knowledge-graph-data')} - - - ); - } + const knowledgeGraph = metadataModeBody; if (!entity) { return ( @@ -858,9 +856,6 @@ const KnowledgeGraph: React.FC = ({ data-testid="knowledge-graph-controls" justify="between"> - - {t('label.view-entity', { entity: t('label.mode') }) + ':'} - = [ + { value: 'json', label: 'JSON (SELECT/ASK)' }, + { value: 'csv', label: 'CSV (SELECT)' }, + { value: 'tsv', label: 'TSV (SELECT)' }, + { value: 'xml', label: 'XML (SELECT/ASK)' }, + { value: 'turtle', label: 'Turtle (CONSTRUCT/DESCRIBE)' }, + { value: 'jsonld', label: 'JSON-LD (CONSTRUCT/DESCRIBE)' }, + { value: 'ntriples', label: 'N-Triples (CONSTRUCT/DESCRIBE)' }, + { value: 'rdfxml', label: 'RDF-XML (CONSTRUCT/DESCRIBE)' }, +]; + +const INFERENCE_OPTIONS: ReadonlyArray<{ + value: SparqlPlaygroundInference; + label: string; +}> = [ + { value: 'none', label: 'none' }, + { value: 'rdfs', label: 'rdfs' }, + { value: 'owl', label: 'owl' }, + { value: 'custom', label: 'custom' }, +]; + +const FORMAT_EXTENSIONS: Record = { + json: 'json', + csv: 'csv', + tsv: 'tsv', + xml: 'xml', + turtle: 'ttl', + jsonld: 'jsonld', + ntriples: 'nt', + rdfxml: 'rdf', +}; + +function loadSavedQueries(): SavedSparqlQuery[] { + try { + const raw = window.localStorage.getItem(SPARQL_PLAYGROUND_STORAGE_KEY); + if (!raw) { + return []; + } + const parsed: unknown = JSON.parse(raw); + if (!Array.isArray(parsed)) { + return []; + } + + return parsed.filter( + (q): q is SavedSparqlQuery => + typeof q === 'object' && + q !== null && + typeof (q as SavedSparqlQuery).id === 'string' && + typeof (q as SavedSparqlQuery).query === 'string' + ); + } catch { + return []; + } +} + +function persistSavedQueries(queries: SavedSparqlQuery[]): void { + window.localStorage.setItem( + SPARQL_PLAYGROUND_STORAGE_KEY, + JSON.stringify(queries) + ); +} + +function downloadAsFile( + body: string, + contentType: string, + filename: string +): void { + const blob = new Blob([body], { type: contentType }); + const url = window.URL.createObjectURL(blob); + const link = document.createElement('a'); + link.href = url; + link.download = filename; + link.style.display = 'none'; + document.body.appendChild(link); + link.click(); + setTimeout(() => { + document.body.removeChild(link); + window.URL.revokeObjectURL(url); + }, 100); +} + +const initialQuery = `${DEFAULT_SPARQL_PREFIXES}\n\nSELECT ?s ?p ?o WHERE {\n ?s ?p ?o\n} LIMIT 10`; + +const SparqlPlayground: React.FC = () => { + const { t } = useTranslation(); + const [query, setQuery] = useState(initialQuery); + const [format, setFormat] = useState('json'); + const [inference, setInference] = useState('none'); + const [running, setRunning] = useState(false); + const [result, setResult] = useState(null); + const [errorMessage, setErrorMessage] = useState(null); + const [savedQueries, setSavedQueries] = useState(() => + loadSavedQueries() + ); + + useEffect(() => { + persistSavedQueries(savedQueries); + }, [savedQueries]); + + const handleRun = useCallback(async () => { + if (!query.trim()) { + setErrorMessage(t('label.sparql-empty-query-error')); + + return; + } + setRunning(true); + setErrorMessage(null); + setResult(null); + try { + const r = await runSparqlQuery({ query, format, inference }); + setResult(r); + } catch (e) { + const message = isAxiosError(e) + ? typeof e.response?.data === 'string' + ? e.response.data + : e.message + : (e as Error).message; + setErrorMessage(message); + showErrorToast(message); + } finally { + setRunning(false); + } + }, [query, format, inference, t]); + + const handleSaveCurrent = useCallback(() => { + const name = window.prompt(t('label.sparql-save-prompt')); + if (!name || !name.trim()) { + return; + } + const id = + typeof crypto !== 'undefined' && 'randomUUID' in crypto + ? crypto.randomUUID() + : `${Date.now()}-${Math.random().toString(36).slice(2)}`; + setSavedQueries((prev) => [ + ...prev, + { + id, + name: name.trim(), + query, + format, + inference, + savedAt: Date.now(), + }, + ]); + showSuccessToast(t('message.sparql-query-saved')); + }, [query, format, inference, t]); + + const handleLoadSaved = useCallback((saved: SavedSparqlQuery) => { + setQuery(saved.query); + setFormat(saved.format); + setInference(saved.inference); + }, []); + + const handleDeleteSaved = useCallback((id: string) => { + setSavedQueries((prev) => prev.filter((q) => q.id !== id)); + }, []); + + const handleLoadSample = useCallback( + (sample: (typeof SAMPLE_SPARQL_QUERIES)[number]) => { + setQuery(sample.query); + }, + [] + ); + + const handleDownload = useCallback(() => { + if (!result) { + return; + } + const filename = `sparql-result.${FORMAT_EXTENSIONS[result.format]}`; + downloadAsFile(result.body, result.contentType, filename); + }, [result]); + + const handleInjectPrefixes = useCallback(() => { + if (query.includes('PREFIX om:')) { + return; + } + setQuery(`${DEFAULT_SPARQL_PREFIXES}\n\n${query}`); + }, [query]); + + const tabularResult = useMemo(() => { + if (!result || result.format !== 'json' || !result.parsed) { + return null; + } + const vars = result.parsed.head?.vars ?? []; + const rows = result.parsed.results?.bindings ?? []; + + return { vars, rows }; + }, [result]); + + return ( + +
+ , + url: '/', + activeTitle: true, + }, + { + name: t('label.sparql-playground'), + url: '', + }, + ]} + /> + + +
+ + {t('label.sparql-playground')} + + + {t('label.beta')} + +
+ + {t('message.sparql-playground-subtitle')} + +
+ +
+ +
+ ({ + id: o.value, + label: o.label, + }))} + size="sm" + value={inference} + onChange={(key) => + setInference(String(key) as SparqlPlaygroundInference) + } + /> + + + +
+ +
+ +
+ + {errorMessage ? ( +
+ {errorMessage} +
+ ) : null} + + {result ? ( +
+
+ + {t('label.format')}: {result.format} + + + {t('label.duration')}: {result.durationMs}ms + + +
+ {tabularResult ? ( +
+ + + + {tabularResult.vars.map((v) => ( + + ))} + + + + {tabularResult.rows.map((row, idx) => ( + + {tabularResult.vars.map((v) => { + const binding = row[v] as Binding | undefined; + + return ( + + ); + })} + + ))} + +
+ {v} +
+ {binding?.value ?? ''} +
+ {tabularResult.rows.length === 0 ? ( +
+ {t('message.sparql-no-rows')} +
+ ) : null} +
+ ) : ( +
+                    {result.body}
+                  
+ )} +
+ ) : null} +
+ + + + {t('label.sample-queries')} + +
    + {SAMPLE_SPARQL_QUERIES.map((sample) => ( +
  • + +
  • + ))} +
+ + + {t('label.saved-queries')} + + {savedQueries.length === 0 ? ( + + {t('message.sparql-no-saved-queries')} + + ) : ( +
    + {savedQueries.map((saved) => ( +
  • + + +
  • + ))} +
+ )} +
+
+
+
+ ); +}; + +export default SparqlPlayground; diff --git a/openmetadata-ui/src/main/resources/ui/src/pages/SparqlPlayground/SparqlPlayground.interface.ts b/openmetadata-ui/src/main/resources/ui/src/pages/SparqlPlayground/SparqlPlayground.interface.ts new file mode 100644 index 000000000000..01e068aea732 --- /dev/null +++ b/openmetadata-ui/src/main/resources/ui/src/pages/SparqlPlayground/SparqlPlayground.interface.ts @@ -0,0 +1,101 @@ +/* + * Copyright 2026 Collate. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { + SparqlPlaygroundFormat, + SparqlPlaygroundInference, +} from '../../rest/rdfAPI'; + +export interface SavedSparqlQuery { + id: string; + name: string; + query: string; + format: SparqlPlaygroundFormat; + inference: SparqlPlaygroundInference; + savedAt: number; +} + +export const SPARQL_PLAYGROUND_STORAGE_KEY = + 'om.sparql-playground.savedQueries'; + +export const DEFAULT_SPARQL_PREFIXES = [ + 'PREFIX om: ', + 'PREFIX dcat: ', + 'PREFIX dct: ', + 'PREFIX prov: ', + 'PREFIX skos: ', + 'PREFIX foaf: ', + 'PREFIX rdfs: ', + 'PREFIX xsd: ', + 'PREFIX dqv: ', +].join('\n'); + +/** + * Sample queries displayed in the SPARQL Playground sidebar. + * + * The visible label is an i18n key (resolved by the component via {@code t(nameKey)}); the + * `query` body is intentionally not translated — SPARQL is a structured language and translating + * a query template would corrupt its semantics. Add a new sample by appending an entry here and + * adding the matching `label.sparql-sample-*` key to en-us.json (then `yarn i18n` to sync). + */ +export const SAMPLE_SPARQL_QUERIES: ReadonlyArray<{ + nameKey: string; + query: string; +}> = [ + { + nameKey: 'label.sparql-sample-tables-tagged-pii', + query: `${DEFAULT_SPARQL_PREFIXES} + +SELECT ?table ?tableFqn ?tagFqn WHERE { + ?table a om:Table ; + om:fullyQualifiedName ?tableFqn ; + om:hasTag ?tag . + ?tag om:tagFQN ?tagFqn . + FILTER(STRSTARTS(?tagFqn, "PII.")) +} LIMIT 50`, + }, + { + nameKey: 'label.sparql-sample-fk-references', + query: `${DEFAULT_SPARQL_PREFIXES} + +SELECT ?fromCol ?fromFqn ?toCol ?toFqn WHERE { + ?fromCol om:references ?toCol ; + om:fullyQualifiedName ?fromFqn . + ?toCol om:fullyQualifiedName ?toFqn . +} LIMIT 50`, + }, + { + nameKey: 'label.sparql-sample-upstream-lineage', + query: `${DEFAULT_SPARQL_PREFIXES} + +SELECT ?upstream ?upstreamFqn WHERE { + ?downstream om:fullyQualifiedName "service.db.schema.target_table" . + ?downstream prov:wasDerivedFrom+ ?upstream . + ?upstream om:fullyQualifiedName ?upstreamFqn . +} LIMIT 100`, + }, + { + nameKey: 'label.sparql-sample-low-completeness', + query: `${DEFAULT_SPARQL_PREFIXES} + +SELECT ?table ?fqn ?metric ?value WHERE { + ?table a om:Table ; + om:fullyQualifiedName ?fqn ; + om:hasColumn ?column . + ?column dqv:hasQualityMeasurement ?m . + ?m dqv:isMeasurementOf ?metric ; + dqv:value ?value . + FILTER(?metric = om:NullProportionMetric && ?value > 0.05) +} LIMIT 50`, + }, +]; diff --git a/openmetadata-ui/src/main/resources/ui/src/pages/SparqlPlayground/SparqlPlayground.test.tsx b/openmetadata-ui/src/main/resources/ui/src/pages/SparqlPlayground/SparqlPlayground.test.tsx new file mode 100644 index 000000000000..b922c7ecf7b3 --- /dev/null +++ b/openmetadata-ui/src/main/resources/ui/src/pages/SparqlPlayground/SparqlPlayground.test.tsx @@ -0,0 +1,199 @@ +/* + * Copyright 2026 Collate. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { + act, + fireEvent, + render, + screen, + waitFor, +} from '@testing-library/react'; +import React from 'react'; +import { runSparqlQuery } from '../../rest/rdfAPI'; +import SparqlPlayground from './SparqlPlayground.component'; +import { + SAMPLE_SPARQL_QUERIES, + SPARQL_PLAYGROUND_STORAGE_KEY, +} from './SparqlPlayground.interface'; + +jest.mock('../../rest/rdfAPI', () => ({ + runSparqlQuery: jest.fn(), +})); + +jest.mock('../../utils/ToastUtils', () => ({ + showErrorToast: jest.fn(), + showSuccessToast: jest.fn(), +})); + +jest.mock('../../components/PageLayoutV1/PageLayoutV1', () => { + const Mock: React.FC = ({ children }) => ( +
{children}
+ ); + + return Mock; +}); + +jest.mock( + '../../components/common/TitleBreadcrumb/TitleBreadcrumb.component', + () => { + const Mock: React.FC = () =>
; + + return Mock; + } +); + +jest.mock('../../components/Database/SchemaEditor/SchemaEditor', () => { + const Mock: React.FC<{ + value?: string; + onChange?: (v: string) => void; + }> = ({ value, onChange }) => ( +