Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,14 @@ The static `Prov::` calls are a convenience facade. Under a dependency-injection
| PROV-XML | yes | yes |
| PROV-JSONLD | yes | no (would require an RDF-aware parser) |

### Output ordering

PROV serializations are unordered (a document is a set of records, namespaces a set of declarations), so ordering never affects meaning. For stable, readable output every serializer always sorts namespace declarations: the `prov`/`xsd` built-ins first, then the rest alphabetically by prefix. Records keep the order you added them by default; pass `sortRecords: true` to a serializer to order them into PROV-DM concept order instead (elements first, then relations in component order, each group sorted by identifier):

```php
$json = new JsonSerializer(sortRecords: true)->serialize($doc);
```

### PROV-N notes

The PROV-N parser accepts two convenience extensions beyond the published grammar, so input that parses here is not necessarily canonical PROV-N: line (`//`) and block (`/* */`) comments, and optional commas between a relation's arguments. Output always uses the canonical form.
Expand Down
91 changes: 59 additions & 32 deletions src/Attribute/Attributes.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,72 @@
readonly class Attributes implements \Countable, \IteratorAggregate
{
/**
* @param array<string, list<\Prov\Identifier\QualifiedName|\Prov\Attribute\Literal|string|int|float|bool>> $data
* @var array<string, list<\Prov\Identifier\QualifiedName|\Prov\Attribute\Literal|string|int|float|bool>>
* Values keyed by the full URI of their attribute key.
* @param array<string, \Prov\Identifier\QualifiedName> $keys
*/
private array $data;

/**
* @var array<string, \Prov\Identifier\QualifiedName>
* QualifiedName key objects, keyed by the same URIs as $data. Entries may
* be omitted; `keys()` and iteration then derive a QualifiedName from the
* URI itself, minting a prefix. All library construction paths populate
* this map, so original prefixes are preserved unless an instance is
* constructed directly from raw URI-keyed data.
*/
public function __construct(
private array $data = [],
private array $keys = [],
) {}
private array $keys;

/**
* PROV-DM models a record's attributes as a set of attribute-value pairs, so
* identical pairs are collapsed here: any key holding more than one value is
* deduplicated by canonical value identity, keeping the first occurrence.
* Deduplicating in the constructor (rather than only in `with()`/`from()`)
* makes the set semantics hold for every construction path, including the
* deserializers that build directly via `new Attributes(...)`. Single-valued
* keys (the common case) skip the work entirely.
*
* @param array<string, list<\Prov\Identifier\QualifiedName|\Prov\Attribute\Literal|string|int|float|bool>> $data
* Values keyed by the full URI of their attribute key.
* @param array<string, \Prov\Identifier\QualifiedName> $keys
* QualifiedName key objects, keyed by the same URIs as $data.
*/
public function __construct(array $data = [], array $keys = [])
{
foreach ($data as $uri => $values) {
if (count($values) > 1) {
$data[$uri] = self::dedupeValues($values);
}
}
$this->data = $data;
$this->keys = $keys;
}

/**
* Collapses values that share a canonical identity, keeping the first
* occurrence. Identity comes from `\Prov\Attribute\ValueIdentity`, so a bare
* scalar and the typed Literal it round-trips to count as one value, and the
* dedup agrees with `\Prov\Operation\DocumentComparator`'s equality. Blank-node
* references dedup by their raw `_:bN` URI, so distinct anonymous references
* stay distinct.
*
* @param list<\Prov\Identifier\QualifiedName|\Prov\Attribute\Literal|string|int|float|bool> $values
*
* @return list<\Prov\Identifier\QualifiedName|\Prov\Attribute\Literal|string|int|float|bool>
*/
private static function dedupeValues(array $values): array
{
$seen = [];
$out = [];
foreach ($values as $value) {
$signature = ValueIdentity::signature($value);
if (isset($seen[$signature])) {
continue;
}
$seen[$signature] = true;
$out[] = $value;
}
return $out;
}

/**
* Shared empty instance. Safe to reuse because Attributes is immutable.
Expand All @@ -62,32 +115,6 @@ public function with(QualifiedName $key, QualifiedName|Literal|string|int|float|
return new self($data, $keys);
}

/**
* Returns a new Attributes instance with every value of `$other` appended.
*
* Because the bag is a multimap, a key present in both instances keeps all
* of its values: the merge appends rather than overwrites, promoting a
* single value to multiple under the same key. Key objects already present
* win; otherwise `$other`'s key object is carried over.
*/
public function merge(self $other): self
{
if ($other->data === []) {
return $this;
}
$data = $this->data;
$keys = $this->keys;
foreach ($other->data as $uri => $values) {
foreach ($values as $value) {
$data[$uri][] = $value;
}
if (!isset($keys[$uri]) && isset($other->keys[$uri])) {
$keys[$uri] = $other->keys[$uri];
}
}
return new self($data, $keys);
}

/**
* Creates an Attributes instance from an array of [QualifiedName, value] pairs.
*
Expand Down
126 changes: 126 additions & 0 deletions src/Attribute/ValueIdentity.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
<?php

declare(strict_types=1);

namespace Prov\Attribute;

use Prov\Identifier\QualifiedName;

/**
* Canonical identity string for an attribute value.
*
* Maps each value to a token such that two values denoting the same PROV-DM
* value collapse: a bare scalar and the canonical xsd:* Literal it round-trips
* to, and an untyped Literal and an explicit xsd:string Literal, all sign
* identically. `Attributes` uses this to dedup identical attribute-value pairs
* (PROV-DM models a record's attributes as a set of pairs), and
* `Prov\Operation\DocumentComparator` uses it for semantic equality, so the two
* never disagree about whether two values are the same.
*
* @internal
*/
final class ValueIdentity
{
public const string XSD_STRING_URI = 'http://www.w3.org/2001/XMLSchema#string';
public const string XML_LITERAL_URI = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral';

/**
* @param ?array<string, string> $blankLabels
* When null, blank-node references sign by their raw `_:bN` URI, so two
* distinct anonymous references stay distinct (the dedup semantics used at
* construction). When an array, a blank reference signs by its canonical
* label (or a `_:?` mask while labels are still being computed), so blank
* nodes compare up to renaming (the equality semantics used by the
* comparator).
*/
public static function signature(
QualifiedName|Literal|string|int|float|bool $value,
?array $blankLabels = null,
): string {
if ($value instanceof QualifiedName) {
$uri = $value->getUri();
if (str_starts_with($uri, '_:')) {
$uri = $blankLabels === null ? $uri : $blankLabels[$uri] ?? '_:?';
}
return 'qn:' . $uri;
}
if ($value instanceof Literal) {
$datatype = $value->datatype !== null ? self::normalizeDatatypeUri($value->datatype->getUri()) : null;
// PROV-DM default: a literal without an explicit datatype and without a language
// tag is an xsd:string. Normalize so bare strings and xsd:string-typed literals
// sign identically regardless of format.
if ($datatype === null && $value->languageTag === null) {
return 'lit:' . $value->value . '^^' . self::XSD_STRING_URI;
}
$literalValue = $datatype === self::XML_LITERAL_URI
? self::normalizeXmlLiteral($value->value)
: $value->value;
$sig = 'lit:' . $literalValue;
if ($datatype !== null) {
$sig .= '^^' . $datatype;
}
if ($value->languageTag !== null) {
$sig .= '@' . $value->languageTag;
}
return $sig;
}
if (is_string($value)) {
return 'lit:' . $value . '^^' . self::XSD_STRING_URI;
}
// Native scalars sign identically to the canonical xsd:* Literal a round-trip
// through PROV-N/XML produces, so a value stays equal across formats. The token is
// built inline to avoid allocating a Literal and QualifiedName on this hot path.
if (is_bool($value)) {
return 'lit:' . ($value ? 'true' : 'false') . '^^http://www.w3.org/2001/XMLSchema#boolean';
}
if (is_int($value)) {
return 'lit:' . $value . '^^http://www.w3.org/2001/XMLSchema#int';
}
return 'lit:' . Literal::formatFloat($value) . '^^http://www.w3.org/2001/XMLSchema#float';
}

/**
* The PROV-XML fixtures declare xsd: without a trailing `#` while PROV-JSON fixtures
* declare it with one. Both point at the same W3C XSD namespace. Normalize so
* `.../XMLSchemastring` and `.../XMLSchema#string` compare equal.
*/
public static function normalizeDatatypeUri(string $uri): string
{
$withoutHash = 'http://www.w3.org/2001/XMLSchema';
if (str_starts_with($uri, $withoutHash) && !str_starts_with($uri, $withoutHash . '#')) {
return $withoutHash . '#' . substr($uri, strlen($withoutHash));
}
return $uri;
}

/**
* Strips inter-element whitespace from an rdf:XMLLiteral value so that the same
* XML fragment serialized compactly (PROV-JSON) or pretty-printed (PROV-XML)
* signs identically. Returns the input unchanged if it doesn't parse as XML.
*/
private static function normalizeXmlLiteral(string $value): string
{
$previous = libxml_use_internal_errors(true);
try {
$doc = new \DOMDocument();
$doc->preserveWhiteSpace = false;
if (!$doc->loadXML('<r xmlns:_="_">' . $value . '</r>', LIBXML_NONET)) {
return $value;
}
$root = $doc->documentElement;
if ($root === null) {
return $value;
}
$out = '';
foreach ($root->childNodes as $child) {
if ($child instanceof \DOMNode) {
$out .= $doc->saveXML($child) ?: '';
}
}
return $out;
} finally {
libxml_clear_errors();
libxml_use_internal_errors($previous);
}
}
}
9 changes: 7 additions & 2 deletions src/Builder/BundleBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,18 @@ public function build(): Bundle
{
$this->markBuilt();

$records = $this->records;
if ($this->autoDeclareEntities) {
$records = [...$records, ...self::autoDeclaredEntities($records)];
}

$namespaces = $this->namespaceManager->getRegisteredNamespaces();
if (!$this->keepUnusedNamespaces) {
$usedUris = self::collectReferencedUris($this->records);
$usedUris = self::collectReferencedUris($records);
$usedUris[$this->identifier->getUri()] = true;
$namespaces = self::pruneNamespaces($namespaces, $usedUris);
}

return new Bundle(identifier: $this->identifier, records: $this->records, namespaces: $namespaces);
return new Bundle(identifier: $this->identifier, records: $records, namespaces: $namespaces);
}
}
17 changes: 14 additions & 3 deletions src/Builder/DocumentBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ public function withBundle(QualifiedName|string $identifier, callable $callback)
if ($this->keepUnusedNamespaces) {
$bundleBuilder->keepUnusedNamespaces();
}
if ($this->autoDeclareEntities) {
$bundleBuilder->autoDeclareEntities();
}
$this->bundles[] = $bundleBuilder->build();
return $this;
}
Expand Down Expand Up @@ -107,24 +110,32 @@ public function build(): Document
{
$this->markBuilt();

$records = $this->records;
if ($this->autoDeclareEntities) {
$records = [...$records, ...self::autoDeclaredEntities($records)];
}

$bundles = $this->bundles;
foreach ($this->bundleBuilders as $bb) {
if ($this->keepUnusedNamespaces) {
$bb->keepUnusedNamespaces();
}
if ($this->autoDeclareEntities) {
$bb->autoDeclareEntities();
}
$bundles[] = $bb->build();
}

$namespaces = $this->namespaceManager->getRegisteredNamespaces();
if (!$this->keepUnusedNamespaces) {
$usedUris = self::collectReferencedUris($this->records);
$usedUris = self::collectReferencedUris($records);
foreach ($bundles as $bundle) {
$usedUris[$bundle->identifier->getUri()] = true;
$usedUris[$bundle->identifier->namespace->uri] = true;
$usedUris = self::collectReferencedUris($bundle->records, $usedUris);
}
$namespaces = self::pruneNamespaces($namespaces, $usedUris);
}

return new Document(records: $this->records, bundles: $bundles, namespaces: $namespaces);
return new Document(records: $records, bundles: $bundles, namespaces: $namespaces);
}
}
Loading