From eea76b5e08bc3383f4a61fac3f054a7a9c577c64 Mon Sep 17 00:00:00 2001 From: Michael Nelson Date: Thu, 29 Jan 2026 17:22:57 +1100 Subject: [PATCH 1/2] anthropic: adds a schema pre-processor for compatibility. Not yet used, it'll be integrated in the next branch. --- pyproject.toml | 3 +- requirements-dev.txt | 4 +- requirements.txt | 4 +- .../knowledge_service/anthropic/__init__.py | 2 + .../anthropic/schema_preprocessor.py | 607 ++++++++++++++ .../tests/test_schema_preprocessor.py | 764 ++++++++++++++++++ 6 files changed, 1379 insertions(+), 5 deletions(-) create mode 100644 src/julee/services/knowledge_service/anthropic/schema_preprocessor.py create mode 100644 src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py diff --git a/pyproject.toml b/pyproject.toml index da36c6a2..2c20d5c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ # Object Storage "minio>=7.0.0", # AI/ML Services - "anthropic>=0.66.0", + "anthropic>=0.76.0", # Utilities "click>=0.8.0", "Jinja2>=3.0.0", @@ -55,6 +55,7 @@ dev = [ "pytest-asyncio>=1.0.0", "pytest-cov>=4.1.0", "pytest-xdist>=3.5.0", + "pytest-subtests>=0.12.0", "hypothesis>=6.0.0", "factory-boy>=3.2.0", # Type checking diff --git a/requirements-dev.txt b/requirements-dev.txt index c9b0a750..cc92b665 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile --extra=dev --output-file=requirements-dev.txt pyproject.toml @@ -10,7 +10,7 @@ annotated-doc==0.0.4 # via fastapi annotated-types==0.7.0 # via pydantic -anthropic==0.75.0 +anthropic==0.76.0 # via julee (pyproject.toml) anyio==4.12.0 # via diff --git a/requirements.txt b/requirements.txt index 59c452bc..c6d3bb83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile --output-file=requirements.txt pyproject.toml @@ -8,7 +8,7 @@ annotated-doc==0.0.4 # via fastapi annotated-types==0.7.0 # via pydantic -anthropic==0.75.0 +anthropic==0.76.0 # via julee (pyproject.toml) anyio==4.12.0 # via diff --git a/src/julee/services/knowledge_service/anthropic/__init__.py b/src/julee/services/knowledge_service/anthropic/__init__.py index dd3ba6ba..3dc70fe1 100644 --- a/src/julee/services/knowledge_service/anthropic/__init__.py +++ b/src/julee/services/knowledge_service/anthropic/__init__.py @@ -6,7 +6,9 @@ """ from .knowledge_service import AnthropicKnowledgeService +from .schema_preprocessor import AnthropicSchemaPreprocessor __all__ = [ "AnthropicKnowledgeService", + "AnthropicSchemaPreprocessor", ] diff --git a/src/julee/services/knowledge_service/anthropic/schema_preprocessor.py b/src/julee/services/knowledge_service/anthropic/schema_preprocessor.py new file mode 100644 index 00000000..14ea5bcd --- /dev/null +++ b/src/julee/services/knowledge_service/anthropic/schema_preprocessor.py @@ -0,0 +1,607 @@ +""" +Schema preprocessor for Anthropic structured outputs compatibility. + +This module provides utilities to transform JSON schemas to be compatible +with Anthropic's structured outputs API, which has certain limitations +compared to full JSON Schema specification. + +For full details on limitations, see: +https://platform.claude.com/docs/en/build-with-claude/structured-outputs#json-schema-limitations +""" + +import copy +import logging +from typing import Any + +logger = logging.getLogger(__name__) + + +class AnthropicSchemaPreprocessor: + """ + Preprocessor to make JSON schemas compatible with Anthropic structured outputs. + + Anthropic's structured outputs implementation has certain limitations: + - minItems only supports values 0 or 1 (not 2, 3, etc.) + - prefixItems is not supported at all + - uniqueItems is not supported for array types + - Numerical constraints (minimum, maximum, multipleOf, etc.) are not supported + - String constraints (minLength, maxLength) are not supported + - additionalProperties must be false (not true or a schema) + - Recursive schemas are not supported (self-referencing definitions) + - Other constraints may be discovered in the future + + This preprocessor transforms schemas to work around these limitations + while preserving as much validation intent as possible. + """ + + def make_compatible( + self, schema: dict[str, Any] + ) -> tuple[dict[str, Any], list[str]]: + """ + Transform schema to be compatible with Anthropic structured outputs. + + Args: + schema: Original JSON schema dictionary + + Returns: + Tuple of (compatible_schema, list_of_changes) + - compatible_schema: Modified schema that works with Anthropic + - list_of_changes: Human-readable list of changes made + + Example: + >>> preprocessor = AnthropicSchemaPreprocessor() + >>> original = {"type": "string", "minLength": 5, "maxLength": 10} + >>> compatible, changes = preprocessor.make_compatible(original) + >>> print(changes) + ['minLength: removed (not supported by Anthropic)', + 'maxLength: removed (not supported by Anthropic)'] + """ + try: + # Quick scan to see if we need to make any changes + if not self._needs_processing(schema): + return schema, [] + + # Make deep copy only if changes are needed + compatible_schema = copy.deepcopy(schema) + changes: list[str] = [] + + # Process the schema recursively + self._process_schema_recursively(compatible_schema, "", changes) + + if changes: + logger.info( + f"Schema modified for Anthropic compatibility: {len(changes)} changes made" + ) + for change in changes: + logger.debug(f"Schema change: {change}") + + return compatible_schema, changes + + except Exception as e: + logger.warning( + f"Schema preprocessing failed: {e}. Using original schema.", + exc_info=True, + ) + return schema, [] + + def _needs_processing(self, schema: dict[str, Any]) -> bool: + """ + Quick scan to determine if schema needs processing. + + Returns True if any unsupported constraints found anywhere in schema. + """ + if isinstance(schema, dict): + # Check current level for all unsupported constraints + if schema.get("minItems", 0) > 1: + return True + + # Array constraints + if ( + "prefixItems" in schema + or "uniqueItems" in schema + or self._has_contains_in_array_schema(schema) + ): + return True + + # Numerical constraints + numerical_constraints = { + "minimum", + "maximum", + "multipleOf", + "exclusiveMinimum", + "exclusiveMaximum", + } + if any(constraint in schema for constraint in numerical_constraints): + return True + + # String constraints + if "minLength" in schema or "maxLength" in schema: + return True + + # additionalProperties constraint - object schemas need explicit false + is_object_schema = schema.get("type") == "object" or "properties" in schema + if is_object_schema and schema.get("additionalProperties") is not False: + return True + + # Check for recursive schema definitions + if self._has_recursive_definitions(schema): + return True + + # Check all nested values + for value in schema.values(): + if isinstance(value, dict) and self._needs_processing(value): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict) and self._needs_processing(item): + return True + + return False + + def _process_schema_recursively( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Recursively process schema to fix incompatibilities. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of changes + changes: List to accumulate change descriptions + """ + if not isinstance(schema, dict): + return + + # Fix constraints at current level + self._fix_min_items_constraint(schema, path, changes) + self._fix_prefix_items_constraint(schema, path, changes) + self._fix_unique_items_constraint(schema, path, changes) + self._fix_contains_constraint(schema, path, changes) + self._fix_numerical_constraints(schema, path, changes) + self._fix_string_constraints(schema, path, changes) + self._fix_additional_properties_constraint(schema, path, changes) + self._fix_recursive_schemas(schema, path, changes) + + # Process nested schemas + for key, value in schema.items(): + current_path = f"{path}.{key}" if path else key + + if isinstance(value, dict): + self._process_schema_recursively(value, current_path, changes) + elif isinstance(value, list): + for i, item in enumerate(value): + if isinstance(item, dict): + item_path = f"{current_path}[{i}]" + self._process_schema_recursively(item, item_path, changes) + + def _fix_min_items_constraint( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix minItems values that are incompatible with Anthropic. + + Anthropic only supports minItems values of 0 or 1. + Values > 1 are reduced to 1. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + min_items = schema.get("minItems") + + if isinstance(min_items, int) and min_items > 1: + original_value = min_items + schema["minItems"] = 1 + + location = path if path else "root" + change_msg = ( + f"{location}.minItems: reduced from {original_value} to 1 " + f"(max supported by Anthropic)" + ) + changes.append(change_msg) + + def _fix_prefix_items_constraint( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix prefixItems constraint that is incompatible with Anthropic. + + Anthropic does not support prefixItems. We remove it and use a + generic items schema if needed. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + if "prefixItems" in schema: + prefix_items = schema.pop("prefixItems") + + # If there's no existing items constraint, create a generic one + if ( + "items" not in schema + and isinstance(prefix_items, list) + and prefix_items + ): + # Use the first prefixItem as a generic items schema + schema["items"] = prefix_items[0] + + location = path if path else "root" + change_msg = f"{location}.prefixItems: removed (not supported by Anthropic)" + changes.append(change_msg) + + def _fix_unique_items_constraint( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix uniqueItems constraint that is incompatible with Anthropic. + + Anthropic does not support uniqueItems for array types. We remove it + entirely as it's a validation constraint that can't be enforced. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + if "uniqueItems" in schema: + schema.pop("uniqueItems") + + location = path if path else "root" + change_msg = f"{location}.uniqueItems: removed (not supported by Anthropic)" + changes.append(change_msg) + + def _fix_contains_constraint( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix contains constraint that is incompatible with Anthropic. + + Anthropic does not support 'contains' property for array types. We remove it + from array schemas as it's a validation constraint that can't be enforced. + This handles both direct contains and nested contains in allOf/anyOf/oneOf. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + # Check if this is an array schema (explicit type or has items property) + is_array_schema = schema.get("type") == "array" or "items" in schema + + if is_array_schema: + # Remove direct contains property + if "contains" in schema: + schema.pop("contains") + location = path if path else "root" + change_msg = f"{location}.contains: removed (not supported by Anthropic for array types)" + changes.append(change_msg) + + # Remove contains from nested combinators + for combinator in ["allOf", "anyOf", "oneOf"]: + if combinator in schema and isinstance(schema[combinator], list): + for i, sub_schema in enumerate(schema[combinator]): + if isinstance(sub_schema, dict) and "contains" in sub_schema: + sub_schema.pop("contains") + location = path if path else "root" + change_msg = f"{location}.{combinator}[{i}].contains: removed (not supported by Anthropic for array types)" + changes.append(change_msg) + + def _fix_numerical_constraints( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix numerical constraints that are incompatible with Anthropic. + + Anthropic does not support numerical validation constraints. + We remove them entirely. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + numerical_constraints = [ + "minimum", + "maximum", + "multipleOf", + "exclusiveMinimum", + "exclusiveMaximum", + ] + location = path if path else "root" + + for constraint in numerical_constraints: + if constraint in schema: + schema.pop(constraint) + change_msg = ( + f"{location}.{constraint}: removed (not supported by Anthropic)" + ) + changes.append(change_msg) + + def _fix_string_constraints( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix string length constraints that are incompatible with Anthropic. + + Anthropic does not support minLength or maxLength constraints. + We remove them entirely. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + string_constraints = ["minLength", "maxLength"] + location = path if path else "root" + + for constraint in string_constraints: + if constraint in schema: + schema.pop(constraint) + change_msg = ( + f"{location}.{constraint}: removed (not supported by Anthropic)" + ) + changes.append(change_msg) + + def _fix_additional_properties_constraint( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix additionalProperties constraint that is incompatible with Anthropic. + + Anthropic requires all object schemas to explicitly have additionalProperties: false. + We add or set it to false for all object schemas. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + # Check if this is an object schema (explicit type or has properties) + is_object_schema = schema.get("type") == "object" or "properties" in schema + + if is_object_schema and schema.get("additionalProperties") is not False: + schema["additionalProperties"] = False + location = path if path else "root" + change_msg = f"{location}.additionalProperties: set to false (required by Anthropic for object types)" + changes.append(change_msg) + + def _has_recursive_definitions(self, schema: dict[str, Any]) -> bool: + """ + Check if schema contains recursive definitions. + + Returns True if any $defs or definitions contain self-references. + """ + definitions = schema.get("$defs", schema.get("definitions", {})) + if not definitions: + return False + + for def_name, definition in definitions.items(): + if self._has_self_reference(definition, def_name, set()): + return True + return False + + def _has_self_reference( + self, schema_part: dict[str, Any], target_ref: str, visited: set[str] + ) -> bool: + """ + Recursively check if a schema part references the target definition. + + Uses visited set to detect cycles and avoid infinite recursion. + """ + if not isinstance(schema_part, dict): + return False + + # Check if this part directly references our target + ref = schema_part.get("$ref", "") + if ref: + # Extract definition name from $ref (e.g., "#/$defs/Criterion" -> "Criterion") + ref_name = ref.split("/")[-1] + if ref_name == target_ref: + return True + # Avoid infinite recursion when following references + if ref_name in visited: + return False + visited.add(ref_name) + + # Recursively check all nested objects and arrays + for value in schema_part.values(): + if isinstance(value, dict): + if self._has_self_reference(value, target_ref, visited.copy()): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + if self._has_self_reference(item, target_ref, visited.copy()): + return True + + return False + + def _fix_recursive_schemas( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix recursive schema definitions by flattening to maximum 3 levels. + + Anthropic does not support recursive schemas. We flatten them by: + 1. Detecting self-referencing definitions + 2. Replacing recursive references with inline flattened structures + 3. Limiting depth to 3 levels, then simplifying to primitive types + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + definitions = schema.get("$defs", schema.get("definitions")) + if not definitions: + return + + definitions_key = "$defs" if "$defs" in schema else "definitions" + location = f"{path}.{definitions_key}" if path else definitions_key + + for def_name, definition in list(definitions.items()): + if self._has_self_reference(definition, def_name, set()): + # Flatten the recursive definition + flattened = self._flatten_recursive_definition( + definition, def_name, max_depth=3 + ) + definitions[def_name] = flattened + + change_msg = f"{location}.{def_name}: flattened recursive references to 3 levels (not supported by Anthropic)" + changes.append(change_msg) + + def _flatten_recursive_definition( + self, definition: dict[str, Any], target_ref: str, max_depth: int + ) -> dict[str, Any]: + """ + Flatten a recursive definition to a specified maximum depth. + + Args: + definition: The schema definition to flatten + target_ref: The name of the definition being flattened + max_depth: Maximum levels of recursion to preserve + + Returns: + Flattened schema definition + """ + return self._flatten_recursive_part( + definition, target_ref, max_depth, 0, definition + ) + + def _flatten_recursive_part( + self, + schema_part: dict[str, Any], + target_ref: str, + max_depth: int, + current_depth: int, + original_definition: dict[str, Any], + ) -> dict[str, Any]: + """ + Recursively flatten a part of a schema. + + When we encounter a self-reference: + - If depth < max_depth: inline the definition and continue + - If depth >= max_depth: use full original definition but remove recursive properties + """ + if not isinstance(schema_part, dict): + return schema_part + + result = {} + + for key, value in schema_part.items(): + if key == "$ref": + ref_name = value.split("/")[-1] + if ref_name == target_ref: + if current_depth >= max_depth: + # At max depth, use full original definition but remove recursive properties + result = self._get_original_without_recursion( + original_definition, target_ref + ) + else: + # Below max depth, inline the definition and continue flattening + result = self._flatten_recursive_part( + original_definition, + target_ref, + max_depth, + current_depth + 1, + original_definition, + ) + else: + # Different reference, keep as-is + result[key] = value + elif isinstance(value, dict): + result[key] = self._flatten_recursive_part( + value, target_ref, max_depth, current_depth, original_definition + ) + elif isinstance(value, list): + result[key] = [ + self._flatten_recursive_part( + item, target_ref, max_depth, current_depth, original_definition + ) + if isinstance(item, dict) + else item + for item in value + ] + else: + result[key] = value + + return result + + def _get_original_without_recursion( + self, original_definition: dict[str, Any], target_ref: str + ) -> dict[str, Any]: + """ + Get the original definition but with recursive references removed. + + This preserves all the real properties of the object but removes + any properties that contain recursive references to itself. + """ + result = {} + + for key, value in original_definition.items(): + if isinstance(value, dict): + # Check if this property contains a recursive reference + if not self._contains_recursive_ref(value, target_ref): + result[key] = value + # If it does contain recursion, skip it entirely + elif isinstance(value, list): + # For arrays, check if any items contain recursive references + if not any( + self._contains_recursive_ref(item, target_ref) + if isinstance(item, dict) + else False + for item in value + ): + result[key] = value + else: + # Primitive values are always safe + result[key] = value + + return result + + def _contains_recursive_ref( + self, schema_part: dict[str, Any], target_ref: str + ) -> bool: + """Check if a schema part contains a recursive reference to target_ref.""" + if isinstance(schema_part, dict): + # Direct $ref check + ref = schema_part.get("$ref", "") + if ref and ref.split("/")[-1] == target_ref: + return True + + # Check nested structures + for value in schema_part.values(): + if isinstance(value, dict): + if self._contains_recursive_ref(value, target_ref): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict) and self._contains_recursive_ref( + item, target_ref + ): + return True + + return False + + def _has_contains_in_array_schema(self, schema: dict[str, Any]) -> bool: + """Check if schema has contains property and is an array schema.""" + # Check if this is an array schema (explicit type or has items property) + is_array_schema = schema.get("type") == "array" or "items" in schema + + if not is_array_schema: + return False + + # Check direct contains + if "contains" in schema: + return True + + # Check contains in nested combinators + for combinator in ["allOf", "anyOf", "oneOf"]: + if combinator in schema and isinstance(schema[combinator], list): + for sub_schema in schema[combinator]: + if isinstance(sub_schema, dict) and "contains" in sub_schema: + return True + + return False diff --git a/src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py b/src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py new file mode 100644 index 00000000..fcddb0ba --- /dev/null +++ b/src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py @@ -0,0 +1,764 @@ +""" +Tests for AnthropicSchemaPreprocessor. + +This module tests the schema preprocessing functionality that makes JSON schemas +compatible with Anthropic's structured outputs limitations. +""" + +from unittest.mock import patch + +import pytest + +from julee.services.knowledge_service.anthropic.schema_preprocessor import ( + AnthropicSchemaPreprocessor, +) + + +class TestAnthropicSchemaPreprocessor: + """Test cases for AnthropicSchemaPreprocessor.""" + + @pytest.fixture + def preprocessor(self) -> AnthropicSchemaPreprocessor: + """Create a preprocessor instance for testing.""" + return AnthropicSchemaPreprocessor() + + def test_leaves_compatible_schemas_unchanged( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test that compatible schemas are left unchanged.""" + # Schema with minItems 0 (default) + schema1 = { + "type": "object", + "properties": {"tags": {"type": "array", "items": {"type": "string"}}}, + } + + compatible1, changes1 = preprocessor.make_compatible(schema1) + assert compatible1 == schema1 + assert changes1 == [] + + # Schema with minItems 1 (valid) + schema2 = { + "type": "object", + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": {"type": "string"}, + } + }, + } + + compatible2, changes2 = preprocessor.make_compatible(schema2) + assert compatible2 == schema2 + assert changes2 == [] + + # Schema with minItems 0 (explicit) + schema3 = { + "type": "array", + "minItems": 0, + "items": {"type": "number"}, + } + + compatible3, changes3 = preprocessor.make_compatible(schema3) + assert compatible3 == schema3 + assert changes3 == [] + + def test_reduces_high_min_items_to_one( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test that minItems > 1 are reduced to 1.""" + schema = { + "type": "object", + "properties": { + "tags": { + "type": "array", + "minItems": 3, + "maxItems": 10, + "items": {"type": "string"}, + } + }, + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Should modify minItems but preserve other constraints + expected = { + "type": "object", + "properties": { + "tags": { + "type": "array", + "minItems": 1, # Changed from 3 + "maxItems": 10, # Preserved + "items": {"type": "string"}, # Preserved + } + }, + } + + assert compatible == expected + assert len(changes) == 1 + assert "properties.tags.minItems" in changes[0] + assert "reduced from 3 to 1" in changes[0] + + def test_handles_multiple_min_items_violations( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test schema with multiple minItems violations.""" + schema = { + "type": "object", + "properties": { + "tags": {"type": "array", "minItems": 2}, + "categories": {"type": "array", "minItems": 5}, + "normal": {"type": "array", "minItems": 1}, # Should be unchanged + }, + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Both violations should be fixed + assert compatible["properties"]["tags"]["minItems"] == 1 + assert compatible["properties"]["categories"]["minItems"] == 1 + assert compatible["properties"]["normal"]["minItems"] == 1 # Unchanged + + assert len(changes) == 2 + assert any("properties.tags.minItems" in change for change in changes) + assert any("properties.categories.minItems" in change for change in changes) + + def test_handles_nested_schemas( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test minItems fixes in deeply nested schema structures.""" + schema = { + "type": "object", + "properties": { + "user": { + "type": "object", + "properties": { + "profile": { + "type": "object", + "properties": { + "skills": { + "type": "array", + "minItems": 4, + "items": {"type": "string"}, + } + }, + } + }, + } + }, + } + + compatible, changes = preprocessor.make_compatible(schema) + + assert ( + compatible["properties"]["user"]["properties"]["profile"]["properties"][ + "skills" + ]["minItems"] + == 1 + ) + + assert len(changes) == 1 + assert ( + "properties.user.properties.profile.properties.skills.minItems" + in changes[0] + ) + + def test_handles_definitions_and_defs( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test minItems fixes in definitions and $defs sections.""" + schema = { + "type": "object", + "definitions": { + "TagList": {"type": "array", "minItems": 3, "items": {"type": "string"}} + }, + "$defs": { + "CategoryList": { + "type": "array", + "minItems": 2, + "items": {"type": "string"}, + } + }, + "properties": { + "tags": {"$ref": "#/definitions/TagList"}, + "categories": {"$ref": "#/$defs/CategoryList"}, + }, + } + + compatible, changes = preprocessor.make_compatible(schema) + + assert compatible["definitions"]["TagList"]["minItems"] == 1 + assert compatible["$defs"]["CategoryList"]["minItems"] == 1 + + assert len(changes) == 2 + assert any("definitions.TagList.minItems" in change for change in changes) + assert any("$defs.CategoryList.minItems" in change for change in changes) + + def test_handles_array_of_schemas( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test schemas within arrays (like allOf, anyOf).""" + schema = { + "allOf": [ + {"type": "array", "minItems": 2}, + {"type": "array", "minItems": 3}, + ] + } + + compatible, changes = preprocessor.make_compatible(schema) + + assert compatible["allOf"][0]["minItems"] == 1 + assert compatible["allOf"][1]["minItems"] == 1 + + assert len(changes) == 2 + assert any("allOf[0].minItems" in change for change in changes) + assert any("allOf[1].minItems" in change for change in changes) + + def test_preserves_other_array_constraints( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test that other array constraints are preserved.""" + schema = { + "type": "array", + "minItems": 5, + "maxItems": 10, + "uniqueItems": True, + "items": {"type": "string", "pattern": "^[A-Z]+$"}, + "description": "List of uppercase strings", + } + + compatible, changes = preprocessor.make_compatible(schema) + + # minItems and uniqueItems should change + expected = { + "type": "array", + "minItems": 1, # reduced from 5 + "maxItems": 10, + # uniqueItems removed (not supported by Anthropic) + "items": {"type": "string", "pattern": "^[A-Z]+$"}, + "description": "List of uppercase strings", + } + + assert compatible == expected + assert len(changes) == 2 + assert any("minItems: reduced from 5 to 1" in change for change in changes) + assert any( + "uniqueItems: removed (not supported by Anthropic)" in change + for change in changes + ) + + def test_handles_root_level_array( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test schema where root is an array with minItems.""" + schema = { + "type": "array", + "minItems": 4, + "items": {"type": "string"}, + } + + compatible, changes = preprocessor.make_compatible(schema) + + assert compatible["minItems"] == 1 + assert len(changes) == 1 + assert "root.minItems" in changes[0] + + def test_quick_scan_optimization( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test that compatible schemas are quickly detected without deep copying.""" + large_compatible_schema = { + "type": "object", + "properties": { + f"field_{i}": { + "type": "array", + "minItems": 1 if i % 2 == 0 else 0, + "items": {"type": "string"}, + } + for i in range(100) + }, + } + + with patch("copy.deepcopy") as mock_deepcopy: + compatible, changes = preprocessor.make_compatible(large_compatible_schema) + + # Should not call deepcopy since no changes needed + mock_deepcopy.assert_not_called() + assert compatible == large_compatible_schema + assert changes == [] + + def test_error_handling_preserves_original_schema( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test that errors in processing return original schema safely.""" + schema = {"type": "array", "minItems": 3} + + # Mock an error in the processing + with patch.object( + preprocessor, + "_process_schema_recursively", + side_effect=ValueError("Test error"), + ): + compatible, changes = preprocessor.make_compatible(schema) + + # Should return original schema unchanged + assert compatible == schema + assert changes == [] + + def test_non_integer_min_items_ignored( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test that non-integer minItems values are ignored.""" + schema = { + "type": "array", + "minItems": "3", # String instead of int + "items": {"type": "string"}, + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Should be unchanged since minItems is not an integer + assert compatible == schema + assert changes == [] + + def test_complex_real_world_schema( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test with a complex schema similar to real assembly specifications.""" + schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "type": { + "type": "array", + "minItems": 2, + "maxItems": 5, + "items": {"type": "string"}, + }, + "credentialSubject": { + "type": "object", + "properties": { + "conformityClaim": { + "type": "array", + "minItems": 1, # Should be unchanged + "items": { + "type": "object", + "properties": { + "assessmentCriteria": { + "type": "array", + "minItems": 3, # Should be changed + "items": {"type": "string"}, + } + }, + }, + } + }, + }, + }, + "$defs": { + "TagArray": { + "type": "array", + "minItems": 4, # Should be changed + "items": {"type": "string"}, + } + }, + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Check the changes + assert compatible["properties"]["type"]["minItems"] == 1 + assert ( + compatible["properties"]["credentialSubject"]["properties"][ + "conformityClaim" + ]["minItems"] + == 1 + ) # Unchanged + assert ( + compatible["properties"]["credentialSubject"]["properties"][ + "conformityClaim" + ]["items"]["properties"]["assessmentCriteria"]["minItems"] + == 1 + ) + assert compatible["$defs"]["TagArray"]["minItems"] == 1 + + # Should have 3 changes (not 4, since one was already 1) + assert len(changes) == 3 + assert any("properties.type.minItems" in change for change in changes) + assert any("assessmentCriteria.minItems" in change for change in changes) + assert any("$defs.TagArray.minItems" in change for change in changes) + + def test_removes_unsupported_prefix_items( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test that prefixItems constraint is removed.""" + schema = { + "type": "array", + "prefixItems": [ + {"type": "string"}, + {"type": "number"}, + {"type": "boolean"}, + ], + "maxItems": 3, + } + + compatible, changes = preprocessor.make_compatible(schema) + + # prefixItems should be removed + assert "prefixItems" not in compatible + # Other constraints should be preserved + assert compatible["maxItems"] == 3 + # Should add generic items schema based on first prefixItem + assert compatible["items"] == {"type": "string"} + + assert len(changes) == 1 + assert "prefixItems: removed" in changes[0] + assert "not supported by Anthropic" in changes[0] + + def test_prefix_items_with_existing_items_schema( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test prefixItems removal when items schema already exists.""" + schema = { + "type": "array", + "prefixItems": [ + {"type": "string"}, + {"type": "number"}, + ], + "items": {"type": "object"}, # Existing items schema + } + + compatible, changes = preprocessor.make_compatible(schema) + + # prefixItems should be removed + assert "prefixItems" not in compatible + # Existing items schema should be preserved + assert compatible["items"] == {"type": "object"} + + assert len(changes) == 1 + assert "prefixItems: removed" in changes[0] + + def test_handles_both_min_items_and_prefix_items( + self, preprocessor: AnthropicSchemaPreprocessor + ) -> None: + """Test schema with both minItems and prefixItems violations.""" + schema = { + "type": "array", + "minItems": 4, + "prefixItems": [ + {"type": "string"}, + {"type": "number"}, + ], + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Both constraints should be fixed + assert compatible["minItems"] == 1 + assert "prefixItems" not in compatible + assert compatible["items"] == {"type": "string"} + + assert len(changes) == 2 + assert any("minItems" in change for change in changes) + assert any("prefixItems" in change for change in changes) + + def test_logging_behavior( + self, preprocessor: AnthropicSchemaPreprocessor, caplog + ) -> None: + """Test that appropriate log messages are generated.""" + schema = { + "type": "array", + "minItems": 3, + } + + with caplog.at_level("INFO"): + compatible, changes = preprocessor.make_compatible(schema) + + # Should log info about changes made + assert "Schema modified for Anthropic compatibility" in caplog.text + assert "1 changes made" in caplog.text + + def test_constraint_removal_table_based(self, preprocessor, subtests): + """Test constraint removal using table-based approach for better maintainability.""" + # Format: (test_name, input_schema, expected_removals, expected_changes_count) + test_cases = [ + # Array constraints + ( + "uniqueItems_true", + {"type": "array", "uniqueItems": True, "items": {"type": "string"}}, + ["uniqueItems"], + 1, + ), + ( + "uniqueItems_false", + {"type": "array", "uniqueItems": False, "items": {"type": "number"}}, + ["uniqueItems"], + 1, + ), + ( + "contains_constraint", + { + "type": "array", + "contains": {"type": "string"}, + "items": {"type": "string"}, + }, + ["contains"], + 1, + ), + ( + "contains_non_array_preserved", + { + "type": "object", + "properties": {"data": {"type": "string"}}, + "contains": {"type": "string"}, # This should be preserved + }, + [], # No removals expected + 0, # No changes expected + ), + ( + "contains_nested_allOf", + { + "type": "array", + "items": {"type": "string"}, + "allOf": [ + { + "contains": { + "const": "DigitalProductPassport", + "minContains": 1, + } + }, + { + "contains": { + "const": "VerifiableCredential", + "minContains": 1, + } + }, + ], + }, + [], # No direct removals, but nested contains should be removed + 2, # Two nested contains removals + ), + # Numerical constraints + ( + "minimum_maximum", + {"type": "number", "minimum": 0, "maximum": 100}, + ["minimum", "maximum"], + 2, + ), + ( + "multipleOf", + {"type": "integer", "multipleOf": 5}, + ["multipleOf"], + 1, + ), + ( + "exclusive_constraints", + {"type": "number", "exclusiveMinimum": 0, "exclusiveMaximum": 100}, + ["exclusiveMinimum", "exclusiveMaximum"], + 2, + ), + # String constraints + ( + "string_length_constraints", + {"type": "string", "minLength": 5, "maxLength": 50}, + ["minLength", "maxLength"], + 2, + ), + # additionalProperties constraints + ( + "additionalProperties_true", + {"type": "object", "additionalProperties": True}, + [], # Not removed, but changed to false + 1, + ), + ( + "additionalProperties_false", + {"type": "object", "additionalProperties": False}, + [], # No changes - already false + 0, + ), + ( + "additionalProperties_schema", + {"type": "object", "additionalProperties": {"type": "string"}}, + [], # Not removed, but changed to false + 1, + ), + ( + "additionalProperties_missing", + {"type": "object", "properties": {"name": {"type": "string"}}}, + [], # Not removed, but additionalProperties added as false + 1, + ), + # Multiple constraint types together + ( + "multiple_constraints", + { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1, "maxLength": 100}, + "age": {"type": "number", "minimum": 0, "maximum": 150}, + "tags": {"type": "array", "minItems": 2, "uniqueItems": True}, + }, + "additionalProperties": True, + }, + ["minLength", "maxLength", "minimum", "maximum", "uniqueItems"], + 7, # 5 removals + 1 minItems reduction + 1 additionalProperties fix + ), + ] + + for test_name, schema, expected_removals, expected_changes_count in test_cases: + with subtests.test(test_name=test_name): + compatible, changes = preprocessor.make_compatible(schema) + + # Check that expected constraints were removed + for constraint in expected_removals: + assert self._constraint_not_in_schema(compatible, constraint), ( + f"{constraint} should be removed in {test_name}" + ) + + # Check additionalProperties is set to false when present and not already false + if "additionalProperties" in schema: + assert self._get_additional_properties_value(compatible) is False, ( + f"additionalProperties should be false in {test_name}" + ) + + # Check expected number of changes + assert len(changes) == expected_changes_count, ( + f"Expected {expected_changes_count} changes but got {len(changes)} in {test_name}: {changes}" + ) + + def test_nested_constraint_removal(self, preprocessor): + """Test constraint removal in deeply nested schemas.""" + schema = { + "type": "object", + "properties": { + "metadata": { + "type": "object", + "properties": { + "tags": {"type": "array", "uniqueItems": True, "minItems": 2}, + "score": {"type": "number", "minimum": 0, "maximum": 100}, + "name": {"type": "string", "minLength": 1, "maxLength": 50}, + }, + "additionalProperties": True, + }, + }, + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Check all nested constraints were fixed + metadata_props = compatible["properties"]["metadata"]["properties"] + assert "uniqueItems" not in metadata_props["tags"] + assert metadata_props["tags"]["minItems"] == 1 + assert "minimum" not in metadata_props["score"] + assert "maximum" not in metadata_props["score"] + assert "minLength" not in metadata_props["name"] + assert "maxLength" not in metadata_props["name"] + assert compatible["properties"]["metadata"]["additionalProperties"] is False + + # Should have 6 changes: uniqueItems, minItems, minimum, maximum, minLength, maxLength, additionalProperties + assert len(changes) == 7 + + def _constraint_not_in_schema(self, schema: dict, constraint: str) -> bool: + """Recursively check that a constraint is not present anywhere in schema.""" + if isinstance(schema, dict): + if constraint in schema: + return False + for value in schema.values(): + if isinstance(value, dict): + if not self._constraint_not_in_schema(value, constraint): + return False + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + if not self._constraint_not_in_schema(item, constraint): + return False + return True + + def _get_additional_properties_value(self, schema: dict) -> any: + """Get additionalProperties value from schema, recursively checking nested objects.""" + if isinstance(schema, dict): + if "additionalProperties" in schema: + return schema["additionalProperties"] + for value in schema.values(): + if isinstance(value, dict): + result = self._get_additional_properties_value(value) + if result is not None: + return result + return None + + def test_detects_recursive_schemas(self, preprocessor): + """Test that recursive schema references are detected.""" + schema = { + "$defs": { + "Criterion": { + "type": "object", + "properties": { + "id": {"type": "string"}, + "subCriterion": { + "type": "array", + "items": {"$ref": "#/$defs/Criterion"}, + }, + }, + } + } + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Should detect and fix the recursion + assert len(changes) == 1 + assert "flattened recursive references" in changes[0] + assert "Criterion" in changes[0] + + def test_flattens_criterion_schema(self, preprocessor): + """Test flattening of the real Criterion schema from assembly specs.""" + schema = { + "$defs": { + "Criterion": { + "type": "object", + "properties": { + "id": {"type": "string"}, + "name": {"type": "string"}, + "subCriterion": { + "type": "array", + "items": {"$ref": "#/$defs/Criterion"}, + }, + }, + } + } + } + + compatible, changes = preprocessor.make_compatible(schema) + + # Check that recursion was removed + criterion_def = compatible["$defs"]["Criterion"] + sub_criterion = criterion_def["properties"]["subCriterion"]["items"] + + # Should no longer have a $ref to itself + assert "$ref" not in sub_criterion + # Should preserve the full object structure at depth limit + assert sub_criterion["type"] == "object" + # Should have the original properties (minus recursive ones) + assert "id" in sub_criterion["properties"] + assert "name" in sub_criterion["properties"] + + def test_no_changes_for_non_recursive_schemas(self, preprocessor): + """Test that non-recursive schemas are left unchanged.""" + schema = { + "$defs": { + "Simple": { + "type": "object", + "properties": { + "id": {"type": "string"}, + "name": {"type": "string"}, + }, + } + } + } + + compatible, changes = preprocessor.make_compatible(schema) + + # No changes should be made + assert len(changes) == 0 + assert compatible == schema From 887a6087690b86a1b7ee27789202db17ccbc1e07 Mon Sep 17 00:00:00 2001 From: Michael Nelson Date: Mon, 2 Feb 2026 09:47:25 +1100 Subject: [PATCH 2/2] Filter out string fields with type binary. --- .../anthropic/schema_preprocessor.py | 42 +++++++++++++++++++ .../tests/test_schema_preprocessor.py | 12 ++++++ 2 files changed, 54 insertions(+) diff --git a/src/julee/services/knowledge_service/anthropic/schema_preprocessor.py b/src/julee/services/knowledge_service/anthropic/schema_preprocessor.py index 14ea5bcd..491c1f24 100644 --- a/src/julee/services/knowledge_service/anthropic/schema_preprocessor.py +++ b/src/julee/services/knowledge_service/anthropic/schema_preprocessor.py @@ -103,6 +103,10 @@ def _needs_processing(self, schema: dict[str, Any]) -> bool: ): return True + # String format constraints + if self._has_unsupported_string_format(schema): + return True + # Numerical constraints numerical_constraints = { "minimum", @@ -159,6 +163,7 @@ def _process_schema_recursively( self._fix_contains_constraint(schema, path, changes) self._fix_numerical_constraints(schema, path, changes) self._fix_string_constraints(schema, path, changes) + self._fix_string_format_constraints(schema, path, changes) self._fix_additional_properties_constraint(schema, path, changes) self._fix_recursive_schemas(schema, path, changes) @@ -344,6 +349,33 @@ def _fix_string_constraints( ) changes.append(change_msg) + def _fix_string_format_constraints( + self, schema: dict[str, Any], path: str, changes: list[str] + ) -> None: + """ + Fix string format constraints that are incompatible with Anthropic. + + Anthropic does not support certain string formats like 'binary'. + We remove unsupported formats entirely. + + Args: + schema: Schema dictionary to modify in-place + path: JSON path for tracking location of change + changes: List to accumulate change descriptions + """ + # List of string formats not supported by Anthropic + unsupported_formats = ["binary"] + + if ( + schema.get("type") == "string" + and "format" in schema + and schema["format"] in unsupported_formats + ): + removed_format = schema.pop("format") + location = path if path else "root" + change_msg = f"{location}.format: removed '{removed_format}' (not supported by Anthropic)" + changes.append(change_msg) + def _fix_additional_properties_constraint( self, schema: dict[str, Any], path: str, changes: list[str] ) -> None: @@ -585,6 +617,16 @@ def _contains_recursive_ref( return False + def _has_unsupported_string_format(self, schema: dict[str, Any]) -> bool: + """Check if schema has unsupported string format.""" + unsupported_formats = ["binary"] + + return ( + schema.get("type") == "string" + and "format" in schema + and schema["format"] in unsupported_formats + ) + def _has_contains_in_array_schema(self, schema: dict[str, Any]) -> bool: """Check if schema has contains property and is an array schema.""" # Check if this is an array schema (explicit type or has items property) diff --git a/src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py b/src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py index fcddb0ba..056d4d5a 100644 --- a/src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py +++ b/src/julee/services/knowledge_service/anthropic/tests/test_schema_preprocessor.py @@ -564,6 +564,18 @@ def test_constraint_removal_table_based(self, preprocessor, subtests): ["minLength", "maxLength"], 2, ), + ( + "string_binary_format", + {"type": "string", "format": "binary", "description": "Base64 data"}, + ["format"], + 1, + ), + ( + "string_supported_format_preserved", + {"type": "string", "format": "uri", "description": "A URI"}, + [], # No removals - supported format + 0, + ), # additionalProperties constraints ( "additionalProperties_true",