@@ -4233,3 +4233,155 @@ fn test_schema_to_is_not_null_predicate(
42334233) {
42344234 assert_eq ! ( schema_to_is_not_null_predicate( & schema) , expected) ;
42354235}
4236+
4237+ /// Verify that `read_actions` correctly handles null values in map fields across all
4238+ /// action types. The Delta protocol allows null values in `partitionValues` maps (a null
4239+ /// partition value means the partition column is null for that file) and in `tags` maps.
4240+ ///
4241+ /// Spark defaults all `Map[String, String]` types to `valueContainsNull = true`, and
4242+ /// checkpoint writing calls `schema.asNullable` which forces all maps nullable. The
4243+ /// schema must match this behavior.
4244+ ///
4245+ /// This test reads JSON actions through `DefaultEngine` + `InMemory` store +
4246+ /// `log_segment.read_actions()`, then re-validates the resulting Arrow `StructArray` with
4247+ /// `StructArray::try_new`. Without the fix, non-nullable map value fields cause:
4248+ /// "Found unmasked nulls for non-nullable StructArray field 'value'"
4249+ #[ rstest]
4250+ // remove.partitionValues.month: null
4251+ #[ case:: remove_partition_values(
4252+ "remove" ,
4253+ "partitionValues" ,
4254+ r#"{"remove":{"path":"file.parquet","deletionTimestamp":1000,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"year":"2024","month":null},"size":100}}"#
4255+ ) ]
4256+ // remove.tags.key2: null
4257+ #[ case:: remove_tags(
4258+ "remove" ,
4259+ "tags" ,
4260+ r#"{"remove":{"path":"file.parquet","deletionTimestamp":1000,"dataChange":true,"tags":{"key1":"val1","key2":null}}}"#
4261+ ) ]
4262+ // add.partitionValues.month: null
4263+ #[ case:: add_partition_values(
4264+ "add" ,
4265+ "partitionValues" ,
4266+ r#"{"add":{"path":"file.parquet","partitionValues":{"year":"2024","month":null},"size":100,"modificationTime":1000,"dataChange":true}}"#
4267+ ) ]
4268+ // add.tags.key2: null
4269+ #[ case:: add_tags(
4270+ "add" ,
4271+ "tags" ,
4272+ r#"{"add":{"path":"file.parquet","partitionValues":{},"size":100,"modificationTime":1000,"dataChange":true,"tags":{"key1":"val1","key2":null}}}"#
4273+ ) ]
4274+ // cdc.partitionValues.month: null
4275+ #[ case:: cdc_partition_values(
4276+ "cdc" ,
4277+ "partitionValues" ,
4278+ r#"{"cdc":{"path":"file.parquet","partitionValues":{"year":"2024","month":null},"size":100,"dataChange":false}}"#
4279+ ) ]
4280+ // cdc.tags.key2: null
4281+ #[ case:: cdc_tags(
4282+ "cdc" ,
4283+ "tags" ,
4284+ r#"{"cdc":{"path":"file.parquet","partitionValues":{},"size":100,"dataChange":false,"tags":{"key1":"val1","key2":null}}}"#
4285+ ) ]
4286+ // sidecar.tags.key2: null
4287+ #[ case:: sidecar_tags(
4288+ "sidecar" ,
4289+ "tags" ,
4290+ r#"{"sidecar":{"path":"sidecar.parquet","sizeInBytes":100,"modificationTime":1000,"tags":{"key1":"val1","key2":null}}}"#
4291+ ) ]
4292+ // checkpointMetadata.tags.key2: null
4293+ #[ case:: checkpoint_metadata_tags(
4294+ "checkpointMetadata" ,
4295+ "tags" ,
4296+ r#"{"checkpointMetadata":{"version":0,"tags":{"key1":"val1","key2":null}}}"#
4297+ ) ]
4298+ // Known issues: these map fields don't yet have #[allow_null_container_values].
4299+ // commitInfo.operationParameters.description: null
4300+ #[ should_panic( expected = "StructArray re-validation failed" ) ]
4301+ #[ case:: commit_info_operation_parameters_known_issue(
4302+ "commitInfo" ,
4303+ "operationParameters" ,
4304+ r#"{"commitInfo":{"timestamp":1000,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","description":null}}}"#
4305+ ) ]
4306+ // metaData.configuration.key2: null
4307+ #[ should_panic( expected = "StructArray re-validation failed" ) ]
4308+ #[ case:: metadata_configuration_known_issue(
4309+ "metaData" ,
4310+ "configuration" ,
4311+ r#"{"metaData":{"id":"test","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{"key1":"val1","key2":null},"createdTime":1000}}"#
4312+ ) ]
4313+ #[ tokio:: test]
4314+ async fn read_actions_with_null_map_values (
4315+ #[ case] action_name : & str ,
4316+ #[ case] map_field : & str ,
4317+ #[ case] json_action : & str ,
4318+ ) {
4319+ use crate :: arrow:: array:: { Array , AsArray , MapArray , StructArray } ;
4320+
4321+ let store = Arc :: new ( InMemory :: new ( ) ) ;
4322+ let log_root = Url :: parse ( "memory:///_delta_log/" ) . unwrap ( ) ;
4323+
4324+ // Write a single commit file with the action containing null map values.
4325+ store
4326+ . put (
4327+ & delta_path_for_version ( 0 , "json" ) ,
4328+ json_action. to_string ( ) . into ( ) ,
4329+ )
4330+ . await
4331+ . unwrap ( ) ;
4332+
4333+ // Build engine and read actions -- same as DeltaActionExtractor::get_actions.
4334+ let engine = DefaultEngineBuilder :: new ( store) . build ( ) ;
4335+ let log_segment =
4336+ LogSegment :: for_table_changes ( engine. storage_handler ( ) . as_ref ( ) , log_root, 0 , Some ( 0 ) )
4337+ . unwrap ( ) ;
4338+
4339+ // Use all_actions_schema to cover sidecar and checkpointMetadata (checkpoint-only actions).
4340+ let action_schema = get_all_actions_schema ( ) . clone ( ) ;
4341+ let action_batches = log_segment
4342+ . read_actions ( & engine, action_schema)
4343+ . expect ( "read_actions should succeed" ) ;
4344+
4345+ // Iterate batches and verify the map value field is nullable.
4346+ let mut found = false ;
4347+ for batch_result in action_batches {
4348+ let actions_batch = batch_result. expect ( "Iterating action batches should succeed" ) ;
4349+
4350+ let data_any = actions_batch. actions . into_any ( ) ;
4351+ let arrow_data = data_any
4352+ . downcast_ref :: < ArrowEngineData > ( )
4353+ . expect ( "ArrowEngineData" ) ;
4354+ let rb = arrow_data. record_batch ( ) ;
4355+
4356+ let Some ( action_col) = rb. column_by_name ( action_name) else {
4357+ continue ;
4358+ } ;
4359+ let action_struct = action_col
4360+ . as_struct_opt ( )
4361+ . unwrap_or_else ( || panic ! ( "{action_name} column should be a struct" ) ) ;
4362+ let map_col = action_struct
4363+ . column_by_name ( map_field)
4364+ . unwrap_or_else ( || panic ! ( "{action_name}.{map_field} not found" ) ) ;
4365+ let map_array = map_col
4366+ . as_any ( )
4367+ . downcast_ref :: < MapArray > ( )
4368+ . unwrap_or_else ( || panic ! ( "{action_name}.{map_field} should be a MapArray" ) ) ;
4369+ // Re-validate the entries StructArray with its own schema, same as what Arrow's
4370+ // IPC deserializer does. Without the fix, this fails with:
4371+ // "Found unmasked nulls for non-nullable StructArray field 'value'"
4372+ let entries = map_array. entries ( ) ;
4373+ StructArray :: try_new (
4374+ entries. fields ( ) . clone ( ) ,
4375+ entries. columns ( ) . to_vec ( ) ,
4376+ entries. nulls ( ) . cloned ( ) ,
4377+ )
4378+ . unwrap_or_else ( |e| {
4379+ panic ! (
4380+ "{action_name}.{map_field} entries StructArray re-validation failed: {e}. \
4381+ This means the schema has non-nullable value field but the data has nulls."
4382+ )
4383+ } ) ;
4384+ found = true ;
4385+ }
4386+ assert ! ( found, "Should have found a {action_name} action batch" ) ;
4387+ }
0 commit comments