|
| 1 | +//! Utilities for Unity Catalog catalog-managed table creation. |
| 2 | +//! |
| 3 | +//! These utilities help connectors create UC-managed tables by providing the required properties |
| 4 | +//! for both the Delta log (disk) and the UC server registration. |
| 5 | +//! |
| 6 | +//! # Usage |
| 7 | +//! |
| 8 | +//! ```ignore |
| 9 | +//! // Step 1: Get staging info from UC |
| 10 | +//! let staging_info = my_uc_client.get_staging_table(..); |
| 11 | +//! |
| 12 | +//! // Step 2: Build and commit the create-table transaction |
| 13 | +//! let disk_props = get_required_properties_for_disk(staging_info.table_id); |
| 14 | +//! let create_table_txn = kernel::create_table(path, schema, "MyApp/1.0") |
| 15 | +//! .with_table_properties(disk_props) |
| 16 | +//! .build(engine, committer); |
| 17 | +//! let result = create_table_txn.commit(engine); |
| 18 | +//! |
| 19 | +//! // Step 3: Finalize table in UC |
| 20 | +//! let snapshot = /* load post-commit snapshot at version 0 */; |
| 21 | +//! let uc_props = get_final_required_properties_for_uc(&snapshot, engine)?; |
| 22 | +//! my_uc_client.create_table(.., uc_props); |
| 23 | +//! ``` |
| 24 | +
|
| 25 | +use std::collections::HashMap; |
| 26 | + |
| 27 | +use delta_kernel::{Engine, Snapshot}; |
| 28 | + |
| 29 | +use crate::constants::{ |
| 30 | + CATALOG_MANAGED_FEATURE_KEY, FEATURE_SUPPORTED, METASTORE_LAST_COMMIT_TIMESTAMP, |
| 31 | + METASTORE_LAST_UPDATE_VERSION, UC_TABLE_ID_KEY, VACUUM_PROTOCOL_CHECK_FEATURE_KEY, |
| 32 | +}; |
| 33 | + |
| 34 | +/// Returns the table properties that must be written to disk (in `000.json`) for a UC |
| 35 | +/// catalog-managed table creation. |
| 36 | +/// |
| 37 | +/// These properties must be persisted in the Delta log so that the table is recognized as |
| 38 | +/// catalog-managed. Note: ICT enablement is handled automatically by kernel's CREATE TABLE |
| 39 | +/// when the `catalogManaged` feature is present. |
| 40 | +pub fn get_required_properties_for_disk(uc_table_id: &str) -> HashMap<String, String> { |
| 41 | + [ |
| 42 | + (CATALOG_MANAGED_FEATURE_KEY, FEATURE_SUPPORTED), |
| 43 | + (VACUUM_PROTOCOL_CHECK_FEATURE_KEY, FEATURE_SUPPORTED), |
| 44 | + (UC_TABLE_ID_KEY, uc_table_id), |
| 45 | + ] |
| 46 | + .into_iter() |
| 47 | + .map(|(k, v)| (k.to_string(), v.to_string())) |
| 48 | + .collect() |
| 49 | +} |
| 50 | + |
| 51 | +/// Extracts the properties that must be sent to the UC server when finalizing a table creation. |
| 52 | +/// |
| 53 | +/// These properties are derived from the post-commit snapshot (after `000.json` has |
| 54 | +/// been written). The connector should pass these to the UC `create_table` API. |
| 55 | +/// |
| 56 | +/// # Properties returned |
| 57 | +/// |
| 58 | +/// - All entries from `Metadata.configuration` (includes `io.unitycatalog.tableId`, user props) |
| 59 | +/// - `delta.minReaderVersion` and `delta.minWriterVersion` |
| 60 | +/// - `delta.feature.<name> = "supported"` for every reader and writer table feature |
| 61 | +/// - `delta.lastUpdateVersion` -- the snapshot version |
| 62 | +/// - `delta.lastCommitTimestamp` -- the snapshot's in-commit timestamp (requires ICT enabled) |
| 63 | +/// - `clusteringColumns` -- JSON-serialized clustering columns (if clustering is enabled) |
| 64 | +/// |
| 65 | +/// # Clustering columns |
| 66 | +/// |
| 67 | +/// Clustering columns are returned as logical column names. When column mapping is enabled, |
| 68 | +/// the physical names stored in domain metadata are converted to logical names using the |
| 69 | +/// table schema. |
| 70 | +pub fn get_final_required_properties_for_uc( |
| 71 | + snapshot: &Snapshot, |
| 72 | + engine: &dyn Engine, |
| 73 | +) -> delta_kernel::DeltaResult<HashMap<String, String>> { |
| 74 | + if snapshot.version() != 0 { |
| 75 | + return Err(delta_kernel::Error::generic(format!( |
| 76 | + "get_final_required_properties_for_uc is only valid for version 0 (table creation) \ |
| 77 | + snapshots, but snapshot is at version {}", |
| 78 | + snapshot.version() |
| 79 | + ))); |
| 80 | + } |
| 81 | + |
| 82 | + // Start with metadata configuration (user + delta properties) |
| 83 | + let mut properties = snapshot.metadata_configuration().clone(); |
| 84 | + |
| 85 | + // Protocol-derived properties (versions + feature signals) |
| 86 | + properties.extend(snapshot.get_protocol_derived_properties()); |
| 87 | + |
| 88 | + // UC-specific properties |
| 89 | + properties.insert( |
| 90 | + METASTORE_LAST_UPDATE_VERSION.to_string(), |
| 91 | + snapshot.version().to_string(), |
| 92 | + ); |
| 93 | + let timestamp = snapshot.get_in_commit_timestamp(engine)?.ok_or_else(|| { |
| 94 | + delta_kernel::Error::generic( |
| 95 | + "In-commit timestamp is required for UC catalog-managed tables but was not found", |
| 96 | + ) |
| 97 | + })?; |
| 98 | + properties.insert( |
| 99 | + METASTORE_LAST_COMMIT_TIMESTAMP.to_string(), |
| 100 | + timestamp.to_string(), |
| 101 | + ); |
| 102 | + |
| 103 | + // Clustering columns as logical names (if present) |
| 104 | + if let Some(columns) = snapshot.get_logical_clustering_columns(engine)? { |
| 105 | + let column_arrays: Vec<Vec<&str>> = columns |
| 106 | + .iter() |
| 107 | + .map(|c| c.path().iter().map(|s| s.as_str()).collect()) |
| 108 | + .collect(); |
| 109 | + let json = serde_json::to_string(&column_arrays).map_err(|e| { |
| 110 | + delta_kernel::Error::generic(format!("Failed to serialize clustering columns: {e}")) |
| 111 | + })?; |
| 112 | + properties.insert("clusteringColumns".to_string(), json); |
| 113 | + } |
| 114 | + |
| 115 | + Ok(properties) |
| 116 | +} |
| 117 | + |
| 118 | +#[cfg(test)] |
| 119 | +mod tests { |
| 120 | + use super::*; |
| 121 | + |
| 122 | + use std::sync::Arc; |
| 123 | + |
| 124 | + use delta_kernel::committer::{CommitMetadata, CommitResponse, Committer, PublishMetadata}; |
| 125 | + use delta_kernel::engine::default::DefaultEngineBuilder; |
| 126 | + use delta_kernel::object_store::memory::InMemory; |
| 127 | + use delta_kernel::schema::{DataType, StructField, StructType}; |
| 128 | + use delta_kernel::snapshot::Snapshot; |
| 129 | + use delta_kernel::transaction::create_table::create_table; |
| 130 | + use delta_kernel::transaction::data_layout::DataLayout; |
| 131 | + use delta_kernel::{DeltaResult, Engine, FileMeta, FilteredEngineData}; |
| 132 | + |
| 133 | + /// A mock catalog committer that writes directly to the published path. |
| 134 | + struct MockCatalogCommitter; |
| 135 | + impl Committer for MockCatalogCommitter { |
| 136 | + fn commit( |
| 137 | + &self, |
| 138 | + engine: &dyn Engine, |
| 139 | + actions: Box<dyn Iterator<Item = DeltaResult<FilteredEngineData>> + Send + '_>, |
| 140 | + commit_metadata: CommitMetadata, |
| 141 | + ) -> DeltaResult<CommitResponse> { |
| 142 | + let path = commit_metadata.published_commit_path()?; |
| 143 | + engine |
| 144 | + .json_handler() |
| 145 | + .write_json_file(&path, Box::new(actions), false)?; |
| 146 | + Ok(CommitResponse::Committed { |
| 147 | + file_meta: FileMeta::new(path, commit_metadata.in_commit_timestamp(), 0), |
| 148 | + }) |
| 149 | + } |
| 150 | + fn is_catalog_committer(&self) -> bool { |
| 151 | + true |
| 152 | + } |
| 153 | + fn publish(&self, _: &dyn Engine, _: PublishMetadata) -> DeltaResult<()> { |
| 154 | + Ok(()) |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + #[test] |
| 159 | + fn test_get_required_properties_for_disk() { |
| 160 | + let props = get_required_properties_for_disk("my-uc-table-123"); |
| 161 | + assert_eq!(props.len(), 3); |
| 162 | + assert_eq!(props["delta.feature.catalogManaged"], "supported"); |
| 163 | + assert_eq!(props["delta.feature.vacuumProtocolCheck"], "supported"); |
| 164 | + assert_eq!(props["io.unitycatalog.tableId"], "my-uc-table-123"); |
| 165 | + } |
| 166 | + |
| 167 | + #[tokio::test] |
| 168 | + async fn test_get_final_required_properties_for_uc() { |
| 169 | + let storage = Arc::new(InMemory::new()); |
| 170 | + let engine = DefaultEngineBuilder::new(storage).build(); |
| 171 | + let table_path = "memory:///test_table/"; |
| 172 | + let schema = Arc::new( |
| 173 | + StructType::try_new(vec![ |
| 174 | + StructField::new("id", DataType::INTEGER, false), |
| 175 | + StructField::new("region", DataType::STRING, true), |
| 176 | + ]) |
| 177 | + .unwrap(), |
| 178 | + ); |
| 179 | + |
| 180 | + // Create a UC catalog-managed table with clustering |
| 181 | + let disk_props = get_required_properties_for_disk("test-table-id-456"); |
| 182 | + let _ = create_table(table_path, schema, "Test/1.0") |
| 183 | + .with_table_properties(disk_props) |
| 184 | + .with_data_layout(DataLayout::clustered(["region"])) |
| 185 | + .build(&engine, Box::new(MockCatalogCommitter)) |
| 186 | + .unwrap() |
| 187 | + .commit(&engine) |
| 188 | + .unwrap(); |
| 189 | + |
| 190 | + let snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); |
| 191 | + assert_eq!(snapshot.version(), 0); |
| 192 | + let uc_props = get_final_required_properties_for_uc(&snapshot, &engine).unwrap(); |
| 193 | + |
| 194 | + // Protocol-derived properties |
| 195 | + assert_eq!(uc_props["delta.minReaderVersion"], "3"); |
| 196 | + assert_eq!(uc_props["delta.minWriterVersion"], "7"); |
| 197 | + assert_eq!(uc_props["delta.feature.catalogManaged"], "supported"); |
| 198 | + assert_eq!(uc_props["delta.feature.vacuumProtocolCheck"], "supported"); |
| 199 | + assert_eq!(uc_props["delta.feature.inCommitTimestamp"], "supported"); |
| 200 | + assert_eq!(uc_props["delta.feature.clustering"], "supported"); |
| 201 | + |
| 202 | + // Metadata configuration |
| 203 | + assert_eq!(uc_props["io.unitycatalog.tableId"], "test-table-id-456"); |
| 204 | + |
| 205 | + // UC-specific properties |
| 206 | + assert_eq!(uc_props["delta.lastUpdateVersion"], "0"); |
| 207 | + let timestamp: i64 = uc_props["delta.lastCommitTimestamp"] |
| 208 | + .parse() |
| 209 | + .expect("timestamp should be a valid i64"); |
| 210 | + assert!( |
| 211 | + timestamp > 0, |
| 212 | + "ICT timestamp should be non-zero, got {timestamp}" |
| 213 | + ); |
| 214 | + |
| 215 | + // Clustering columns: serialized as [[col1], [col2]] (array of path arrays) |
| 216 | + let parsed: Vec<Vec<String>> = |
| 217 | + serde_json::from_str(&uc_props["clusteringColumns"]).unwrap(); |
| 218 | + assert_eq!(parsed, vec![vec!["region"]]); |
| 219 | + } |
| 220 | + |
| 221 | + #[tokio::test] |
| 222 | + async fn test_clustering_columns_serialization_multiple_and_nested() { |
| 223 | + let storage = Arc::new(InMemory::new()); |
| 224 | + let engine = DefaultEngineBuilder::new(storage).build(); |
| 225 | + let table_path = "memory:///test_clustering_ser/"; |
| 226 | + let address_struct = StructType::new_unchecked(vec![ |
| 227 | + StructField::new("city", DataType::STRING, true), |
| 228 | + StructField::new("zip", DataType::STRING, true), |
| 229 | + ]); |
| 230 | + let schema = Arc::new( |
| 231 | + StructType::try_new(vec![ |
| 232 | + StructField::new("id", DataType::INTEGER, false), |
| 233 | + StructField::new("region", DataType::STRING, true), |
| 234 | + StructField::new("address", DataType::Struct(Box::new(address_struct)), true), |
| 235 | + ]) |
| 236 | + .unwrap(), |
| 237 | + ); |
| 238 | + |
| 239 | + use delta_kernel::expressions::ColumnName; |
| 240 | + |
| 241 | + let disk_props = get_required_properties_for_disk("test-table-id"); |
| 242 | + let _ = create_table(table_path, schema, "Test/1.0") |
| 243 | + .with_table_properties(disk_props) |
| 244 | + .with_data_layout(DataLayout::Clustered { |
| 245 | + columns: vec![ |
| 246 | + ColumnName::new(["region"]), |
| 247 | + ColumnName::new(["address", "city"]), |
| 248 | + ], |
| 249 | + }) |
| 250 | + .build(&engine, Box::new(MockCatalogCommitter)) |
| 251 | + .unwrap() |
| 252 | + .commit(&engine) |
| 253 | + .unwrap(); |
| 254 | + |
| 255 | + let snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); |
| 256 | + let uc_props = get_final_required_properties_for_uc(&snapshot, &engine).unwrap(); |
| 257 | + |
| 258 | + // Clustering columns serialized as array of path arrays: |
| 259 | + // [["region"], ["address", "city"]] |
| 260 | + let raw_json = &uc_props["clusteringColumns"]; |
| 261 | + let parsed: Vec<Vec<String>> = serde_json::from_str(raw_json).unwrap(); |
| 262 | + assert_eq!( |
| 263 | + parsed, |
| 264 | + vec![vec!["region"], vec!["address", "city"]], |
| 265 | + "Raw JSON: {raw_json}" |
| 266 | + ); |
| 267 | + } |
| 268 | + |
| 269 | + #[tokio::test] |
| 270 | + async fn test_get_final_required_properties_for_uc_rejects_non_zero_version() { |
| 271 | + let storage = Arc::new(InMemory::new()); |
| 272 | + let engine = DefaultEngineBuilder::new(storage).build(); |
| 273 | + let table_path = "memory:///test_version_check/"; |
| 274 | + let schema = Arc::new( |
| 275 | + StructType::try_new(vec![StructField::new("id", DataType::INTEGER, false)]).unwrap(), |
| 276 | + ); |
| 277 | + |
| 278 | + // Create a table (version 0) and append (version 1) |
| 279 | + let disk_props = get_required_properties_for_disk("test-table-id"); |
| 280 | + let _ = create_table(table_path, schema, "Test/1.0") |
| 281 | + .with_table_properties(disk_props) |
| 282 | + .build(&engine, Box::new(MockCatalogCommitter)) |
| 283 | + .unwrap() |
| 284 | + .commit(&engine) |
| 285 | + .unwrap(); |
| 286 | + let v0_snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); |
| 287 | + let result = v0_snapshot |
| 288 | + .transaction(Box::new(MockCatalogCommitter), &engine) |
| 289 | + .unwrap() |
| 290 | + .commit(&engine) |
| 291 | + .unwrap(); |
| 292 | + assert!(result.is_committed()); |
| 293 | + |
| 294 | + // Load snapshot at version 1 |
| 295 | + let snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); |
| 296 | + assert_eq!(snapshot.version(), 1); |
| 297 | + |
| 298 | + // Should fail because version != 0 |
| 299 | + let err = get_final_required_properties_for_uc(&snapshot, &engine).unwrap_err(); |
| 300 | + assert!( |
| 301 | + err.to_string().contains("version 0"), |
| 302 | + "expected version 0 error, got: {err}" |
| 303 | + ); |
| 304 | + } |
| 305 | +} |
0 commit comments