From c879746167f5fcccc34e85862b843878e4e0cf03 Mon Sep 17 00:00:00 2001 From: Beforerr Date: Thu, 4 Jun 2026 22:22:04 -0700 Subject: [PATCH] feat: juliac --trim compatibility for the metadata path + trim test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the "open + inspect metadata" workload build cleanly under `juliac --trim=safe`: - Annotate every record-parsing function's record-size argument as `::Type{T} where T`. Julia does not specialize on bare/`::Type` value args (Type/Function/Vararg heuristic), so they were boxed as abstract `Type`, making every `sizeof`/`read_be` a dynamic call the trim verifier rejects. - Move file-level compression from the `CDFDataset{CT,FST}` type parameter to a plain field. It is metadata only (the buffer is already decompressed) and nothing dispatches on it; as a type param the version×compression product was a 6-way union exceeding the split limit, forcing everything through abstract `CDFDataset`. - Replace `open(f, name, mode) do` (varargs splat via `_apply_iterate`, trim-hostile) with explicit open + try/finally. - Reorder `getproperty` so real field accesses short-circuit before the lazy `attrib`/`adr` branches, so a trim build that never reads attributes does not drag in the heterogeneous attribute machinery. - Branch on the version literal so the record-size type reaches the loader as a concrete type parameter rather than a widened `Type` ternary. Behavior-preserving: Aqua, JET, and the existing suite pass. Add test/trim.jl (HTTP.jl/SciML-style): locate juliac, develop the package into a temp project, build a `--trim=safe` probe, run it. Skips when juliac is unavailable. Variable *data* reading stays out of scope — eltype, ndims and VDR type are read from the file at runtime and cannot be statically resolved. --- README.md | 21 ++++++------ benchmark/benchmarks.jl | 2 -- src/dataset.jl | 58 ++++++++++++++++---------------- src/decompress.jl | 2 +- src/loading/attribute.jl | 4 +-- src/loading/variable.jl | 46 +++++++++++++++++++++++-- src/parsing.jl | 43 +++--------------------- src/precompile.jl | 4 +-- src/records/adr.jl | 2 +- src/records/aedr.jl | 2 +- src/records/ccr.jl | 5 ++- src/records/cdr.jl | 2 +- src/records/cpr.jl | 5 ++- src/records/cvvr.jl | 4 +-- src/records/records.jl | 10 +++--- src/records/vdr.jl | 24 ++++++++++++++ src/records/vvr.jl | 4 +-- src/records/vxr.jl | 2 +- test/Project.toml | 3 +- test/cdf_trim_probe.jl | 17 ++++++++++ test/runtests.jl | 37 +++++++++++++++++++-- test/trim.jl | 72 ++++++++++++++++++++++++++++++++++++++++ 22 files changed, 259 insertions(+), 110 deletions(-) create mode 100644 test/cdf_trim_probe.jl create mode 100644 test/trim.jl diff --git a/README.md b/README.md index 2070d13..3450e8b 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,18 @@ # CommonDataFormat.jl +[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliaspacephysics.github.io/CommonDataFormat.jl/dev/) [![DOI](https://zenodo.org/badge/1057373325.svg)](https://doi.org/10.5281/zenodo.17517061) [![version](https://juliahub.com/docs/General/CommonDataFormat/stable/version.svg)](https://juliahub.com/ui/Packages/General/CommonDataFormat) [![Build Status](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl) -A Julia package for reading [Common Data Format (CDF)](https://cdf.gsfc.nasa.gov/) files, widely used in space physics for storing multidimensional data arrays and metadata. See [CDFDatasets.jl](https://github.com/JuliaSpacePhysics/CDFDatasets.jl) for a high-level interface. - -**Installation**: at the Julia REPL, run `using Pkg; Pkg.add("CommonDataFormat")` - -**Documentation**: [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliaspacephysics.github.io/CommonDataFormat.jl/dev/) - -## Features - -- **Pure Julia implementation** - No external dependencies on CDF libraries -- **Efficient data access** - Memory-mapped access for data and attributes, super fast decompression using [`LibDeflate`](https://github.com/jakobnissen/LibDeflate.jl) -- **[DiskArrays.jl](https://github.com/JuliaIO/DiskArrays.jl) integration** - Lazy representation of data on hard disk with AbstractDiskArray interface +A Julia package for reading [Common Data Format (CDF)](https://cdf.gsfc.nasa.gov/) files, widely used in space physics for storing multidimensional data arrays and metadata. See [CDFDatasets.jl](https://github.com/JuliaSpacePhysics/CDFDatasets.jl) for high-level interfaces. ## Quick Start ```julia +using Pkg; Pkg.add("CommonDataFormat") using CommonDataFormat # Load a CDF file @@ -38,6 +30,13 @@ println("Variables: ", keys(cdf)) var = cdf["temperature"] ``` +## Features + +- **Pure Julia implementation** - No external dependencies on CDF libraries +- **Efficient data access** - Memory-mapped access for data and attributes, super fast decompression using [`LibDeflate`](https://github.com/jakobnissen/LibDeflate.jl) +- **[DiskArrays.jl](https://github.com/JuliaIO/DiskArrays.jl) integration** - Lazy representation of data on hard disk with AbstractDiskArray interface + + ## Elsewhere - [CDFpp](https://github.com/SciQLop/CDFpp): A modern C++ header only cdf library with Python bindings diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 015deab..719ffdb 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -4,10 +4,8 @@ using Downloads const SUITE = BenchmarkGroup() -# elb file is committed to the repo, so it ships with the package at any rev const ELX_FILE = joinpath(pkgdir(CommonDataFormat), "data", "elb_l2_epdef_20210914_v01.cdf") -# mms file (50 MB) is gitignored; download once and cache across revisions function download_data(url, filename = basename(url)) dir = joinpath(tempdir(), "CommonDataFormat_benchmark_data") mkpath(dir) diff --git a/src/dataset.jl b/src/dataset.jl index ebb253f..06aa7e8 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -1,14 +1,15 @@ -struct CDFDataset{CT, FST} +struct CDFDataset{FST} filename::String cdr::CDR{FST} gdr::GDR{FST} buffer::Vector{UInt8} + compression::CompressionType end Base.parent(cdf::CDFDataset) = getfield(cdf, :buffer) GDR(cdf::CDFDataset) = getfield(cdf, :gdr) filename(cdf::CDFDataset) = getfield(cdf, :filename) -recordsize_type(::CDFDataset{CT, RS}) where {CT, RS} = RS +recordsize_type(::CDFDataset{RS}) where {RS} = RS """ CDFDataset(filename) @@ -22,21 +23,27 @@ cdf = CDFDataset("data.cdf") """ function CDFDataset(filename) fname = String(filename) - return open(fname, "r") do io + # `open(f, name, mode) do` form: routes through varargs splatting (`_apply_iterate`) which `juliac --trim` can't resolve. + io = open(fname, "r") + try buffer = Mmap.mmap(io) magic_bytes = read_be(buffer, 1, UInt32) @assert validate_cdf_magic(magic_bytes) + return is_cdf_v3(magic_bytes) ? _load_dataset(fname, buffer, Int64) : + _load_dataset(fname, buffer, Int32) + finally + close(io) + end +end - FieldSizeType = is_cdf_v3(magic_bytes) ? Int64 : Int32 - compression = NoCompression - if is_compressed(read_be(buffer, 5, UInt32)) - buffer, compression = decompress_bytes(buffer, FieldSizeType) - end - # Parse CDF header - cdr = CDR(buffer, 8, FieldSizeType) - gdr = GDR(buffer, Int(cdr.gdr_offset), FieldSizeType) - return CDFDataset{compression, FieldSizeType}(fname, cdr, gdr, buffer) +function _load_dataset(fname, buffer, ::Type{FieldSizeType}) where {FieldSizeType} + compression = NoCompression + if is_compressed(read_be(buffer, 5, UInt32)) + buffer, compression = decompress_bytes(buffer, FieldSizeType) end + cdr = CDR(buffer, 8, FieldSizeType) + gdr = GDR(buffer, Int(cdr.gdr_offset), FieldSizeType) + return CDFDataset{FieldSizeType}(fname, cdr, gdr, buffer, compression) end is_big_endian_encoding(cdf::CDFDataset) = is_big_endian_encoding(cdf.cdr.encoding) @@ -45,23 +52,16 @@ is_compressed(magic_numbers::UInt32) = magic_numbers != 0x0000FFFF majority(cdf::CDFDataset) = majority(cdf.cdr) # Convenience accessors for the dataset with lazy loading -@inline function Base.getproperty(cdf::CDFDataset{CT, FST}, name::Symbol) where {CT, FST} +@inline function Base.getproperty(cdf::CDFDataset, name::Symbol) + # Real fields FIRST so internal accesses (`cdf.cdr`, `cdf.gdr`, …) short-circuit and + # never traverse the lazy `attrib` branches below. name in fieldnames(CDFDataset) && return getfield(cdf, name) - if name === :version - return version(cdf.cdr) - elseif name === :majority - return majority(cdf) - elseif name === :compression - return CT - elseif name === :adr - return ADR(parent(cdf), GDR(cdf).ADRhead, recordsize_type(cdf)) - elseif name === :attrib - return attrib(cdf) - elseif name === :vattrib - return attrib(cdf; predicate = !is_global) - else - throw(ArgumentError("Unknown property $name")) - end + name === :version && return version(getfield(cdf, :cdr)) + name === :majority && return majority(cdf) + name === :adr && return ADR(parent(cdf), GDR(cdf).ADRhead, recordsize_type(cdf)) + name === :attrib && return attrib(cdf) + name === :vattrib && return attrib(cdf; predicate = !is_global) + throw(ArgumentError("Unknown property $name")) end function find_vdr(cdf::CDFDataset, var_name::String) @@ -134,5 +134,5 @@ function Base.show(io::IO, m::MIME"text/plain", cdf::CDFDataset) return end -OffsetsIterator(cdf::CDFDataset) = +OffsetsIterator(cdf::CDFDataset) = OffsetsIterator{recordsize_type(cdf)}(cdf.buffer, cdf.gdr.ADRhead) diff --git a/src/decompress.jl b/src/decompress.jl index cee4d9d..ab728c2 100644 --- a/src/decompress.jl +++ b/src/decompress.jl @@ -2,7 +2,7 @@ include("decompress/rle.jl") include("decompress/gzip.jl") -function decompress_bytes(buffer, RecordSizeType) +function decompress_bytes(buffer, ::Type{RecordSizeType}) where {RecordSizeType} ccr = CCR(buffer, 8, RecordSizeType) cpr = CPR(buffer, Int(ccr.cpr_offset), RecordSizeType) compression = CompressionType(cpr.compression_type) diff --git a/src/loading/attribute.jl b/src/loading/attribute.jl index bf5a97e..30eb959 100644 --- a/src/loading/attribute.jl +++ b/src/loading/attribute.jl @@ -2,7 +2,7 @@ # Handles loading of ADR (Attribute Descriptor Record) and AEDR (Attribute Entry Descriptor Record) chains # Load all attribute entries for a given attribute from its AEDRs. -@inline function load_attribute_entries(buffer::Vector{UInt8}, adr, RecordSizeType, needs_byte_swap) +@inline function load_attribute_entries(buffer::Vector{UInt8}, adr, ::Type{RecordSizeType}, needs_byte_swap) where {RecordSizeType} head = max(adr.AgrEDRhead, adr.AzEDRhead) offsets = OffsetsIterator{RecordSizeType}(buffer, head) return map(offsets) do offset @@ -122,7 +122,7 @@ function _get_attributes(name, value, cdf) return value end -@inline function _search_aedr_entries(source, aedr_head, RecordSizeType, needs_byte_swap, target_varnum) +@inline function _search_aedr_entries(source, aedr_head, ::Type{RecordSizeType}, needs_byte_swap, target_varnum) where {RecordSizeType} aedr_head == 0 && return nothing offset = Int(aedr_head) _num_offset = 13 + 2 * sizeof(RecordSizeType) diff --git a/src/loading/variable.jl b/src/loading/variable.jl index 2ae956a..e362dea 100644 --- a/src/loading/variable.jl +++ b/src/loading/variable.jl @@ -48,6 +48,48 @@ function variable(cdf::CDFDataset, name) ) end +""" + read!(ds::CDFDataset, name, dest::AbstractArray{T, N}) -> dest + +Read the full contents of variable `name` into the preallocated `dest`. + +Statically-typed entry point: `T` and `N` come from `dest` instead of the file, so — +unlike `ds[name]`, whose type is only known at runtime — the call chain is resolvable +at compile time and survives `juliac --trim`. +""" +function Base.read!(ds::CDFDataset, name::String, dest::AbstractArray{T, N}) where {T, N} + vdr = find_vdr(ds, name) + isnothing(vdr) && throw(KeyError(name)) + return _read_full!(dest, ds, name, vdr) +end + +""" + read(ds::CDFDataset, name, ::Type{Array{T, N}}) -> Array{T, N} + +Allocating variant of [`read!`](@ref): read the full contents of variable `name` into a +freshly allocated `Array{T, N}`. +""" +function Base.read(ds::CDFDataset, name::String, ::Type{Array{T, N}}) where {T, N} + vdr = find_vdr(ds, name) + isnothing(vdr) && throw(KeyError(name)) + dims = (map(Int, record_sizes(vdr, Val(N - 1)))..., Int(vdr.max_rec) + 1) + return _read_full!(Array{T, N}(undef, dims), ds, name, vdr) +end + +function _read_full!(dest::AbstractArray{T, N}, ds, name, vdr) where {T, N} + Base.require_one_based_indexing(dest) + Tfile = julia_type(vdr.data_type, vdr.num_elems) + T === Tfile || throw(ArgumentError("element type mismatch for \"$name\": file has $Tfile, destination has $T")) + dims = (map(Int, record_sizes(vdr, Val(N - 1)))..., Int(vdr.max_rec) + 1) + size(dest) == dims || throw(DimensionMismatch("variable \"$name\" has size $dims, destination has size $(size(dest))")) + var = CDFVariable{T, N, typeof(vdr), typeof(ds)}(name, vdr, ds, dims) + DiskArrays.readblock!(var, dest, axes(dest)...) + return dest +end + +@inline _record_view(A::AbstractArray{<:Any, M}, r) where {M} = + view(A, ntuple(_ -> Colon(), M - 1)..., r) + function DiskArrays.readblock!(var::CDFVariable{T, N}, dest::AbstractArray{T}, ranges::Vararg{AbstractUnitRange{<:Integer}, N}; nbuffers = nthreads()) where {T, N} N > 0 && @boundscheck checkbounds(var, ranges...) isempty(dest) && return dest @@ -100,7 +142,7 @@ function DiskArrays.readblock!(var::CDFVariable{T, N}, dest::AbstractArray{T}, r if is_full_record && entry.first >= first_rec && entry.last <= last_rec # full entry dest_range = dst_src_ranges(first_rec, last_rec, entry)[1] - dest_view = selectdim(dest, N, dest_range) + dest_view = _record_view(dest, dest_range) total_elems = record_size * length(entry) decompressor = take!(decompressors()) load_cvvr_data!(dest_view, 1, buffer, entry.offset, total_elems, RecordSizeType, compression; decompressor) @@ -109,7 +151,7 @@ function DiskArrays.readblock!(var::CDFVariable{T, N}, dest::AbstractArray{T}, r else # partial entry (dest_range, local_range) = dst_src_ranges(first_rec, last_rec, entry) - dest_view = selectdim(dest, N, dest_range) + dest_view = _record_view(dest, dest_range) n_records = length(entry) total_elems = record_size * n_records chunk = Vector{T}(undef, total_elems) diff --git a/src/parsing.jl b/src/parsing.jl index 2dfa603..b983251 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -21,6 +21,11 @@ end return ntuple(j -> read_be(v, i + (j - 1) * S, T), n) end +@inline function read_be(v::Vector{UInt8}, i, ::Val{M}, T) where {M} + S = sizeof(T) + return ntuple(j -> read_be(v, i + (j - 1) * S, T), Val(M)) +end + @inline function read_be_i(v::Vector{UInt8}, i, T::Base.DataType) return read_be(v, i, T), i + _sizeof(T) end @@ -32,58 +37,20 @@ end const name_bytes_buffer = Vector{UInt8}(undef, 256) -""" - @read_be_fields buffer pos T1 T2 ... - -Unrolls sequential big-endian reads starting at `pos` within `buffer`. -Returns a tuple of the parsed values and the updated position, mirroring -`read_be_i` but without the runtime `ntuple`/offset bookkeeping. - -# Example - -```julia -values, next = @read_be_fields buf pos UInt32 Int16 -``` -""" -macro read_be_fields(buffer, pos, Ts...) - isempty(Ts) && error("@read_be_fields requires at least one field type") - - types = flatten_field_types(__module__, Ts) - buf = esc(buffer) - start = esc(pos) - pos_sym = gensym(:pos) - value_syms = [gensym(:field) for _ in types] - - stmts = Any[:(local $pos_sym = $start)] - for (sym, T) in zip(value_syms, types) - Tesc = esc(T) - push!(stmts, :(local $sym = read_be($buf, $pos_sym, $Tesc))) - push!(stmts, :($pos_sym += _sizeof($Tesc))) - end - - tuple_expr = Expr(:tuple, value_syms...) - push!(stmts, :(($tuple_expr, $pos_sym))) - - return Expr(:block, stmts...) -end - # Optimized version using loop unrolling for better performance @generated function read_be_fields(buffer::Vector{UInt8}, pos::Integer, ::Type{SType}, ::Val{indxs}) where {SType, indxs} exprs = Expr[] value_syms = [gensym(:field) for _ in 1:length(indxs)] pos_sym = gensym(:pos) - # Initialize position push!(exprs, :(local $pos_sym = pos)) - # Read each field for (i, idx) in enumerate(indxs) T = fieldtype(SType, idx) push!(exprs, :(local $(value_syms[i]) = read_be(buffer, $pos_sym, $T))) push!(exprs, :($pos_sym += _sizeof($T))) end - # Return tuple of values and final position tuple_expr = Expr(:tuple, value_syms...) push!(exprs, :(($tuple_expr, $pos_sym))) diff --git a/src/precompile.jl b/src/precompile.jl index 361b587..691b748 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -1,6 +1,6 @@ -precompile(Array, (CDFVariable{TT2000, 1, VDR{Int64}, CDFDataset{NoCompression, Int64}},)) +precompile(Array, (CDFVariable{TT2000, 1, VDR{Int64}, CDFDataset{Int64}},)) for T in (Float32, Float64), i in 1:3 - precompile(Array, (CDFVariable{T, i, VDR{Int64}, CDFDataset{NoCompression, Int64}},)) + precompile(Array, (CDFVariable{T, i, VDR{Int64}, CDFDataset{Int64}},)) end PrecompileTools.@setup_workload begin diff --git a/src/records/adr.jl b/src/records/adr.jl index e8c5064..f45334c 100644 --- a/src/records/adr.jl +++ b/src/records/adr.jl @@ -29,7 +29,7 @@ is_global(buffer, offset, ::Type{Int64}) = read_be(buffer, offset + 29, Int32) = Load an Attribute Descriptor Record from the buffer at the specified position. """ -@inline function ADR(buffer::Vector{UInt8}, offset, RecordSizeType) +@inline function ADR(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}) where {RecordSizeType} pos = check_record_type(4, buffer, offset, RecordSizeType) # Read ADR fields fields, pos = read_be_fields(buffer, pos, ADR{RecordSizeType, String}, Val(1:11)) diff --git a/src/records/aedr.jl b/src/records/aedr.jl index 4499e4a..e90b345 100644 --- a/src/records/aedr.jl +++ b/src/records/aedr.jl @@ -21,7 +21,7 @@ struct AEDR{FST, A} Value::A # This consists of the number of elements (specified by the NumElems field) of the data type (specified by the DataType field). This can be thought of as a 1-dimensional array of values (stored contiguously). The size of this field is the product of the number of elements and the size in bytes of each element. end -function load_aedr_data(buffer::Vector{UInt8}, offset, RecordSizeType, needs_byte_swap) +function load_aedr_data(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}, needs_byte_swap) where {RecordSizeType} _datatype_offset = 9 + 2 * sizeof(RecordSizeType) _numelems_offset = 17 + 2 * sizeof(RecordSizeType) _data_offset = 41 + 2 * sizeof(RecordSizeType) diff --git a/src/records/ccr.jl b/src/records/ccr.jl index 1dd6373..e20a7c3 100644 --- a/src/records/ccr.jl +++ b/src/records/ccr.jl @@ -1,5 +1,4 @@ struct CCR <: Record - header::Header cpr_offset::UInt64 uncompressed_size::UInt64 # uSize Size of the CDF in its uncompressed form. This byte count does NOT include the 8-byte magic numbers, and 16-byte checksum if it exists. rfu_a::RInt32 @@ -7,7 +6,7 @@ struct CCR <: Record data_length::Int end -@inline function CCR(buffer::Vector{UInt8}, offset, RecordSizeType) +@inline function CCR(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}) where {RecordSizeType} pos = offset + 1 header = Header(buffer, pos, RecordSizeType) @assert header.record_type == 10 "Invalid CCR record type" @@ -18,7 +17,7 @@ end record_end = offset + header.record_size data_length = record_end - data_offset @assert data_length >= 0 "Invalid CCR data length" - return CCR(header, UInt64(cpr_offset), UInt64(uncompressed_size), rfu_a, data_offset, data_length) + return CCR(UInt64(cpr_offset), UInt64(uncompressed_size), rfu_a, data_offset, data_length) end @inline function data_view(ccr::CCR, buffer::Vector{UInt8}) diff --git a/src/records/cdr.jl b/src/records/cdr.jl index 01db15a..f808156 100644 --- a/src/records/cdr.jl +++ b/src/records/cdr.jl @@ -27,7 +27,7 @@ is_cdf_v3(cdr::CDR) = cdr.version == 3 Load a CDF Descriptor Record from the IO stream at the specified offset. This follows the CDF specification for CDR record structure. """ -@inline function CDR(buffer::Vector{UInt8}, offset, FieldSizeT) +@inline function CDR(buffer::Vector{UInt8}, offset, ::Type{FieldSizeT}) where {FieldSizeT} pos = check_record_type(1, buffer, offset, FieldSizeT) # Read remaining CDR fields in order as per CDF specification fields, pos = read_be_fields(buffer, pos, CDR{FieldSizeT}, Val(1:9)) diff --git a/src/records/cpr.jl b/src/records/cpr.jl index 2c03919..036dc7e 100644 --- a/src/records/cpr.jl +++ b/src/records/cpr.jl @@ -7,10 +7,9 @@ struct CPR <: Record # parameters::Tuple{Vararg{Int32}} end -@inline function CPR(buffer::Vector{UInt8}, offset, FieldSizeT) +@inline function CPR(buffer::Vector{UInt8}, offset, ::Type{FieldSizeT}) where {FieldSizeT} pos = check_record_type(11, buffer, offset, FieldSizeT) - fields, pos = @read_be_fields(buffer, pos, fieldtypes(CPR)...) - # parameter_count, pos = read_be_i(buffer, pos, Int32) + fields, pos = read_be_fields(buffer, pos, CPR, Val(1:3)) # parameters = read_be(buffer, pos, parameter_count, Int32) return CPR(fields...) end diff --git a/src/records/cvvr.jl b/src/records/cvvr.jl index 6521e3e..211796a 100644 --- a/src/records/cvvr.jl +++ b/src/records/cvvr.jl @@ -3,7 +3,7 @@ struct CVVR <: Record data_offset::Int end -@inline function CVVR(buffer::Vector{UInt8}, offset, RecordSizeType; check = false) +@inline function CVVR(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}; check = false) where {RecordSizeType} pos = offset + 1 + sizeof(RecordSizeType) record_type, pos = read_be_i(buffer, pos, Int32) check && @assert record_type == 13 "Invalid CVVR record type" @@ -12,7 +12,7 @@ end return CVVR(Int64(cSize), pos) end -function load_cvvr_data!(data, pos, src::Vector{UInt8}, offset, N, RecordSizeType, compression::CompressionType; decompressor = Decompressor()) +function load_cvvr_data!(data, pos, src::Vector{UInt8}, offset, N, ::Type{RecordSizeType}, compression::CompressionType; decompressor = Decompressor()) where {RecordSizeType} cvvr = CVVR(src, offset, RecordSizeType) decompress_bytes!(decompressor, data, pos, src, cvvr.data_offset, N, cvvr.cSize, compression) return diff --git a/src/records/records.jl b/src/records/records.jl index 00336ac..7e7629c 100644 --- a/src/records/records.jl +++ b/src/records/records.jl @@ -11,22 +11,22 @@ struct Header record_type::Int32 end -@inline function Header(buf::Vector{UInt8}, pos, FieldSizeT) +@inline function Header(buf::Vector{UInt8}, pos, ::Type{FieldSizeT}) where {FieldSizeT} record_size = Int64(read_be(buf, pos, FieldSizeT)) record_type = read_be(buf, pos + sizeof(FieldSizeT), Int32) return Header(record_size, record_type) end -get_record_type(buffer, offset, FieldSizeT) = read_be(buffer, offset + sizeof(FieldSizeT) + 1, Int32) +get_record_type(buffer, offset, ::Type{FieldSizeT}) where {FieldSizeT} = read_be(buffer, offset + sizeof(FieldSizeT) + 1, Int32) -@inline function check_record_type(record_type::Integer, buffer, offset, FieldSizeT) +@inline function check_record_type(record_type::Integer, buffer, offset, ::Type{FieldSizeT}) where {FieldSizeT} pos = offset + sizeof(FieldSizeT) + 1 header_type = read_be(buffer, pos, Int32) @assert header_type == record_type return pos + sizeof(Int32) end -@inline function check_record_type(record_types, buffer, offset, FieldSizeT) +@inline function check_record_type(record_types, buffer, offset, ::Type{FieldSizeT}) where {FieldSizeT} pos = offset + sizeof(FieldSizeT) + 1 header_type = read_be(buffer, pos, Int32) @assert header_type in record_types @@ -44,7 +44,6 @@ include("cpr.jl") include("ccr.jl") include("cvvr.jl") -# Utility functions to decode CDR flags """ decode_cdr_flags(flags::UInt32) @@ -65,7 +64,6 @@ function decode_cdr_flags(flags) ) end -# Pretty printing for CDR structure function Base.show(io::IO, cdr::CDR) flag_info = decode_cdr_flags(cdr.flags) diff --git a/src/records/vdr.jl b/src/records/vdr.jl index 89bd2e9..05dfdfc 100644 --- a/src/records/vdr.jl +++ b/src/records/vdr.jl @@ -89,6 +89,30 @@ end return read_be(vdr.buffer, vdr.pos, vdr.num_dims, Int32) end +# Static-arity variants for the typed `read!` path: the caller supplies the dimension +# count via `Val`, so tuple lengths stay inferable under `juliac --trim`. They avoid the +# runtime-length tuples (and `collect`/logical indexing for rVDR) of the methods above. +function record_sizes(vdr::VDR, ::Val{M}) where {M} + vdr.num_dims == M || + throw(DimensionMismatch("variable has $(vdr.num_dims) dimensions, expected $M")) + return read_be(vdr.buffer, vdr.pos, Val(M), Int32) +end + +function record_sizes(vdr::rVDR, ::Val{M}) where {M} + gdr = vdr.gdr + buf = vdr.buffer + sizes_pos = gdr.pos + sizeof(Int64) + 3 * sizeof(Int32) # mirrors `r_dim_sizes` + sizes = zeros(Int32, M) + count = 0 + for i in 1:Int(gdr.r_num_dims) + read_be(buf, vdr.pos + (i - 1) * 4, Int32) == 0 && continue + count += 1 + count <= M && (sizes[count] = read_be(buf, sizes_pos + (i - 1) * 4, Int32)) + end + count == M || throw(DimensionMismatch("variable has $count dimensions, expected $M")) + return ntuple(i -> sizes[i], Val(M)) +end + function Base.size(vdr::AbstractVDR) records = vdr.max_rec + 1 dims = (record_sizes(vdr)..., records) diff --git a/src/records/vvr.jl b/src/records/vvr.jl index 5388553..4866957 100644 --- a/src/records/vvr.jl +++ b/src/records/vvr.jl @@ -9,7 +9,7 @@ struct VVR{T} data::Vector{T} # Raw variable data end -@inline function VVR(buffer::Vector{UInt8}, offset, RecordSizeType, data) +@inline function VVR(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}, data) where {RecordSizeType} pos = offset + 1 header = Header(buffer, pos, RecordSizeType) @assert header.record_type == 7 "Invalid VVR record type" @@ -25,7 +25,7 @@ function _copy_to!(dest, doffs, src, soffs, N) end end -function load_vvr_data!(data::Vector{T}, pos, src::Vector{UInt8}, offset, N, RecordSizeType) where {T} +function load_vvr_data!(data::Vector{T}, pos, src::Vector{UInt8}, offset, N, ::Type{RecordSizeType}) where {T, RecordSizeType} src_start = offset + 1 + sizeof(RecordSizeType) + sizeof(Int32) _copy_to!(data, pos, src, src_start, N) return diff --git a/src/records/vxr.jl b/src/records/vxr.jl index bf7b05c..89df75c 100644 --- a/src/records/vxr.jl +++ b/src/records/vxr.jl @@ -18,7 +18,7 @@ end Load a Variable Index Record from the source at the specified offset. """ -function VXR(source::Vector{UInt8}, offset, FieldSizeT) +function VXR(source::Vector{UInt8}, offset, ::Type{FieldSizeT}) where {FieldSizeT} pos = check_record_type(6, source, offset, FieldSizeT) vxr_next, pos = read_be_i(source, pos, FieldSizeT) n_entries, pos = read_be_i(source, pos, Int32) diff --git a/test/Project.toml b/test/Project.toml index 4d88f49..2b76fbb 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,5 +4,6 @@ CommonDataFormat = "a9737db6-c05c-4e48-868b-6bc41491d9d9" CommonDataModel = "1fbeeb36-5f17-413c-809b-666fb144f157" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" +JuliaC = "acedd4c2-ced6-4a15-accc-2607eb759ba2" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/cdf_trim_probe.jl b/test/cdf_trim_probe.jl new file mode 100644 index 0000000..7bf6323 --- /dev/null +++ b/test/cdf_trim_probe.jl @@ -0,0 +1,17 @@ +using CommonDataFormat +import CommonDataFormat as CDF + +function (@main)(args::Vector{String}) + ds = CDFDataset(args[1]) + ok = ds.version == (3, 9, 0) && + ds.majority == CDF.Row && + ds.compression == CDF.NoCompression && + length(keys(ds)) > 0 + # Typed data reads: z-variable (VDR) 1D + 2D with row-major swap. + v = read!(ds, "var", Vector{Float64}(undef, 101)) + ok &= v[1] == 1.0 + m = read!(ds, "var2d_counter", Matrix{Float64}(undef, 10, 10)) + ok &= m[1, 1] == 0.0 && m[2, 1] == 1.0 && m[10, 10] == 99.0 + ok &= read(ds, "var2d_counter", Matrix{Float64}) == m + return ok ? 0 : 1 +end diff --git a/test/runtests.jl b/test/runtests.jl index 010ac03..fb1bb75 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,9 +14,18 @@ include("staticstring.jl") Aqua.test_all(CommonDataFormat) end +const RUN_JET_TESTS = isempty(VERSION.prerelease) + @testset "JET" begin - using JET - JET.test_package(CommonDataFormat; target_modules = [CommonDataFormat]) + if RUN_JET_TESTS + using Pkg; Pkg.add("JET"); Pkg.instantiate() + using JET + JET.test_package(CommonDataFormat; target_modules = [CommonDataFormat]) + end +end + +@testset "Trim" begin + include("trim.jl") end @testset "Fill Value" begin @@ -38,6 +47,30 @@ end display(ds["var"]) end +@testset "read! typed entry point" begin + ds = CDFDataset(data_path("a_cdf.cdf")) + v = read!(ds, "var", Vector{Float64}(undef, 101)) + @test v == ds["var"][:] + m = read!(ds, "var2d_counter", Matrix{Float64}(undef, 10, 10)) + @test m == reshape(0.0:99.0, 10, 10) + @test_throws KeyError read!(ds, "nonexistent", zeros(1)) + @test_throws ArgumentError read!(ds, "var", zeros(Float32, 101)) + @test_throws DimensionMismatch read!(ds, "var", zeros(100)) + @test_throws DimensionMismatch read!(ds, "var", zeros(101, 1)) + + @test read(ds, "var", Vector{Float64}) == v + @test read(ds, "var2d_counter", Matrix{Float64}) == m + @test_throws KeyError read(ds, "nonexistent", Vector{Float64}) + @test_throws ArgumentError read(ds, "var", Vector{Float32}) + @test_throws DimensionMismatch read(ds, "var", Matrix{Float64}) + + rds = CDFDataset(data_path("ac_h0_mfi_20230102_v07.cdf")) + rvar = rds["BGSEc"] + b = read!(rds, "BGSEc", Matrix{eltype(rvar)}(undef, size(rvar))) + @test b == rvar[:, :] + @test read(rds, "BGSEc", Matrix{eltype(rvar)}) == b +end + @testset "Compressed cdf file (gzip)" begin compressed = CDFDataset(data_path("a_compressed_cdf.cdf")) reference = CDFDataset(data_path("a_cdf.cdf")) diff --git a/test/trim.jl b/test/trim.jl new file mode 100644 index 0000000..72614c0 --- /dev/null +++ b/test/trim.jl @@ -0,0 +1,72 @@ +# `juliac --trim` compatibility test. +# +# Builds a small static executable that opens a CDF and reads its metadata, then runs it. +# This guards the "open + inspect" path against regressions in static reachability: a +# stray dynamic dispatch (e.g. a record-size type passed as a value instead of `::Type{T}`, +# or `getproperty` dragging in the heterogeneous attribute machinery) fails the verifier. +# +# Data reading is covered through the typed entry point `read!(ds, name, dest)`, where +# eltype/ndims come from `dest` instead of the file. The generic `ds[name]` path stays +# out of scope: its element type is only known at runtime, so it cannot be statically +# resolved. + +using Test + +const TRIM_SUPPORTED = VERSION >= v"1.12.0-rc1" +const JULIAC_ENTRYPOINT_EXPR = "using JuliaC; if isdefined(JuliaC, :main); JuliaC.main(ARGS); else JuliaC._main_cli(ARGS); end" + +function trim_project_path() + active_project = Base.active_project() + active_project !== nothing && isfile(active_project) && return dirname(active_project) + return normpath(joinpath(@__DIR__, "..")) +end + +function run_and_capture(cmd::Cmd) + mktemp() do path, io + exit_code = try + run(pipeline(ignorestatus(cmd), stdout = io, stderr = io)).exitcode + catch + -1 + end + close(io) + return exit_code, read(path, String) + end +end + +function trim_verify_totals(output) + m = match(r"Trim verify finished with\s+(\d+)\s+errors,\s+(\d+)\s+warnings\.", output) + m !== nothing && return parse(Int, m.captures[1]), parse(Int, m.captures[2]) + errors = length(collect(eachmatch(r"Verifier error #\d+:", output))) + warnings = length(collect(eachmatch(r"Verifier warning #\d+:", output))) + return errors, warnings +end + +if !TRIM_SUPPORTED + @info "JuliaC trim compilation unavailable before Julia 1.12; skipping --trim build test" + @test_skip false +else + mktempdir() do dir + juliac_project = trim_project_path() + app_project = pkgdir(CommonDataFormat) + data_file = data_path("a_cdf.cdf") + probe = joinpath(@__DIR__, "cdf_trim_probe.jl") + exe = "cdf_trim_probe" + + julia = joinpath(Sys.BINDIR, Base.julia_exename()) + build = `$julia --startup-file=no --history-file=no --code-coverage=none --project=$juliac_project -e $JULIAC_ENTRYPOINT_EXPR -- --output-exe $exe --project=$app_project --experimental --trim=safe $probe` + build_exit, build_output = run_and_capture(Cmd(build; dir = dir)) + trim_errors, trim_warnings = trim_verify_totals(build_output) + (build_exit == 0 && trim_errors == 0 && trim_warnings == 0) || print(build_output) + @test build_exit == 0 + @test trim_errors == 0 + @test trim_warnings == 0 + + if build_exit == 0 + exe_path = Sys.iswindows() ? joinpath(dir, "$exe.exe") : joinpath(dir, exe) + run_cmd = Cmd(`$exe_path $data_file`; dir = dir) + run_exit, run_output = run_and_capture(run_cmd) + run_exit == 0 || print(run_output) + @test run_exit == 0 + end + end +end