diff --git a/src/dataset.jl b/src/dataset.jl index dd541f1..da9a9d4 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -74,7 +74,7 @@ function _vdr_at(cdf::CDFDataset{FST}, offset::Int) where {FST} record_type = read_be(buffer, offset + 1 + sizeof(FST), Int32) @assert record_type in (8, 3) return record_type == 8 ? VDR{FST}(buffer, offset) : - rVDR{FST}(buffer, offset, GDR(cdf)) + rVDR{FST}(buffer, offset) end function find_vdr(cdf::CDFDataset, var_name::String) diff --git a/src/loading/variable.jl b/src/loading/variable.jl index 1cde6d1..c412ad8 100644 --- a/src/loading/variable.jl +++ b/src/loading/variable.jl @@ -45,7 +45,7 @@ end # Branch over dimension count so each leaf builds dims tuple at compile time statically function _variable(cdf, name, vdr) - M = num_record_dims(vdr) + M = num_record_dims(vdr, cdf) return Base.Cartesian.@nif 12 d -> (M == d - 1) d -> ( d == 12 ? throw(ArgumentError("variable has $M dimensions; the CDF format allows at most 10")) : _variable(cdf, name, vdr, Val(d - 1)) @@ -53,7 +53,7 @@ function _variable(cdf, name, vdr) end function _variable(cdf, name, vdr, ::Val{M}) where {M} - dims = (map(Int, record_sizes(vdr, Val(M)))..., Int(vdr.max_rec) + 1) + dims = (map(Int, record_sizes(vdr, cdf, Val(M)))..., Int(vdr.max_rec) + 1) code = Int(vdr.data_type) if code == 51 || code == 52 # CHAR/UCHAR: eltype depends on runtime num_elems T = StaticString{Int(vdr.num_elems), UInt8} @@ -91,7 +91,7 @@ freshly allocated `Array{T, N}`. function Base.read(ds::CDFDataset, name::String, ::Type{Array{T, N}}) where {T, N} vdr = find_vdr(ds, name) isnothing(vdr) && throw(KeyError(name)) - dims = (map(Int, record_sizes(vdr, Val(N - 1)))..., Int(vdr.max_rec) + 1) + dims = (map(Int, record_sizes(vdr, ds, Val(N - 1)))..., Int(vdr.max_rec) + 1) return _read_full!(Array{T, N}(undef, dims), ds, name, vdr) end @@ -99,7 +99,7 @@ function _read_full!(dest::AbstractArray{T, N}, ds, name, vdr) where {T, N} Base.require_one_based_indexing(dest) Tfile = julia_type(vdr.data_type, vdr.num_elems) T === Tfile || throw(ArgumentError("element type mismatch for \"$name\": file has $Tfile, destination has $T")) - dims = (map(Int, record_sizes(vdr, Val(N - 1)))..., Int(vdr.max_rec) + 1) + dims = (map(Int, record_sizes(vdr, ds, Val(N - 1)))..., Int(vdr.max_rec) + 1) size(dest) == dims || throw(DimensionMismatch("variable \"$name\" has size $dims, destination has size $(size(dest))")) var = CDFVariable{T, N, typeof(vdr), typeof(ds)}(name, vdr, ds, dims) DiskArrays.readblock!(var, dest, axes(dest)...) @@ -115,10 +115,10 @@ function DiskArrays.readblock!(var::CDFVariable{T, N}, dest::AbstractArray{T}, r buffer = parent(var.parentdataset) RecordSizeType = recordsize_type(var.parentdataset) - entries, vvr_type = read_vvrs(var.vdr) + entries, vvr_type = read_vvrs(var.vdr, var.parentdataset) isempty(entries) && return dest compression = if !isempty(entries) # # vvr records is the ultimative source - vvr_type == VVR_ ? NoCompression : variable_compression(var.vdr) + vvr_type == VVR_ ? NoCompression : variable_compression(var.vdr, var.parentdataset) else NoCompression end @@ -220,10 +220,10 @@ function collect_vxr_entries!(entries::Vector{VVREntry}, src, offset, ::Type{Fie return vvr_type end -function variable_compression(vdr::AbstractVDR{FieldSizeT}) where {FieldSizeT} +function variable_compression(vdr::AbstractVDR{FieldSizeT}, cdf) where {FieldSizeT} offset_value = Int(vdr.cpr_or_spr_offset) if is_compressed(vdr) && offset_value != 0 - buffer = vdr.buffer + buffer = parent(cdf) cpr = CPR(buffer, offset_value, FieldSizeT) return CompressionType(cpr.compression_type) end diff --git a/src/parsing.jl b/src/parsing.jl index e7584e6..4cecf6d 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -35,26 +35,40 @@ end return ntuple(j -> read_be(v, i + (j - 1) * S, T), n), i + n * S end -# Optimized version using loop unrolling -@generated function read_be_fields(buffer::Vector{UInt8}, pos::Integer, ::Type{SType}, ::Val{indxs}) where {SType, indxs} - exprs = Expr[] - value_syms = [gensym(:field) for _ in 1:length(indxs)] - pos_sym = gensym(:pos) - - push!(exprs, :(local $pos_sym = pos)) - - for (i, idx) in enumerate(indxs) - T = fieldtype(SType, idx) - push!(exprs, :(local $(value_syms[i]) = read_be(buffer, $pos_sym, $T))) - push!(exprs, :($pos_sym += _sizeof($T))) +function field_layout(SType, indxs) + offsets = Int[] + types = Any[] + pos = 0 + for idx in indxs + push!(offsets, pos) + push!(types, fieldtype(SType, idx)) + pos += _sizeof(types[end]) end + return offsets, types, pos +end - tuple_expr = Expr(:tuple, value_syms...) - push!(exprs, :(($tuple_expr, $pos_sym))) +@generated function read_be_fields(buffer::Vector{UInt8}, pos::Integer, ::Type{SType}, ::Val{indxs}) where {SType, indxs} + offsets, types, total = field_layout(SType, indxs) + values = [:(read_be(buffer, pos + $(offsets[i]), $(types[i]))) for i in eachindex(offsets)] + return :(($(Expr(:tuple, values...)), pos + $total)) +end - return Expr(:block, exprs...) +@generated function write_be_fields(buffer::Vector{UInt8}, pos::Integer, ::Type{SType}, ::Val{indxs}, values::Tuple) where {SType, indxs} + offsets, types, total = field_layout(SType, indxs) + exprs = [ + :(write_be(buffer, pos + $(offsets[i]), convert($(types[i]), values[$i]))) + for i in eachindex(offsets) + ] + return Expr(:block, exprs..., :(pos + $total)) end +@inline function write_be(v::Vector{UInt8}, i, x) + GC.@preserve v unsafe_store!(convert(Ptr{typeof(x)}, pointer(v, i)), hton(x)) + return i + sizeof(x) +end + +@inline write_be(v::Vector{UInt8}, i, ::RInt32) = i + _sizeof(RInt32) + function flatten_field_types(mod, args) types = Any[] for arg in args diff --git a/src/records/gdr.jl b/src/records/gdr.jl index 4f7cdf4..af179e4 100644 --- a/src/records/gdr.jl +++ b/src/records/gdr.jl @@ -27,10 +27,3 @@ end fields, pos = read_be_fields(buffer, pos, GDR{FST}, Val(1:9)) return GDR(fields..., pos) end - -function r_dim_sizes(gdr::GDR, buffer::Vector{UInt8}) - pos = gdr.pos + sizeof(Int64) + sizeof(Int32) + sizeof(Int32) + sizeof(Int32) - r_num_dims = gdr.r_num_dims - @assert r_num_dims >= 0 - return read_be(buffer, pos, r_num_dims, Int32) -end diff --git a/src/records/vdr.jl b/src/records/vdr.jl index 2a5f894..4513fcf 100644 --- a/src/records/vdr.jl +++ b/src/records/vdr.jl @@ -19,8 +19,6 @@ struct rVDR{FST} <: AbstractVDR{FST} # blocking_factor::Int32 # name::S # Variable name pos::Int - buffer::Vector{UInt8} - gdr::GDR{FST} end """ @@ -45,7 +43,6 @@ struct VDR{FST} <: AbstractVDR{FST} # name::S # Variable name num_dims::Int32 # Number of dimensions pos::Int - buffer::Vector{UInt8} # z_dim_sizes::Tuple{Vararg{Int32}} # Dimension sizes (z-variables only) # dim_varys::Tuple{Vararg{Int32}} # Dimension variance flags end @@ -62,46 +59,34 @@ Load a z-Variable Descriptor Record from the buffer at the specified offset. pos = FieldSizeT == Int64 ? offset + 340 + 1 : offset + 128 + 1 z_num_dims, pos = read_be_i(buffer, pos, Int32) - return VDR{FieldSizeT}(fields..., z_num_dims, pos, buffer) + return VDR{FieldSizeT}(fields..., z_num_dims, pos) end """ - rVDR{FieldSizeT}(buffer, offset, gdr) + rVDR{FieldSizeT}(buffer, offset) Load an r-Variable Descriptor Record from the buffer at the specified offset. """ -@inline function rVDR{FieldSizeT}(buffer::Vector{UInt8}, offset, gdr) where {FieldSizeT} +@inline function rVDR{FieldSizeT}(buffer::Vector{UInt8}, offset) where {FieldSizeT} pos = check_record_type(3, buffer, offset, FieldSizeT) fields, pos = read_be_fields(buffer, pos, rVDR{FieldSizeT}, Val(1:13)) pos = FieldSizeT == Int64 ? offset + 340 + 1 : offset + 128 + 1 - return rVDR{FieldSizeT}(fields..., pos, buffer, gdr) + return rVDR{FieldSizeT}(fields..., pos) end -@inline function record_sizes(vdr::rVDR) - gdr = vdr.gdr - buffer = vdr.buffer - dim_varys = collect(read_be(buffer, vdr.pos, gdr.r_num_dims, Int32)) - return r_dim_sizes(gdr, buffer)[dim_varys .!= 0] -end - -@inline function record_sizes(vdr::VDR) - return read_be(vdr.buffer, vdr.pos, vdr.num_dims, Int32) -end - # Static-arity variants for the typed `read!` path: the caller supplies the dimension -# count via `Val`, so tuple lengths stay inferable under `juliac --trim`. They avoid the -# runtime-length tuples (and `collect`/logical indexing for rVDR) of the methods above. -function record_sizes(vdr::VDR, ::Val{M}) where {M} +# count via `Val`, so tuple lengths stay inferable under `juliac --trim`. +function record_sizes(vdr::VDR, cdf, ::Val{M}) where {M} vdr.num_dims == M || throw(DimensionMismatch("variable has $(vdr.num_dims) dimensions, expected $M")) - return read_be(vdr.buffer, vdr.pos, Val(M), Int32) + return read_be(parent(cdf), vdr.pos, Val(M), Int32) end -function record_sizes(vdr::rVDR, ::Val{M}) where {M} - gdr = vdr.gdr - buf = vdr.buffer - sizes_pos = gdr.pos + sizeof(Int64) + 3 * sizeof(Int32) # mirrors `r_dim_sizes` +function record_sizes(vdr::rVDR, cdf, ::Val{M}) where {M} + gdr = GDR(cdf) + buf = parent(cdf) + sizes_pos = gdr.pos + sizeof(Int64) + 3 * sizeof(Int32) sizes = zeros(Int32, M) count = 0 for i in 1:Int(gdr.r_num_dims) @@ -113,24 +98,18 @@ function record_sizes(vdr::rVDR, ::Val{M}) where {M} return ntuple(i -> sizes[i], Val(M)) end -num_record_dims(vdr::VDR) = Int(vdr.num_dims) -function num_record_dims(vdr::rVDR) +num_record_dims(vdr::VDR, cdf) = Int(vdr.num_dims) +function num_record_dims(vdr::rVDR, cdf) n = 0 - for i in 1:Int(vdr.gdr.r_num_dims) - n += read_be(vdr.buffer, vdr.pos + (i - 1) * 4, Int32) != 0 + for i in 1:Int(GDR(cdf).r_num_dims) + n += read_be(parent(cdf), vdr.pos + (i - 1) * 4, Int32) != 0 end return n end -function Base.size(vdr::AbstractVDR) - records = vdr.max_rec + 1 - dims = (record_sizes(vdr)..., records) - return Int.(dims) -end - function Base.show(io::IO, vdr::AbstractVDR) - print(io, "VDR: ", Base.dims2string(size(vdr)), " (", CDFDataType(vdr.data_type), ")") + print(io, "VDR: ", CDFDataType(vdr.data_type)) is_nrv(vdr) && print(io, " [NRV]") is_compressed(vdr) && print(io, " [compressed]") return @@ -141,10 +120,10 @@ end # 1 Whether or not a pad value is specified for this variable. Set indicates that a pad value has been specified. Clear indicates that a pad value has not been specified. The PadValue field described below is only present if a pad value has been specified. # 2 Whether or not a compression method might be applied to this variable data. Set indicates that a compression is chosen by the user and the data might be compressed, depending on the data size and content. If the compressed data becomes larger than its uncompressed data, no compression is applied and the data are stored as uncompressed, even the compression bit is set. The compressed data is stored in Compressed Variable Value Record (CVVR) while uncompressed data go into Variable Value Record (VVR). Clear indicates that a compression will not be used. The CPRorSPRoffset field provides the offset of the Compressed Parameters Record if this compression bit is set and the compression used. -function read_vvrs(vdr::AbstractVDR{FieldSizeT}) where {FieldSizeT} +function read_vvrs(vdr::AbstractVDR{FieldSizeT}, cdf) where {FieldSizeT} vxr_head = vdr.vxr_head entries = Vector{VVREntry}() - src = vdr.buffer + src = parent(cdf) sizehint!(entries, 1) vvr_type = collect_vxr_entries!(entries, src, Int(vxr_head), FieldSizeT) vvr_type = @something vvr_type VVR_ diff --git a/src/variable.jl b/src/variable.jl index d79dd03..bb6f05a 100644 --- a/src/variable.jl +++ b/src/variable.jl @@ -46,7 +46,7 @@ function _eachchunk(var::CDFVariable) end function _eachchunk_vvrs(var::CDFVariable) - vvrs, _ = read_vvrs(var.vdr) + vvrs, _ = read_vvrs(var.vdr, var.parentdataset) N = ndims(var) chunks = ntuple(N) do i if i != N diff --git a/test/parsing_test.jl b/test/parsing_test.jl new file mode 100644 index 0000000..6ecc932 --- /dev/null +++ b/test/parsing_test.jl @@ -0,0 +1,31 @@ +using CommonDataFormat: read_be_fields, write_be_fields, field_layout, RInt32, CDR, GDR + +@testset "write_be_fields round-trip" begin + buf = zeros(UInt8, 64) + vals = ( + Int64(0x1122334455667788), Int32(3), Int32(9), Int32(2), Int32(15), + RInt32(), RInt32(), Int32(0), Int32(1), + ) + endw = write_be_fields(buf, 1, CDR{Int64}, Val(1:9), vals) + fields, endr = read_be_fields(buf, 1, CDR{Int64}, Val(1:9)) + @test endw == endr + @test fields == vals + + # read fields from a real file, re-emit, compare bytes (reserved fields are + # skipped by the writer, so mask them out using the schema itself) + ds = CDFDataset(data_path("a_cdf.cdf")) + buffer = parent(ds) + for SType in (CDR{Int64}, GDR{Int64}) + offset = SType <: CDR ? 8 : Int(ds.cdr.gdr_offset) + pos = offset + 1 + 8 + 4 # past record size + record type + fields, endpos = read_be_fields(buffer, pos, SType, Val(1:9)) + out = zeros(UInt8, endpos - pos) + @test write_be_fields(out, 1, SType, Val(1:9), fields) == length(out) + 1 + offsets, types, total = field_layout(SType, 1:9) + mask = trues(total) + for (o, T) in zip(offsets, types) + T == RInt32 && (mask[(o + 1):(o + 4)] .= false) + end + @test out[mask] == buffer[pos:(endpos - 1)][mask] + end +end diff --git a/test/runtests.jl b/test/runtests.jl index a3593c3..a9ae543 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,6 +8,7 @@ include("comprehensive_test.jl") include("cdf2_test.jl") include("CommonDataModelExt_test.jl") include("decompress_test.jl") +include("parsing_test.jl") @testset "StaticString" begin include("staticstring.jl") end