diff --git a/README.md b/README.md index 3450e8b..03ce1ff 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Build Status](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl) -A Julia package for reading [Common Data Format (CDF)](https://cdf.gsfc.nasa.gov/) files, widely used in space physics for storing multidimensional data arrays and metadata. See [CDFDatasets.jl](https://github.com/JuliaSpacePhysics/CDFDatasets.jl) for high-level interfaces. +A pure Julia implementation for reading [Common Data Format (CDF)](https://cdf.gsfc.nasa.gov/) files, widely used in space physics for storing multidimensional data arrays and metadata. See [CDFDatasets.jl](https://github.com/JuliaSpacePhysics/CDFDatasets.jl) for high-level interfaces. ## Quick Start @@ -32,7 +32,6 @@ var = cdf["temperature"] ## Features -- **Pure Julia implementation** - No external dependencies on CDF libraries - **Efficient data access** - Memory-mapped access for data and attributes, super fast decompression using [`LibDeflate`](https://github.com/jakobnissen/LibDeflate.jl) - **[DiskArrays.jl](https://github.com/JuliaIO/DiskArrays.jl) integration** - Lazy representation of data on hard disk with AbstractDiskArray interface diff --git a/test/perf_test_ntoh.jl b/benchmark/perf_test_ntoh.jl similarity index 100% rename from test/perf_test_ntoh.jl rename to benchmark/perf_test_ntoh.jl diff --git a/justfile b/justfile index f5d993d..f73fbff 100644 --- a/justfile +++ b/justfile @@ -8,10 +8,3 @@ perf: @time var2 = ds["elb_pef_hs_epa_spec"] @time Array(var2) @time Array(ds["elb_pef_fs_time"]) - -snoop: - #!/usr/bin/env -S julia --threads=auto --project=. -i - using SnoopCompileCore - invs = @snoop_invalidations using CommonDataFormat - using SnoopCompile, AbstractTrees - trees = invalidation_trees(invs) \ No newline at end of file diff --git a/src/enums.jl b/src/enums.jl index a8b1771..e46207c 100644 --- a/src/enums.jl +++ b/src/enums.jl @@ -45,24 +45,18 @@ Base.:(==)(x::T, y::RecordType) where {T <: Integer} = x == T(y) Base.:(==)(x::DataType, y::T) where {T <: Integer} = T(x) == y Base.:(==)(x::T, y::DataType) where {T <: Integer} = x == T(y) -const type_map = Dict( - CDF_INT1 => Int8, - CDF_INT2 => Int16, - CDF_INT4 => Int32, - CDF_INT8 => Int64, - CDF_UINT1 => UInt8, - CDF_UINT2 => UInt16, - CDF_UINT4 => UInt32, - CDF_REAL4 => Float32, - CDF_REAL8 => Float64, - CDF_BYTE => Int8, - CDF_FLOAT => Float32, - CDF_DOUBLE => Float64, +# code → eltype for fixed-size types ordered roughly by frequency +const CODE_TYPE_PAIRS = ( + (21, Float32), (22, Float64), (44, Float32), (45, Float64), + (33, TT2000), (31, Epoch), (32, Epoch16), + (1, Int8), (2, Int16), (4, Int32), (8, Int64), + (11, UInt8), (12, UInt16), (14, UInt32), (41, Int8), +) + +const type_map = Dict{DataType, Type}( + Dict(DataType(c) => T for (c, T) in CODE_TYPE_PAIRS)..., CDF_CHAR => UInt8, CDF_UCHAR => UInt8, - CDF_EPOCH => Epoch, - CDF_EPOCH16 => Epoch16, - CDF_TIME_TT2000 => TT2000 ) function julia_type(cdf_type, num_elems) diff --git a/src/loading/variable.jl b/src/loading/variable.jl index e362dea..2c9bb1d 100644 --- a/src/loading/variable.jl +++ b/src/loading/variable.jl @@ -40,22 +40,41 @@ end function variable(cdf::CDFDataset, name) vdr = find_vdr(cdf, name) isnothing(vdr) && throw(KeyError(name)) - T = julia_type(vdr.data_type, vdr.num_elems) - dims = (record_sizes(vdr)..., vdr.max_rec + 1) - N = vdr isa VDR ? vdr.num_dims + 1 : length(dims) - return CDFVariable{T, N, typeof(vdr), typeof(cdf)}( - name, vdr, cdf, dims + return _variable(cdf, name, vdr) +end + +# Branch over dimension count so each leaf builds dims tuple at compile time statically +function _variable(cdf, name, vdr) + M = num_record_dims(vdr) + return Base.Cartesian.@nif 12 d -> (M == d - 1) d -> ( + d == 12 ? throw(ArgumentError("variable has $M dimensions; the CDF format allows at most 10")) : + _variable(cdf, name, vdr, Val(d - 1)) + ) +end + +function _variable(cdf, name, vdr, ::Val{M}) where {M} + dims = (map(Int, record_sizes(vdr, Val(M)))..., Int(vdr.max_rec) + 1) + code = Int(vdr.data_type) + if code == 51 || code == 52 # CHAR/UCHAR: eltype depends on runtime num_elems + T = StaticString{Int(vdr.num_elems), UInt8} + return CDFVariable{T, M + 1, typeof(vdr), typeof(cdf)}(name, vdr, cdf, dims) + end + # Branch to static constructor per element type + return Base.Cartesian.@nif( + 16, + d -> code == CODE_TYPE_PAIRS[d][1], + d -> _construct(cdf, name, vdr, dims, CODE_TYPE_PAIRS[d][2]), + d -> throw(ArgumentError("unsupported CDF data type $code")) ) end +@inline _construct(cdf, name, vdr, dims::NTuple{N, Int}, ::Type{T}) where {N, T} = + CDFVariable{T, N, typeof(vdr), typeof(cdf)}(name, vdr, cdf, dims) + """ read!(ds::CDFDataset, name, dest::AbstractArray{T, N}) -> dest Read the full contents of variable `name` into the preallocated `dest`. - -Statically-typed entry point: `T` and `N` come from `dest` instead of the file, so — -unlike `ds[name]`, whose type is only known at runtime — the call chain is resolvable -at compile time and survives `juliac --trim`. """ function Base.read!(ds::CDFDataset, name::String, dest::AbstractArray{T, N}) where {T, N} vdr = find_vdr(ds, name) diff --git a/src/records/vdr.jl b/src/records/vdr.jl index 05dfdfc..3738fe7 100644 --- a/src/records/vdr.jl +++ b/src/records/vdr.jl @@ -113,6 +113,16 @@ function record_sizes(vdr::rVDR, ::Val{M}) where {M} return ntuple(i -> sizes[i], Val(M)) end +num_record_dims(vdr::VDR) = Int(vdr.num_dims) +function num_record_dims(vdr::rVDR) + n = 0 + for i in 1:Int(vdr.gdr.r_num_dims) + n += read_be(vdr.buffer, vdr.pos + (i - 1) * 4, Int32) != 0 + end + return n +end + + function Base.size(vdr::AbstractVDR) records = vdr.max_rec + 1 dims = (record_sizes(vdr)..., records) diff --git a/src/staticstring.jl b/src/staticstring.jl index 7cc5b7d..9bf8d72 100644 --- a/src/staticstring.jl +++ b/src/staticstring.jl @@ -1,6 +1,5 @@ # https://github.com/mkitti/StaticStrings.jl # https://github.com/JuliaPy/PythonCall.jl/blob/main/src/Utils/Utils.jl -using Base: between struct StaticString{N, T} <: AbstractString codeunits::NTuple{N, T} @@ -8,12 +7,10 @@ struct StaticString{N, T} <: AbstractString end function Base.iterate(x::StaticString{N, UInt8}, i::Int = 1) where {N} - i > N && return + i > ncodeunits(x) && return cs = x.codeunits c = @inbounds cs[i] - if all(iszero, (cs[j] for j in i:N)) - return - elseif (c & 0x80) == 0x00 + if (c & 0x80) == 0x00 return (reinterpret(Char, UInt32(c) << 24), i + 1) elseif (c & 0x40) == 0x00 nothing @@ -56,12 +53,25 @@ function Base.iterate(x::StaticString{N, UInt8}, i::Int = 1) where {N} throw(StringIndexError(x, i)) end -function Base.String(x::StaticString{N, T}) where {N, T} - b = Base.StringVector(N) - return String(b .= x.codeunits) +function Base.String(x::StaticString) + n = ncodeunits(x) + b = Base.StringVector(n) + @inbounds for i in 1:n + b[i] = x.codeunits[i] + end + return String(b) end -@inline Base.ncodeunits(::StaticString{N}) where {N} = N +# CDF CHAR values are fixed-width null-padded; the string ends at the trailing-null run +# so length/collect/String agree with iterate truncating there. +@inline function Base.ncodeunits(s::StaticString{N}) where {N} + cs = s.codeunits + n = N + while n > 0 && iszero(@inbounds cs[n]) + n -= 1 + end + return n +end Base.codeunit(::StaticString{N, T}) where {N, T} = T Base.@propagate_inbounds Base.codeunit(s::StaticString, i::Int) = s.codeunits[i] @@ -71,23 +81,4 @@ function StaticString(cu::Base.CodeUnits{T}) where {T} end StaticString(s::AbstractString) = StaticString(codeunits(s)) -Base.isvalid(s::StaticString, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i -Base.thisind(s::StaticString, i::Int) = _thisind_str(s, i) - -@inline function _thisind_str(s, i::Int) - i == 0 && return 0 - n = ncodeunits(s) - i == n + 1 && return i - @boundscheck between(i, 1, n) || throw(BoundsError(s, i)) - @inbounds b = codeunit(s, i) - (b & 0xc0 == 0x80) & (i - 1 > 0) || return i - @inbounds b = codeunit(s, i - 1) - between(b, 0b11000000, 0b11110111) && return i - 1 - (b & 0xc0 == 0x80) & (i - 2 > 0) || return i - @inbounds b = codeunit(s, i - 2) - between(b, 0b11100000, 0b11110111) && return i - 2 - (b & 0xc0 == 0x80) & (i - 3 > 0) || return i - @inbounds b = codeunit(s, i - 3) - between(b, 0b11110000, 0b11110111) && return i - 3 - return i -end +Base.isvalid(s::StaticString, i::Int) = checkbounds(Bool, s, i) && Base._thisind_str(s, i) == i diff --git a/test/debug.jl b/test/debug.jl deleted file mode 100644 index cc7e686..0000000 --- a/test/debug.jl +++ /dev/null @@ -1,3 +0,0 @@ -using CommonDataFormat -ds=CDFDataset("data/mms1_scm_srvy_l2_scsrvy_20190301_v2.2.0.cdf") -ds["mms1_scm_acb_gse_scsrvy_srvy_l2"] \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index fb1bb75..88ab575 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,7 +7,9 @@ include("epochs_test.jl") include("comprehensive_test.jl") include("cdf2_test.jl") include("CommonDataModelExt_test.jl") -include("staticstring.jl") +@testset "StaticString" begin + include("staticstring.jl") +end @testset "Aqua" begin using Aqua diff --git a/test/staticstring.jl b/test/staticstring.jl index 20695a3..5989110 100644 --- a/test/staticstring.jl +++ b/test/staticstring.jl @@ -1,11 +1,11 @@ using CommonDataFormat: StaticString using Test -@testset "StaticString" begin +@testset "Basic operations" begin s = "Hello, World!" ss = StaticString(s) - StaticString(codeunits(s)) + @test StaticString(codeunits(s)) == s @test ss == s @test String(ss) == s @test !isempty(ss) @@ -15,3 +15,21 @@ using Test @test codeunit(ss) == UInt8 end + +@testset "Null padding and UTF-8" begin + # null padding: iterate/length/collect/String must agree + pad = StaticString{8, UInt8}((UInt8('a'), UInt8('b'), zeros(UInt8, 6)...)) + @test ncodeunits(pad) == 2 + @test length(pad) == 2 + @test collect(pad) == ['a', 'b'] + @test String(pad) == "ab" + @test pad == "ab" + @test isempty(StaticString{4, UInt8}(ntuple(_ -> 0x00, 4))) + + # multi-byte UTF-8 indexing + s = StaticString("héllo") + @test collect(s) == collect("héllo") + @test thisind(s, 3) == 2 + @test isvalid(s, 2) && !isvalid(s, 3) + @test length(s) == 5 && ncodeunits(s) == 6 +end