Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 10 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,26 +1,18 @@
# CommonDataFormat.jl

[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliaspacephysics.github.io/CommonDataFormat.jl/dev/)
[![DOI](https://zenodo.org/badge/1057373325.svg)](https://doi.org/10.5281/zenodo.17517061)
[![version](https://juliahub.com/docs/General/CommonDataFormat/stable/version.svg)](https://juliahub.com/ui/Packages/General/CommonDataFormat)

[![Build Status](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml?query=branch%3Amain)
[![Coverage](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl)

A Julia package for reading [Common Data Format (CDF)](https://cdf.gsfc.nasa.gov/) files, widely used in space physics for storing multidimensional data arrays and metadata. See [CDFDatasets.jl](https://github.com/JuliaSpacePhysics/CDFDatasets.jl) for a high-level interface.

**Installation**: at the Julia REPL, run `using Pkg; Pkg.add("CommonDataFormat")`

**Documentation**: [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliaspacephysics.github.io/CommonDataFormat.jl/dev/)

## Features

- **Pure Julia implementation** - No external dependencies on CDF libraries
- **Efficient data access** - Memory-mapped access for data and attributes, super fast decompression using [`LibDeflate`](https://github.com/jakobnissen/LibDeflate.jl)
- **[DiskArrays.jl](https://github.com/JuliaIO/DiskArrays.jl) integration** - Lazy representation of data on hard disk with AbstractDiskArray interface
A Julia package for reading [Common Data Format (CDF)](https://cdf.gsfc.nasa.gov/) files, widely used in space physics for storing multidimensional data arrays and metadata. See [CDFDatasets.jl](https://github.com/JuliaSpacePhysics/CDFDatasets.jl) for high-level interfaces.

## Quick Start

```julia
using Pkg; Pkg.add("CommonDataFormat")
using CommonDataFormat

# Load a CDF file
Expand All @@ -38,6 +30,13 @@ println("Variables: ", keys(cdf))
var = cdf["temperature"]
```

## Features

- **Pure Julia implementation** - No external dependencies on CDF libraries
- **Efficient data access** - Memory-mapped access for data and attributes, super fast decompression using [`LibDeflate`](https://github.com/jakobnissen/LibDeflate.jl)
- **[DiskArrays.jl](https://github.com/JuliaIO/DiskArrays.jl) integration** - Lazy representation of data on hard disk with AbstractDiskArray interface


## Elsewhere

- [CDFpp](https://github.com/SciQLop/CDFpp): A modern C++ header only cdf library with Python bindings
Expand Down
2 changes: 0 additions & 2 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@ using Downloads

const SUITE = BenchmarkGroup()

# elb file is committed to the repo, so it ships with the package at any rev
const ELX_FILE = joinpath(pkgdir(CommonDataFormat), "data", "elb_l2_epdef_20210914_v01.cdf")

# mms file (50 MB) is gitignored; download once and cache across revisions
function download_data(url, filename = basename(url))
dir = joinpath(tempdir(), "CommonDataFormat_benchmark_data")
mkpath(dir)
Expand Down
58 changes: 29 additions & 29 deletions src/dataset.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
struct CDFDataset{CT, FST}
struct CDFDataset{FST}
filename::String
cdr::CDR{FST}
gdr::GDR{FST}
buffer::Vector{UInt8}
compression::CompressionType
end

Base.parent(cdf::CDFDataset) = getfield(cdf, :buffer)
GDR(cdf::CDFDataset) = getfield(cdf, :gdr)
filename(cdf::CDFDataset) = getfield(cdf, :filename)
recordsize_type(::CDFDataset{CT, RS}) where {CT, RS} = RS
recordsize_type(::CDFDataset{RS}) where {RS} = RS

"""
CDFDataset(filename)
Expand All @@ -22,21 +23,27 @@ cdf = CDFDataset("data.cdf")
"""
function CDFDataset(filename)
fname = String(filename)
return open(fname, "r") do io
# `open(f, name, mode) do` form: routes through varargs splatting (`_apply_iterate`) which `juliac --trim` can't resolve.
io = open(fname, "r")
try
buffer = Mmap.mmap(io)
magic_bytes = read_be(buffer, 1, UInt32)
@assert validate_cdf_magic(magic_bytes)
return is_cdf_v3(magic_bytes) ? _load_dataset(fname, buffer, Int64) :
_load_dataset(fname, buffer, Int32)
finally
close(io)
end
end

FieldSizeType = is_cdf_v3(magic_bytes) ? Int64 : Int32
compression = NoCompression
if is_compressed(read_be(buffer, 5, UInt32))
buffer, compression = decompress_bytes(buffer, FieldSizeType)
end
# Parse CDF header
cdr = CDR(buffer, 8, FieldSizeType)
gdr = GDR(buffer, Int(cdr.gdr_offset), FieldSizeType)
return CDFDataset{compression, FieldSizeType}(fname, cdr, gdr, buffer)
function _load_dataset(fname, buffer, ::Type{FieldSizeType}) where {FieldSizeType}
compression = NoCompression
if is_compressed(read_be(buffer, 5, UInt32))
buffer, compression = decompress_bytes(buffer, FieldSizeType)
end
cdr = CDR(buffer, 8, FieldSizeType)
gdr = GDR(buffer, Int(cdr.gdr_offset), FieldSizeType)
return CDFDataset{FieldSizeType}(fname, cdr, gdr, buffer, compression)
end

is_big_endian_encoding(cdf::CDFDataset) = is_big_endian_encoding(cdf.cdr.encoding)
Expand All @@ -45,23 +52,16 @@ is_compressed(magic_numbers::UInt32) = magic_numbers != 0x0000FFFF
majority(cdf::CDFDataset) = majority(cdf.cdr)

# Convenience accessors for the dataset with lazy loading
@inline function Base.getproperty(cdf::CDFDataset{CT, FST}, name::Symbol) where {CT, FST}
@inline function Base.getproperty(cdf::CDFDataset, name::Symbol)
# Real fields FIRST so internal accesses (`cdf.cdr`, `cdf.gdr`, …) short-circuit and
# never traverse the lazy `attrib` branches below.
name in fieldnames(CDFDataset) && return getfield(cdf, name)
if name === :version
return version(cdf.cdr)
elseif name === :majority
return majority(cdf)
elseif name === :compression
return CT
elseif name === :adr
return ADR(parent(cdf), GDR(cdf).ADRhead, recordsize_type(cdf))
elseif name === :attrib
return attrib(cdf)
elseif name === :vattrib
return attrib(cdf; predicate = !is_global)
else
throw(ArgumentError("Unknown property $name"))
end
name === :version && return version(getfield(cdf, :cdr))
name === :majority && return majority(cdf)
name === :adr && return ADR(parent(cdf), GDR(cdf).ADRhead, recordsize_type(cdf))
name === :attrib && return attrib(cdf)
name === :vattrib && return attrib(cdf; predicate = !is_global)
throw(ArgumentError("Unknown property $name"))
end

function find_vdr(cdf::CDFDataset, var_name::String)
Expand Down Expand Up @@ -134,5 +134,5 @@ function Base.show(io::IO, m::MIME"text/plain", cdf::CDFDataset)
return
end

OffsetsIterator(cdf::CDFDataset) =
OffsetsIterator(cdf::CDFDataset) =
OffsetsIterator{recordsize_type(cdf)}(cdf.buffer, cdf.gdr.ADRhead)
2 changes: 1 addition & 1 deletion src/decompress.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ include("decompress/rle.jl")
include("decompress/gzip.jl")


function decompress_bytes(buffer, RecordSizeType)
function decompress_bytes(buffer, ::Type{RecordSizeType}) where {RecordSizeType}
ccr = CCR(buffer, 8, RecordSizeType)
cpr = CPR(buffer, Int(ccr.cpr_offset), RecordSizeType)
compression = CompressionType(cpr.compression_type)
Expand Down
4 changes: 2 additions & 2 deletions src/loading/attribute.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Handles loading of ADR (Attribute Descriptor Record) and AEDR (Attribute Entry Descriptor Record) chains

# Load all attribute entries for a given attribute from its AEDRs.
@inline function load_attribute_entries(buffer::Vector{UInt8}, adr, RecordSizeType, needs_byte_swap)
@inline function load_attribute_entries(buffer::Vector{UInt8}, adr, ::Type{RecordSizeType}, needs_byte_swap) where {RecordSizeType}
head = max(adr.AgrEDRhead, adr.AzEDRhead)
offsets = OffsetsIterator{RecordSizeType}(buffer, head)
return map(offsets) do offset
Expand Down Expand Up @@ -122,7 +122,7 @@ function _get_attributes(name, value, cdf)
return value
end

@inline function _search_aedr_entries(source, aedr_head, RecordSizeType, needs_byte_swap, target_varnum)
@inline function _search_aedr_entries(source, aedr_head, ::Type{RecordSizeType}, needs_byte_swap, target_varnum) where {RecordSizeType}
aedr_head == 0 && return nothing
offset = Int(aedr_head)
_num_offset = 13 + 2 * sizeof(RecordSizeType)
Expand Down
46 changes: 44 additions & 2 deletions src/loading/variable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,48 @@ function variable(cdf::CDFDataset, name)
)
end

"""
read!(ds::CDFDataset, name, dest::AbstractArray{T, N}) -> dest

Read the full contents of variable `name` into the preallocated `dest`.

Statically-typed entry point: `T` and `N` come from `dest` instead of the file, so —
unlike `ds[name]`, whose type is only known at runtime — the call chain is resolvable
at compile time and survives `juliac --trim`.
"""
function Base.read!(ds::CDFDataset, name::String, dest::AbstractArray{T, N}) where {T, N}
vdr = find_vdr(ds, name)
isnothing(vdr) && throw(KeyError(name))
return _read_full!(dest, ds, name, vdr)
end

"""
read(ds::CDFDataset, name, ::Type{Array{T, N}}) -> Array{T, N}

Allocating variant of [`read!`](@ref): read the full contents of variable `name` into a
freshly allocated `Array{T, N}`.
"""
function Base.read(ds::CDFDataset, name::String, ::Type{Array{T, N}}) where {T, N}
vdr = find_vdr(ds, name)
isnothing(vdr) && throw(KeyError(name))
dims = (map(Int, record_sizes(vdr, Val(N - 1)))..., Int(vdr.max_rec) + 1)
return _read_full!(Array{T, N}(undef, dims), ds, name, vdr)
end

function _read_full!(dest::AbstractArray{T, N}, ds, name, vdr) where {T, N}
Base.require_one_based_indexing(dest)
Tfile = julia_type(vdr.data_type, vdr.num_elems)
T === Tfile || throw(ArgumentError("element type mismatch for \"$name\": file has $Tfile, destination has $T"))
dims = (map(Int, record_sizes(vdr, Val(N - 1)))..., Int(vdr.max_rec) + 1)
size(dest) == dims || throw(DimensionMismatch("variable \"$name\" has size $dims, destination has size $(size(dest))"))
var = CDFVariable{T, N, typeof(vdr), typeof(ds)}(name, vdr, ds, dims)
DiskArrays.readblock!(var, dest, axes(dest)...)
return dest
end

@inline _record_view(A::AbstractArray{<:Any, M}, r) where {M} =
view(A, ntuple(_ -> Colon(), M - 1)..., r)

function DiskArrays.readblock!(var::CDFVariable{T, N}, dest::AbstractArray{T}, ranges::Vararg{AbstractUnitRange{<:Integer}, N}; nbuffers = nthreads()) where {T, N}
N > 0 && @boundscheck checkbounds(var, ranges...)
isempty(dest) && return dest
Expand Down Expand Up @@ -100,7 +142,7 @@ function DiskArrays.readblock!(var::CDFVariable{T, N}, dest::AbstractArray{T}, r
if is_full_record && entry.first >= first_rec && entry.last <= last_rec
# full entry
dest_range = dst_src_ranges(first_rec, last_rec, entry)[1]
dest_view = selectdim(dest, N, dest_range)
dest_view = _record_view(dest, dest_range)
total_elems = record_size * length(entry)
decompressor = take!(decompressors())
load_cvvr_data!(dest_view, 1, buffer, entry.offset, total_elems, RecordSizeType, compression; decompressor)
Expand All @@ -109,7 +151,7 @@ function DiskArrays.readblock!(var::CDFVariable{T, N}, dest::AbstractArray{T}, r
else
# partial entry
(dest_range, local_range) = dst_src_ranges(first_rec, last_rec, entry)
dest_view = selectdim(dest, N, dest_range)
dest_view = _record_view(dest, dest_range)
n_records = length(entry)
total_elems = record_size * n_records
chunk = Vector{T}(undef, total_elems)
Expand Down
43 changes: 5 additions & 38 deletions src/parsing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ end
return ntuple(j -> read_be(v, i + (j - 1) * S, T), n)
end

@inline function read_be(v::Vector{UInt8}, i, ::Val{M}, T) where {M}
S = sizeof(T)
return ntuple(j -> read_be(v, i + (j - 1) * S, T), Val(M))
end

@inline function read_be_i(v::Vector{UInt8}, i, T::Base.DataType)
return read_be(v, i, T), i + _sizeof(T)
end
Expand All @@ -32,58 +37,20 @@ end

const name_bytes_buffer = Vector{UInt8}(undef, 256)

"""
@read_be_fields buffer pos T1 T2 ...

Unrolls sequential big-endian reads starting at `pos` within `buffer`.
Returns a tuple of the parsed values and the updated position, mirroring
`read_be_i` but without the runtime `ntuple`/offset bookkeeping.

# Example

```julia
values, next = @read_be_fields buf pos UInt32 Int16
```
"""
macro read_be_fields(buffer, pos, Ts...)
isempty(Ts) && error("@read_be_fields requires at least one field type")

types = flatten_field_types(__module__, Ts)
buf = esc(buffer)
start = esc(pos)
pos_sym = gensym(:pos)
value_syms = [gensym(:field) for _ in types]

stmts = Any[:(local $pos_sym = $start)]
for (sym, T) in zip(value_syms, types)
Tesc = esc(T)
push!(stmts, :(local $sym = read_be($buf, $pos_sym, $Tesc)))
push!(stmts, :($pos_sym += _sizeof($Tesc)))
end

tuple_expr = Expr(:tuple, value_syms...)
push!(stmts, :(($tuple_expr, $pos_sym)))

return Expr(:block, stmts...)
end

# Optimized version using loop unrolling for better performance
@generated function read_be_fields(buffer::Vector{UInt8}, pos::Integer, ::Type{SType}, ::Val{indxs}) where {SType, indxs}
exprs = Expr[]
value_syms = [gensym(:field) for _ in 1:length(indxs)]
pos_sym = gensym(:pos)

# Initialize position
push!(exprs, :(local $pos_sym = pos))

# Read each field
for (i, idx) in enumerate(indxs)
T = fieldtype(SType, idx)
push!(exprs, :(local $(value_syms[i]) = read_be(buffer, $pos_sym, $T)))
push!(exprs, :($pos_sym += _sizeof($T)))
end

# Return tuple of values and final position
tuple_expr = Expr(:tuple, value_syms...)
push!(exprs, :(($tuple_expr, $pos_sym)))

Expand Down
4 changes: 2 additions & 2 deletions src/precompile.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
precompile(Array, (CDFVariable{TT2000, 1, VDR{Int64}, CDFDataset{NoCompression, Int64}},))
precompile(Array, (CDFVariable{TT2000, 1, VDR{Int64}, CDFDataset{Int64}},))
for T in (Float32, Float64), i in 1:3
precompile(Array, (CDFVariable{T, i, VDR{Int64}, CDFDataset{NoCompression, Int64}},))
precompile(Array, (CDFVariable{T, i, VDR{Int64}, CDFDataset{Int64}},))
end

PrecompileTools.@setup_workload begin
Expand Down
2 changes: 1 addition & 1 deletion src/records/adr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ is_global(buffer, offset, ::Type{Int64}) = read_be(buffer, offset + 29, Int32) =

Load an Attribute Descriptor Record from the buffer at the specified position.
"""
@inline function ADR(buffer::Vector{UInt8}, offset, RecordSizeType)
@inline function ADR(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}) where {RecordSizeType}
pos = check_record_type(4, buffer, offset, RecordSizeType)
# Read ADR fields
fields, pos = read_be_fields(buffer, pos, ADR{RecordSizeType, String}, Val(1:11))
Expand Down
2 changes: 1 addition & 1 deletion src/records/aedr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct AEDR{FST, A}
Value::A # This consists of the number of elements (specified by the NumElems field) of the data type (specified by the DataType field). This can be thought of as a 1-dimensional array of values (stored contiguously). The size of this field is the product of the number of elements and the size in bytes of each element.
end

function load_aedr_data(buffer::Vector{UInt8}, offset, RecordSizeType, needs_byte_swap)
function load_aedr_data(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}, needs_byte_swap) where {RecordSizeType}
_datatype_offset = 9 + 2 * sizeof(RecordSizeType)
_numelems_offset = 17 + 2 * sizeof(RecordSizeType)
_data_offset = 41 + 2 * sizeof(RecordSizeType)
Expand Down
5 changes: 2 additions & 3 deletions src/records/ccr.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
struct CCR <: Record
header::Header
cpr_offset::UInt64
uncompressed_size::UInt64 # uSize Size of the CDF in its uncompressed form. This byte count does NOT include the 8-byte magic numbers, and 16-byte checksum if it exists.
rfu_a::RInt32
data_offset::Int
data_length::Int
end

@inline function CCR(buffer::Vector{UInt8}, offset, RecordSizeType)
@inline function CCR(buffer::Vector{UInt8}, offset, ::Type{RecordSizeType}) where {RecordSizeType}
pos = offset + 1
header = Header(buffer, pos, RecordSizeType)
@assert header.record_type == 10 "Invalid CCR record type"
Expand All @@ -18,7 +17,7 @@ end
record_end = offset + header.record_size
data_length = record_end - data_offset
@assert data_length >= 0 "Invalid CCR data length"
return CCR(header, UInt64(cpr_offset), UInt64(uncompressed_size), rfu_a, data_offset, data_length)
return CCR(UInt64(cpr_offset), UInt64(uncompressed_size), rfu_a, data_offset, data_length)
end

@inline function data_view(ccr::CCR, buffer::Vector{UInt8})
Expand Down
2 changes: 1 addition & 1 deletion src/records/cdr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ is_cdf_v3(cdr::CDR) = cdr.version == 3
Load a CDF Descriptor Record from the IO stream at the specified offset.
This follows the CDF specification for CDR record structure.
"""
@inline function CDR(buffer::Vector{UInt8}, offset, FieldSizeT)
@inline function CDR(buffer::Vector{UInt8}, offset, ::Type{FieldSizeT}) where {FieldSizeT}
pos = check_record_type(1, buffer, offset, FieldSizeT)
# Read remaining CDR fields in order as per CDF specification
fields, pos = read_be_fields(buffer, pos, CDR{FieldSizeT}, Val(1:9))
Expand Down
5 changes: 2 additions & 3 deletions src/records/cpr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@ struct CPR <: Record
# parameters::Tuple{Vararg{Int32}}
end

@inline function CPR(buffer::Vector{UInt8}, offset, FieldSizeT)
@inline function CPR(buffer::Vector{UInt8}, offset, ::Type{FieldSizeT}) where {FieldSizeT}
pos = check_record_type(11, buffer, offset, FieldSizeT)
fields, pos = @read_be_fields(buffer, pos, fieldtypes(CPR)...)
# parameter_count, pos = read_be_i(buffer, pos, Int32)
fields, pos = read_be_fields(buffer, pos, CPR, Val(1:3))
# parameters = read_be(buffer, pos, parameter_count, Int32)
return CPR(fields...)
end
Loading
Loading