diff --git a/Project.toml b/Project.toml index f1caec7c..5b40a674 100644 --- a/Project.toml +++ b/Project.toml @@ -1,11 +1,12 @@ name = "UnROOT" uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9" authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"] -version = "0.8.22" +version = "0.8.23" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" ArraysOfArrays = "65a8f2f4-9b39-5baf-92e2-a9cc46fdf018" +BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561" CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" @@ -28,6 +29,7 @@ xrootdgo_jll = "9d84c17e-11f2-50ef-8cc9-e9701362097f" [compat] AbstractTrees = "^0.3.0, 0.4" ArraysOfArrays = "^0.5.3, ^0.6" +BitIntegers = "^0.2.6" CodecLz4 = "^0.3.0, ^0.4.0" CodecXz = "^0.6.0, ^0.7.0" CodecZstd = "^0.6.0, ^0.7.0" diff --git a/src/RNTuple/bootstrap.jl b/src/RNTuple/bootstrap.jl index 39aa27cc..02ceb1d8 100644 --- a/src/RNTuple/bootstrap.jl +++ b/src/RNTuple/bootstrap.jl @@ -180,9 +180,3 @@ function _rntuple_read(io, ::Type{RNTupleListNoFrame{T}}) where T seek(io, end_pos) return res end - -primitive type Switch <: Integer 64 end -Base.show(io::IO, ::Type{Switch}) = print(io, "Switch") -Base.:&(x::Switch, y::Switch) = Switch(UInt64(x) & UInt64(y)) -Base.Int64(x::Switch) = reinterpret(Int64, x) -Base.UInt64(x::Switch) = reinterpret(UInt64, x) diff --git a/src/RNTuple/constants.jl b/src/RNTuple/constants.jl index 1728a15b..525d3422 100644 --- a/src/RNTuple/constants.jl +++ b/src/RNTuple/constants.jl @@ -1,7 +1,12 @@ +# the signed ones are not used +@define_integers 64 SignedSwitch Switch +@define_integers 32 SignedIndex32 Index32 +@define_integers 64 SignedIndex64 Index64 + #https://github.com/root-project/root/blob/master/tree/ntuple/v7/doc/specifications.md const rntuple_col_type_dict = ( - Int64, - Int32, + Index64, + Index32, Switch, # Switch UInt8, UInt8, # char diff --git a/src/RNTuple/fieldcolumn_reading.jl b/src/RNTuple/fieldcolumn_reading.jl index 55266dbf..efaf2be0 100644 --- a/src/RNTuple/fieldcolumn_reading.jl +++ b/src/RNTuple/fieldcolumn_reading.jl @@ -1,3 +1,28 @@ +""" + _field_output_type(::Type{F}) where F + +This is function is used in two ways: + +- provide a output type prediction for each "field" in RNTuple so we can +achieve type stability +- it's also used to enforce the type stability in [`read_field`](@ref): + +``` + # this is basically a type assertion for `res` + return res::_field_output_type(field) +``` +""" +function _field_output_type() end + +""" + read_field(io, field::F, page_list) where F + +Read a field from the `io` stream. The `page_list` is a list of PageLinks for the +current cluster group. The type stability is achieved by type asserting +based on type `F` via [`_field_output_type`](@ref) function. +""" +function read_field() end + _field_output_type(x::T) where T = _field_output_type(T) function _field_output_type(::Type{StdArrayField{N, T}}) where {N, T<:LeafField} @@ -30,7 +55,30 @@ function read_field(io, field::StringField{O, T}, page_list) where {O, T} return res::_field_output_type(field) end -_field_output_type(::Type{LeafField{T}}) where {T} = Base.ReinterpretArray{T, 1, UInt8, Vector{UInt8}, false} +const T_Reinter{T} = Base.ReinterpretArray{T, 1, UInt8, Vector{UInt8}, false} + +struct CardinalityVector{T} <: AbstractVector{T} + contents::T_Reinter{T} +end +Base.length(ary::CardinalityVector) = length(ary.contents) +Base.size(ary::CardinalityVector) = (length(ary.contents), ) +Base.IndexStyle(::CardinalityVector) = IndexLinear() +function Base.getindex(ary::CardinalityVector{T}, i::Int) where {T} + ary.contents[i] - get(ary.contents, i-1, zero(T)) +end + + +_field_output_type(::Type{RNTupleCardinality{T}}) where {T} = CardinalityVector{T} +function read_field(io, field::RNTupleCardinality{T}, page_list) where T + nbits = field.nbits + pages = page_list[field.content_col_idx] + bytes = read_pagedesc(io, pages, nbits) + contents = reinterpret(T, bytes) + res = CardinalityVector(contents) + return res::_field_output_type(field) +end + +_field_output_type(::Type{LeafField{T}}) where {T} = T_Reinter{T} function read_field(io, field::LeafField{T}, page_list) where T nbits = field.nbits pages = page_list[field.content_col_idx] @@ -55,7 +103,7 @@ function read_field(io, field::LeafField{Bool}, page_list) return res::_field_output_type(field) end -_field_output_type(::Type{VectorField{O, T}}) where {O, T} = VectorOfVectors{eltype(_field_output_type(T)), _field_output_type(T), Vector{Int32}, Vector{Tuple{}}} +_field_output_type(::Type{VectorField{O, T}}) where {O, T} = VectorOfVectors{eltype(_field_output_type(T)), _field_output_type(T), Vector{eltype(O)}, Vector{Tuple{}}} function read_field(io, field::VectorField{O, T}, page_list) where {O, T} offset = read_field(io, field.offset_col, page_list) content = read_field(io, field.content_col, page_list) @@ -71,9 +119,12 @@ function _field_output_type(::Type{StructField{N, T}}) where {N, T} types2 = Tuple{_field_output_type.(T.types)...} StructArray{NamedTuple{N, types}, 1, NamedTuple{N, types2}, Int64} end + """ - Since each field of the struct is stored in a separate field of the RNTuple, - this function returns a `StructArray` for efficiency / performance reason. + read_field(io, field::StructField{N, T}, page_list) where {N, T} + +Since each field of the struct is stored in a separate field of the RNTuple, +this function returns a `StructArray` to maximize efficiency. """ function read_field(io, field::StructField{N, T}, page_list) where {N, T} contents = (read_field(io, col, page_list) for col in field.content_cols) @@ -100,14 +151,13 @@ function Base.getindex(ary::UnionVector, i::Int) end function _split_switch_bits(content) - kindex = Int64.(content) .& 0x00000000000FFFFF .+ 1 - tags = Int8.(UInt64.(content) .>> 44) + kindex = content .& 0x00000000000FFFFF .+ 1 + tags = Int8.(content .>> 44) return kindex, tags end function _field_output_type(::Type{UnionField{S, T}}) where {S, T} - type = Union{eltype.(_field_output_type.(T.types))...} - type2 = Tuple{_field_output_type.(T.types)...} - return UnionVector{type, type2} + types = _field_output_type.(T.types) + return UnionVector{Union{eltype.(types)...}, Tuple{types...}} end function read_field(io, field::UnionField{S, T}, page_list) where {S, T} switch = read_field(io, field.switch_col, page_list) diff --git a/src/RNTuple/fieldcolumn_schema.jl b/src/RNTuple/fieldcolumn_schema.jl index c678bdfc..d3b54d2f 100644 --- a/src/RNTuple/fieldcolumn_schema.jl +++ b/src/RNTuple/fieldcolumn_schema.jl @@ -28,6 +28,7 @@ end """ struct LeafField{T} content_col_idx::Int + nbits::Int end Base case of field nesting, this links to a column in the RNTuple by 0-based index. @@ -39,15 +40,33 @@ struct LeafField{T} nbits::Int end +""" + struct RNTupleCardinality{T} + content_col_idx::Int + nbits::Int + end + +Special field. The cardinality is basically a counter, but the data column is +a leaf column of Index32 or Index64. To get a number from Cardinality, one needs to +compute `ary[i] - ary[i-1]`. +""" +struct RNTupleCardinality{T} + content_col_idx::Int + nbits::Int +end +RNTupleCardinality(l::LeafField{T}) where T = RNTupleCardinality{T}(l.content_col_idx, l.nbits) + +Base.eltype(::Type{LeafField{T}}) where T = T + function _search_col_type(field_id, column_records, col_id::Int...) if length(col_id) == 2 && - # String is the only known leaf field that splits in column records column_records[col_id[1]].type == 2 && column_records[col_id[2]].type == 5 return StringField(LeafField{Int32}(col_id[1], 32), LeafField{Char}(col_id[2], 8)) elseif length(col_id) == 1 record = column_records[only(col_id)] - return LeafField{rntuple_col_type_dict[record.type]}(only(col_id), record.nbits) + LeafType = rntuple_col_type_dict[record.type] + return LeafField{LeafType}(only(col_id), record.nbits) else error("un-handled base case, report issue to authors") end @@ -82,7 +101,13 @@ function _parse_field(field_id, field_records, column_records, alias_columns, :: # field_id in 0-based index field = field_records[field_id + 1] if iszero(field.repetition) - return _search_col_type(field_id, column_records, alias_columns) + res = _search_col_type(field_id, column_records, alias_columns) + if eltype(res) <: Union{Index32, Index64} + # https://github.com/root-project/root/pull/12127 + return RNTupleCardinality(res) + else + return res + end else # `std::array<>` for some reason splits in Field records and pretent to be a leaf field element_idx = findlast(field_records) do field diff --git a/src/UnROOT.jl b/src/UnROOT.jl index 81e31e85..24dd7073 100644 --- a/src/UnROOT.jl +++ b/src/UnROOT.jl @@ -15,6 +15,7 @@ using Mixers, Parameters, Memoization, LRUCache import IterTools: groupby using LibDeflate: zlib_decompress!, Decompressor, crc32 +using BitIntegers: @define_integers import Tables, PrettyTables diff --git a/src/iteration.jl b/src/iteration.jl index 2ad1187d..5027ce01 100644 --- a/src/iteration.jl +++ b/src/iteration.jl @@ -218,6 +218,10 @@ function Base.show(io::IO, evt::LazyEvent) show(io, collect(evt)) end +function Base.show(io::IO, ::Type{<:LazyEvent}) + print(io, "UnROOT.LazyEvent") +end + struct LazyTree{T<:NamedTuple} <: AbstractVector{LazyEvent{T}} treetable::T end diff --git a/test/rntuple_tests.jl b/test/rntuple_tests.jl index 11c8101d..7023e88a 100644 --- a/test/rntuple_tests.jl +++ b/test/rntuple_tests.jl @@ -42,7 +42,7 @@ end sample = schema2.vector_tuple_int32_string @test sample isa UnROOT.VectorField - @test sample.offset_col isa UnROOT.LeafField{Int32} + @test sample.offset_col isa UnROOT.LeafField{UnROOT.Index32} @test sample.offset_col.content_col_idx == 9 @test sample.content_col isa UnROOT.StructField @@ -110,6 +110,16 @@ end @test df.array_lv[5] == fill((pt=5.0, eta=5.0, phi=5.0, mass=5.0), 3) end +@testset "RNTupleCardinality" begin + f1 = UnROOT.samplefile("RNTuple/Run2012BC_DoubleMuParked_Muons_rntuple_1000evts.root") + t = LazyTree(f1, "Events") + @test t.nMuon == + length.(t.Muon_pt) == + length.(t.Muon_eta) == + length.(t.Muon_mass) == + length.(t.Muon_charge) +end + @testset "RNTuple Type stability" begin f1 = UnROOT.samplefile("RNTuple/test_ntuple_int_5e4.root") t = LazyTree(f1, "ntuple") diff --git a/test/samples/RNTuple/Run2012BC_DoubleMuParked_Muons_rntuple_1000evts.root b/test/samples/RNTuple/Run2012BC_DoubleMuParked_Muons_rntuple_1000evts.root new file mode 100644 index 00000000..1ce0ed56 Binary files /dev/null and b/test/samples/RNTuple/Run2012BC_DoubleMuParked_Muons_rntuple_1000evts.root differ