Skip to content

Commit

Permalink
handle RNTupleCardinality field and add test file (#209)
Browse files Browse the repository at this point in the history
* handle RNTupleCardinality field and add test file
  • Loading branch information
Moelf authored Jan 28, 2023
1 parent 591787b commit 175220a
Show file tree
Hide file tree
Showing 9 changed files with 113 additions and 22 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
name = "UnROOT"
uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9"
authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"]
version = "0.8.22"
version = "0.8.23"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
ArraysOfArrays = "65a8f2f4-9b39-5baf-92e2-a9cc46fdf018"
BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
Expand All @@ -28,6 +29,7 @@ xrootdgo_jll = "9d84c17e-11f2-50ef-8cc9-e9701362097f"
[compat]
AbstractTrees = "^0.3.0, 0.4"
ArraysOfArrays = "^0.5.3, ^0.6"
BitIntegers = "^0.2.6"
CodecLz4 = "^0.3.0, ^0.4.0"
CodecXz = "^0.6.0, ^0.7.0"
CodecZstd = "^0.6.0, ^0.7.0"
Expand Down
6 changes: 0 additions & 6 deletions src/RNTuple/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,3 @@ function _rntuple_read(io, ::Type{RNTupleListNoFrame{T}}) where T
seek(io, end_pos)
return res
end

primitive type Switch <: Integer 64 end
Base.show(io::IO, ::Type{Switch}) = print(io, "Switch")
Base.:&(x::Switch, y::Switch) = Switch(UInt64(x) & UInt64(y))
Base.Int64(x::Switch) = reinterpret(Int64, x)
Base.UInt64(x::Switch) = reinterpret(UInt64, x)
9 changes: 7 additions & 2 deletions src/RNTuple/constants.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# the signed ones are not used
@define_integers 64 SignedSwitch Switch
@define_integers 32 SignedIndex32 Index32
@define_integers 64 SignedIndex64 Index64

#https://github.com/root-project/root/blob/master/tree/ntuple/v7/doc/specifications.md
const rntuple_col_type_dict = (
Int64,
Int32,
Index64,
Index32,
Switch, # Switch
UInt8,
UInt8, # char
Expand Down
68 changes: 59 additions & 9 deletions src/RNTuple/fieldcolumn_reading.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
"""
_field_output_type(::Type{F}) where F
This is function is used in two ways:
- provide a output type prediction for each "field" in RNTuple so we can
achieve type stability
- it's also used to enforce the type stability in [`read_field`](@ref):
```
# this is basically a type assertion for `res`
return res::_field_output_type(field)
```
"""
function _field_output_type() end

"""
read_field(io, field::F, page_list) where F
Read a field from the `io` stream. The `page_list` is a list of PageLinks for the
current cluster group. The type stability is achieved by type asserting
based on type `F` via [`_field_output_type`](@ref) function.
"""
function read_field() end

_field_output_type(x::T) where T = _field_output_type(T)

function _field_output_type(::Type{StdArrayField{N, T}}) where {N, T<:LeafField}
Expand Down Expand Up @@ -30,7 +55,30 @@ function read_field(io, field::StringField{O, T}, page_list) where {O, T}
return res::_field_output_type(field)
end

_field_output_type(::Type{LeafField{T}}) where {T} = Base.ReinterpretArray{T, 1, UInt8, Vector{UInt8}, false}
const T_Reinter{T} = Base.ReinterpretArray{T, 1, UInt8, Vector{UInt8}, false}

struct CardinalityVector{T} <: AbstractVector{T}
contents::T_Reinter{T}
end
Base.length(ary::CardinalityVector) = length(ary.contents)
Base.size(ary::CardinalityVector) = (length(ary.contents), )
Base.IndexStyle(::CardinalityVector) = IndexLinear()
function Base.getindex(ary::CardinalityVector{T}, i::Int) where {T}
ary.contents[i] - get(ary.contents, i-1, zero(T))
end


_field_output_type(::Type{RNTupleCardinality{T}}) where {T} = CardinalityVector{T}
function read_field(io, field::RNTupleCardinality{T}, page_list) where T
nbits = field.nbits
pages = page_list[field.content_col_idx]
bytes = read_pagedesc(io, pages, nbits)
contents = reinterpret(T, bytes)
res = CardinalityVector(contents)
return res::_field_output_type(field)
end

_field_output_type(::Type{LeafField{T}}) where {T} = T_Reinter{T}
function read_field(io, field::LeafField{T}, page_list) where T
nbits = field.nbits
pages = page_list[field.content_col_idx]
Expand All @@ -55,7 +103,7 @@ function read_field(io, field::LeafField{Bool}, page_list)
return res::_field_output_type(field)
end

_field_output_type(::Type{VectorField{O, T}}) where {O, T} = VectorOfVectors{eltype(_field_output_type(T)), _field_output_type(T), Vector{Int32}, Vector{Tuple{}}}
_field_output_type(::Type{VectorField{O, T}}) where {O, T} = VectorOfVectors{eltype(_field_output_type(T)), _field_output_type(T), Vector{eltype(O)}, Vector{Tuple{}}}
function read_field(io, field::VectorField{O, T}, page_list) where {O, T}
offset = read_field(io, field.offset_col, page_list)
content = read_field(io, field.content_col, page_list)
Expand All @@ -71,9 +119,12 @@ function _field_output_type(::Type{StructField{N, T}}) where {N, T}
types2 = Tuple{_field_output_type.(T.types)...}
StructArray{NamedTuple{N, types}, 1, NamedTuple{N, types2}, Int64}
end

"""
Since each field of the struct is stored in a separate field of the RNTuple,
this function returns a `StructArray` for efficiency / performance reason.
read_field(io, field::StructField{N, T}, page_list) where {N, T}
Since each field of the struct is stored in a separate field of the RNTuple,
this function returns a `StructArray` to maximize efficiency.
"""
function read_field(io, field::StructField{N, T}, page_list) where {N, T}
contents = (read_field(io, col, page_list) for col in field.content_cols)
Expand All @@ -100,14 +151,13 @@ function Base.getindex(ary::UnionVector, i::Int)
end

function _split_switch_bits(content)
kindex = Int64.(content) .& 0x00000000000FFFFF .+ 1
tags = Int8.(UInt64.(content) .>> 44)
kindex = content .& 0x00000000000FFFFF .+ 1
tags = Int8.(content .>> 44)
return kindex, tags
end
function _field_output_type(::Type{UnionField{S, T}}) where {S, T}
type = Union{eltype.(_field_output_type.(T.types))...}
type2 = Tuple{_field_output_type.(T.types)...}
return UnionVector{type, type2}
types = _field_output_type.(T.types)
return UnionVector{Union{eltype.(types)...}, Tuple{types...}}
end
function read_field(io, field::UnionField{S, T}, page_list) where {S, T}
switch = read_field(io, field.switch_col, page_list)
Expand Down
31 changes: 28 additions & 3 deletions src/RNTuple/fieldcolumn_schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ end
"""
struct LeafField{T}
content_col_idx::Int
nbits::Int
end
Base case of field nesting, this links to a column in the RNTuple by 0-based index.
Expand All @@ -39,15 +40,33 @@ struct LeafField{T}
nbits::Int
end

"""
struct RNTupleCardinality{T}
content_col_idx::Int
nbits::Int
end
Special field. The cardinality is basically a counter, but the data column is
a leaf column of Index32 or Index64. To get a number from Cardinality, one needs to
compute `ary[i] - ary[i-1]`.
"""
struct RNTupleCardinality{T}
content_col_idx::Int
nbits::Int
end
RNTupleCardinality(l::LeafField{T}) where T = RNTupleCardinality{T}(l.content_col_idx, l.nbits)

Base.eltype(::Type{LeafField{T}}) where T = T

function _search_col_type(field_id, column_records, col_id::Int...)
if length(col_id) == 2 &&
# String is the only known leaf field that splits in column records
column_records[col_id[1]].type == 2 &&
column_records[col_id[2]].type == 5
return StringField(LeafField{Int32}(col_id[1], 32), LeafField{Char}(col_id[2], 8))
elseif length(col_id) == 1
record = column_records[only(col_id)]
return LeafField{rntuple_col_type_dict[record.type]}(only(col_id), record.nbits)
LeafType = rntuple_col_type_dict[record.type]
return LeafField{LeafType}(only(col_id), record.nbits)
else
error("un-handled base case, report issue to authors")
end
Expand Down Expand Up @@ -82,7 +101,13 @@ function _parse_field(field_id, field_records, column_records, alias_columns, ::
# field_id in 0-based index
field = field_records[field_id + 1]
if iszero(field.repetition)
return _search_col_type(field_id, column_records, alias_columns)
res = _search_col_type(field_id, column_records, alias_columns)
if eltype(res) <: Union{Index32, Index64}
# https://github.com/root-project/root/pull/12127
return RNTupleCardinality(res)
else
return res
end
else
# `std::array<>` for some reason splits in Field records and pretent to be a leaf field
element_idx = findlast(field_records) do field
Expand Down
1 change: 1 addition & 0 deletions src/UnROOT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ using Mixers, Parameters, Memoization, LRUCache
import IterTools: groupby

using LibDeflate: zlib_decompress!, Decompressor, crc32
using BitIntegers: @define_integers

import Tables, PrettyTables

Expand Down
4 changes: 4 additions & 0 deletions src/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ function Base.show(io::IO, evt::LazyEvent)
show(io, collect(evt))
end

function Base.show(io::IO, ::Type{<:LazyEvent})
print(io, "UnROOT.LazyEvent")
end

struct LazyTree{T<:NamedTuple} <: AbstractVector{LazyEvent{T}}
treetable::T
end
Expand Down
12 changes: 11 additions & 1 deletion test/rntuple_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ end

sample = schema2.vector_tuple_int32_string
@test sample isa UnROOT.VectorField
@test sample.offset_col isa UnROOT.LeafField{Int32}
@test sample.offset_col isa UnROOT.LeafField{UnROOT.Index32}
@test sample.offset_col.content_col_idx == 9

@test sample.content_col isa UnROOT.StructField
Expand Down Expand Up @@ -110,6 +110,16 @@ end
@test df.array_lv[5] == fill((pt=5.0, eta=5.0, phi=5.0, mass=5.0), 3)
end

@testset "RNTupleCardinality" begin
f1 = UnROOT.samplefile("RNTuple/Run2012BC_DoubleMuParked_Muons_rntuple_1000evts.root")
t = LazyTree(f1, "Events")
@test t.nMuon ==
length.(t.Muon_pt) ==
length.(t.Muon_eta) ==
length.(t.Muon_mass) ==
length.(t.Muon_charge)
end

@testset "RNTuple Type stability" begin
f1 = UnROOT.samplefile("RNTuple/test_ntuple_int_5e4.root")
t = LazyTree(f1, "ntuple")
Expand Down
Binary file not shown.

2 comments on commit 175220a

@Moelf
Copy link
Member Author

@Moelf Moelf commented on 175220a Jan 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/76579

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.8.23 -m "<description of version>" 175220a682bd826f1c122022c95e4c4aaee381b1
git push origin v0.8.23

Please sign in to comment.