From 1c71bc1b34e493fd6312236a4ed3ca746e6608ab Mon Sep 17 00:00:00 2001 From: Moelf Date: Fri, 15 Mar 2024 12:43:35 -0400 Subject: [PATCH 1/4] reset page index for Index32/Index64 columns --- src/RNTuple/displays.jl | 36 ++++++++++++++++++++++++----- src/RNTuple/fieldcolumn_reading.jl | 37 ++++++++++++++++++++++-------- src/RNTuple/fieldcolumn_schema.jl | 13 +++++------ src/RNTuple/footer.jl | 4 +++- src/RNTuple/header.jl | 24 +++++++++++++++---- src/RNTuple/highlevel.jl | 13 +++++++---- 6 files changed, 95 insertions(+), 32 deletions(-) diff --git a/src/RNTuple/displays.jl b/src/RNTuple/displays.jl index 1d7af4bf..37f19ead 100644 --- a/src/RNTuple/displays.jl +++ b/src/RNTuple/displays.jl @@ -1,5 +1,27 @@ +function _showwithkw(io, @nospecialize(k)) + T = typeof(k) + + print(io, T) + print(io, "(") + for i in fieldnames(T) + print(io, i, "=", repr(getfield(k, i)), ", ") + end + println(io, ")") +end + +function Base.show(io::IO, f::FieldRecord) + _showwithkw(io, f) +end + +function Base.show(io::IO, f::ColumnRecord) + _showwithkw(io, f) +end + function Base.show(io::IO, f::AliasRecord) - print(io, "AliasRecord(physical_id=$(f.physical_id), field_id=$(f.field_id))") + _showwithkw(io, f) +end +function Base.show(io::IO, f::Locator) + _showwithkw(io, f) end function Base.show(io::IO, lf::StringField) @@ -36,10 +58,11 @@ function Base.show(io::IO, header::RNTupleHeader, indent=0, short=false) l1 = maximum(length, [f.field_name for f in header.field_records]) l2 = maximum(length, [f.type_name for f in header.field_records]) println(io, "$ind field_records: ") - for f in header.field_records + for (fidx, f) in enumerate(header.field_records) print(io, "$ind ") + print(io, "(implicit idx=$(lpad(fidx-1, 2, "0"))), ") print(io, "parent=$(lpad(Int(f.parent_field_id), 2, "0")), ") - print(io, "role=$(Int(f.struct_role)), ") + print(io, "struct_role=$(Int(f.struct_role)), ") print(io, "name=$(rpad(f.field_name, l1+1, " ")), ") print(io, "type=$(rpad(f.type_name, l2+1, " "))") println(io, "repetition=$(f.repetition)") @@ -52,7 +75,8 @@ function Base.show(io::IO, header::RNTupleHeader, indent=0, short=false) print(io, "type=$(lpad(Int(g.type), 2, "0")), ") print(io, "nbits=$(lpad(Int(g.nbits), 2, "0")), ") print(io, "field_id=$(lpad(Int(g.field_id), 3, "0")), ") - println(io, "flags=$(g.flags)") + print(io, "flags=$(g.flags), ") + println(io, "first_ele_index=$(g.first_ele_idx)") end end end @@ -77,13 +101,13 @@ function Base.show(io::IO, rn::RNTuple) print(io, " └─ ") println(io, "Schema: ") _io = IOBuffer() - print_tree(_io, rn.schema; maxdepth=1, indicate_truncation=false) + print_tree(_io, rn.schema; maxdepth=3, indicate_truncation=true) for l in split(String(take!(_io)), '\n') print(io, " ") println(io, l) end end -Base.show(io::IO, s::RNTupleSchema) = print_tree(io, s) +Base.show(io::IO, s::RNTupleSchema) = print_tree(io, s; maxdepth=10) printnode(io::IO, s::RNTupleSchema) = print(io, "RNTupleSchema with $(length(s)) top fields") children(s::RNTupleSchema) = Dict(pairs(getfield(s, :namedtuple))) diff --git a/src/RNTuple/fieldcolumn_reading.jl b/src/RNTuple/fieldcolumn_reading.jl index 826aac59..1ac0c454 100644 --- a/src/RNTuple/fieldcolumn_reading.jl +++ b/src/RNTuple/fieldcolumn_reading.jl @@ -38,7 +38,7 @@ end _field_output_type(::Type{StringField{O, T}}) where {O, T} = Vector{String} function read_field(io, field::StringField{O, T}, page_list) where {O, T} - nbits = field.content_col.nbits + nbits = field.content_col.columnrecord.nbits pages = page_list[field.content_col.content_col_idx] offset = read_field(io, field.offset_col, page_list) @@ -65,9 +65,9 @@ end _field_output_type(::Type{RNTupleCardinality{T}}) where {T} = CardinalityVector{T} function read_field(io, field::RNTupleCardinality{T}, page_list) where T - nbits = field.leaf_field.nbits + nbits = field.leaf_field.columnrecord.nbits pages = page_list[field.leaf_field.content_col_idx] - typenum = field.leaf_field.type + typenum = field.leaf_field.columnrecord.type split = 14 <= typenum <= 21 || 26 <= typenum <= 28 delta = 14 <= typenum <= 15 bytes = read_pagedesc(io, pages, nbits; split) @@ -81,6 +81,27 @@ end _from_zigzag(n) = (n >> 1) ⊻ -(n & 1) _to_zigzag(n) = (n << 1) ⊻ (n >> 63) +function _from_zigzag!(res::AbstractVector) + @simd for i in eachindex(res) + res[i] = _from_zigzag(res[i]) + end + return res +end + +function _to_zigzag!(res::AbstractVector) + @simd for i in eachindex(res) + res[i] = _to_zigzag(res[i]) + end + return res +end + +function _reset_to_incremental(res::AbstractVector, pages, ::Type{T}) where T + endpoint = 0 + for pi in firstindex(pages):lastindex(pages)-1 + endpoint += pages[pi].num_elements + res[endpoint+1] -= sum(@view res[begin:endpoint]) + end +end function _reset_to_incremental(res::AbstractVector, pages, ::Type{T}) where T endpoint = 0 @@ -92,19 +113,17 @@ end _field_output_type(::Type{LeafField{T}}) where {T} = Vector{T} function read_field(io, field::LeafField{T}, page_list) where T - nbits = field.nbits + nbits = field.columnrecord.nbits pages = page_list[field.content_col_idx] # handle split encoding within page - typenum = field.type + typenum = field.columnrecord.type split = 14 <= typenum <= 21 || 26 <= typenum <= 28 zigzag = 26 <= typenum <= 28 delta = 14 <= typenum <= 15 bytes = read_pagedesc(io, pages, nbits; split = split) res = collect(reinterpret(T, bytes)) if zigzag - @simd for i in eachindex(res) - res[i] = _from_zigzag(res[i]) - end + _from_zigzag!(res) elseif delta # the Index32/64 resets to absolute offset page-by-page # https://github.com/JuliaHEP/UnROOT.jl/issues/312#issuecomment-1999875348 @@ -118,7 +137,7 @@ end _field_output_type(::Type{LeafField{Bool}}) = BitVector function read_field(io, field::LeafField{Bool}, page_list) - nbits = field.nbits + nbits = field.columnrecord.nbits pages = page_list[field.content_col_idx] total_num_elements = sum(p.num_elements for p in pages) diff --git a/src/RNTuple/fieldcolumn_schema.jl b/src/RNTuple/fieldcolumn_schema.jl index 2c9449a6..98e61a33 100644 --- a/src/RNTuple/fieldcolumn_schema.jl +++ b/src/RNTuple/fieldcolumn_schema.jl @@ -56,8 +56,7 @@ isvoid(::Type{<:StringField}) = false """ struct LeafField{T} content_col_idx::Int - type::Int - nbits::Int + columnrecord::ColumnRecord end Base case of field nesting, this links to a column in the RNTuple by 0-based index. @@ -68,8 +67,7 @@ The `type` field is the RNTuple spec type number, used to record split encoding. """ struct LeafField{T} content_col_idx::Int - type::Int - nbits::Int + columnrecord::ColumnRecord end Base.eltype(::Type{LeafField{T}}) where {T} = T isvoid(::Type{<:LeafField}) = false @@ -92,16 +90,17 @@ isvoid(::Type{<:RNTupleCardinality}) = false function _search_col_type(field_id, column_records, col_id::Int...) if length(col_id) == 2 && column_records[col_id[2]].type == 5 index_record = column_records[col_id[1]] + char_record = column_records[col_id[2]] index_typenum = index_record.type LeafType = rntuple_col_type_dict[index_typenum] return StringField( - LeafField{LeafType}(col_id[1], index_typenum, index_record.nbits), - LeafField{Char}(col_id[2], 5, 8) + LeafField{LeafType}(col_id[1],index_record), + LeafField{Char}(col_id[2], char_record) ) elseif length(col_id) == 1 record = column_records[only(col_id)] LeafType = rntuple_col_type_dict[record.type] - return LeafField{LeafType}(only(col_id), record.type, record.nbits) + return LeafField{LeafType}(only(col_id), record) else error("un-handled RNTuple case, report issue to UnROOT.jl") end diff --git a/src/RNTuple/footer.jl b/src/RNTuple/footer.jl index 55e382ba..9d709874 100644 --- a/src/RNTuple/footer.jl +++ b/src/RNTuple/footer.jl @@ -96,12 +96,14 @@ function split8_reinterpret!(dst, src::Vector{UInt8}) end """ - read_pagedesc(io, pagedesc::Vector{PageDescription}, nbits::Integer) + read_pagedesc(io, pagedescs::AbstractVector{PageDescription}, nbits::Integer; split=false) Read the decompressed raw bytes given a Page Description. The `nbits` need to be provided according to the element type of the column since `pagedesc` only contains `num_elements` information. +`split` is true when split encoding is needed, this is done per page. + !!! note Boolean values are always stored as bit in RNTuple, so `nbits = 1`. diff --git a/src/RNTuple/header.jl b/src/RNTuple/header.jl index 4e02e212..99ed8f05 100644 --- a/src/RNTuple/header.jl +++ b/src/RNTuple/header.jl @@ -1,6 +1,6 @@ -struct FieldRecord - field_version::UInt32 - type_version::UInt32 +Base.@kwdef struct FieldRecord + field_version::UInt32 = 0x0000 + type_version::UInt32 = 0x0000 parent_field_id::UInt32 struct_role::UInt16 flags::UInt16 @@ -16,7 +16,7 @@ function _rntuple_read(io, ::Type{FieldRecord}) parent_field_id = read(io, UInt32) struct_role = read(io, UInt16) flags = read(io, UInt16) - repetition = if flags == 0x0001 + repetition = if flags == 0x01 read(io, Int64) else 0 @@ -26,12 +26,26 @@ function _rntuple_read(io, ::Type{FieldRecord}) struct_role, flags, repetition, field_name, type_name, type_alias, field_desc) end -@SimpleStruct struct ColumnRecord +struct ColumnRecord type::UInt16 nbits::UInt16 field_id::UInt32 flags::UInt32 + first_ele_idx::Int64 end +function _rntuple_read(io, ::Type{ColumnRecord}) + type = read(io, UInt16) + nbits = read(io, UInt16) + field_id = read(io, UInt32) + flags = read(io, UInt32) + first_ele_idx = if flags == 0x08 + read(io, Int64) + else + 0 + end + ColumnRecord(type, nbits, field_id, flags, first_ele_idx) +end + @SimpleStruct struct AliasRecord physical_id::UInt32 diff --git a/src/RNTuple/highlevel.jl b/src/RNTuple/highlevel.jl index b992ae5b..a3a36d27 100644 --- a/src/RNTuple/highlevel.jl +++ b/src/RNTuple/highlevel.jl @@ -84,11 +84,14 @@ RNTupleSchema with 13 top fields ``` """ struct RNTupleSchema - namedtuple::Any + namedtuple::NamedTuple end Base.propertynames(s::RNTupleSchema) = propertynames(getfield(s, :namedtuple)) Base.getproperty(s::RNTupleSchema, sym::Symbol) = getproperty(getfield(s, :namedtuple), sym) Base.length(s::RNTupleSchema) = length(getfield(s, :namedtuple)) +function Base.getindex(s::RNTupleSchema, idx) + RNTupleSchema(getfield(s, :namedtuple)[idx]) +end function Base.getindex(rf::RNTupleField, idx::Int) tid = Threads.threadid() @@ -174,8 +177,8 @@ struct RNTuple{O} header::RNTupleHeader footer::RNTupleFooter pagelinks::Dict{Int, PageLink} - schema::Any - function RNTuple(io::O, header, footer, schema::S) where {O, S} + schema::RNTupleSchema + function RNTuple(io::O, header, footer, schema) where {O} new{O}( io, header, @@ -213,7 +216,9 @@ function LazyTree(rn::RNTuple, selection) end N = Tuple(Symbol.(filtered_names)) - T = Tuple(RNTupleField(rn, getproperty(rn.schema, k)) for k in N) + skim_schema = getfield(rn.schema, :namedtuple)[N] + new_rn = RNTuple(rn.io, rn.header, rn.footer, skim_schema) + T = Tuple(RNTupleField(new_rn, getproperty(new_rn.schema, k)) for k in N) return LazyTree(NamedTuple{N}(T)) end From bcb47a117aed2ab152ac6f5ead5eb8fe979492a2 Mon Sep 17 00:00:00 2001 From: Moelf Date: Fri, 15 Mar 2024 14:17:54 -0400 Subject: [PATCH 2/4] clean up --- src/RNTuple/fieldcolumn_reading.jl | 8 -------- src/RNTuple/header.jl | 6 +++--- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/RNTuple/fieldcolumn_reading.jl b/src/RNTuple/fieldcolumn_reading.jl index 1ac0c454..f96f32ea 100644 --- a/src/RNTuple/fieldcolumn_reading.jl +++ b/src/RNTuple/fieldcolumn_reading.jl @@ -103,14 +103,6 @@ function _reset_to_incremental(res::AbstractVector, pages, ::Type{T}) where T end end -function _reset_to_incremental(res::AbstractVector, pages, ::Type{T}) where T - endpoint = 0 - for pi in firstindex(pages):lastindex(pages)-1 - endpoint += pages[pi].num_elements - res[endpoint+1] -= sum(@view res[begin:endpoint]) - end -end - _field_output_type(::Type{LeafField{T}}) where {T} = Vector{T} function read_field(io, field::LeafField{T}, page_list) where T nbits = field.columnrecord.nbits diff --git a/src/RNTuple/header.jl b/src/RNTuple/header.jl index 99ed8f05..fcda24d4 100644 --- a/src/RNTuple/header.jl +++ b/src/RNTuple/header.jl @@ -1,6 +1,6 @@ -Base.@kwdef struct FieldRecord - field_version::UInt32 = 0x0000 - type_version::UInt32 = 0x0000 +struct FieldRecord + field_version::UInt32 + type_version::UInt32 parent_field_id::UInt32 struct_role::UInt16 flags::UInt16 From 62e98ef82266fc8689b3d38cb9ba762efec13215 Mon Sep 17 00:00:00 2001 From: Moelf Date: Fri, 15 Mar 2024 14:43:53 -0400 Subject: [PATCH 3/4] test schema is slim --- test/rntuple_tests.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/rntuple_tests.jl b/test/rntuple_tests.jl index d815ae31..1743dc64 100644 --- a/test/rntuple_tests.jl +++ b/test/rntuple_tests.jl @@ -210,6 +210,15 @@ end @test length(names(df4)) == 1 end +@testset "Skim the schema" begin + f1 = UnROOT.samplefile("RNTuple/DAOD_TRUTH3_RC2.root") + df_full = LazyTree(f1, "RNT:CollectionTree") + df1 = LazyTree(f1, "RNT:CollectionTree", r"AntiKt4TruthDressedWZ") + @test 0 < length(names(df1)) < length(names(df_full)) + @test "AntiKt4TruthDressedWZJetsAux:" ∈ names(df1) + @test length(df1[!, 1].rn.schema) < length(df_full[!, 1].rn.schema) +end + @testset "Skip Recursively Empty Structs" begin f1 = UnROOT.samplefile("RNTuple/DAOD_TRUTH3_RC2.root") df = LazyTree(f1, "RNT:CollectionTree", r"AntiKt4TruthDressedWZ") From 77184f7e732044a826030461cffe385844a15236 Mon Sep 17 00:00:00 2001 From: Moelf Date: Fri, 15 Mar 2024 14:44:05 -0400 Subject: [PATCH 4/4] bump version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index eb56e3c7..bc3d13f4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "UnROOT" uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9" authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"] -version = "0.10.24" +version = "0.10.25" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"