Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RNTuple] refactor display and parse column flags 0x08 #316

Merged
merged 4 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "UnROOT"
uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9"
authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"]
version = "0.10.24"
version = "0.10.25"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Expand Down
36 changes: 30 additions & 6 deletions src/RNTuple/displays.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
function _showwithkw(io, @nospecialize(k))
T = typeof(k)

print(io, T)
print(io, "(")
for i in fieldnames(T)
print(io, i, "=", repr(getfield(k, i)), ", ")
end
println(io, ")")
end

function Base.show(io::IO, f::FieldRecord)
_showwithkw(io, f)
end

function Base.show(io::IO, f::ColumnRecord)
_showwithkw(io, f)
end

function Base.show(io::IO, f::AliasRecord)
print(io, "AliasRecord(physical_id=$(f.physical_id), field_id=$(f.field_id))")
_showwithkw(io, f)
end
function Base.show(io::IO, f::Locator)
_showwithkw(io, f)
end

function Base.show(io::IO, lf::StringField)
Expand Down Expand Up @@ -36,10 +58,11 @@ function Base.show(io::IO, header::RNTupleHeader, indent=0, short=false)
l1 = maximum(length, [f.field_name for f in header.field_records])
l2 = maximum(length, [f.type_name for f in header.field_records])
println(io, "$ind field_records: ")
for f in header.field_records
for (fidx, f) in enumerate(header.field_records)
print(io, "$ind ")
print(io, "(implicit idx=$(lpad(fidx-1, 2, "0"))), ")
print(io, "parent=$(lpad(Int(f.parent_field_id), 2, "0")), ")
print(io, "role=$(Int(f.struct_role)), ")
print(io, "struct_role=$(Int(f.struct_role)), ")
print(io, "name=$(rpad(f.field_name, l1+1, " ")), ")
print(io, "type=$(rpad(f.type_name, l2+1, " "))")
println(io, "repetition=$(f.repetition)")
Expand All @@ -52,7 +75,8 @@ function Base.show(io::IO, header::RNTupleHeader, indent=0, short=false)
print(io, "type=$(lpad(Int(g.type), 2, "0")), ")
print(io, "nbits=$(lpad(Int(g.nbits), 2, "0")), ")
print(io, "field_id=$(lpad(Int(g.field_id), 3, "0")), ")
println(io, "flags=$(g.flags)")
print(io, "flags=$(g.flags), ")
println(io, "first_ele_index=$(g.first_ele_idx)")
end
end
end
Expand All @@ -77,13 +101,13 @@ function Base.show(io::IO, rn::RNTuple)
print(io, " └─ ")
println(io, "Schema: ")
_io = IOBuffer()
print_tree(_io, rn.schema; maxdepth=1, indicate_truncation=false)
print_tree(_io, rn.schema; maxdepth=3, indicate_truncation=true)
for l in split(String(take!(_io)), '\n')
print(io, " ")
println(io, l)
end
end
Base.show(io::IO, s::RNTupleSchema) = print_tree(io, s)
Base.show(io::IO, s::RNTupleSchema) = print_tree(io, s; maxdepth=10)
printnode(io::IO, s::RNTupleSchema) = print(io, "RNTupleSchema with $(length(s)) top fields")
children(s::RNTupleSchema) = Dict(pairs(getfield(s, :namedtuple)))

Expand Down
29 changes: 20 additions & 9 deletions src/RNTuple/fieldcolumn_reading.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ end

_field_output_type(::Type{StringField{O, T}}) where {O, T} = Vector{String}
function read_field(io, field::StringField{O, T}, page_list) where {O, T}
nbits = field.content_col.nbits
nbits = field.content_col.columnrecord.nbits
pages = page_list[field.content_col.content_col_idx]

offset = read_field(io, field.offset_col, page_list)
Expand All @@ -65,9 +65,9 @@ end

_field_output_type(::Type{RNTupleCardinality{T}}) where {T} = CardinalityVector{T}
function read_field(io, field::RNTupleCardinality{T}, page_list) where T
nbits = field.leaf_field.nbits
nbits = field.leaf_field.columnrecord.nbits
pages = page_list[field.leaf_field.content_col_idx]
typenum = field.leaf_field.type
typenum = field.leaf_field.columnrecord.type
split = 14 <= typenum <= 21 || 26 <= typenum <= 28
delta = 14 <= typenum <= 15
bytes = read_pagedesc(io, pages, nbits; split)
Expand All @@ -81,6 +81,19 @@ end

_from_zigzag(n) = (n >> 1) ⊻ -(n & 1)
_to_zigzag(n) = (n << 1) ⊻ (n >> 63)
function _from_zigzag!(res::AbstractVector)
@simd for i in eachindex(res)
res[i] = _from_zigzag(res[i])
end
return res
end

function _to_zigzag!(res::AbstractVector)
@simd for i in eachindex(res)
res[i] = _to_zigzag(res[i])
end
return res
end

function _reset_to_incremental(res::AbstractVector, pages, ::Type{T}) where T
endpoint = 0
Expand All @@ -92,19 +105,17 @@ end

_field_output_type(::Type{LeafField{T}}) where {T} = Vector{T}
function read_field(io, field::LeafField{T}, page_list) where T
nbits = field.nbits
nbits = field.columnrecord.nbits
pages = page_list[field.content_col_idx]
# handle split encoding within page
typenum = field.type
typenum = field.columnrecord.type
split = 14 <= typenum <= 21 || 26 <= typenum <= 28
zigzag = 26 <= typenum <= 28
delta = 14 <= typenum <= 15
bytes = read_pagedesc(io, pages, nbits; split = split)
res = collect(reinterpret(T, bytes))
if zigzag
@simd for i in eachindex(res)
res[i] = _from_zigzag(res[i])
end
_from_zigzag!(res)
elseif delta
# the Index32/64 resets to absolute offset page-by-page
# https://github.com/JuliaHEP/UnROOT.jl/issues/312#issuecomment-1999875348
Expand All @@ -118,7 +129,7 @@ end

_field_output_type(::Type{LeafField{Bool}}) = BitVector
function read_field(io, field::LeafField{Bool}, page_list)
nbits = field.nbits
nbits = field.columnrecord.nbits
pages = page_list[field.content_col_idx]
total_num_elements = sum(p.num_elements for p in pages)

Expand Down
13 changes: 6 additions & 7 deletions src/RNTuple/fieldcolumn_schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ isvoid(::Type{<:StringField}) = false
"""
struct LeafField{T}
content_col_idx::Int
type::Int
nbits::Int
columnrecord::ColumnRecord
end

Base case of field nesting, this links to a column in the RNTuple by 0-based index.
Expand All @@ -68,8 +67,7 @@ The `type` field is the RNTuple spec type number, used to record split encoding.
"""
struct LeafField{T}
content_col_idx::Int
type::Int
nbits::Int
columnrecord::ColumnRecord
end
Base.eltype(::Type{LeafField{T}}) where {T} = T
isvoid(::Type{<:LeafField}) = false
Expand All @@ -92,16 +90,17 @@ isvoid(::Type{<:RNTupleCardinality}) = false
function _search_col_type(field_id, column_records, col_id::Int...)
if length(col_id) == 2 && column_records[col_id[2]].type == 5
index_record = column_records[col_id[1]]
char_record = column_records[col_id[2]]
index_typenum = index_record.type
LeafType = rntuple_col_type_dict[index_typenum]
return StringField(
LeafField{LeafType}(col_id[1], index_typenum, index_record.nbits),
LeafField{Char}(col_id[2], 5, 8)
LeafField{LeafType}(col_id[1],index_record),
LeafField{Char}(col_id[2], char_record)
)
elseif length(col_id) == 1
record = column_records[only(col_id)]
LeafType = rntuple_col_type_dict[record.type]
return LeafField{LeafType}(only(col_id), record.type, record.nbits)
return LeafField{LeafType}(only(col_id), record)
else
error("un-handled RNTuple case, report issue to UnROOT.jl")
end
Expand Down
4 changes: 3 additions & 1 deletion src/RNTuple/footer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,14 @@ function split8_reinterpret!(dst, src::Vector{UInt8})
end

"""
read_pagedesc(io, pagedesc::Vector{PageDescription}, nbits::Integer)
read_pagedesc(io, pagedescs::AbstractVector{PageDescription}, nbits::Integer; split=false)

Read the decompressed raw bytes given a Page Description. The
`nbits` need to be provided according to the element type of the
column since `pagedesc` only contains `num_elements` information.

`split` is true when split encoding is needed, this is done per page.

!!! note
Boolean values are always stored as bit in RNTuple, so `nbits = 1`.

Expand Down
18 changes: 16 additions & 2 deletions src/RNTuple/header.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ function _rntuple_read(io, ::Type{FieldRecord})
parent_field_id = read(io, UInt32)
struct_role = read(io, UInt16)
flags = read(io, UInt16)
repetition = if flags == 0x0001
repetition = if flags == 0x01
read(io, Int64)
else
0
Expand All @@ -26,12 +26,26 @@ function _rntuple_read(io, ::Type{FieldRecord})
struct_role, flags, repetition, field_name, type_name, type_alias, field_desc)
end

@SimpleStruct struct ColumnRecord
struct ColumnRecord
type::UInt16
nbits::UInt16
field_id::UInt32
flags::UInt32
first_ele_idx::Int64
end
function _rntuple_read(io, ::Type{ColumnRecord})
type = read(io, UInt16)
nbits = read(io, UInt16)
field_id = read(io, UInt32)
flags = read(io, UInt32)
first_ele_idx = if flags == 0x08
read(io, Int64)
else
0
end
ColumnRecord(type, nbits, field_id, flags, first_ele_idx)
end


@SimpleStruct struct AliasRecord
physical_id::UInt32
Expand Down
13 changes: 9 additions & 4 deletions src/RNTuple/highlevel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,14 @@ RNTupleSchema with 13 top fields
```
"""
struct RNTupleSchema
namedtuple::Any
namedtuple::NamedTuple
end
Base.propertynames(s::RNTupleSchema) = propertynames(getfield(s, :namedtuple))
Base.getproperty(s::RNTupleSchema, sym::Symbol) = getproperty(getfield(s, :namedtuple), sym)
Base.length(s::RNTupleSchema) = length(getfield(s, :namedtuple))
function Base.getindex(s::RNTupleSchema, idx)
RNTupleSchema(getfield(s, :namedtuple)[idx])
end

function Base.getindex(rf::RNTupleField, idx::Int)
tid = Threads.threadid()
Expand Down Expand Up @@ -174,8 +177,8 @@ struct RNTuple{O}
header::RNTupleHeader
footer::RNTupleFooter
pagelinks::Dict{Int, PageLink}
schema::Any
function RNTuple(io::O, header, footer, schema::S) where {O, S}
schema::RNTupleSchema
function RNTuple(io::O, header, footer, schema) where {O}
new{O}(
io,
header,
Expand Down Expand Up @@ -213,7 +216,9 @@ function LazyTree(rn::RNTuple, selection)
end

N = Tuple(Symbol.(filtered_names))
T = Tuple(RNTupleField(rn, getproperty(rn.schema, k)) for k in N)
skim_schema = getfield(rn.schema, :namedtuple)[N]
new_rn = RNTuple(rn.io, rn.header, rn.footer, skim_schema)
T = Tuple(RNTupleField(new_rn, getproperty(new_rn.schema, k)) for k in N)

return LazyTree(NamedTuple{N}(T))
end
9 changes: 9 additions & 0 deletions test/rntuple_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,15 @@ end
@test length(names(df4)) == 1
end

@testset "Skim the schema" begin
f1 = UnROOT.samplefile("RNTuple/DAOD_TRUTH3_RC2.root")
df_full = LazyTree(f1, "RNT:CollectionTree")
df1 = LazyTree(f1, "RNT:CollectionTree", r"AntiKt4TruthDressedWZ")
@test 0 < length(names(df1)) < length(names(df_full))
@test "AntiKt4TruthDressedWZJetsAux:" ∈ names(df1)
@test length(df1[!, 1].rn.schema) < length(df_full[!, 1].rn.schema)
end

@testset "Skip Recursively Empty Structs" begin
f1 = UnROOT.samplefile("RNTuple/DAOD_TRUTH3_RC2.root")
df = LazyTree(f1, "RNT:CollectionTree", r"AntiKt4TruthDressedWZ")
Expand Down
Loading