JuliaHEP · Moelf · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "UnROOT"
 uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9"
 authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"]
-version = "0.10.24"
+version = "0.10.25"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

diff --git a/src/RNTuple/displays.jl b/src/RNTuple/displays.jl
@@ -1,5 +1,27 @@
+function _showwithkw(io, @nospecialize(k))
+    T = typeof(k)
+
+    print(io, T)
+    print(io, "(")
+    for i in fieldnames(T)
+        print(io, i, "=", repr(getfield(k, i)), ", ")
+    end
+    println(io, ")")
+end
+
+function Base.show(io::IO, f::FieldRecord)
+    _showwithkw(io, f)
+end
+
+function Base.show(io::IO, f::ColumnRecord)
+    _showwithkw(io, f)
+end
+
 function Base.show(io::IO, f::AliasRecord)
-    print(io, "AliasRecord(physical_id=$(f.physical_id), field_id=$(f.field_id))")
+    _showwithkw(io, f)
+end
+function Base.show(io::IO, f::Locator)
+    _showwithkw(io, f)
 end
 
 function Base.show(io::IO, lf::StringField)
@@ -36,10 +58,11 @@ function Base.show(io::IO, header::RNTupleHeader, indent=0, short=false)
         l1 = maximum(length, [f.field_name for f in header.field_records])
         l2 = maximum(length, [f.type_name for f in header.field_records])
         println(io, "$ind    field_records: ")
-        for f in header.field_records
+        for (fidx, f) in enumerate(header.field_records)
             print(io, "$ind        ")
+            print(io, "(implicit idx=$(lpad(fidx-1, 2, "0"))), ")
             print(io, "parent=$(lpad(Int(f.parent_field_id), 2, "0")), ")
-            print(io, "role=$(Int(f.struct_role)), ")
+            print(io, "struct_role=$(Int(f.struct_role)), ")
             print(io, "name=$(rpad(f.field_name, l1+1, " ")), ")
             print(io, "type=$(rpad(f.type_name, l2+1, " "))")
             println(io, "repetition=$(f.repetition)")
@@ -52,7 +75,8 @@ function Base.show(io::IO, header::RNTupleHeader, indent=0, short=false)
             print(io, "type=$(lpad(Int(g.type), 2, "0")), ")
             print(io, "nbits=$(lpad(Int(g.nbits), 2, "0")), ")
             print(io, "field_id=$(lpad(Int(g.field_id), 3, "0")), ")
-            println(io, "flags=$(g.flags)")
+            print(io, "flags=$(g.flags), ")
+            println(io, "first_ele_index=$(g.first_ele_idx)")
         end
     end
 end
@@ -77,13 +101,13 @@ function Base.show(io::IO, rn::RNTuple)
     print(io, " └─ ")
     println(io, "Schema: ")
     _io = IOBuffer()
-    print_tree(_io, rn.schema; maxdepth=1, indicate_truncation=false)
+    print_tree(_io, rn.schema; maxdepth=3, indicate_truncation=true)
     for l in split(String(take!(_io)), '\n')
         print(io, "      ")
         println(io, l)
     end
 end
-Base.show(io::IO, s::RNTupleSchema) = print_tree(io, s)
+Base.show(io::IO, s::RNTupleSchema) = print_tree(io, s; maxdepth=10)
 printnode(io::IO, s::RNTupleSchema) = print(io, "RNTupleSchema with $(length(s)) top fields")
 children(s::RNTupleSchema) = Dict(pairs(getfield(s, :namedtuple)))
 

diff --git a/src/RNTuple/fieldcolumn_reading.jl b/src/RNTuple/fieldcolumn_reading.jl
@@ -38,7 +38,7 @@ end
 
 _field_output_type(::Type{StringField{O, T}}) where {O, T} = Vector{String}
 function read_field(io, field::StringField{O, T}, page_list) where {O, T}
-    nbits = field.content_col.nbits
+    nbits = field.content_col.columnrecord.nbits
     pages = page_list[field.content_col.content_col_idx]
 
     offset = read_field(io, field.offset_col, page_list)
@@ -65,9 +65,9 @@ end
 
 _field_output_type(::Type{RNTupleCardinality{T}}) where {T} = CardinalityVector{T}
 function read_field(io, field::RNTupleCardinality{T}, page_list) where T
-    nbits = field.leaf_field.nbits
+    nbits = field.leaf_field.columnrecord.nbits
     pages = page_list[field.leaf_field.content_col_idx]
-    typenum = field.leaf_field.type
+    typenum = field.leaf_field.columnrecord.type
     split = 14 <= typenum <= 21 || 26 <= typenum <= 28
     delta = 14 <= typenum <= 15
     bytes = read_pagedesc(io, pages, nbits; split)
@@ -81,6 +81,19 @@ end
 
 _from_zigzag(n) = (n >> 1) ⊻ -(n & 1)
 _to_zigzag(n) = (n << 1) ⊻ (n >> 63)
+function _from_zigzag!(res::AbstractVector)
+    @simd for i in eachindex(res)
+        res[i] = _from_zigzag(res[i])
+    end
+    return res
+end
+
+function _to_zigzag!(res::AbstractVector)
+    @simd for i in eachindex(res)
+        res[i] = _to_zigzag(res[i])
+    end
+    return res
+end
 
 function _reset_to_incremental(res::AbstractVector, pages, ::Type{T}) where T
     endpoint = 0
@@ -92,19 +105,17 @@ end
 
 _field_output_type(::Type{LeafField{T}}) where {T} = Vector{T}
 function read_field(io, field::LeafField{T}, page_list) where T
-    nbits = field.nbits
+    nbits = field.columnrecord.nbits
     pages = page_list[field.content_col_idx]
     # handle split encoding within page
-    typenum = field.type
+    typenum = field.columnrecord.type
     split = 14 <= typenum <= 21 || 26 <= typenum <= 28
     zigzag = 26 <= typenum <= 28
     delta = 14 <= typenum <= 15
     bytes = read_pagedesc(io, pages, nbits; split = split)
     res = collect(reinterpret(T, bytes))
     if zigzag
-        @simd for i in eachindex(res)
-            res[i] = _from_zigzag(res[i])
-        end
+        _from_zigzag!(res)
     elseif delta
         # the Index32/64 resets to absolute offset page-by-page
         # https://github.com/JuliaHEP/UnROOT.jl/issues/312#issuecomment-1999875348
@@ -118,7 +129,7 @@ end
 
 _field_output_type(::Type{LeafField{Bool}}) = BitVector
 function read_field(io, field::LeafField{Bool}, page_list)
-    nbits = field.nbits
+    nbits = field.columnrecord.nbits
     pages = page_list[field.content_col_idx]
     total_num_elements = sum(p.num_elements for p in pages)
 

diff --git a/src/RNTuple/fieldcolumn_schema.jl b/src/RNTuple/fieldcolumn_schema.jl
@@ -56,8 +56,7 @@ isvoid(::Type{<:StringField}) = false
 """
     struct LeafField{T}
         content_col_idx::Int
-        type::Int
-        nbits::Int
+        columnrecord::ColumnRecord
     end
 
 Base case of field nesting, this links to a column in the RNTuple by 0-based index.
@@ -68,8 +67,7 @@ The `type` field is the RNTuple spec type number, used to record split encoding.
 """
 struct LeafField{T}
     content_col_idx::Int
-    type::Int
-    nbits::Int
+    columnrecord::ColumnRecord
 end
 Base.eltype(::Type{LeafField{T}}) where {T} = T
 isvoid(::Type{<:LeafField}) = false
@@ -92,16 +90,17 @@ isvoid(::Type{<:RNTupleCardinality}) = false
 function _search_col_type(field_id, column_records, col_id::Int...)
     if length(col_id) == 2 && column_records[col_id[2]].type == 5
         index_record = column_records[col_id[1]]
+        char_record = column_records[col_id[2]]
         index_typenum = index_record.type
         LeafType = rntuple_col_type_dict[index_typenum]
         return StringField(
-            LeafField{LeafType}(col_id[1], index_typenum, index_record.nbits),
-            LeafField{Char}(col_id[2], 5, 8)
+            LeafField{LeafType}(col_id[1],index_record),
+            LeafField{Char}(col_id[2], char_record)
         )
     elseif length(col_id) == 1
         record = column_records[only(col_id)]
         LeafType = rntuple_col_type_dict[record.type]
-        return LeafField{LeafType}(only(col_id), record.type, record.nbits)
+        return LeafField{LeafType}(only(col_id), record)
     else
         error("un-handled RNTuple case, report issue to UnROOT.jl")
     end

diff --git a/src/RNTuple/footer.jl b/src/RNTuple/footer.jl
@@ -96,12 +96,14 @@ function split8_reinterpret!(dst, src::Vector{UInt8})
 end
 
 """
-    read_pagedesc(io, pagedesc::Vector{PageDescription}, nbits::Integer)
+    read_pagedesc(io, pagedescs::AbstractVector{PageDescription}, nbits::Integer; split=false)
 
 Read the decompressed raw bytes given a Page Description. The
 `nbits` need to be provided according to the element type of the
 column since `pagedesc` only contains `num_elements` information.
 
+`split` is true when split encoding is needed, this is done per page.
+
 !!! note
     Boolean values are always stored as bit in RNTuple, so `nbits = 1`.
 

diff --git a/src/RNTuple/header.jl b/src/RNTuple/header.jl
@@ -16,7 +16,7 @@ function _rntuple_read(io, ::Type{FieldRecord})
     parent_field_id = read(io, UInt32)
     struct_role = read(io, UInt16)
     flags = read(io, UInt16)
-    repetition = if flags == 0x0001
+    repetition = if flags == 0x01
         read(io, Int64)
     else
         0
@@ -26,12 +26,26 @@ function _rntuple_read(io, ::Type{FieldRecord})
                 struct_role, flags, repetition, field_name, type_name, type_alias, field_desc)
 end
 
-@SimpleStruct struct ColumnRecord
+struct ColumnRecord
     type::UInt16
     nbits::UInt16
     field_id::UInt32
     flags::UInt32
+    first_ele_idx::Int64
 end
+function _rntuple_read(io, ::Type{ColumnRecord})
+    type = read(io, UInt16)
+    nbits = read(io, UInt16)
+    field_id = read(io, UInt32)
+    flags = read(io, UInt32)
+    first_ele_idx = if flags == 0x08
+        read(io, Int64)
+    else
+        0
+    end
+    ColumnRecord(type, nbits, field_id, flags, first_ele_idx)
+end
+
 
 @SimpleStruct struct AliasRecord
     physical_id::UInt32

diff --git a/src/RNTuple/highlevel.jl b/src/RNTuple/highlevel.jl
@@ -84,11 +84,14 @@ RNTupleSchema with 13 top fields
 ```
 """
 struct RNTupleSchema
-    namedtuple::Any
+    namedtuple::NamedTuple
 end
 Base.propertynames(s::RNTupleSchema) = propertynames(getfield(s, :namedtuple))
 Base.getproperty(s::RNTupleSchema, sym::Symbol) = getproperty(getfield(s, :namedtuple), sym)
 Base.length(s::RNTupleSchema) = length(getfield(s, :namedtuple))
+function Base.getindex(s::RNTupleSchema, idx)
+    RNTupleSchema(getfield(s, :namedtuple)[idx])
+end
 
 function Base.getindex(rf::RNTupleField, idx::Int)
     tid = Threads.threadid()
@@ -174,8 +177,8 @@ struct RNTuple{O}
     header::RNTupleHeader
     footer::RNTupleFooter
     pagelinks::Dict{Int, PageLink}
-    schema::Any
-    function RNTuple(io::O, header, footer, schema::S) where {O, S}
+    schema::RNTupleSchema
+    function RNTuple(io::O, header, footer, schema) where {O}
         new{O}(
             io,
             header,
@@ -213,7 +216,9 @@ function LazyTree(rn::RNTuple, selection)
     end
 
     N = Tuple(Symbol.(filtered_names))
-    T = Tuple(RNTupleField(rn, getproperty(rn.schema, k)) for k in N)
+    skim_schema = getfield(rn.schema, :namedtuple)[N]
+    new_rn =  RNTuple(rn.io, rn.header, rn.footer, skim_schema)
+    T = Tuple(RNTupleField(new_rn, getproperty(new_rn.schema, k)) for k in N)
 
     return LazyTree(NamedTuple{N}(T))
 end
diff --git a/test/rntuple_tests.jl b/test/rntuple_tests.jl
@@ -210,6 +210,15 @@ end
     @test length(names(df4)) == 1
 end
 
+@testset "Skim the schema" begin
+    f1 = UnROOT.samplefile("RNTuple/DAOD_TRUTH3_RC2.root")
+    df_full = LazyTree(f1, "RNT:CollectionTree")
+    df1 = LazyTree(f1, "RNT:CollectionTree", r"AntiKt4TruthDressedWZ")
+    @test 0 < length(names(df1)) < length(names(df_full))
+    @test "AntiKt4TruthDressedWZJetsAux:" ∈ names(df1)
+    @test length(df1[!, 1].rn.schema) < length(df_full[!, 1].rn.schema)
+end
+
 @testset "Skip Recursively Empty Structs" begin
     f1 = UnROOT.samplefile("RNTuple/DAOD_TRUTH3_RC2.root")
     df = LazyTree(f1, "RNT:CollectionTree", r"AntiKt4TruthDressedWZ")