diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 00000000..7ddad2a9 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,45 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 * * * + workflow_dispatch: +permissions: + contents: write + pull-requests: write +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Check if Julia is already available in the PATH + id: julia_in_path + run: which julia + continue-on-error: true + - name: Install Julia, but only if it is not already available in the PATH + uses: julia-actions/setup-julia@v1 + with: + version: '1' + # arch: ${{ runner.arch }} + if: steps.julia_in_path.outcome != 'success' + - name: "Add the General registry via Git" + run: | + import Pkg + ENV["JULIA_PKG_SERVER"] = "" + Pkg.Registry.add("General") + shell: julia --color=yes {0} + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "3" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index f49313b6..f3896111 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -4,6 +4,11 @@ on: types: - created workflow_dispatch: + inputs: + lookback: + default: 3 +permissions: + contents: write jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 85b03cc3..aefbc611 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,9 +16,7 @@ jobs: matrix: version: - '1.6' - - '1.5' - - '1.4' - - '1.3' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'. + - '1.7' - '1' # Leave this line unchanged. '1' will automatically expand to the latest stable 1.x release of Julia. - 'nightly' os: diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 00000000..76310246 --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,23 @@ +on: [push] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v1 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf diff --git a/CITATION.bib b/CITATION.bib new file mode 100644 index 00000000..482ef94a --- /dev/null +++ b/CITATION.bib @@ -0,0 +1,13 @@ +@article{ + UnROOT-2022, + doi = {10.21105/joss.04452}, + url = {https://doi.org/10.21105/joss.04452}, + year = {2022}, + publisher = {The Open Journal}, + volume = {7}, + number = {76}, + pages = {4452}, + author = {Tamás Gál and Jerry (Jiahong) Ling and Nick Amin}, + title = {UnROOT: an I/O library for the CERN ROOT file format written in Julia}, + journal = {Journal of Open Source Software} +} diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..12c82239 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,18 @@ +cff-version: 1.2.0 +message: "Cite this paper whenever you use this package" +authors: +- family-names: "Gál" + given-names: "Tamás" + orcid: "https://orcid.org/0000-0001-7821-8673" +- family-names: "Ling" + given-names: "Jerry" + orcid: "https://orcid.org/0000-0002-3359-0380" +- family-names: "Amin" + given-names: "Nick" + orcid: "https://orcid.org/0000-0003-2560-0013" +title: "UnROOT: an I/O library for the CERN ROOT file format written in Julia" +version: "v1" +license: "MIT" +doi: "10.21105/joss.04452" +date-released: 2021-10-11 +url: "https://github.com/JuliaHEP/UnROOT.jl/" diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..3b87b098 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,6 @@ +UnROOT.jl Standards +========================= + +We abide by the larger Julia Communities Code of Conduct (CoC). You can find that CoC listed here: https://julialang.org/community/standards/ + +If you have a conflict or concern that requires resolution, please contact the [Julia Community Stewards](https://julialang.org/community/stewards/). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..50a0aaec --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,39 @@ +# Contribute to UnROOT.jl + +To make your first PR to this repo: + +1. Have basic understanding of **Git**. The tutorial [Making a first Julia pull request](https://kshyatt.github.io/post/firstjuliapr/) could be helpful for learning both git and how to contribute to the Julia language projects. +2. Set up your local environment. We recommend use `Revise.jl` workflow. +3. Familiarise yourself with the source code. See [Source code organization](#source-code-organization). +4. Make changes & test them & submit PR. + +## Contribution example ideas + +### Core functionality +1. Parsing more ROOT types +2. Implement writing `.root` files + +#### Help Wanted Issues +One of the best ways to contribute is by looking at issues labelled [help wanted](https://github.com/JuliaHEP/UnROOT.jl/labels/help%20wanted). These issues are not always beginner-friendly. However, you are welcome to [ask clarifying questions](#get-help) or just browse +help wanted issues to see if there is anything that seems interesting to help with. + +### Write tutorials +We can always use more tutorial on how to use UnROOT.jl efficiently and with other visualization or statistics tools in Julia for doing +HEP. + +## Contribution guidelines +- We use the GitHub issue page for any bug filing or feature request, feel free to use them. +- For usage related discussion, feel free to use [HEP tag on Julia discourse](https://discourse.julialang.org/tag/hep) or join +our [mailist](https://groups.google.com/g/julia-hep). + +### source code organization + +The following table shows how the code is organized: + +| Directory | Contents | +| ------------- | ------------- | +| docs | Documentation| +| paper | JOSS paper | +| src | Source code | +| test | Test suites | +| test/samples | .root files for tests | diff --git a/Project.toml b/Project.toml index b2c4e8b6..4ba3985d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "UnROOT" uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9" authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"] -version = "0.8.1" +version = "0.8.16" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" @@ -9,6 +9,7 @@ ArraysOfArrays = "65a8f2f4-9b39-5baf-92e2-a9cc46fdf018" CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561" CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637" LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02" @@ -16,32 +17,34 @@ LibDeflate = "9255714d-24a7-4b30-8ea3-d46a97f7e13b" LorentzVectors = "3f54b04b-17fc-5cd4-9758-90c048d965e3" Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73" Mixers = "2a8e4939-dab8-5edc-8f64-72a8776f13de" +Mmap = "a63ad114-7e13-5084-954f-fe012c677804" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" +xrootdgo_jll = "9d84c17e-11f2-50ef-8cc9-e9701362097f" [compat] -AbstractTrees = "^0.3.0" -ArraysOfArrays = "^0.5.3" +AbstractTrees = "^0.3.0, 0.4" +ArraysOfArrays = "^0.5.3, ^0.6" CodecLz4 = "^0.3.0, ^0.4.0" CodecXz = "^0.6.0, ^0.7.0" CodecZstd = "^0.6.0, ^0.7.0" +HTTP = "^0.9.7, 1" IterTools = "^1" LRUCache = "^1.3.0" -LazyArrays = "^0.15, ^0.21, ^0.22, ^1" +LazyArrays = "^0.21, ^0.22, ^1" LibDeflate = "^0.4.1" LorentzVectors = "^0.4.0" Memoization = "^0.1.10" Mixers = "^0.1.0" Parameters = "^0.12.0" Polyester = "^0.5.3" -PrettyTables = "^1.2.0" +PrettyTables = "2" StaticArrays = "^0.12.0, ^1" Tables = "^1.0.0" -TypedTables = "^1.0.0" -julia = "^1.3" +julia = "^1.6" +xrootdgo_jll = "^0.31.1" [extras] InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" @@ -52,4 +55,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d" [targets] -test = ["Test", "Pkg", "ThreadsX", "MD5", "InteractiveUtils"] +test = ["Test", "Pkg", "ThreadsX", "MD5", "InteractiveUtils", "Polyester"] diff --git a/README.md b/README.md index b81f02c3..8377ec76 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,23 @@ [![All Contributors](https://img.shields.io/badge/all_contributors-7-orange.svg?style=flat-square)](#contributors-) -[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://tamasgal.github.io/UnROOT.jl/dev) -[![Build Status](https://github.com/tamasgal/UnROOT.jl/workflows/CI/badge.svg)](https://github.com/tamasgal/UnROOT.jl/actions) -[![Codecov](https://codecov.io/gh/tamasgal/UnROOT.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/tamasgal/UnROOT.jl) +[![JOSS](https://joss.theoj.org/papers/bab42b0c60f9dc7ef3b8d6460bc7229c/status.svg)](https://joss.theoj.org/papers/bab42b0c60f9dc7ef3b8d6460bc7229c) +[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliahep.github.io/UnROOT.jl/dev) +[![Build Status](https://github.com/JuliaHEP/UnROOT.jl/workflows/CI/badge.svg)](https://github.com/JuliaHEP/UnROOT.jl/actions) +[![Codecov](https://codecov.io/gh/JuliaHEP/UnROOT.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaHEP/UnROOT.jl) UnROOT.jl is a reader for the [CERN ROOT](https://root.cern) file format written entirely in Julia, without any dependence on ROOT or Python. -## Quick Start (see [docs](https://tamasgal.github.io/UnROOT.jl/dev/) for more) +## Installation Guide +1. Download the latest [Julia release](https://julialang.org/downloads/) +2. Open up Julia REPL (hit `]` once to enter Pkg mode, hit backspace to exit it) +```julia +julia>] +(v1.8) pkg> add UnROOT +``` +## Quick Start (see [docs](https://JuliaHEP.github.io/UnROOT.jl/dev/) for more) + ```julia julia> using UnROOT @@ -51,23 +60,32 @@ julia> Threads.@threads for event in mytree # multi-threading ``` Only one basket per branch will be cached so you don't have to worry about running out of RAM. -At the same time, `event` inside the for-loop is not materialized until a field is accessed. If your event -is fairly small or you need all of them anyway, you can `collect(event)` first inside the loop. +At the same time, `event` inside the for-loop is not materialized until a field is accessed. This means you should avoid double-access, +see [performance tips](https://juliahep.github.io/UnROOT.jl/dev/performancetips/#Don't-%22double-access%22) + +XRootD is also supported, depending on the protocol: +- the "url" has to start with `http://` or `https://`: +- (1.6+ only) or the "url" has to start with `root://` and have another `//` to separate server and file path +```julia +julia> r = @time ROOTFile("https://scikit-hep.org/uproot3/examples/Zmumu.root") + 0.034877 seconds (5.13 k allocations: 533.125 KiB) +ROOTFile with 1 entry and 18 streamers. + +julia> r = ROOTFile("root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleMuParked.root") +ROOTFile with 1 entry and 19 streamers. +``` ## Branch of custom struct We provide an experimental interface for hooking up UnROOT with your custom types -that only takes 2 steps, as explained [in the docs](https://tamasgal.github.io/UnROOT.jl/dev/advanced/custom_branch/). +that only takes 2 steps, as explained [in the docs](https://JuliaHEP.github.io/UnROOT.jl/dev/advanced/custom_branch/). As a show case for this functionality, the `TLorentzVector` support in UnROOT is implemented with the said plug-in system. -## Main challenges -- ROOT data is generally stored as big endian and is a - self-descriptive format, i.e. so-called streamers are stored in the files - which describe the actual structure of the data in the corresponding branches. - These streamers are read during runtime and need to be used to generate - Julia structs and `unpack` methods on the fly. -- Performance is very important for a low level I/O library. +## Support & Contributiing +- Use Github issues for any bug reporting or feature request; feel free to make PRs, +bug fixing, feature tuning, quality of life, docs, examples etc. +- See `CONTRIBUTING.md` for more information and recommended workflows in contributing to this package. ## TODOs @@ -98,6 +116,7 @@ of inspiration and information for reverse engineering the ROOT binary structures. ## Behind the scene +
Some additional debug output:

@@ -190,10 +209,10 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d - - - - + + + + diff --git a/docs/make.jl b/docs/make.jl index 97c7fbaa..20b4c9b5 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -8,6 +8,7 @@ makedocs(; ), pages=[ "Introduction" => "index.md", + "Example Usage" => "exampleusage.md", "Performance Tips" => "performancetips.md", "Advanced Usage" => [ "Parse Custom Branch" => "advanced/custom_branch.md", @@ -16,11 +17,11 @@ makedocs(; "For Contributors" => "devdocs.md", "APIs" => "internalapis.md", ], - repo="https://github.com/tamasgal/UnROOT.jl/blob/{commit}{path}#L{line}", + repo="https://github.com/JuliaHEP/UnROOT.jl/blob/{commit}{path}#L{line}", sitename="UnROOT.jl", authors="Tamas Gal and contributors", ) deploydocs(; - repo="github.com/tamasgal/UnROOT.jl", + repo="github.com/JuliaHEP/UnROOT.jl", ) diff --git a/docs/src/advanced/custom_branch.md b/docs/src/advanced/custom_branch.md index 5f71367c..650ce045 100644 --- a/docs/src/advanced/custom_branch.md +++ b/docs/src/advanced/custom_branch.md @@ -3,17 +3,17 @@ It is possible to parse Branches with custom structure as long as you know how t As an example, the `TLorentzVector` is added using this mechanism and we will walk through the steps needed: ### 1. Provide a map between `fClassName` of your struct (as seen in .root) to a Julia type. -Pass a `Dict{String, Type}` to `ROOTFile(filepath; customstructs)`. The `TLorentzVector` is shipped [by default](https://github.com/tamasgal/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/root.jl#L21): +Pass a `Dict{String, Type}` to `ROOTFile(filepath; customstructs)`. The `TLorentzVector` is shipped [by default](https://github.com/JuliaHEP/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/root.jl#L21): ```julia ROOTFile(filepath; customstructs = Dict("TLorentzVector" => LorentzVector{Float64})) ``` -This `Dict` will subsequently be used by the `auto_T_JaggT` function [at here](https://github.com/tamasgal/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/root.jl#L213-L222) such that when we encounter a branch with this `fClassName`, we will return your `Type` as the detected element type of this branch. +This `Dict` will subsequently be used by the `auto_T_JaggT` function [at here](https://github.com/JuliaHEP/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/root.jl#L213-L222) such that when we encounter a branch with this `fClassName`, we will return your `Type` as the detected element type of this branch. ### 2. Extend the raw bytes interpreting function `UnROOT.interped_data` -By default, given a branch element type and a "jaggness" type, a general function [is defined](https://github.com/tamasgal/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/root.jl#L149) which will try to parse the raw bytes into Julia data structure. The `::Type{T}` will match what you have provided in the `Dict` in the previous step. +By default, given a branch element type and a "jaggness" type, a general function [is defined](https://github.com/JuliaHEP/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/root.jl#L149) which will try to parse the raw bytes into Julia data structure. The `::Type{T}` will match what you have provided in the `Dict` in the previous step. -Thus, to "teach" UnROOT how to interpret bytes for your type `T`, you would want to defined a more specific `UnROOT.interped_data` than the default one. Taking the `TLorentzVector` [as example](https://github.com/tamasgal/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/custom.jl#L23) again, we define a function: +Thus, to "teach" UnROOT how to interpret bytes for your type `T`, you would want to defined a more specific `UnROOT.interped_data` than the default one. Taking the `TLorentzVector` [as example](https://github.com/JuliaHEP/UnROOT.jl/blob/06b692523bbff3f467f6b7fe3544e411a719bc9e/src/custom.jl#L23) again, we define a function: ```julia using LorentzVector const LVF64 = LorentzVector{Float64} @@ -24,7 +24,8 @@ function UnROOT.interped_data(rawdata, rawoffsets, ::Type{LVF64}, ::Type{J}) whe ] end -function Base.reinterpret(::Type{LVF64}, v::AbstractVector{UInt8}) where T +# VorView is defined in the `src/custom.jl` +function Base.reinterpret(::Type{LVF64}, v::VorView) where T # x,y,z,t in ROOT v4 = ntoh.(reinterpret(Float64, v[1+32:end])) # t,x,y,z in LorentzVectors.jl @@ -32,7 +33,7 @@ function Base.reinterpret(::Type{LVF64}, v::AbstractVector{UInt8}) where T end ``` -The `Base.reinterpret` function is just a helper function, you could instead write everything inside `UnROOT.interped_data`. We then builds on these, to interpret Jagged TLV branch: https://github.com/tamasgal/UnROOT.jl/blob/4747f6f5fd97ed1a872765485b4eb9e99ec5a650/src/custom.jl#L47 +The `Base.reinterpret` function is just a helper function, you could instead write everything inside `UnROOT.interped_data`. We then builds on these, to interpret Jagged TLV branch: https://github.com/JuliaHEP/UnROOT.jl/blob/4747f6f5fd97ed1a872765485b4eb9e99ec5a650/src/custom.jl#L47 ### More details To expand a bit what we're doing here, the `rawdata` for a single `TLV` is always `64 bytes` long and the first `32 bytes` are TObject header which we don't care (which is why we don't care about `rawoffsets` here). The last `32 bytes` make up 4 `Float64` and we simply parse them and return a collection of (julia) `LorentzVector{Float64}`. diff --git a/docs/src/exampleusage.md b/docs/src/exampleusage.md new file mode 100644 index 00000000..9873dc7a --- /dev/null +++ b/docs/src/exampleusage.md @@ -0,0 +1,68 @@ +## Chunk Iteration +```julia +t = LazyTree(...) +res = 0.0 +for rang in Iterators.partition(1:lastindex(t), 10^6) + res += sum(t[rang].nMuon) # +end +res +``` +Note, `t[rang]` is eager, if you don't need all branches, it's much better to use `t.nMuon[rang]`, or limit which +branches are selected during `LazyTree()` creation time. + +This pattern works the best over network, for local files, stick with: +``` +for evt in t + ... +end +``` +usually is the best approach. + + +## Writting out `.root` files +Currently `UnROOT.jl` is focused on reading only, however, it's semi-trivial to leverage Python world +for write operation since it's not performance critical. + +You have the following choice: +- [PythonCall.jl](https://github.com/cjdoris/PythonCall.jl) -- we will demo how to use this one +- [PyCall.jl](https://github.com/JuliaPy/PyCall.jl) + +Checkout [configuration docs for PythonCall.jl](https://cjdoris.github.io/PythonCall.jl/stable/pythoncall/#pythoncall-config) + +Most importantly, you probably want to set: +```julia +ENV["JULIA_PYTHONCALL_EXE"] = readchomp(`which python`) +``` +before the `using PythonCall` line. Especially if you're using LCG or Athena or CMSSW environment. + +### Write out a `TTree` +```julia +julia> using PythonCall + +julia> const up = pyimport("uproot") + +julia> pywith(up.recreate("./example.root")) do file + file["mytree"] = Dict("branch1"=>1:1000, "branch2"=>rand(1000)) + end + +# read it back with UnROOT.jl +julia> using UnROOT + +julia> LazyTree("./example.root", "mytree") + Row │ branch1 branch2 + │ Int64 Float64 +─────┼─────────────────────────────── + 1 │ 1 0.5775868298287866 + 2 │ 2 0.7245212475492369 + 3 │ 3 0.009249240901789912 + 4 │ 4 0.9010206670973542 + 5 │ 5 0.7609879879740359 + 6 │ 6 0.00916447384387542 + 7 │ 7 0.5636229077934333 + 8 │ 8 0.32617388561103156 + ⋮ │ ⋮ ⋮ +``` + +### Write out a histogram +A histogram is just a tuple of `(bincontent, binedges)`, see +[FHist.jl docs](https://moelf.github.io/FHist.jl/dev/writingtoroot/) for details. diff --git a/paper/LICENSE b/paper/LICENSE new file mode 100644 index 00000000..ffd0f5e8 --- /dev/null +++ b/paper/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Tamas Gal, Jerry Ling and Nick Amin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 00000000..f0e70f98 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,144 @@ +@article{Julia, + author = {Bezanson, Jeff and Edelman, Alan and Karpinski, Stefan and Shah, Viral B.}, + title = {Julia: A Fresh Approach to Numerical Computing}, + journal = {SIAM Review}, + volume = {59}, + number = {1}, + pages = {65-98}, + year = {2017}, + doi = {10.1137/141000671}, + URL = { https://doi.org/10.1137/141000671}, + eprint = {https://doi.org/10.1137/141000671} +} + +@article{JuliaPerformance, + title={Performance of {J}ulia for high energy physics analyses}, + author={Stanitzki, Marcel and Strube, Jan}, + journal={Computing and Software for Big Science}, + volume={5}, + number={1}, + doi = {10.1007/s41781-021-00053-3}, + pages={1--11}, + year={2021}, + publisher={Springer} +} + +@software{jim_pivarski_2021_5539722, + author = {Jim Pivarski and + Henry Schreiner and + Nicholas Smith and + Chris Burr and + Dmitry Kalinkin and + Giordon Stark and + Nikolai Hartmann and + Doug Davis and + Ryunosuke O'Neil and + Andrzej Novak and + Ben Greiner and + Beojan Stanislaus and + ChristopheRappold and + Cosmin Deaconu and + Daniel Cervenkov and + Jonas Rübenach and + Josh Bendavid and + Kilian Lieret and + Michele Peresano and + Raymond Ehlers and + Ruggero Turra and + Tamas Gal and + Alexander Held}, + title = {scikit-hep/uproot4: 4.1.3}, + month = sep, + year = 2021, + publisher = {Zenodo}, + version = {4.1.3}, + doi = {10.5281/zenodo.5539722}, + url = {https://doi.org/10.5281/zenodo.5539722} +} +@software{pivarski_jim_2018_6522027, + author = {Pivarski, Jim and + Osborne, Ianna and + Ifrim, Ioana and + Schreiner, Henry and + Hollands, Angus and + Biswas, Anish and + Das, Pratyush and + Roy Choudhury, Santam and + Smith, Nicholas}, + title = {Awkward Array}, + month = oct, + year = 2018, + note = {If you use this software, please cite it as below.}, + publisher = {Zenodo}, + version = {1.9.0rc4}, + doi = {10.5281/zenodo.6522027}, + url = {https://doi.org/10.5281/zenodo.6522027} +} +@Article{harris2020array, + title = {Array programming with {NumPy}}, + author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. + van der Walt and Ralf Gommers and Pauli Virtanen and David + Cournapeau and Eric Wieser and Julian Taylor and Sebastian + Berg and Nathaniel J. Smith and Robert Kern and Matti Picus + and Stephan Hoyer and Marten H. van Kerkwijk and Matthew + Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del + R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre + G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and + Warren Weckesser and Hameer Abbasi and Christoph Gohlke and + Travis E. Oliphant}, + year = {2020}, + month = sep, + journal = {Nature}, + volume = {585}, + number = {7825}, + pages = {357--362}, + doi = {10.1038/s41586-020-2649-2}, + publisher = {Springer Science and Business Media {LLC}}, + url = {https://doi.org/10.1038/s41586-020-2649-2} +} +@software{reback2020pandas, + author = {{The pandas development team}}, + title = {Pandas}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {latest}, + doi = {10.5281/zenodo.3509134}, + url = {https://doi.org/10.5281/zenodo.3509134} +} +@article{Brun:1997pa, + author = "Brun, R. and Rademakers, F.", + editor = "Werlen, M. and Perret-Gallix, D.", + title = {ROOT: An object oriented data analysis framework}, + doi = "10.1016/S0168-9002(97)00048-X", + journal = "Nucl. Instrum. Meth. A", + volume = "389", + pages = "81--86", + year = "1997" +} + +@article{Adri_n_Mart_nez_2016, + doi = {10.1088/0954-3899/43/8/084001}, + url = {https://doi.org/10.1088%2F0954-3899%2F43%2F8%2F084001}, + year = 2016, + month = {jun}, + publisher = {{IOP} Publishing}, + volume = {43}, + number = {8}, + pages = {084001}, + author = {S Adri{\'{a} +}n-Mart{\'{\i}}nez and M Ageron and F Aharonian and S Aiello and A Albert and F Ameli and E Anassontzis and M Andre and G Androulakis and M Anghinolfi and G Anton and M Ardid and T Avgitas and G Barbarino and E Barbarito and B Baret and J Barrios-Mart{\'{\i}} and B Belhorma and A Belias and E Berbee and A van den Berg and V Bertin and S Beurthey and V van Beveren and N Beverini and S Biagi and A Biagioni and M Billault and M Bond{\`{\i}} and R Bormuth and B Bouhadef and G Bourlis and S Bourret and C Boutonnet and M Bouwhuis and C Bozza and R Bruijn and J Brunner and E Buis and J Busto and G Cacopardo and L Caillat and M Calamai and D Calvo and A Capone and L Caramete and S Cecchini and S Celli and C Champion and R Cherkaoui El Moursli and S Cherubini and T Chiarusi and M Circella and L Classen and R Cocimano and J A B Coelho and A Coleiro and S Colonges and R Coniglione and M Cordelli and A Cosquer and P Coyle and A Creusot and G Cuttone and A D'Amico and G De Bonis and G De Rosa and C De Sio and F Di Capua and I Di Palma and A F D{\'{\i}}az Garc{\'{\i}}a and C Distefano and C Donzaud and D Dornic and Q Dorosti-Hasankiadeh and E Drakopoulou and D Drouhin and L Drury and M Durocher and T Eberl and S Eichie and D van Eijk and I El Bojaddaini and N El Khayati and D Elsaesser and A Enzenhöfer and F Fassi and P Favali and P Fermani and G Ferrara and C Filippidis and G Frascadore and L A Fusco and T Gal and S Galat{\`{a}} and F Garufi and P Gay and M Gebyehu and V Giordano and N Gizani and R Gracia and K Graf and T Gr{\'{e}}goire and G Grella and R Habel and S Hallmann and H van Haren and S Harissopulos and T Heid and A Heijboer and E Heine and S Henry and J J Hern{\'{a}}ndez-Rey and M Hevinga and J Hofestädt and C M F Hugon and G Illuminati and C W James and P Jansweijer and M Jongen and M de Jong and M Kadler and O Kalekin and A Kappes and U F Katz and P Keller and G Kieft and D Kie{\ss}ling and E N Koffeman and P Kooijman and A Kouchner and V Kulikovskiy and R Lahmann and P Lamare and A Leisos and E Leonora and M Lindsey Clark and A Liolios and C D Llorens Alvarez and D Lo Presti and H Löhner and A Lonardo and M Lotze and S Loucatos and E Maccioni and K Mannheim and A Margiotta and A Marinelli and O Mari{\c{s}} and C Markou and J A Mart{\'{\i}}nez-Mora and A Martini and R Mele and K W Melis and T Michael and P Migliozzi and E Migneco and P Mijakowski and A Miraglia and C M Mollo and M Mongelli and M Morganti and A Moussa and P Musico and M Musumeci and S Navas and C A Nicolau and I Olcina and C Olivetto and A Orlando and A Papaikonomou and R Papaleo and G E P{\u{a}}v{\u{a}}la{\c{s}} and H Peek and C Pellegrino and C Perrina and M Pfutzner and P Piattelli and K Pikounis and G E Poma and V Popa and T Pradier and F Pratolongo and G Pühlhofer and S Pulvirenti and L Quinn and C Racca and F Raffaelli and N Randazzo and P Rapidis and P Razis and D Real and L Resvanis and J Reubelt and G Riccobene and C Rossi and A Rovelli and M Salda{\~{n}}a and I Salvadori and D F E Samtleben and A S{\'{a}}nchez Garc{\'{\i}}a and A S{\'{a}}nchez Losa and M Sanguineti and A Santangelo and D Santonocito and P Sapienza and F Schimmel and J Schmelling and V Sciacca and M Sedita and T Seitz and I Sgura and F Simeone and I Siotis and V Sipala and B Spisso and M Spurio and G Stavropoulos and J Steijger and S M Stellacci and D Stransky and M Taiuti and Y Tayalati and D T{\'{e}}zier and S Theraube and L Thompson and P Timmer and C Tönnis and L Trasatti and A Trovato and A Tsirigotis and S Tzamarias and E Tzamariudaki and B Vallage and V Van Elewyck and J Vermeulen and P Vicini and S Viola and D Vivolo and M Volkert and G Voulgaris and L Wiggers and J Wilms and E de Wolf and K Zachariadou and J D Zornoza and J Z{\'{u}}{\~{n}}iga}, + title = {Letter of intent for {KM}3NeT 2.0}, + journal = {Journal of Physics G: Nuclear and Particle Physics} +} +@article{Ehataht:2020ebp, + author = {Ehat\"aht, Karl}, + editor = "Doglioni, C. and Kim, D. and Stewart, G. A. and Silvestris, L. and Jackson, P. and Kamleh, W.", + collaboration = "CMS", + title = "{NANOAOD: a new compact event data format in CMS}", + doi = "10.1051/epjconf/202024506002", + journal = "EPJ Web Conf.", + volume = "245", + pages = "06002", + year = "2020" +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..62a9fdc3 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,229 @@ +--- +title: 'UnROOT: an I/O library for the CERN ROOT file format written in Julia' +tags: + - Julia + - HEP +authors: + - name: Tamás Gál + orcid: 0000-0001-7821-8673 + affiliation: "1, 2" + - name: Jerry (Jiahong) Ling + orcid: 0000-0002-3359-0380 + affiliation: "3" + - name: Nick Amin + orcid: 0000-0003-2560-0013 + affiliation: "4" +affiliations: + - name: Erlangen Centre for Astroparticle Physics + index: 1 + - name: Friedrich-Alexander-Universität Erlangen-Nürnberg + index: 2 + - name: Harvard University + index: 3 + - name: University of California, Santa Barbara + index: 4 +date: 08 October 2021 +bibliography: paper.bib +--- +# Summary +`UnROOT.jl` is a pure Julia implementation of CERN's ROOT [@Brun:1997pa] file I/O +(`.root`) software, which is fast and memory-efficient, and composes well with Julia's +high-performance iteration, array, and multi-threading interfaces. + +# Statement of Need +The High-Energy Physics (HEP) community, especially in data analysis, +has been facing the two-language +problem for a long time. Often, physicists would start prototyping with a +`Python` front-end which glues to a `C/C++/Fortran` back-end. Soon they would hit +a task that could be challenging to express in columnar (i.e., "vectorized") style, +a type of problem that is normally tackled with libraries like +`numpy` [@harris2020array] or `pandas` [@reback2020pandas]. This usually would lead to +them either writing `C++` kernels and interfacing them with `Python`, or porting the +prototype to `C++` and starting to maintain two code bases including the wrapper +code. Specific to HEP, `AwkwardArray` [@pivarski_jim_2018_6522027] can be seen +as a compromise between the two solutions, where the user writes in a special +columnar style that has some flexibility for addressing the jaggedness of HEP data. + +All traditional options represent increasing engineering effort for authors and +users, often in multiple programming languages. Many steps of this process are +critical, such as identifying bottlenecks and creating an architecture that is simultaneously +performant and maintainable while still being user-friendly and +logically structured. Using a `Python` front-end and dancing across language +barriers also hinders the ability to parallelize tasks down to +event level, as the existing usage often relies on chunk or even file level +parallelization. Finally, newer techniques such as automatic differentiation +also work more smoothly without language barriers, allowing physicists to develop +algorithms. With Julia's active auto diff community [^1], we expect +`UnROOT.jl` to be one of the cornerstones for physicists. + +`UnROOT.jl` attempts to solve all of the above by choosing Julia, a +high-performance language with a simple and expressive syntax [@Julia]. Julia is +designed to solve the two-language problem in general. This has also been studied for +HEP specifically [@JuliaPerformance]. Analysis software written in Julia +can freely escape to a `for-loop` whenever vectorized-style processing is not +flexible enough, without any performance degradation. At the same time, +`UnROOT.jl` transparently supports multi-threading and multi-processing by +providing data structures that are a subtype of `AbstractArray`, the +built-in abstract type for array-like objects, which allows easy interfacing with +array-routines from other packages, thanks to multiple dispatch, one of +the main features of Julia. + +# Features and Functionality + +The `ROOT` data format is flexible and mostly self-descriptive. Users can define +their own data structures (C++ classes) that derive from `ROOT` classes and +serialise them into directories, trees, and branches. The information about the +deserialisation is written to the output file (therefore, it's self-descriptive) but +there are some basic structures and constants needed to bootstrap the parsing +process. One of the biggest advantages of the `ROOT` data format is the ability +to store jagged structures like nested arrays of structs with different +sub-array lengths. In high-energy physics, such structures are preferred to +represent, for example, particle interactions and detector responses as signals from +different hardware components, combined into a tree of events. + +`UnROOT.jl` understands the core structure of `ROOT` files, and is able to +decompress and deserialize instances of the commonly used `TH1`, `TH2`, +`TDirectory`, `TTree`, etc. ROOT classes. All basic C++ types for `TTree` +branches are supported as well, including their nested variants. Additionally, +`UnROOT.jl` provides a way to hook into the deserialisation process of custom +types where the automatic parsing fails. At the time of this article, `UnROOT` is +already being used successfully in the data analysis of the KM3NeT neutrino telescope. +And just like `RDataFrame`, it can be directly used on "NTuple" `TTree` such as +the NANOAOD format used by the CMS collaboration [@Ehataht:2020ebp]. + +Opening and loading a `TTree` lazily, i.e., without reading the whole data into +memory, is simple: + +```julia +julia> using UnROOT + +julia> f = ROOTFile("test/samples/NanoAODv5_sample.root") +ROOTFile with 2 entries and 21 streamers. +test/samples/NanoAODv5_sample.root + Events + "run" + "luminosityBlock" + "event" + "HTXS_Higgs_pt" + "HTXS_Higgs_y" + ... + +julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)$"]) + Row Electron_dxy nMuon Muon_eta Muon_pt + Vector{Float32} UInt32 Vector{Float32} Vector{Float32} + + 1 [0.000371] 0 [] [] + 2 [-0.00982] 2 [0.53, 0.229] [19.9, 15.3] + 3 [] 0 [] [] + 4 [-0.00157] 0 [] [] + ... +``` + +As seen in the above example, the entries in the columns are multi-dimensional +and jagged. The `LazyTree` object acts as a table that suports sequential +or parallel iteration, selections, and filtering based on ranges or masks, and +operations on whole columns: + +```julia +for event in mytree + # ... Operate on event +end + +Threads.@threads for event in mytree # multi-threading + # ... Operate on event +end + +mytree.Muon_pt # a column as a lazy vector of vectors +``` + +The `LazyTree` is designed as `<: AbstractArray`, which makes it compose well +with the rest of the Julia ecosystem. For example, syntactic loop fusion [^2] and +Query-style tabular manipulations provided by packages like `Query.jl` [^3] without +any additional code support just work out-of-the-box. + +For example, the following code will only iterate through the tree once to find +all events with exactly two muons and two electrons, due to loop fusion: +```julia +# t <: LazyTree +findall(@. t.nMuon==2 & t.nElectron==2) +``` + +And query-style filtering can be done with no code addition from `UnROOT.jl`'s end +thanks to Julia's composability due to multiple dispatch [^4]: +```julia +julia> using Query, DataFrame + +julia> @from evt in mytree begin + @where length(evt.Jet_pt) > 6 + @let njets=length(evt.Jet_pt) + @let njets40=sum(evt.Jet_pt.>40) + @select {njets=njets, njets40, evt.MET_pt} + @collect DataFrame + end +``` + +# Synthetic Benchmark against C++ ROOT +Benchmark against various C++ ROOT solution can be found in our +benchmark repo[^5]. Here we summarize the results: + +### Single-threaded composite benchmark +| Language | Cold Run | Warmed Run | +| -------- | -------- | ---------- | +| [Julia](https://nbviewer.jupyter.org/github/Moelf/UnROOT_RDataFrame_MiniBenchmark/blob/master/UnROOT_benchmark.ipynb) | 20.58 s | 19.81 s | +| [PyROOT RDF](https://nbviewer.jupyter.org/github/Moelf/UnROOT_RDataFrame_MiniBenchmark/blob/master/RDataFrame_benchmark.ipynb) | 40.21 s | N/A | +| [Compiled C++ ROOT Loop](https://github.com/Moelf/UnROOT_RDataFrame_MiniBenchmark/tree/master/composite_benchmarks#root-rdataframe-g-compiled) | 28.16 s | N/A | +| [Compiled RDF](https://github.com/Moelf/UnROOT_RDataFrame_MiniBenchmark/blob/master/composite_benchmarks/RDataFrame_benchmark_compiled_single.cpp) | 19.82 s | N/A | + +### 4-threaded composite benchmark +| Language | Cold Run | Warmed Run | +| -------- | -------- | ---------- | +| [Julia](https://nbviewer.jupyter.org/github/Moelf/UnROOT_RDataFrame_MiniBenchmark/blob/master/UnROOT_benchmark.ipynb) | 5.46 s | 5.07 s | +| PyROOT RDF |N/A | N/A | +| Compiled C++ ROOT Loop | N/A | N/A | +| [Compiled RDF](https://github.com/Moelf/UnROOT_RDataFrame_MiniBenchmark/blob/master/composite_benchmarks/RDataFrame_benchmark_compiled_MT.cpp) | 5.64 s | N/A | + + +# Usage Comparison with Existing Software + +This section focuses on comparison with other existing ROOT I/O solutions +in the Julia universe. However, one honorable mention is `uproot` +[@jim_pivarski_2021_5539722], which is a purely Python-based ROOT I/O library +that played (and still plays) an important role in the development of `UnROOT.jl` as +it was at the time of this article the most complete and best documented ROOT I/O +implementation. + +- `UpROOT.jl` is a wrapper for the aforementioned `uproot` Python package and + uses `PyCall.jl` [^6] as a bridge, which means that it relies on `Python` as a + glue language. In addition to that, `uproot` itself utilises the C++ library + `AwkwardArray` [@pivarski_jim_2018_6522027] to efficiently deal with jagged + data in `ROOT` files. Most of the features of `uproot` are available in the + Julia context, but there are intrinsic performance and usability drawbacks due + to the three-language architecture. + +- `ROOT.jl` [^7] is one of the oldest Julia `ROOT` packages. It uses C++ bindings to + directly wrap the `ROOT` framework and therefore is not limited ot I/O. + Unfortunately, the `Cxx.jl` [^8] package that is used to generate the C++ glue + code does not support Julia 1.4 or later. The multi-threaded features are also + limited. + +# Conclusion + +`UnROOT.jl` is an important package in high-energy physics and related +scientific fields where the `ROOT` dataformat is established, since the ability +to read and parse scientific data is certainly the first mandatory step to open +the window to a programming language and its package ecosystem. `UnROOT.jl` has +demonstrated tree processing speeds at the same level as the `C++` `ROOT` +framework in per-event iteration as well as the Python-based `uproot` library in +chunked iteration. + +# References + + +[^1]: https://juliadiff.org/ +[^2]: https://julialang.org/blog/2017/01/moredots/ +[^3]: https://github.com/queryverse/Query.jl +[^4]: https://www.youtube.com/watch?v=kc9HwsxE1OY +[^5]: https://github.com/Moelf/UnROOT_RDataFrame_MiniBenchmark#single-threaded-composite-benchmark +[^6]: https://github.com/JuliaPy/PyCall.jl +[^7]: https://github.com/JuliaHEP/ROOT.jl +[^8]: https://github.com/JuliaInterop/Cxx.jl diff --git a/src/UnROOT.jl b/src/UnROOT.jl index f868bac0..35d50425 100644 --- a/src/UnROOT.jl +++ b/src/UnROOT.jl @@ -1,9 +1,10 @@ module UnROOT using LazyArrays +import Mmap: mmap export ROOTFile, LazyBranch, LazyTree -import Base: close, keys, get, getindex, getproperty, show, length, iterate, position, ntoh, lock, unlock, reinterpret +import Base: close, keys, get, getindex, getproperty, show, length, iterate, position, ntoh, reinterpret ntoh(b::Bool) = b import AbstractTrees: children, printnode, print_tree @@ -14,12 +15,22 @@ import IterTools: groupby import LibDeflate: zlib_decompress!, Decompressor -import Tables, TypedTables, PrettyTables +import Tables, PrettyTables -@static if VERSION < v"1.6" - Base.first(a::AbstractVector{S}, n::Integer) where S<: AbstractString = a[1:(length(a) > n ? n : end)] - Base.first(a::S, n::Integer) where S<: AbstractString = a[1:(length(a) > n ? n : end)] +""" + OffsetBuffer + +Works with seek, position of the original file. Think of it as a view of IOStream that can be +indexed with original positions. +""" +struct OffsetBuffer{T} + io::T + offset::Int end +Base.read(io::OffsetBuffer, nb) = Base.read(io.io, nb) +Base.seek(io::OffsetBuffer, i) = Base.seek(io.io, i - io.offset) +Base.skip(io::OffsetBuffer, i) = Base.skip(io.io, i) +Base.position(io::OffsetBuffer) = position(io.io) + io.offset function unsafe_arraycastntoh(::Type{D}, ary::Vector{S}) where {S, D} @static if VERSION < v"1.7" @@ -35,6 +46,7 @@ function unsafe_arraycastntoh(::Type{D}, ary::Vector{S}) where {S, D} end include("constants.jl") +include("streamsource.jl") include("io.jl") include("types.jl") include("utils.jl") diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 92ef2ea8..3c2e72a6 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -202,6 +202,41 @@ end primitivetype(l::TLeafI) = l.fIsUnsigned ? UInt32 : Int32 +# FIXME this should be generated and inherited from TLeaf +@with_kw struct TLeafS + # from TNamed + fName + fTitle + + # from TLeaf + fLen + fLenType + fOffset + fIsRange + fIsUnsigned + fLeafCount + + # own fields + fMinimum + fMaximum +end + +function parsefields!(io, fields, T::Type{TLeafS}) + preamble = Preamble(io, T) + parsefields!(io, fields, TLeaf) + fields[:fMinimum] = readtype(io, Int16) + fields[:fMaximum] = readtype(io, Int16) + endcheck(io, preamble) +end + +function unpack(io, tkey::TKey, refs::Dict{Int32, Any}, T::Type{TLeafS}) + @initparse + parsefields!(io, fields, T) + T(;fields...) +end + +primitivetype(l::TLeafS) = l.fIsUnsigned ? UInt16 : Int16 + # FIXME this should be generated and inherited from TLeaf @with_kw struct TLeafL # from TNamed @@ -395,6 +430,8 @@ primitivetype(l::TLeafD) = Float64 fMaximum end +primitivetype(l::TLeafC) = UInt8 + function parsefields!(io, fields, ::Type{T}) where {T<:TLeafC} preamble = Preamble(io, T) parsefields!(io, fields, TLeaf) diff --git a/src/custom.jl b/src/custom.jl index 0823d3ef..ccf6892e 100644 --- a/src/custom.jl +++ b/src/custom.jl @@ -31,11 +31,44 @@ end # Custom struct interpretation abstract type CustomROOTStruct end +struct FixLenVector{N, T} <: AbstractVector{T} + vec::SVector{N, T} +end +(::Type{FixLenVector{N, T}})() where {N, T} = FixLenVector(zero(SVector{N, T})) +Base.length(x::FixLenVector) = length(x.vec) +Base.length(::Type{FixLenVector{N, T}}) where {N, T} = N +Base.size(x::FixLenVector) = size(x.vec) +Base.eltype(x::FixLenVector) = eltype(x.vec) +Base.iterate(x::FixLenVector) = iterate(x.vec) +Base.iterate(x::FixLenVector, n) = iterate(x.vec, n) +Base.getindex(x::FixLenVector, n) = getindex(x.vec, n) + +#N.B this is a hack, we deal with ntoh in reinterpret step +Base.ntoh(x::FixLenVector) = x +const VorView = Union{Vector{UInt8}, SubArray{UInt8, 1, Vector{UInt8}, Tuple{UnitRange{Int64}}, true}} + +function Base.reinterpret(::Type{Vector{FixLenVector{N, T}}}, data::Vector{UInt8}) where {N,T} + vs = reinterpret(T, data) + @. vs = ntoh(vs) + return FixLenVector.(reinterpret(SVector{N, T}, vs)) +end + +function Base.reinterpret(::Type{FixLenVector{N, T}}, v::VorView) where {N, T} + vs = reinterpret(T, v) + @. vs = ntoh(vs) + FixLenVector(SVector{N, T}(vs)) +end +function interped_data(rawdata, rawoffsets, ::Type{T}, ::Type{Nojagg}) where {T <: FixLenVector} + n = sizeof(T) + [ + reinterpret(T, x) for x in Base.Iterators.partition(rawdata, n) + ] +end # TLorentzVector const LVF64 = LorentzVector{Float64} Base.show(io::IO, lv::LorentzVector) = print(io, "LV(x=$(lv.x), y=$(lv.y), z=$(lv.z), t=$(lv.t))") -function Base.reinterpret(::Type{LVF64}, v::AbstractVector{UInt8}) where T +function Base.reinterpret(::Type{LVF64}, v::VorView) # first 32 bytes are TObject header we don't care # x,y,z,t in ROOT v4 = reinterpret(Float64, @view v[1+32:end]) @@ -74,7 +107,7 @@ function interped_data(rawdata, rawoffsets, ::Type{Vector{LVF64}}, ::Type{Offset offset .+= 1 VectorOfVectors(real_data, offset) end -function interped_data(rawdata, rawoffsets, ::Type{LVF64}, ::Type{J}) where {T, J <: JaggType} +function interped_data(rawdata, rawoffsets, ::Type{LVF64}, ::Type{J}) where J <: JaggType # even with rawoffsets, we know each TLV is destinied to be 64 bytes [ reinterpret(LVF64, x) for x in Base.Iterators.partition(rawdata, 64) @@ -92,7 +125,7 @@ end function readtype(io::IO, T::Type{_KM3NETDAQHit}) T(readtype(io, Int32), read(io, UInt8), read(io, Int32), read(io, UInt8)) end -function interped_data(rawdata, rawoffsets, ::Type{Vector{_KM3NETDAQHit}}, ::Type{J}) where {T, J <: UnROOT.JaggType} +function interped_data(rawdata, rawoffsets, ::Type{Vector{_KM3NETDAQHit}}, ::Type{J}) where J <: UnROOT.JaggType UnROOT.splitup(rawdata, rawoffsets, _KM3NETDAQHit, skipbytes=10) end @@ -115,7 +148,7 @@ function readtype(io::IO, T::Type{_KM3NETDAQTriggeredHit}) T(dom_id, channel_id, tdc, tot, trigger_mask) end -function UnROOT.interped_data(rawdata, rawoffsets, ::Type{Vector{_KM3NETDAQTriggeredHit}}, ::Type{J}) where {T, J <: UnROOT.JaggType} +function UnROOT.interped_data(rawdata, rawoffsets, ::Type{Vector{_KM3NETDAQTriggeredHit}}, ::Type{J}) where J <: UnROOT.JaggType UnROOT.splitup(rawdata, rawoffsets, _KM3NETDAQTriggeredHit, skipbytes=10) end @@ -147,6 +180,6 @@ function readtype(io::IO, T::Type{_KM3NETDAQEventHeader}) T(detector_id, run, frame_index, UTC_seconds, UTC_16nanosecondcycles, trigger_counter, trigger_mask, overlays) end -function UnROOT.interped_data(rawdata, rawoffsets, ::Type{_KM3NETDAQEventHeader}, ::Type{J}) where {T, J <: UnROOT.JaggType} +function UnROOT.interped_data(rawdata, rawoffsets, ::Type{_KM3NETDAQEventHeader}, ::Type{J}) where J <: UnROOT.JaggType UnROOT.splitup(rawdata, rawoffsets, _KM3NETDAQEventHeader, jagged=false) end diff --git a/src/displays.jl b/src/displays.jl index 6c82e270..7c20437c 100644 --- a/src/displays.jl +++ b/src/displays.jl @@ -14,7 +14,6 @@ function children(f::T) where T <: Union{ROOTFile,ROOTDirectory} # then all TKeys in the file which are not for a TTree seen = Set{String}() ch = Vector{Union{TTree,TKeyNode,ROOTDirectory}}() - T === ROOTFile ? lock(f) : nothing for k in keys(f) try obj = f[k] @@ -35,7 +34,6 @@ function children(f::T) where T <: Union{ROOTFile,ROOTDirectory} push!(ch, kn) end end - T === ROOTFile ? unlock(f) : nothing ch end function children(t::TTree) @@ -61,12 +59,12 @@ function _show(io::IO, tree::LazyTree; kwargs...) _ds = displaysize(io) PrettyTables.pretty_table( io, - innertable(tree); + Tables.columns(tree); header=_hs, alignment=:l, vlines=[1], hlines=[:header], - crop_num_lines_at_beginning=2, + reserved_display_lines=2, row_number_alignment=:l, row_number_column_title="Row", show_row_number=true, @@ -77,13 +75,39 @@ function _show(io::IO, tree::LazyTree; kwargs...) ) nothing end + + +function Base.show(io::IO, ::MIME"text/plain", br::LazyBranch) + print(io, summary(br)) + println(io, ": ") + if length(br) < 200 + Base.print_array(IOContext(io, :limit => true), br[:]) + else + head = @async br[1:100] + tail = @async br[end-99:end] + wait(head) + wait(tail) + Base.print_array(IOContext(io, :limit => true), Vcat(head.result, tail.result)) + end + nothing +end + +# stop crazy stracktrace +function Base.show(io::IO, + ::Type{<:LazyTree{<:NamedTuple{Ns, Vs}}}) where {Ns, Vs} + elip = length(Ns) > 5 ? "..." : "" + println(io, "LazyTree with $(length(Ns)) branches:") + println(io, join(first(Ns, 5), ", "), elip) +end + function Base.show(io::IO, ::MIME"text/html", tree::LazyTree) - _hs = _make_header(tree) maxrows = 10 maxcols = 30 nrow = length(tree) - t = @view innertable(tree)[1:min(maxrows,nrow)] - ncol = length(Tables.columns(t)) + t = Tables.columns(@view tree[1:min(maxrows,nrow)]) + # _hs has headers and subheaders + _hs = first.(_make_header(tree), maxcols) + ncol = length(t) withcommas(value) = reverse(join(join.(Iterators.partition(reverse(string(value)),3)),",")) write(io, "

") write(io, "$(withcommas(nrow)) rows × $(ncol) columns") @@ -94,7 +118,7 @@ function Base.show(io::IO, ::MIME"text/html", tree::LazyTree) elseif (ncol > maxcols) write(io, " (omitted printing of $(ncol-maxcols) columns)") end - write(io, "

") + println(io, "

") PrettyTables.pretty_table( io, t; @@ -103,9 +127,8 @@ function Base.show(io::IO, ::MIME"text/html", tree::LazyTree) row_number_column_title="", show_row_number=true, compact_printing=false, - filters_col = ((_,i) -> i <= maxcols,), formatters=(v, i, j) -> _treeformat(v, 100), - tf = PrettyTables.HTMLTableFormat(css = """th { color: #000; background-color: #fff; }"""), + tf = PrettyTables.HtmlTableFormat(css = """th { color: #000; background-color: #fff; }"""), backend=Val(:html), ) nothing @@ -114,7 +137,7 @@ _symtup2str(symtup, trunc=15) = collect(first.(string.(symtup), trunc)) function _make_header(t) pn = propertynames(t) header = _symtup2str(pn) - subheader = _symtup2str(Tables.columntype.(Ref(innertable(t)), pn)) + subheader = _symtup2str(eltype.(values(Tables.columns(t)))) (header, subheader) end function _treeformat(val, trunc) diff --git a/src/iteration.jl b/src/iteration.jl index 60373790..f90bf7d5 100644 --- a/src/iteration.jl +++ b/src/iteration.jl @@ -121,6 +121,8 @@ mutable struct LazyBranch{T,J,B} <: AbstractVector{T} [0:-1 for _ in 1:Threads.nthreads()]) end end +basketarray(lb::LazyBranch, ithbasket) = basketarray(lb.f, lb.b, ithbasket) +basketarray_iter(lb::LazyBranch) = basketarray_iter(lb.f, lb.b) function Base.hash(lb::LazyBranch, h::UInt) h = hash(lb.f, h) @@ -137,8 +139,6 @@ Base.firstindex(ba::LazyBranch) = 1 Base.lastindex(ba::LazyBranch) = ba.L Base.eltype(ba::LazyBranch{T,J,B}) where {T,J,B} = T -basketarray(lb::LazyBranch, ithbasket) = basketarray(lb.f, lb.b, ithbasket) -basketarray_iter(lb::LazyBranch) = basketarray_iter(lb.f, lb.b) function Base.show(io::IO, lb::LazyBranch) summary(io, lb) @@ -191,67 +191,83 @@ function Base.iterate(ba::LazyBranch{T,J,B}, idx=1) where {T,J,B} return (ba[idx], idx + 1) end -struct LazyEvent{T<:TypedTables.Table} +struct LazyEvent{T} tree::T idx::Int64 end +Base.propertynames(lt::LazyEvent) = propertynames(getfield(lt, :tree)) struct LazyTree{T} <: AbstractVector{LazyEvent{T}} treetable::T end + +Tables.columns(t::LazyTree) = Core.getfield(t, :treetable) +Tables.partitions(t::LazyTree) = (t[r] for r in _clusterranges(t)) + function LazyTree(path::String, x...) LazyTree(ROOTFile(path), x...) end -@inline innertable(t::LazyTree) = Core.getfield(t, :treetable) - -Base.propertynames(lt::LazyTree) = propertynames(innertable(lt)) -Base.getproperty(lt::LazyTree, s::Symbol) = getproperty(innertable(lt), s) +Base.propertynames(lt::LazyTree) = propertynames(Tables.columns(lt)) +Base.getproperty(lt::LazyTree, s::Symbol) = getproperty(Tables.columns(lt), s) Base.broadcastable(lt::LazyTree) = lt Base.IndexStyle(::Type{<:LazyTree}) = IndexLinear() -Base.getindex(lt::LazyTree, row::Int) = LazyEvent(innertable(lt), row) +Base.getindex(lt::LazyTree, row::Int) = LazyEvent(Tables.columns(lt), row) # kept lazy for broadcasting purpose -Base.getindex(lt::LazyTree, row::CartesianIndex{1}) = LazyEvent(innertable(lt), row[1]) -function Base.getindex(lt::LazyTree, rang::UnitRange) - return LazyTree(innertable(lt)[rang]) +Base.getindex(lt::LazyTree, row::CartesianIndex{1}) = LazyEvent(Tables.columns(lt), row[1]) +function Base.getindex(lt::LazyTree, rang) + bnames = propertynames(lt) + branches = asyncmap(b->getproperty(lt, b)[rang], bnames) + return LazyTree(NamedTuple{bnames}(branches)) +end + +function Base.view(lt::LazyTree, idx...) + bnames = propertynames(lt) + return LazyTree(map(x->view(x, idx...), Tables.columns(lt))) end # a specific event -Base.getindex(lt::LazyTree, ::typeof(!), s::Symbol) = lt[:, s] -Base.getindex(lt::LazyTree, ::Colon, s::Symbol) = getproperty(innertable(lt), s) # the real deal +Base.getindex(lt::LazyTree, ::typeof(!), s) = lt[:, s] +Base.getindex(lt::LazyTree, ::Colon, s::Symbol) = getproperty(lt, s) +Base.getindex(lt::LazyTree, ::Colon, s::Int) = getproperty(lt, propertynames(lt)[s]) +Base.getindex(lt::LazyTree, ::Colon, ss) = LazyTree(NamedTuple(propertynames(lt)[s]=>lt[:, s] for s in ss)) Base.getindex(lt::LazyTree, row::Int, col::Symbol) = lt[:, col][row] Base.getindex(lt::LazyTree, rows::UnitRange, col::Symbol) = lt[:, col][rows] +Base.getindex(lt::LazyTree, row::Int, ::Colon) = lt[row] +Base.getindex(lt::LazyTree, row::AbstractVector, ::Colon) = lt[row] Base.getindex(lt::LazyTree, ::Colon) = lt[1:end] -Base.firstindex(lt::LazyTree) = 1 -Base.lastindex(lt::LazyTree) = length(lt) -Base.eachindex(lt::LazyTree) = 1:lastindex(lt) # allow enumerate() to be chunkable (eg with Threads.@threads) Base.step(e::Iterators.Enumerate{LazyTree{T}}) where T = 1 Base.firstindex(e::Iterators.Enumerate{LazyTree{T}}) where T = firstindex(e.itr) Base.lastindex(e::Iterators.Enumerate{LazyTree{T}}) where T = lastindex(e.itr) Base.eachindex(e::Iterators.Enumerate{LazyTree{T}}) where T = eachindex(e.itr) -Base.getindex(e::Iterators.Enumerate{LazyTree{T}}, row::Int) where T = (row, LazyEvent(innertable(e.itr), row)) +Base.getindex(e::Iterators.Enumerate{LazyTree{T}}, row::Int) where T = (row, LazyEvent(Tables.columns(e.itr), row)) # interfacing Table -Base.names(lt::LazyTree) = collect(String.(propertynames(innertable(lt)))) -Base.length(lt::LazyTree) = length(innertable(lt)) +Base.names(lt::LazyTree) = [String(x) for x in propertynames(lt)] +Base.length(lt::LazyTree) = length(first(Tables.columns(lt))) Base.ndims(::Type{<:LazyTree}) = 1 -Base.size(lt::LazyTree) = size(innertable(lt)) +Base.size(lt::LazyTree) = size(first(Tables.columns(lt))) # all column has the same size function LazyArrays.Vcat(ts::LazyTree...) - cs = Tables.columns.(innertable.(ts)) - LazyTree(TypedTables.Table(map(Vcat, cs...))) + branch_names = propertynames(first(ts)) + res_branches = map(branch_names) do bname + LazyArrays.Vcat(getproperty.(ts, bname)...) + end + LazyTree(NamedTuple{branch_names}(res_branches)) end Base.vcat(ts::LazyTree...) = Vcat(ts...) Base.reduce(::typeof(vcat), ts::AbstractVector{<:LazyTree}) = Vcat((ts)...) -Base.mapreduce(f::Function, ::typeof(vcat), ts::AbstractVector{<:LazyTree}) = Vcat(f.(ts)...) -Base.mapreduce(f::Function, ::typeof(Vcat), ts::AbstractVector{<:LazyTree}) = Vcat(f.(ts)...) +Base.mapreduce(f, ::typeof(vcat), ts::Vector{<:LazyTree}) = Vcat(f.(ts)...) +Base.mapreduce(f, ::typeof(Vcat), ts::Vector{<:LazyTree}) = Vcat(f.(ts)...) function getbranchnamesrecursive(obj) out = Vector{String}() for b in obj.fBranches.elements push!(out, b.fName) - for subname in getbranchnamesrecursive(b) + subs = getbranchnamesrecursive(b) + !isempty(subs) && pop!(out) + for subname in subs push!(out, "$(b.fName)/$(subname)") end end @@ -263,13 +279,17 @@ end LazyTree(f::ROOTFile, s::AbstractString, branches::Vector{Union{AbstractString, Regex}}) Constructor for `LazyTree`, which is close to an `DataFrame` (interface wise), -and a lazy `TypedTables.Table` (speed wise). Looping over a `LazyTree` is fast and type +and a lazy Table (speed wise). Looping over a `LazyTree` is fast and type stable. Internally, `LazyTree` contains a typed table whose branch are [`LazyBranch`](@ref). This means that at any given time only `N` baskets are cached, where `N` is the number of branches. !!! note Accessing with `[start:stop]` will return a `LazyTree` with concrete internal table. +!!! warning + Split branches are re-named, and the exact renaming may change. See + [Issue 156](https://github.com/JuliaHEP/UnROOT.jl/pull/156) for context. + # Example ```julia julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)\$"]) @@ -286,18 +306,35 @@ julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)\ function LazyTree(f::ROOTFile, s::AbstractString, branches) tree = f[s] tree isa TTree || error("$s is not a tree name.") - if length(branches) > 30 - @warn "Your tree is quite wide, with $(length(branches)) branches, this will take compiler a moment." - end d = Dict{Symbol,LazyBranch}() - _m(s::AbstractString) = isequal(s) _m(r::Regex) = Base.Fix1(occursin, r) - branches = mapreduce(b -> filter(_m(b), getbranchnamesrecursive(tree)), ∪, branches) - SB = Symbol.(branches) - for b in SB - d[b] = f["$s/$b"] + all_bnames = getbranchnamesrecursive(tree) + res_bnames = mapreduce(∪, branches) do b + if b isa Regex + filter(_m(b), all_bnames) + elseif b isa String + expand = filter(n->startswith(n, "$b/$b"), all_bnames) + isempty(expand) ? filter(isequal(b), all_bnames) : expand + else + error("branch selection must be string or regex") + end end - return LazyTree(TypedTables.Table(d)) + for b in res_bnames + # split by `.` or `/` + norm_name = b + v = split(b, r"\.|\/") + if length(v) >= 2 # only normalize name when branches are split + head = v[1] + tail = v[2:end] + # remove duplicated info + replace!(tail, head => "") + # remove known split branch information + replace!(tail, "fCoordinates" => "") + norm_name = join([head; join(tail)], "_") + end + d[Symbol(norm_name)] = f["$s/$b"] + end + return LazyTree(NamedTuple{Tuple(keys(d))}(values(d))) end function LazyTree(f::ROOTFile, s::AbstractString) @@ -320,11 +357,13 @@ end @inline function Base.getproperty(evt::LazyEvent, s::Symbol) @inbounds getproperty(Core.getfield(evt, :tree), s)[Core.getfield(evt, :idx)] end -Base.collect(evt::LazyEvent) = @inbounds Core.getfield(evt, :tree)[Core.getfield(evt, :idx)] +Base.Tuple(evt::LazyEvent) = Tuple(getproperty(evt, s) for s in propertynames(evt)) +Base.NamedTuple(evt::LazyEvent) = NamedTuple{propertynames(evt)}(Tuple(evt)) +Base.collect(evt::LazyEvent) = NamedTuple(evt) function Base.iterate(tree::T, idx=1) where {T<:LazyTree} idx > length(tree) && return nothing - return LazyEvent(innertable(tree), idx), idx + 1 + return LazyEvent(Tables.columns(tree), idx), idx + 1 end function Base.getindex(ba::LazyBranch{T,J,B}, range::UnitRange) where {T,J,B} @@ -332,7 +371,7 @@ function Base.getindex(ba::LazyBranch{T,J,B}, range::UnitRange) where {T,J,B} ib2 = findfirst(x -> x > (last(range) - 1), ba.fEntry) - 1 offset = ba.fEntry[ib1] range = (first(range)-offset):(last(range)-offset) - return vcat([basketarray(ba, i) for i in ib1:ib2]...)[range] + return Vcat(asyncmap(i->basketarray(ba, i), ib1:ib2)...)[range] end _clusterranges(t::LazyTree) = _clusterranges([getproperty(t,p) for p in propertynames(t)]) @@ -358,6 +397,3 @@ function _clusterbytes(lbs::AbstractVector{<:LazyBranch}; compressed=false) end return bytes end - -Tables.columns(t::LazyTree) = Tables.columns(innertable(t)) -Tables.partitions(t::LazyTree) = (t[r] for r in _clusterranges(t)) diff --git a/src/root.jl b/src/root.jl index abe86bf4..4e24840f 100644 --- a/src/root.jl +++ b/src/root.jl @@ -2,23 +2,21 @@ struct ROOTDirectory name::AbstractString header::ROOTDirectoryHeader keys::Vector{TKey} - fobj::IOStream + fobj::SourceStream refs::Dict{Int32, Any} end struct ROOTFile - filename::AbstractString + filename::String format_version::Int32 header::FileHeader - fobj::IOStream + fobj::SourceStream tkey::TKey streamers::Streamers directory::ROOTDirectory customstructs::Dict{String, Type} - lk::ReentrantLock end function close(f::ROOTFile) - # TODO: should we take care of the lock? close(f.fobj) end function ROOTFile(f::Function, args...; pv...) @@ -29,8 +27,7 @@ function ROOTFile(f::Function, args...; pv...) close(rootfile) end end -lock(f::ROOTFile) = lock(f.lk) -unlock(f::ROOTFile) = unlock(f.lk) + function Base.hash(rf::ROOTFile, h::UInt) hash(rf.fobj, h) end @@ -58,45 +55,58 @@ test/samples/NanoAODv5_sample.root └─ "⋮" ``` """ +const HEAD_BUFFER_SIZE = 2048 function ROOTFile(filename::AbstractString; customstructs = Dict("TLorentzVector" => LorentzVector{Float64})) - fobj = Base.open(filename) - preamble = unpack(fobj, FilePreamble) - String(preamble.identifier) == "root" || error("Not a ROOT file!") + fobj = if startswith(filename, r"https?://") + HTTPStream(filename) + elseif startswith(filename, "root://") + sep_idx = findlast("//", filename) + baseurl = filename[8:first(sep_idx)-1] + filepath = filename[last(sep_idx):end] + XRDStream(baseurl, filepath, "go") + else + !isfile(filename) && throw(SystemError("opening file $filename", 2)) + MmapStream(filename) + end + header_bytes = read(fobj, HEAD_BUFFER_SIZE) + if header_bytes[1:4] != [0x72, 0x6f, 0x6f, 0x74] + error("$filename is not a ROOT file.") + end + head_buffer = IOBuffer(header_bytes) + preamble = unpack(head_buffer, FilePreamble) format_version = preamble.fVersion - if format_version < 1000000 + header = if format_version < 1000000 @debug "32bit ROOT file" - header = unpack(fobj, FileHeader32) + unpack(head_buffer, FileHeader32) else @debug "64bit ROOT file" - header = unpack(fobj, FileHeader64) + unpack(head_buffer, FileHeader64) end # Streamers - if header.fSeekInfo != 0 - @debug "Reading streamer info." - seek(fobj, header.fSeekInfo) - streamers = Streamers(fobj) - else - @debug "No streamer info present, skipping." - end + seek(fobj, header.fSeekInfo) + stream_buffer = OffsetBuffer(IOBuffer(read(fobj, 10^5)), Int(header.fSeekInfo)) + streamers = Streamers(stream_buffer) - seek(fobj, header.fBEGIN) - tkey = unpack(fobj, TKey) + seek(head_buffer, header.fBEGIN + header.fNbytesName) + dir_header = unpack(head_buffer, ROOTDirectoryHeader) + dirkey = dir_header.fSeekKeys + seek(fobj, dirkey) + tail_buffer = @async IOBuffer(read(fobj, 10^7)) - # Reading the header key for the top ROOT directory - seek(fobj, header.fBEGIN + header.fNbytesName) - dir_header = unpack(fobj, ROOTDirectoryHeader) + seek(head_buffer, header.fBEGIN) + tkey = unpack(head_buffer, TKey) - seek(fobj, dir_header.fSeekKeys) - header_key = unpack(fobj, TKey) + wait(tail_buffer) + unpack(tail_buffer.result, TKey) - n_keys = readtype(fobj, Int32) - keys = [unpack(fobj, TKey) for _ in 1:n_keys] + n_keys = readtype(tail_buffer.result, Int32) + keys = [unpack(tail_buffer.result, TKey) for _ in 1:n_keys] directory = ROOTDirectory(tkey.fName, dir_header, keys, fobj, streamers.refs) - ROOTFile(filename, format_version, header, fobj, tkey, streamers, directory, customstructs, ReentrantLock()) + ROOTFile(filename, format_version, header, fobj, tkey, streamers, directory, customstructs) end function Base.show(io::IO, f::ROOTFile) @@ -146,14 +156,8 @@ end tkey = f.directory.keys[findfirst(isequal(s), keys(f))] @debug "Retrieving $s ('$(tkey.fClassName)')" streamer = getfield(@__MODULE__, Symbol(tkey.fClassName)) - lock(f) - try - S = streamer(f.fobj, tkey, f.streamers.refs) - return S - catch - finally - unlock(f) - end + S = streamer(f.fobj, tkey, f.streamers.refs) + return S end # FIXME unify with above? @@ -174,10 +178,18 @@ function Base.keys(f::ROOTFile) keys(f.directory) end +function Base.haskey(f::ROOTFile, k) + haskey(f.directory, k) +end + function Base.keys(d::ROOTDirectory) [key.fName for key in d.keys] end +function Base.haskey(d::ROOTDirectory, k) + any(==(k), (key.fName for key in d.keys)) +end + Base.keys(t::TTree) = [b.fName for b in t.fBranches.elements] function Base.getindex(t::T, s::AbstractString) where {T<:Union{TTree, TBranchElement}} @@ -198,7 +210,7 @@ function Base.getindex(t::TTree, s::Vector{T}) where {T<:AbstractString} [t[n] for n in s] end -reinterpret(vt::Type{Vector{T}}, data::AbstractVector{UInt8}) where T <: Union{AbstractFloat, Integer} = reinterpret(T, data) +reinterpret(vt::Type{Vector{T}}, data::Vector{UInt8}) where T <: Union{AbstractFloat, Integer} = reinterpret(T, data) """ interped_data(rawdata, rawoffsets, ::Type{T}, ::Type{J}) where {T, J<:JaggType} @@ -364,6 +376,8 @@ function auto_T_JaggT(f::ROOTFile, branch; customstructs::Dict{String, Type}) Bool elseif elname == "unsigned int" UInt32 + elseif elname == "signed char" + Int8 elseif elname == "unsigned char" UInt8 elseif elname == "unsigned short" @@ -394,11 +408,31 @@ function auto_T_JaggT(f::ROOTFile, branch; customstructs::Dict{String, Type}) end end else + # since no classname were found, we now try to determine + # type based on leaf information + _type, _jaggtype = leaf_jaggtype(leaf, _jaggtype) + end + return _type, _jaggtype +end + +function leaf_jaggtype(leaf, _jaggtype) _type = primitivetype(leaf) + leafLen = leaf.fLen + if leafLen > 1 # treat NTuple as Nojagg since size is static + _fTitle = replace(leaf.fTitle, "[$(leafLen)]" => "") + # looking for more `[var]` + m = match(r"\[\D+\]", _fTitle) + _type = FixLenVector{Int(leafLen), _type} + if isnothing(m) + return _type, Nojagg + else + #FIXME this only handles [var][fix] case + return Vector{_type}, Nooffsetjagg + end + end _type = _jaggtype === Nojagg ? _type : Vector{_type} - end - return _type, _jaggtype + return _type, _jaggtype end @@ -408,9 +442,9 @@ function readbranchraw(f::ROOTFile, branch) datas = sizehint!(Vector{UInt8}(), sum(nbytes)) # maximum length if all data are UInt8 offsets = sizehint!(zeros(Int32, 1), branch.fEntries+1) # this is always Int32 position = 0 - foreach(branch.fBasketSeek) do seek - seek==0 && return - data, offset = readbasketseek(f, branch, seek) + for (seek, nb) in zip(branch.fBasketSeek, nbytes) + seek==0 && break + data, offset = readbasketseek(f, branch, seek, nb) append!(datas, data) # FIXME: assuming offset has always 0 or at least 2 elements ;) append!(offsets, (@view offset[2:end]) .+ position) @@ -434,7 +468,7 @@ end # 3GB cache for baskets """ readbasket(f::ROOTFile, branch, ith) - readbasketseek(f::ROOTFile, branch::Union{TBranch, TBranchElement}, seek_pos::Int) + readbasketseek(f::ROOTFile, branch::Union{TBranch, TBranchElement}, seek_pos::Int, nbytes) The fundamental building block of reading read data from a .root file. Read read one basket's raw bytes and offsets at a time. These raw bytes and offsets then (potentially) get @@ -442,19 +476,15 @@ processed by [`interped_data`](@ref). See also: [`auto_T_JaggT`](@ref), [`basketarray`](@ref) """ -readbasket(f::ROOTFile, branch, ith) = readbasketseek(f, branch, branch.fBasketSeek[ith]) +function readbasket(f::ROOTFile, branch, ith) + readbasketseek(f, branch, branch.fBasketSeek[ith], branch.fBasketBytes[ith]) +end -function readbasketseek(f::ROOTFile, branch::Union{TBranch, TBranchElement}, seek_pos::Int) - lock(f) - local basketkey, compressedbytes - try - seek(f.fobj, seek_pos) - basketkey = unpack(f.fobj, TBasketKey) - compressedbytes = compressed_datastream(f.fobj, basketkey) - catch - finally - unlock(f) - end +function readbasketseek(f::ROOTFile, branch::Union{TBranch, TBranchElement}, seek_pos::Int, nb) + local rawbuffer + rawbuffer = OffsetBuffer(IOBuffer(read_seek_nb(f.fobj, seek_pos, nb)), seek_pos) + basketkey = unpack(rawbuffer, TBasketKey) + compressedbytes = compressed_datastream(rawbuffer, basketkey) basketrawbytes = decompress_datastreambytes(compressedbytes, basketkey) diff --git a/src/streamsource.jl b/src/streamsource.jl new file mode 100644 index 00000000..8ac211b7 --- /dev/null +++ b/src/streamsource.jl @@ -0,0 +1,176 @@ +using xrootdgo_jll +import HTTP + +mutable struct XRDStream + gofile_id::Cstring # used as key to a global `map` on the Go side + seekloc::Int + size::Int +end + +mutable struct MmapStream # Mmap based + mmap_ary::Vector{UInt8} + seekloc::Int + size::Int + function MmapStream(filepath::AbstractString) + size = filesize(filepath) + new(mmap(filepath), 0, size) + end +end + +read_seek_nb(fobj::MmapStream, seek, nb) = fobj.mmap_ary[seek+1:seek+nb] + +function Base.read(fobj::MmapStream, nb::Integer) + stop = min(fobj.seekloc + nb, fobj.size) + b = fobj.mmap_ary[fobj.seekloc+1 : stop] + fobj.seekloc += nb + return b +end + +function Base.close(fobj::MmapStream) # no-op + nothing +end + +# SciToken discovery https://zenodo.org/record/3937438 +function _find_scitoken() + op1 = get(ENV, "BEARER_TOKEN", "") + op2 = get(ENV, "BEARER_TOKEN_FILE", "") + op3 = get(ENV, "XDG_RUNTIME_DIR", "") + uid = @static if Sys.iswindows() + "julia" + else + strip(read(`id -u`, String)) + end + op3_file = joinpath(op3, "bt_u$uid") + op4_file = "/tmp/bt_u$uid" + token = if !isempty(op1) + op1 + elseif !isempty(op2) + read(op2, String) + elseif !isempty(op3) && isfile(op3_file) + read(op3_file, String) + elseif isfile(op4_file) + read(op4_file, String) + else + "" + end + return strip(token) +end + +mutable struct HTTPStream + uri::HTTP.URI + seekloc::Int + size::Int + multipart::Bool + scitoken::String + function HTTPStream(uri::AbstractString; scitoken = _find_scitoken()) + #TODO: determin multipart support + test = HTTP.request("GET", uri, + ("Range" => "bytes=0-3", "User-Agent" => "UnROOTjl", "Authorization" => "Bearer $scitoken") + ) + @assert test.status==206 "bad network or wrong server" + @assert String(test.body)=="root" "not a root file" + multipart = false + local v + for pair in test.headers + if lowercase(pair[1]) == "content-range" + v = pair[2] + break + end + end + size = parse(Int, match(r"/(\d+)", v).captures[1]) + new(HTTP.URI(uri), 0, size, multipart, scitoken) + end +end + +const SourceStream = Union{MmapStream, HTTPStream, XRDStream} + +function Base.read(fobj::SourceStream, ::Type{T}) where T + return only(reinterpret(T, read(fobj, sizeof(T)))) +end + +function Base.position(fobj::SourceStream) + fobj.seekloc +end + +function Base.seek(fobj::SourceStream, loc) + fobj.seekloc = loc + return fobj +end + +function Base.skip(fobj::SourceStream, stride) + fobj.seekloc += stride + return fobj +end + +function Base.seekstart(fobj::SourceStream) + fobj.seekloc = 0 + return fobj +end + +function Base.close(fobj::HTTPStream) # no-op + nothing +end + +function Base.read(fobj::HTTPStream, nb::Integer) + @debug nb + b = read_seek_nb(fobj, fobj.seekloc, nb) + fobj.seekloc += nb + return b +end + +function read_seek_nb(fobj::HTTPStream, seek, nb) + stop = seek+nb-1 + hd = ("Range" => "bytes=$(seek)-$stop", "Authorization" => "Bearer $(fobj.scitoken)") + b = HTTP.request(HTTP.stack(), "GET", fobj.uri, hd, UInt8[]).body + return b +end + +function Base.read(fobj::SourceStream) + read(fobj, fobj.size - fobj.seekloc + 1) +end + +function XRDStream(urlbase::AbstractString, filepath::AbstractString, username::AbstractString) + file_id = @ccall xrootdgo.Open(urlbase::Cstring, filepath::Cstring, username::Cstring)::Cstring + # file_id = @threadcall((:Open, xrootdgo), Cstring, (Cstring, Cstring, Cstring), urlbase, filepath, username) + size = @ccall xrootdgo.Size(file_id::Cstring)::Int + XRDStream(file_id, 0, size) +end + +function Base.close(fobj::XRDStream) + xrootdgo.Close(fobj.gofile_id) +end + +function read_seek_nb(fobj::XRDStream, seek, nb) + buffer = Vector{UInt8}(undef, nb) + @threadcall((:ReadAt, xrootdgo), Cvoid, (Ptr{UInt8}, Cstring, Clong, Clong), buffer, fobj.gofile_id, nb, seek) + # @ccall xrootdgo.ReadAt(buffer::Ptr{UInt8}, + # fobj.gofile_id::Cstring, nb::Clong, seek::Clong)::Cvoid + return buffer +end +function _read!(ptr, fobj, nb, seekloc) + @ccall xrootdgo.ReadAt(ptr::Ptr{UInt8}, + fobj.gofile_id::Cstring, nb::Clong, seekloc::Clong)::Cvoid +end + +function _read!(ptr, fobj, nb) + _read!(ptr, fobj, nb, fobj.seekloc) +end + +function Base.read(fobj::XRDStream, ::Type{T}) where T + @debug @show T, sizeof(T) + nb = sizeof(T) + output = Ref{T}() + tko = Base.@_gc_preserve_begin output + po = Ptr{UInt8}(pointer_from_objref(output)) + _read!(po, fobj, nb, fobj.seekloc) + Base.@_gc_preserve_end tko + fobj.seekloc += nb + return output[] +end + +function Base.read(fobj::XRDStream, nb::Integer) + buffer = Vector{UInt8}(undef, nb) + GC.@preserve buffer _read!(buffer, fobj, nb, fobj.seekloc) + fobj.seekloc += nb + return buffer +end diff --git a/src/types.jl b/src/types.jl index ce0ac1ff..124a026d 100644 --- a/src/types.jl +++ b/src/types.jl @@ -151,6 +151,7 @@ function decompress_datastreambytes(compbytes, tkey) compression_header = unpack(io, CompressionHeader) cname, _, compbytes, uncompbytes = unpack(compression_header) rawbytes = read(io, compbytes) + @debug cname if cname == "L4" # skip checksum which is 8 bytes @@ -238,7 +239,7 @@ end const ROOTDirectoryHeader = Union{ROOTDirectoryHeader32, ROOTDirectoryHeader64} -function unpack(io::IOStream, ::Type{ROOTDirectoryHeader}) +function unpack(io, ::Type{ROOTDirectoryHeader}) fVersion = readtype(io, Int16) skip(io, -2) diff --git a/test/runtests.jl b/test/runtests.jl index 9e88ed6b..f5c3189f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,11 +4,7 @@ using StaticArrays using InteractiveUtils using MD5 -@static if VERSION > v"1.5.0" - import Pkg - Pkg.add("Polyester") - using ThreadsX, Polyester -end +using ThreadsX, Polyester const SAMPLES_DIR = joinpath(@__DIR__, "samples") @@ -91,10 +87,14 @@ end @testset "ROOTFile" begin + @test_throws SystemError ROOTFile("non_existent_fname.root") + ROOTFile(joinpath(SAMPLES_DIR, "tree_with_histos.root")) do rootfile @test 100 == rootfile.header.fBEGIN @test 1 == length(rootfile.directory.keys) @test "t1" ∈ keys(rootfile) + @test haskey(rootfile, "t1") + @test haskey(rootfile.directory, "t1") for key in keys(rootfile) rootfile[key] end @@ -300,8 +300,27 @@ end close(rootfile) end -@testset "Doubly jagged branches" begin - rootfile = ROOTFile(joinpath(SAMPLES_DIR, "tree_with_doubly_jagged.root")) +@testset "View" begin + data = LazyTree(joinpath(SAMPLES_DIR, "tree_with_jagged_array.root"), "t1") + data[1:2] + @view data[1:2] + alloc1 = @allocated v = data[3:90] + alloc2 = @allocated v = @view data[3:90] + v = @view data[3:80] + @test alloc2 < alloc1/100 + @static if VERSION >= v"1.8" + @test alloc2 < 50 + end + @test all(v.int32_array .== data.int32_array[3:80]) + + v2 = @view data[[1,3,5]] + @test v2[1].int32_array == data[1].int32_array + @test v2[2].int32_array == data[3].int32_array +end + +@testset "Doubly jagged [var][var] branches" begin + # this is vector> + rootfile = UnROOT.samplefile("tree_with_doubly_jagged.root") vvi = [[[2], [3, 5]], [[7, 9, 11], [13]], [[17], [19], []], [], [[]]] vvf = [[[2.5], [3.5, 5.5]], [[7.5, 9.5, 11.5], [13.5]], [[17.5], [19.5], []], [], [[]]] @test UnROOT.array(rootfile, "t1/bi") == vvi @@ -313,6 +332,33 @@ end close(rootfile) end +@testset "Doubly jagged [var][fix] branches" begin + # issue #187 + # this is vector + f = UnROOT.samplefile("tree_with_varfix_doubly_jagged.root") + tree = LazyTree(f, "outtree") + @test tree.nparticles == [4,3,2] + @test length.(tree.P) == [4,3,2] + @test eltype(tree.P[1]) <: AbstractVector + # also compared to uproot + @test tree[1].P == [ + [0.9411764705882353, 0.8888888888888888, 0.8421052631578947, 0.8], + [1.0, 0.9285714285714286, 0.8666666666666667, 0.8125], + [1.1111111111111112, 1.0, 0.9090909090909091, 0.8333333333333334], + [1.4, 1.1666666666666667, 1.0, 0.875] + ] + @test tree[3].P == [ + [0.8222222222222222, + 0.8043478260869565, + 0.7872340425531915, + 0.7708333333333334], + [0.8292682926829268, + 0.8095238095238095, + 0.7906976744186046, + 0.7727272727272727] + ] +end + @testset "NanoAOD" begin rootfile = ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")) event = UnROOT.array(rootfile, "Events/event") @@ -325,6 +371,7 @@ end @test HLT_Mu3_PFJet40[1:3] == [false, true, false] tree = LazyTree(rootfile, "Events", [r"Muon_(pt|eta|phi)$", "Muon_charge", "Muon_pt"]) @test sort(propertynames(tree) |> collect) == sort([:Muon_pt, :Muon_eta, :Muon_phi, :Muon_charge]) + @test sort(names(tree)) == [String(x) for x in sort([:Muon_pt, :Muon_eta, :Muon_phi, :Muon_charge])] tree = LazyTree(rootfile, "Events", r"Muon_(pt|eta)$") @test sort(propertynames(tree) |> collect) == sort([:Muon_pt, :Muon_eta]) @test occursin("LazyEvent", repr(first(iterate(tree)))) @@ -526,16 +573,16 @@ end # Issues @testset "issues" begin - rootfile = ROOTFile(joinpath(SAMPLES_DIR, "issue7.root")) + rootfile = UnROOT.samplefile("issue7.root") @test 2 == length(keys(rootfile)) @test [1.0, 2.0, 3.0] == UnROOT.array(rootfile, "TreeD/nums") @test [1.0, 2.0, 3.0] == UnROOT.array(rootfile, "TreeF/nums") close(rootfile) - # issue 55 - rootfile = ROOTFile(joinpath(SAMPLES_DIR, "cms_ntuple_wjet.root")) + # issue #55 and #156 + rootfile = UnROOT.samplefile("cms_ntuple_wjet.root") pts1 = UnROOT.array(rootfile, "variable/met_p4/fCoordinates/fCoordinates.fPt"; raw=false) - pts2 = LazyTree(rootfile, "variable", [r"met_p4/fCoordinates/.*", "mll"])[!, Symbol("met_p4/fCoordinates/fCoordinates.fPt")] + pts2 = LazyTree(rootfile, "variable", [r"met_p4/fCoordinates/.*", "mll"])[!, Symbol("met_p4_fPt")] pts3 = rootfile["variable/good_jets_p4/good_jets_p4.fCoordinates.fPt"] @test 24 == length(pts1) @test Float32[69.96958, 25.149912, 131.66693, 150.56802] == pts1[1:4] @@ -544,7 +591,7 @@ end close(rootfile) # issue 61 - rootfile = ROOTFile(joinpath(SAMPLES_DIR, "issue61.root")) + rootfile = UnROOT.samplefile("issue61.root") @test rootfile["Events/Jet_pt"][:] == Vector{Float32}[[], [27.324587, 24.889547, 20.853024], [], [20.33066], [], []] close(rootfile) @@ -583,12 +630,6 @@ end end @testset "Type stability" begin - function isfullystable(func) - io = IOBuffer() - print(io, (@code_typed func()).first); - typed = String(take!(io)) - return !occursin("::Any", typed) - end rootfile = ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")) t = LazyTree(rootfile, "Events", ["MET_pt"])[1:10] @@ -602,12 +643,15 @@ end end f2() = sum(t.MET_pt) - @test isfullystable(f1) - @test isfullystable(f2) + @inferred f1() + @inferred f2() close(rootfile) end +const nthreads = Threads.nthreads() +nthreads == 1 && @warn "Running on a single thread. Please re-run the test suite with at least two threads (`julia --threads 2 ...`)" + @testset "Parallel and enumerate interface" begin t = LazyTree(ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")), "Events", ["Muon_pt"]) @test eachindex(enumerate(t)) == eachindex(t) @@ -625,8 +669,11 @@ end if get(ENV, "CI", "false") == "true" - # Make sure CI runs with more than 1 thread - @test Threads.nthreads() > 1 + if nthreads >= 1 + @test Threads.nthreads()>1 + else + @warn "CI wasn't run with multi thread" + end end nmus = zeros(Int, Threads.nthreads()) Threads.@threads for i in 1:length(t) @@ -643,46 +690,46 @@ end @test !isempty(hash(t.Muon_pt.b)) end -@static if VERSION > v"1.5.1" - t = LazyTree(ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")), "Events", ["Muon_pt"]) - @testset "Multi threading" begin - nthreads = Threads.nthreads() - nthreads == 1 && @warn "Running on a single thread. Please re-run the test suite with at least two threads (`julia --threads 2 ...`)" - nmus = zeros(Int, nthreads) - Threads.@threads for (i, evt) in enumerate(t) - nmus[Threads.threadid()] += length(t.Muon_pt[i]) - end - @test sum(nmus) == 878 +t = LazyTree(ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")), "Events", ["Muon_pt"]) +@testset "Multi threading" begin + nmus = zeros(Int, nthreads) + Threads.@threads for (i, evt) in enumerate(t) + nmus[Threads.threadid()] += length(t.Muon_pt[i]) + end + @test sum(nmus) == 878 - nmus .= 0 - Threads.@threads for evt in t - nmus[Threads.threadid()] += length(evt.Muon_pt) - end - nthreads > 1 && @test count(>(0), nmus) > 1 # test @threads is actually threading - @test sum(nmus) == 878 + nmus .= 0 + Threads.@threads for evt in t + nmus[Threads.threadid()] += length(evt.Muon_pt) + end + if nthreads > 1 + @test count(>(0), nmus) > 1# test @threads is actually threading + end + @test sum(nmus) == 878 - nmus .= 0 - @batch for evt in t - nmus[Threads.threadid()] += length(evt.Muon_pt) - end - nthreads > 1 && @test count(>(0), nmus) > 1 # test @threads is actually threading - @test sum(nmus) == 878 + nmus .= 0 + Threads.@threads for evt in t + nmus[Threads.threadid()] += length(evt.Muon_pt) + end + if nthreads > 1 + @test count(>(0), nmus) > 1 + end + @test sum(nmus) == 878 - nmus .= 0 - t_dummy = LazyTree(ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")), "Events", ["Muon_pt"]) - @batch for evt in vcat(t,t_dummy) # avoid using the same underlying file handler - nmus[Threads.threadid()] += length(evt.Muon_pt) - end - @test sum(nmus) == 2*878 - - for j in 1:3 - inds = [Vector{Int}() for _ in 1:nthreads] - Threads.@threads for (i, evt) in enumerate(t) - push!(inds[Threads.threadid()], i) - end - @test sum([length(inds[i] ∩ inds[j]) for i=1:length(inds), j=1:length(inds) if j>i]) == 0 + nmus .= 0 + t_dummy = LazyTree(ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")), "Events", ["Muon_pt"]) + @batch for evt in vcat(t,t_dummy) # avoid using the same underlying file handler + nmus[Threads.threadid()] += length(evt.Muon_pt) + end + @test sum(nmus) == 2*878 + + for j in 1:3 + inds = [Vector{Int}() for _ in 1:nthreads] + Threads.@threads for (i, evt) in enumerate(t) + push!(inds[Threads.threadid()], i) end + @test sum([length(inds[i] ∩ inds[j]) for i=1:length(inds), j=1:length(inds) if j>i]) == 0 end end @@ -704,6 +751,20 @@ end @test all(onesrow .== 1) end +@testset "C-array types" begin + tree = LazyTree(UnROOT.samplefile("issue165_multiple_baskets.root"), "arrays") + ele = tree.carr[3] + @test length(tree.carr) == 3 + @test length(ele) == 9 + @test eltype(ele) == Float64 + @test length(typeof(ele)) == 9 + @test all(ele .≈ + [0.7775048011809144, 0.8664217530127716, 0.4918492038230641, + 0.24464299401484568, 0.38991686533667, 0.15690925771226608, + 0.3850047958013624, 0.9268160513261408, 0.9298329730191421]) + @test all(ele .== [ele...]) +end + @testset "basketarray_iter()" begin f = UnROOT.samplefile("tree_with_vector_multiple_baskets.root") t = LazyTree(f,"t1") @@ -712,6 +773,17 @@ end @test length(UnROOT.basketarray(t.b1, 1)) == 1228 end +@testset "SourceStream remote" begin + r = ROOTFile("root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleMuParked.root") + @test r["Events"].fEntries == 29308627 + show(devnull, r) # test display + + t = LazyTree("https://scikit-hep.org/uproot3/examples/Zmumu.root", "events") + @test t.eta1[1] ≈ -1.21769 + @test t.eta1[end] ≈ -1.57044 + show(devnull, t) # test display +end + @testset "Cluster ranges" begin t = LazyTree(UnROOT.samplefile("tree_with_clusters.root"),"t1"); @test all(UnROOT._clusterbytes(t; compressed=true) .< 10000) @@ -734,24 +806,27 @@ end s2 = sum(tt.nMuon) @test s2 == 2*s1 alloc1 = @allocated sum(length, t.Muon_pt) - alloc2 = @allocated sum(evt->length(evt.nMuon), tt) + alloc2 = @allocated sum(evt->length(evt.Muon_pt), tt) @test alloc2 < 2.1 * alloc1 close(rootfile) end -@static if VERSION > v"1.5.0" - @testset "Broadcast fusion" begin - rootfile = ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")) - t = LazyTree(rootfile, "Events", "nMuon") - testf(evt) = evt.nMuon == 4 - testf2(evt) = evt.nMuon == 4 - alloc1 = @allocated a1 = testf.(t) - alloc1 += @allocated a2 = testf2.(t) - alloc1 += @allocated idx1 = findall(a1 .& a2) - alloc2 = @allocated idx2 = findall(@. testf(t) & testf2(t)) - @assert !isempty(idx1) - @test idx1 == idx2 - # compiler optimization is good on 1.8 - @test alloc1 > 1.4*alloc2 - end +@testset "Broadcast fusion" begin + rootfile = ROOTFile(joinpath(SAMPLES_DIR, "NanoAODv5_sample.root")) + t = LazyTree(rootfile, "Events", "nMuon") + @test t[2] == t[CartesianIndex(2)] + testf(evt) = evt.nMuon == 4 + testf2(evt) = evt.nMuon == 4 + # precompile + testf.(t) + testf2.(t) + findall(@. testf(t) & testf2(t)) + ########## + alloc1 = @allocated a1 = testf.(t) + alloc1 += @allocated a2 = testf2.(t) + alloc1 += @allocated idx1 = findall(a1 .& a2) + alloc2 = @allocated idx2 = findall(@. testf(t) & testf2(t)) + @assert !isempty(idx1) + @test idx1 == idx2 + @test alloc1 > 1.9*alloc2 end diff --git a/test/samples/issue165.root b/test/samples/issue165.root new file mode 100644 index 00000000..298fc3ba Binary files /dev/null and b/test/samples/issue165.root differ diff --git a/test/samples/issue165_multiple_baskets.root b/test/samples/issue165_multiple_baskets.root new file mode 100644 index 00000000..24e78bff Binary files /dev/null and b/test/samples/issue165_multiple_baskets.root differ diff --git a/test/samples/issue61.py b/test/samples/issue61.py index a19d5147..e26d4ca7 100644 --- a/test/samples/issue61.py +++ b/test/samples/issue61.py @@ -1,6 +1,6 @@ import ROOT as r -# https://github.com/tamasgal/UnROOT.jl/issues/61 +# https://github.com/JuliaHEP/UnROOT.jl/issues/61 # needs to be run in a specific environment to trigger the issue # in the first place diff --git a/test/samples/tree_with_varfix_doubly_jagged.C b/test/samples/tree_with_varfix_doubly_jagged.C new file mode 100644 index 00000000..d16264c5 --- /dev/null +++ b/test/samples/tree_with_varfix_doubly_jagged.C @@ -0,0 +1,27 @@ +#include "TFile.h" +#include "TTree.h" + +int maketree(){ + TFile f("tree_with_varfix_doubly_jagged.root", "RECREATE", ""); + TTree tree = TTree("outtree", "outtree"); + int nparticles{}; + double P[100][4]; + tree.Branch("nparticles", &nparticles, "nparticles/I"); + tree.Branch("P", P, "P[nparticles][4]/D"); + double counter1 = 1; + double counter2 = 1; + for (auto ev = 0; ev<3; ++ev){ + nparticles = 4-ev; + for (auto i = nparticles; i>=0; --i){ + counter1 += 3; + for (auto j = 0; j<=3; ++j){ + P[i][j] = counter1 / (counter2); + counter2++; + } + } + tree.Fill(); + } + f.Write(); + f.Close(); + return 0; +} diff --git a/test/samples/tree_with_varfix_doubly_jagged.root b/test/samples/tree_with_varfix_doubly_jagged.root new file mode 100644 index 00000000..465c7ea4 Binary files /dev/null and b/test/samples/tree_with_varfix_doubly_jagged.root differ

Tamas Gal

💻 📖 🚇 🔣 ⚠️

Jerry Ling

💻 ⚠️ 🔣 📖

Johannes Schumann

💻 ⚠️ 🔣

Nick Amin

💻 ⚠️ 🔣

Tamas Gal

💻 📖 🚇 🔣 ⚠️

Jerry Ling

💻 ⚠️ 🔣 📖

Johannes Schumann

💻 ⚠️ 🔣

Nick Amin

💻 ⚠️ 🔣

Mosè Giordano

🚇

Oliver Schulz

🤔

Misha Mikhasenko

🔣