From 4a9f9f7f0c9282ea58ed0bd476db78e7a06afa54 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Fri, 8 Oct 2021 14:58:16 +0200 Subject: [PATCH 01/25] Add skeleton --- paper/LICENSE | 21 +++++++++++++++++++++ paper/paper.bib | 0 paper/paper.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 paper/LICENSE create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/paper/LICENSE b/paper/LICENSE new file mode 100644 index 00000000..ffd0f5e8 --- /dev/null +++ b/paper/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Tamas Gal, Jerry Ling and Nick Amin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 00000000..e69de29b diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..10f7541e --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,47 @@ +--- +title: 'UnROOT: an I/O library for the CERN ROOT file format written in Julia' +tags: + - Julia + - HEP +authors: + - name: Tamás Gál + orcid: 0000-0001-7821-8673 + affiliation: "1, 2" + - name: Jerry Ling + orcid: 0000-0002-3359-0380 + affiliation: "3" + - name: Nick Amin + orcid: 0000- + affiliation: "4" +affiliations: + - name: Erlangen Centre for Astroparticle Physics + index: 1 + - name: Friedrich-Alexander-Universität Erlangen-Nürnberg + index: 2 + - name: Harvard University + index: 3 +date: 08 October 2021 +bibliography: paper.bib +--- +# Summary + + +# Statement of need + + +# Features and Functionality + + +# Comparison with existing software + +Julia and other languages... + +- UpROOT.jl +- ROOT.jl +- uproot +- ... + +# Conclusion + +# References + From 1ac738e43c4b1ad9a0a63be0da32e42eafefa5f1 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Sat, 9 Oct 2021 09:01:03 +0200 Subject: [PATCH 02/25] Add open journal PDF generator GHA --- .github/workflows/draft-pdf.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/draft-pdf.yml diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 00000000..cf6d3dbf --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,23 @@ +on: [push] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper.md + - name: Upload + uses: actions/upload-artifact@v1 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper.pdf From 6cc3c02fbda03f48edd79db54f582d78bbf72a8e Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Sat, 9 Oct 2021 09:02:51 +0200 Subject: [PATCH 03/25] Fix path to source file --- .github/workflows/draft-pdf.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml index cf6d3dbf..420f58ee 100644 --- a/.github/workflows/draft-pdf.yml +++ b/.github/workflows/draft-pdf.yml @@ -12,7 +12,7 @@ jobs: with: journal: joss # This should be the path to the paper within your repo. - paper-path: paper.md + paper-path: paper/paper.md - name: Upload uses: actions/upload-artifact@v1 with: From 0a26130ea29eff1da0198091a5d40fc530b92d5e Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Sat, 9 Oct 2021 09:04:48 +0200 Subject: [PATCH 04/25] Set output path --- .github/workflows/draft-pdf.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml index 420f58ee..76310246 100644 --- a/.github/workflows/draft-pdf.yml +++ b/.github/workflows/draft-pdf.yml @@ -20,4 +20,4 @@ jobs: # This is the output path where Pandoc will write the compiled # PDF. Note, this should be the same directory as the input # paper.md - path: paper.pdf + path: paper/paper.pdf From 56c438acc6497dfdf304ebff3e5295b1b9c57be5 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 9 Oct 2021 11:09:14 -0400 Subject: [PATCH 05/25] summary and statement of need --- paper/paper.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 10f7541e..6436a7ab 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -7,7 +7,7 @@ authors: - name: Tamás Gál orcid: 0000-0001-7821-8673 affiliation: "1, 2" - - name: Jerry Ling + - name: Jerry (Jiahong) Ling orcid: 0000-0002-3359-0380 affiliation: "3" - name: Nick Amin @@ -24,9 +24,27 @@ date: 08 October 2021 bibliography: paper.bib --- # Summary - +`UnROOT.jl` is a pure Julia implementation of CERN ROOT files I/O (`.root`) that is fast, +memory-efficient, and composes well with Julia's high-performance iteration, array, and +multi-threading interfaces. # Statement of need +The High-Energy Physics (HEP) community has been troubled by the two-language problem +for a long time. Often, physicists would start prototyping with a `Python` front-end +which glues to a `C++` back-end. Soon they will hit a task which is extremely hard to +express in columnar (i.e. "vectorized") style. This usually leads to either writing +`C++` kernel and interface it with `Python`, or, porting the prototype to `C++` and +start to maintain two code bases. Both options are engineering challenging for physicists +who are not also software engineering. + +Using `Python` front-end and dancing across language barriers also hinders the ability +to parallelize the tasks which are conceptually trivial most of the time. + +`UnROOT.jl` attempts to solve all of the above by choosing Julia, a high-performance +language with simple and expressive syntax. Users can freely escape to a `for-loop` +should vectorized-style shows inflexibility, without any performance degradation. +At the same time, `UnROOT.jl` transparently support multi-threading/multi-processing +by simply being a subtype of `AbstractArray` -- the limit is the sky. # Features and Functionality From 69e71226c288fbc8203537a6563eb2ce649e37a8 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Sat, 9 Oct 2021 21:10:17 +0200 Subject: [PATCH 06/25] Minor cosmetics --- paper/paper.md | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 6436a7ab..5df2723e 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -29,23 +29,25 @@ memory-efficient, and composes well with Julia's high-performance iteration, arr multi-threading interfaces. # Statement of need -The High-Energy Physics (HEP) community has been troubled by the two-language problem -for a long time. Often, physicists would start prototyping with a `Python` front-end -which glues to a `C++` back-end. Soon they will hit a task which is extremely hard to -express in columnar (i.e. "vectorized") style. This usually leads to either writing -`C++` kernel and interface it with `Python`, or, porting the prototype to `C++` and -start to maintain two code bases. Both options are engineering challenging for physicists -who are not also software engineering. +The High-Energy Physics (HEP) community has been troubled by the two-language +problem for a long time. Often, physicists would start prototyping with a +`Python` front-end which glues to a `C/C++/Fortran` back-end. Soon they will hit +a task which is extremely hard to express in columnar (i.e. "vectorized") style, +a type of problems which are normally tackled with libraries like `numpy` or +`pandas`. This usually leads to either writing `C++` kernels and interface it +with `Python`, or, porting the prototype to `C++` and start to maintain two code +bases including the wrapper code. Both options are engineering challenges for +physicists who usually have no or little background in software engineering. -Using `Python` front-end and dancing across language barriers also hinders the ability -to parallelize the tasks which are conceptually trivial most of the time. - -`UnROOT.jl` attempts to solve all of the above by choosing Julia, a high-performance -language with simple and expressive syntax. Users can freely escape to a `for-loop` -should vectorized-style shows inflexibility, without any performance degradation. -At the same time, `UnROOT.jl` transparently support multi-threading/multi-processing -by simply being a subtype of `AbstractArray` -- the limit is the sky. +Using a `Python` front-end and dancing across language barriers also hinders the ability +to parallelize tasks that are conceptually trivial most of the time. +`UnROOT.jl` attempts to solve all of the above by choosing Julia, a +high-performance language with simple and expressive syntax. Users can freely +escape to a `for-loop` whenever vectorized-style processing is not flexible +enough, without any performance degradation. At the same time, `UnROOT.jl` +transparently supports multi-threading and multi-processing by simply being a +subtype of `AbstractArray` -- the limit is the sky. # Features and Functionality From f8694eb1e9119ae40993cd988d4d4351cdf56d6f Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Sat, 9 Oct 2021 21:22:28 +0200 Subject: [PATCH 07/25] Add comparison intro and uproot --- paper/paper.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 5df2723e..85141cc4 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -54,11 +54,16 @@ subtype of `AbstractArray` -- the limit is the sky. # Comparison with existing software -Julia and other languages... +This section focusses on the comparison with other existing ROOT I/O solutions +in the Julia universe, however, one honorable mention is `uproot`, which is a +purely Python-based ROOT I/O library and played (plays) an important role for the +development of `UnROOT.jl` as it is by the time of writing the most complete and +best documented ROOT I/O implementation. + +`UpROOT.jl` is a wrapper for `uproot` and uses `PyCall` as a bridge. (TODO: +problems of Julia->PyWrapper->Awkward) -- UpROOT.jl - ROOT.jl -- uproot - ... # Conclusion From fed04a49581f1d376e14a3d3045be24d710d50b8 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Sat, 9 Oct 2021 21:24:31 +0200 Subject: [PATCH 08/25] Add uproot reference --- paper/paper.bib | 32 ++++++++++++++++++++++++++++++++ paper/paper.md | 9 +++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index e69de29b..a5887faa 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -0,0 +1,32 @@ +@software{jim_pivarski_2021_5539722, + author = {Jim Pivarski and + Henry Schreiner and + Nicholas Smith and + Chris Burr and + Dmitry Kalinkin and + Giordon Stark and + Nikolai Hartmann and + Doug Davis and + Ryunosuke O'Neil and + Andrzej Novak and + Ben Greiner and + Beojan Stanislaus and + ChristopheRappold and + Cosmin Deaconu and + Daniel Cervenkov and + Jonas Rübenach and + Josh Bendavid and + Kilian Lieret and + Michele Peresano and + Raymond Ehlers and + Ruggero Turra and + Tamas Gal and + Alexander Held}, + title = {scikit-hep/uproot4: 4.1.3}, + month = sep, + year = 2021, + publisher = {Zenodo}, + version = {4.1.3}, + doi = {10.5281/zenodo.5539722}, + url = {https://doi.org/10.5281/zenodo.5539722} +} diff --git a/paper/paper.md b/paper/paper.md index 85141cc4..982fa008 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -55,10 +55,11 @@ subtype of `AbstractArray` -- the limit is the sky. # Comparison with existing software This section focusses on the comparison with other existing ROOT I/O solutions -in the Julia universe, however, one honorable mention is `uproot`, which is a -purely Python-based ROOT I/O library and played (plays) an important role for the -development of `UnROOT.jl` as it is by the time of writing the most complete and -best documented ROOT I/O implementation. +in the Julia universe, however, one honorable mention is +`uproot` [@jim_pivarski_2021_5539722], which is a purely Python-based ROOT I/O +library and played (plays) an important role for the development of `UnROOT.jl` +as it is by the time of writing the most complete and best documented ROOT I/O +implementation. `UpROOT.jl` is a wrapper for `uproot` and uses `PyCall` as a bridge. (TODO: problems of Julia->PyWrapper->Awkward) From b458cfad81e2aaa3c3d932a4c01c00a1ef6c2805 Mon Sep 17 00:00:00 2001 From: Moelf Date: Mon, 18 Oct 2021 08:54:42 -0400 Subject: [PATCH 09/25] add functionality --- paper/paper.bib | 13 +++++++++++++ paper/paper.md | 46 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index a5887faa..8c83b9b9 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,3 +1,16 @@ +@article{Julia, + author = {Bezanson, Jeff. and Edelman, Alan. and Karpinski, Stefan. and Shah, Viral B.}, + title = {Julia: A Fresh Approach to Numerical Computing}, + journal = {SIAM Review}, + volume = {59}, + number = {1}, + pages = {65-98}, + year = {2017}, + doi = {10.1137/141000671}, + URL = { https://doi.org/10.1137/141000671}, + eprint = {https://doi.org/10.1137/141000671} +} + @software{jim_pivarski_2021_5539722, author = {Jim Pivarski and Henry Schreiner and diff --git a/paper/paper.md b/paper/paper.md index 982fa008..7ef2f28a 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -43,7 +43,7 @@ Using a `Python` front-end and dancing across language barriers also hinders the to parallelize tasks that are conceptually trivial most of the time. `UnROOT.jl` attempts to solve all of the above by choosing Julia, a -high-performance language with simple and expressive syntax. Users can freely +high-performance language with simple and expressive syntax [@Julia]. Users can freely escape to a `for-loop` whenever vectorized-style processing is not flexible enough, without any performance degradation. At the same time, `UnROOT.jl` transparently supports multi-threading and multi-processing by simply being a @@ -51,6 +51,48 @@ subtype of `AbstractArray` -- the limit is the sky. # Features and Functionality +Opening and loading a "tree" lazily is simple: +```julia +julia> using UnROOT + +julia> f = ROOTFile("test/samples/NanoAODv5_sample.root") +ROOTFile with 2 entries and 21 streamers. +test/samples/NanoAODv5_sample.root +└─ Events + ├─ "run" + ├─ "luminosityBlock" + ├─ "event" + ├─ "HTXS_Higgs_pt" + ├─ "HTXS_Higgs_y" + └─ "⋮" + +julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)$"]) + Row │ Electron_dxy nMuon Muon_eta Muon_pt + │ Vector{Float32} UInt32 Vector{Float32} Vector{Float32} +─────┼─────────────────────────────────────────────────────────── + 1 │ [0.000371] 0 [] [] + 2 │ [-0.00982] 2 [0.53, 0.229] [19.9, 15.3] + 3 │ [] 0 [] [] + 4 │ [-0.00157] 0 [] [] + ⋮ │ ⋮ ⋮ ⋮ ⋮ +``` + +Then, the `LazyTree` object acts as a table: you can iterate it sequentially or in parallel, +select entries based on range or masks etc: +```julia +for event in mytree + # ... Operate on event +end + +Threads.@threads for event in mytree # multi-threading + # ... Operate on event +end +``` + +The `LazyTree` is designed as `<: AbstractArray` which makes it compose well with +the rest of Julia ecosystem. For example, syntactic loop fusion [^1] "just works", +and it works with Query-style tabular manipulation provided by packages like `Query.jl` +without any additional code support. # Comparison with existing software @@ -71,3 +113,5 @@ problems of Julia->PyWrapper->Awkward) # References + +[^1]: https://julialang.org/blog/2017/01/moredots/ From 40ef615576a6a7ed67d92f65b276a5170b57d3f3 Mon Sep 17 00:00:00 2001 From: Moelf Date: Mon, 18 Oct 2021 09:00:03 -0400 Subject: [PATCH 10/25] add performance citation --- paper/paper.bib | 11 +++++++++++ paper/paper.md | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index 8c83b9b9..b48325d6 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -11,6 +11,17 @@ @article{Julia eprint = {https://doi.org/10.1137/141000671} } +@article{JuliaPerformance, + title={Performance of julia for high energy physics analyses}, + author={Stanitzki, Marcel and Strube, Jan}, + journal={Computing and Software for Big Science}, + volume={5}, + number={1}, + pages={1--11}, + year={2021}, + publisher={Springer} +} + @software{jim_pivarski_2021_5539722, author = {Jim Pivarski and Henry Schreiner and diff --git a/paper/paper.md b/paper/paper.md index 7ef2f28a..a20fe3e2 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -43,7 +43,9 @@ Using a `Python` front-end and dancing across language barriers also hinders the to parallelize tasks that are conceptually trivial most of the time. `UnROOT.jl` attempts to solve all of the above by choosing Julia, a -high-performance language with simple and expressive syntax [@Julia]. Users can freely +high-performance language with simple and expressive syntax [@Julia]. Julia is designed +to solve the two-language problem in general. This has been studied for HEP specifically +as well[@JuliaPerformance]. Analysis software written in Julia can freely escape to a `for-loop` whenever vectorized-style processing is not flexible enough, without any performance degradation. At the same time, `UnROOT.jl` transparently supports multi-threading and multi-processing by simply being a From 018c046f322a59265d5d7992ff855a1117725def Mon Sep 17 00:00:00 2001 From: Nick Amin Date: Tue, 19 Oct 2021 19:21:54 -0700 Subject: [PATCH 11/25] small additions --- paper/paper.md | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index a20fe3e2..62195afd 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -11,7 +11,7 @@ authors: orcid: 0000-0002-3359-0380 affiliation: "3" - name: Nick Amin - orcid: 0000- + orcid: 0000-0003-2560-0013 affiliation: "4" affiliations: - name: Erlangen Centre for Astroparticle Physics @@ -20,6 +20,8 @@ affiliations: index: 2 - name: Harvard University index: 3 + - name: University of California, Santa Barbara + index: 4 date: 08 October 2021 bibliography: paper.bib --- @@ -53,7 +55,12 @@ subtype of `AbstractArray` -- the limit is the sky. # Features and Functionality -Opening and loading a "tree" lazily is simple: +`UnROOT.jl` can deserialize instances of the commonly used `TH1`, `TH2`, +`TDirectory`, and `TTree` ROOT classes. All basic C++ types for `TTree` +branches are supported, including their nested variants. Additionally, a +`UnROOT.jl` provides a way to hook into deserialization for custom types. +Opening and loading a `TTree` lazily is simple: + ```julia julia> using UnROOT @@ -79,8 +86,9 @@ julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)$ ⋮ │ ⋮ ⋮ ⋮ ⋮ ``` -Then, the `LazyTree` object acts as a table: you can iterate it sequentially or in parallel, -select entries based on range or masks etc: +Then, the `LazyTree` object acts as a table: you can iterate through it sequentially or in parallel, +select entries based on range or masks, and operate on whole columns: + ```julia for event in mytree # ... Operate on event @@ -89,10 +97,12 @@ end Threads.@threads for event in mytree # multi-threading # ... Operate on event end + +mytree.Muon_pt # whole column as a lazy vector of vectors ``` The `LazyTree` is designed as `<: AbstractArray` which makes it compose well with -the rest of Julia ecosystem. For example, syntactic loop fusion [^1] "just works", +the rest of the Julia ecosystem. For example, syntactic loop fusion [^1] "just works", and it works with Query-style tabular manipulation provided by packages like `Query.jl` without any additional code support. @@ -111,6 +121,12 @@ problems of Julia->PyWrapper->Awkward) - ROOT.jl - ... + + +_ (?) Generic statement like: `UpROOT.jl` has demonstrated tree processing +speeds at the same level as the `C++` `ROOT` framework as well as the +Python-based `uproot` library _ + # Conclusion # References From 08033f26c71363c87539e043bcb26d8089de1739 Mon Sep 17 00:00:00 2001 From: Nick Amin Date: Tue, 19 Oct 2021 19:34:14 -0700 Subject: [PATCH 12/25] for ci --- paper/paper.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 62195afd..9c00af5f 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -67,23 +67,23 @@ julia> using UnROOT julia> f = ROOTFile("test/samples/NanoAODv5_sample.root") ROOTFile with 2 entries and 21 streamers. test/samples/NanoAODv5_sample.root -└─ Events - ├─ "run" - ├─ "luminosityBlock" - ├─ "event" - ├─ "HTXS_Higgs_pt" - ├─ "HTXS_Higgs_y" - └─ "⋮" + Events + "run" + "luminosityBlock" + "event" + "HTXS_Higgs_pt" + "HTXS_Higgs_y" + ... julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)$"]) - Row │ Electron_dxy nMuon Muon_eta Muon_pt - │ Vector{Float32} UInt32 Vector{Float32} Vector{Float32} -─────┼─────────────────────────────────────────────────────────── - 1 │ [0.000371] 0 [] [] - 2 │ [-0.00982] 2 [0.53, 0.229] [19.9, 15.3] - 3 │ [] 0 [] [] - 4 │ [-0.00157] 0 [] [] - ⋮ │ ⋮ ⋮ ⋮ ⋮ + Row Electron_dxy nMuon Muon_eta Muon_pt + Vector{Float32} UInt32 Vector{Float32} Vector{Float32} + + 1 [0.000371] 0 [] [] + 2 [-0.00982] 2 [0.53, 0.229] [19.9, 15.3] + 3 [] 0 [] [] + 4 [-0.00157] 0 [] [] + ... ``` Then, the `LazyTree` object acts as a table: you can iterate through it sequentially or in parallel, From 1713245116b02c087deb15cb52b29d03d933a240 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Wed, 20 Oct 2021 09:21:14 +0200 Subject: [PATCH 13/25] Cite pandas and numpy --- paper/paper.bib | 32 ++++++++++++++++++++++++++++++++ paper/paper.md | 11 ++++++----- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index b48325d6..fd6a6b71 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -54,3 +54,35 @@ @software{jim_pivarski_2021_5539722 doi = {10.5281/zenodo.5539722}, url = {https://doi.org/10.5281/zenodo.5539722} } +@Article{harris2020array, + title = {Array programming with {NumPy}}, + author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. + van der Walt and Ralf Gommers and Pauli Virtanen and David + Cournapeau and Eric Wieser and Julian Taylor and Sebastian + Berg and Nathaniel J. Smith and Robert Kern and Matti Picus + and Stephan Hoyer and Marten H. van Kerkwijk and Matthew + Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del + R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre + G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and + Warren Weckesser and Hameer Abbasi and Christoph Gohlke and + Travis E. Oliphant}, + year = {2020}, + month = sep, + journal = {Nature}, + volume = {585}, + number = {7825}, + pages = {357--362}, + doi = {10.1038/s41586-020-2649-2}, + publisher = {Springer Science and Business Media {LLC}}, + url = {https://doi.org/10.1038/s41586-020-2649-2} +} +@software{reback2020pandas, + author = {The pandas development team}, + title = {pandas-dev/pandas: Pandas}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {latest}, + doi = {10.5281/zenodo.3509134}, + url = {https://doi.org/10.5281/zenodo.3509134} +} diff --git a/paper/paper.md b/paper/paper.md index 9c00af5f..9b75be62 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -35,11 +35,12 @@ The High-Energy Physics (HEP) community has been troubled by the two-language problem for a long time. Often, physicists would start prototyping with a `Python` front-end which glues to a `C/C++/Fortran` back-end. Soon they will hit a task which is extremely hard to express in columnar (i.e. "vectorized") style, -a type of problems which are normally tackled with libraries like `numpy` or -`pandas`. This usually leads to either writing `C++` kernels and interface it -with `Python`, or, porting the prototype to `C++` and start to maintain two code -bases including the wrapper code. Both options are engineering challenges for -physicists who usually have no or little background in software engineering. +a type of problems which are normally tackled with libraries like +`numpy`[@harris2020array] or `pandas`[@reback2020pandas]. This usually leads to +either writing `C++` kernels and interface it with `Python`, or, porting the +prototype to `C++` and start to maintain two code bases including the wrapper +code. Both options are engineering challenges for physicists who usually have no +or little background in software engineering. Using a `Python` front-end and dancing across language barriers also hinders the ability to parallelize tasks that are conceptually trivial most of the time. From 0eb18be1475a854c653e5498ba8548f228335187 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Wed, 20 Oct 2021 09:26:24 +0200 Subject: [PATCH 14/25] Add ROOT citation --- paper/paper.bib | 10 ++++++++++ paper/paper.md | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index fd6a6b71..a746cb58 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -86,3 +86,13 @@ @software{reback2020pandas doi = {10.5281/zenodo.3509134}, url = {https://doi.org/10.5281/zenodo.3509134} } +@article{Brun:1997pa, + author = "Brun, R. and Rademakers, F.", + editor = "Werlen, M. and Perret-Gallix, D.", + title = "{ROOT: An object oriented data analysis framework}", + doi = "10.1016/S0168-9002(97)00048-X", + journal = "Nucl. Instrum. Meth. A", + volume = "389", + pages = "81--86", + year = "1997" +} diff --git a/paper/paper.md b/paper/paper.md index 9b75be62..9e59fa3c 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -26,9 +26,9 @@ date: 08 October 2021 bibliography: paper.bib --- # Summary -`UnROOT.jl` is a pure Julia implementation of CERN ROOT files I/O (`.root`) that is fast, -memory-efficient, and composes well with Julia's high-performance iteration, array, and -multi-threading interfaces. +`UnROOT.jl` is a pure Julia implementation of CERN ROOT[@Brun:1997pa] files I/O +(`.root`) that is fast, memory-efficient, and composes well with Julia's +high-performance iteration, array, and multi-threading interfaces. # Statement of need The High-Energy Physics (HEP) community has been troubled by the two-language From 0b008a79570f6d9dd07cdfcdd19237492fe137c8 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Wed, 25 May 2022 09:34:17 +0200 Subject: [PATCH 15/25] Cleanup and some additions --- paper/paper.md | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 9e59fa3c..8e3e4fea 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -37,11 +37,13 @@ problem for a long time. Often, physicists would start prototyping with a a task which is extremely hard to express in columnar (i.e. "vectorized") style, a type of problems which are normally tackled with libraries like `numpy`[@harris2020array] or `pandas`[@reback2020pandas]. This usually leads to -either writing `C++` kernels and interface it with `Python`, or, porting the +either writing `C++` kernels and interface them with `Python`, or porting the prototype to `C++` and start to maintain two code bases including the wrapper code. Both options are engineering challenges for physicists who usually have no -or little background in software engineering. - +or little background in software engineering. Many steps of this process are +critical, like identifying bottlenecks, creating an architecture which is +both performant and maintainable at the same time while still being user-friendly +and logically structured. Using a `Python` front-end and dancing across language barriers also hinders the ability to parallelize tasks that are conceptually trivial most of the time. @@ -51,14 +53,16 @@ to solve the two-language problem in general. This has been studied for HEP spec as well[@JuliaPerformance]. Analysis software written in Julia can freely escape to a `for-loop` whenever vectorized-style processing is not flexible enough, without any performance degradation. At the same time, `UnROOT.jl` -transparently supports multi-threading and multi-processing by simply being a -subtype of `AbstractArray` -- the limit is the sky. +transparently supports multi-threading and multi-processing by simply providing +data structures which are a subtype of `AbstractArray`, the built-in abstract type +for array-like objects, which allows to interface with array-routines from other +packages easily, thanks to multiple dispatch, one of the main features of Julia. # Features and Functionality `UnROOT.jl` can deserialize instances of the commonly used `TH1`, `TH2`, `TDirectory`, and `TTree` ROOT classes. All basic C++ types for `TTree` -branches are supported, including their nested variants. Additionally, a +branches are supported, including their nested variants. Additionally, `UnROOT.jl` provides a way to hook into deserialization for custom types. Opening and loading a `TTree` lazily is simple: @@ -87,8 +91,9 @@ julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)$ ... ``` -Then, the `LazyTree` object acts as a table: you can iterate through it sequentially or in parallel, -select entries based on range or masks, and operate on whole columns: +Then, the `LazyTree` object acts as a table which suports sequential or parallel +iteration, selectections and filtering based on ranges or masks, and operations +on whole columns: ```julia for event in mytree @@ -103,16 +108,16 @@ mytree.Muon_pt # whole column as a lazy vector of vectors ``` The `LazyTree` is designed as `<: AbstractArray` which makes it compose well with -the rest of the Julia ecosystem. For example, syntactic loop fusion [^1] "just works", -and it works with Query-style tabular manipulation provided by packages like `Query.jl` -without any additional code support. +the rest of the Julia ecosystem. For example, syntactic loop fusion [^1] or + Query-style tabular manipulations provided by packages like `Query.jl` +without any additional code support just work out-of-the-box. # Comparison with existing software This section focusses on the comparison with other existing ROOT I/O solutions in the Julia universe, however, one honorable mention is `uproot` [@jim_pivarski_2021_5539722], which is a purely Python-based ROOT I/O -library and played (plays) an important role for the development of `UnROOT.jl` +library and played (still plays) an important role for the development of `UnROOT.jl` as it is by the time of writing the most complete and best documented ROOT I/O implementation. From c5d20fe1695a4a6a891f4ef545db965c20decc69 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 00:23:22 +0200 Subject: [PATCH 16/25] Minor updates and additions --- paper/paper.md | 64 +++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 8e3e4fea..d5959e7d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -41,30 +41,40 @@ either writing `C++` kernels and interface them with `Python`, or porting the prototype to `C++` and start to maintain two code bases including the wrapper code. Both options are engineering challenges for physicists who usually have no or little background in software engineering. Many steps of this process are -critical, like identifying bottlenecks, creating an architecture which is -both performant and maintainable at the same time while still being user-friendly -and logically structured. -Using a `Python` front-end and dancing across language barriers also hinders the ability -to parallelize tasks that are conceptually trivial most of the time. +critical, like identifying bottlenecks, creating an architecture which is both +performant and maintainable at the same time while still being user-friendly and +logically structured. Using a `Python` front-end and dancing across language +barriers also hinders the ability to parallelize tasks that are conceptually +trivial most of the time. `UnROOT.jl` attempts to solve all of the above by choosing Julia, a -high-performance language with simple and expressive syntax [@Julia]. Julia is designed -to solve the two-language problem in general. This has been studied for HEP specifically -as well[@JuliaPerformance]. Analysis software written in Julia can freely -escape to a `for-loop` whenever vectorized-style processing is not flexible -enough, without any performance degradation. At the same time, `UnROOT.jl` -transparently supports multi-threading and multi-processing by simply providing -data structures which are a subtype of `AbstractArray`, the built-in abstract type -for array-like objects, which allows to interface with array-routines from other -packages easily, thanks to multiple dispatch, one of the main features of Julia. +high-performance language with simple and expressive syntax [@Julia]. Julia is +designed to solve the two-language problem in general. This has been studied for +HEP specifically as well[@JuliaPerformance]. Analysis software written in Julia +can freely escape to a `for-loop` whenever vectorized-style processing is not +flexible enough, without any performance degradation. At the same time, +`UnROOT.jl` transparently supports multi-threading and multi-processing by +simply providing data structures which are a subtype of `AbstractArray`, the +built-in abstract type for array-like objects, which allows to interface with +array-routines from other packages easily, thanks to multiple dispatch, one of +the main features of Julia. # Features and Functionality -`UnROOT.jl` can deserialize instances of the commonly used `TH1`, `TH2`, -`TDirectory`, and `TTree` ROOT classes. All basic C++ types for `TTree` -branches are supported, including their nested variants. Additionally, -`UnROOT.jl` provides a way to hook into deserialization for custom types. -Opening and loading a `TTree` lazily is simple: +The `ROOT` dataformat is flexible and mostly self-descriptive. Users can define +their own data structures (C++ classes) which derive from `ROOT` classes and +serialise them into directories, trees and branches. The information about the +deserialisation is written to the output file (therfore: self-descriptive) but +there are some basic structures and constants needed to bootstrap the parsing +process. + +`UnROOT.jl` understands the core structure of `ROOT` files and is able to +deserialize instances of the commonly used `TH1`, `TH2`, `TDirectory`, and +`TTree` ROOT classes. All basic C++ types for `TTree` branches are supported, +including their nested variants. Additionally, `UnROOT.jl` provides a way to +hook into the deserialisation of custom types, where the automatic parsing +fails. Opening and loading a `TTree` lazily -- i.e. without reading the whole +data into memory -- is simple: ```julia julia> using UnROOT @@ -107,18 +117,18 @@ end mytree.Muon_pt # whole column as a lazy vector of vectors ``` -The `LazyTree` is designed as `<: AbstractArray` which makes it compose well with -the rest of the Julia ecosystem. For example, syntactic loop fusion [^1] or - Query-style tabular manipulations provided by packages like `Query.jl` -without any additional code support just work out-of-the-box. +The `LazyTree` is designed as `<: AbstractArray` which makes it compose well +with the rest of the Julia ecosystem. For example, syntactic loop fusion [^1] or +Query-style tabular manipulations provided by packages like `Query.jl` without +any additional code support just work out-of-the-box. # Comparison with existing software This section focusses on the comparison with other existing ROOT I/O solutions -in the Julia universe, however, one honorable mention is -`uproot` [@jim_pivarski_2021_5539722], which is a purely Python-based ROOT I/O -library and played (still plays) an important role for the development of `UnROOT.jl` -as it is by the time of writing the most complete and best documented ROOT I/O +in the Julia universe, however, one honorable mention is `uproot` +[@jim_pivarski_2021_5539722], which is a purely Python-based ROOT I/O library +and played (still plays) an important role for the development of `UnROOT.jl` as +it is by the time of writing the most complete and best documented ROOT I/O implementation. `UpROOT.jl` is a wrapper for `uproot` and uses `PyCall` as a bridge. (TODO: From ea8ceabee380bc28e2fcd92df110633ec38a76d1 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:26:23 +0200 Subject: [PATCH 17/25] Update references --- paper/paper.bib | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/paper/paper.bib b/paper/paper.bib index a746cb58..4980e354 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -54,6 +54,25 @@ @software{jim_pivarski_2021_5539722 doi = {10.5281/zenodo.5539722}, url = {https://doi.org/10.5281/zenodo.5539722} } +@software{pivarski_jim_2018_6522027, + author = {Pivarski, Jim and + Osborne, Ianna and + Ifrim, Ioana and + Schreiner, Henry and + Hollands, Angus and + Biswas, Anish and + Das, Pratyush and + Roy Choudhury, Santam and + Smith, Nicholas}, + title = {Awkward Array}, + month = oct, + year = 2018, + note = {If you use this software, please cite it as below.}, + publisher = {Zenodo}, + version = {1.9.0rc4}, + doi = {10.5281/zenodo.6522027}, + url = {https://doi.org/10.5281/zenodo.6522027} +} @Article{harris2020array, title = {Array programming with {NumPy}}, author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. @@ -96,3 +115,18 @@ @article{Brun:1997pa pages = "81--86", year = "1997" } + +@article{Adri_n_Mart_nez_2016, + doi = {10.1088/0954-3899/43/8/084001}, + url = {https://doi.org/10.1088%2F0954-3899%2F43%2F8%2F084001}, + year = 2016, + month = {jun}, + publisher = {{IOP} Publishing}, + volume = {43}, + number = {8}, + pages = {084001}, + author = {S Adri{\'{a} +}n-Mart{\'{\i}}nez and M Ageron and F Aharonian and S Aiello and A Albert and F Ameli and E Anassontzis and M Andre and G Androulakis and M Anghinolfi and G Anton and M Ardid and T Avgitas and G Barbarino and E Barbarito and B Baret and J Barrios-Mart{\'{\i}} and B Belhorma and A Belias and E Berbee and A van den Berg and V Bertin and S Beurthey and V van Beveren and N Beverini and S Biagi and A Biagioni and M Billault and M Bond{\`{\i}} and R Bormuth and B Bouhadef and G Bourlis and S Bourret and C Boutonnet and M Bouwhuis and C Bozza and R Bruijn and J Brunner and E Buis and J Busto and G Cacopardo and L Caillat and M Calamai and D Calvo and A Capone and L Caramete and S Cecchini and S Celli and C Champion and R Cherkaoui El Moursli and S Cherubini and T Chiarusi and M Circella and L Classen and R Cocimano and J A B Coelho and A Coleiro and S Colonges and R Coniglione and M Cordelli and A Cosquer and P Coyle and A Creusot and G Cuttone and A D'Amico and G De Bonis and G De Rosa and C De Sio and F Di Capua and I Di Palma and A F D{\'{\i}}az Garc{\'{\i}}a and C Distefano and C Donzaud and D Dornic and Q Dorosti-Hasankiadeh and E Drakopoulou and D Drouhin and L Drury and M Durocher and T Eberl and S Eichie and D van Eijk and I El Bojaddaini and N El Khayati and D Elsaesser and A Enzenhöfer and F Fassi and P Favali and P Fermani and G Ferrara and C Filippidis and G Frascadore and L A Fusco and T Gal and S Galat{\`{a}} and F Garufi and P Gay and M Gebyehu and V Giordano and N Gizani and R Gracia and K Graf and T Gr{\'{e}}goire and G Grella and R Habel and S Hallmann and H van Haren and S Harissopulos and T Heid and A Heijboer and E Heine and S Henry and J J Hern{\'{a}}ndez-Rey and M Hevinga and J Hofestädt and C M F Hugon and G Illuminati and C W James and P Jansweijer and M Jongen and M de Jong and M Kadler and O Kalekin and A Kappes and U F Katz and P Keller and G Kieft and D Kie{\ss}ling and E N Koffeman and P Kooijman and A Kouchner and V Kulikovskiy and R Lahmann and P Lamare and A Leisos and E Leonora and M Lindsey Clark and A Liolios and C D Llorens Alvarez and D Lo Presti and H Löhner and A Lonardo and M Lotze and S Loucatos and E Maccioni and K Mannheim and A Margiotta and A Marinelli and O Mari{\c{s}} and C Markou and J A Mart{\'{\i}}nez-Mora and A Martini and R Mele and K W Melis and T Michael and P Migliozzi and E Migneco and P Mijakowski and A Miraglia and C M Mollo and M Mongelli and M Morganti and A Moussa and P Musico and M Musumeci and S Navas and C A Nicolau and I Olcina and C Olivetto and A Orlando and A Papaikonomou and R Papaleo and G E P{\u{a}}v{\u{a}}la{\c{s}} and H Peek and C Pellegrino and C Perrina and M Pfutzner and P Piattelli and K Pikounis and G E Poma and V Popa and T Pradier and F Pratolongo and G Pühlhofer and S Pulvirenti and L Quinn and C Racca and F Raffaelli and N Randazzo and P Rapidis and P Razis and D Real and L Resvanis and J Reubelt and G Riccobene and C Rossi and A Rovelli and M Salda{\~{n}}a and I Salvadori and D F E Samtleben and A S{\'{a}}nchez Garc{\'{\i}}a and A S{\'{a}}nchez Losa and M Sanguineti and A Santangelo and D Santonocito and P Sapienza and F Schimmel and J Schmelling and V Sciacca and M Sedita and T Seitz and I Sgura and F Simeone and I Siotis and V Sipala and B Spisso and M Spurio and G Stavropoulos and J Steijger and S M Stellacci and D Stransky and M Taiuti and Y Tayalati and D T{\'{e}}zier and S Theraube and L Thompson and P Timmer and C Tönnis and L Trasatti and A Trovato and A Tsirigotis and S Tzamarias and E Tzamariudaki and B Vallage and V Van Elewyck and J Vermeulen and P Vicini and S Viola and D Vivolo and M Volkert and G Voulgaris and L Wiggers and J Wilms and E de Wolf and K Zachariadou and J D Zornoza and J Z{\'{u}}{\~{n}}iga}, + title = {Letter of intent for {KM}3NeT 2.0}, + journal = {Journal of Physics G: Nuclear and Particle Physics} +} From 40eb64d6ac75ed1f313b8c8071e7cd4cd5797682 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:26:31 +0200 Subject: [PATCH 18/25] Cleanup and additions --- paper/paper.md | 66 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index d5959e7d..5f951a68 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -66,15 +66,23 @@ their own data structures (C++ classes) which derive from `ROOT` classes and serialise them into directories, trees and branches. The information about the deserialisation is written to the output file (therfore: self-descriptive) but there are some basic structures and constants needed to bootstrap the parsing -process. - -`UnROOT.jl` understands the core structure of `ROOT` files and is able to -deserialize instances of the commonly used `TH1`, `TH2`, `TDirectory`, and -`TTree` ROOT classes. All basic C++ types for `TTree` branches are supported, -including their nested variants. Additionally, `UnROOT.jl` provides a way to -hook into the deserialisation of custom types, where the automatic parsing -fails. Opening and loading a `TTree` lazily -- i.e. without reading the whole -data into memory -- is simple: +process. One of the biggest advantages of the `ROOT` data format is the ability +to store jagged structures like nested arrays of structs with different +sub-array lengths. In high-energy physics, such structures are preferred to +resemble e.g. particle interactions and detector responses as signals from +different hardware components, combined into a tree of events. + +`UnROOT.jl` understands the core structure of `ROOT` files, and is able to +decompress and deserialize instances of the commonly used `TH1`, `TH2`, +`TDirectory`, `TTree` etc. ROOT classes. All basic C++ types for `TTree` +branches are supported as well, including their nested variants. Additionally, +`UnROOT.jl` provides a way to hook into the deserialisation process of custom +types where the automatic parsing fails. By the time of writing, `UnROOT` is +already used successfully in the data analysis of the KM3NeT neutrino +telescope[@Adri_n_Mart_nez_2016] and the CMS detector. + +Opening and loading a `TTree` lazily -- i.e. without reading the whole data into +memory -- is simple: ```julia julia> using UnROOT @@ -101,9 +109,10 @@ julia> mytree = LazyTree(f, "Events", ["Electron_dxy", "nMuon", r"Muon_(pt|eta)$ ... ``` -Then, the `LazyTree` object acts as a table which suports sequential or parallel -iteration, selectections and filtering based on ranges or masks, and operations -on whole columns: +As seen in the above example, the entries in the columns are multi-dimensional +and jagged. The `LazyTree` object acts as a table which suports sequential +or parallel iteration, selections and filtering based on ranges or masks, and +operations on whole columns: ```julia for event in mytree @@ -119,7 +128,7 @@ mytree.Muon_pt # whole column as a lazy vector of vectors The `LazyTree` is designed as `<: AbstractArray` which makes it compose well with the rest of the Julia ecosystem. For example, syntactic loop fusion [^1] or -Query-style tabular manipulations provided by packages like `Query.jl` without +Query-style tabular manipulations provided by packages like `Query.jl`[^2] without any additional code support just work out-of-the-box. # Comparison with existing software @@ -131,21 +140,32 @@ and played (still plays) an important role for the development of `UnROOT.jl` as it is by the time of writing the most complete and best documented ROOT I/O implementation. -`UpROOT.jl` is a wrapper for `uproot` and uses `PyCall` as a bridge. (TODO: -problems of Julia->PyWrapper->Awkward) - -- ROOT.jl -- ... - +- `UpROOT.jl` is a wrapper for the aforementioned `uproot` Python package and + uses `PyCall.jl`[^3] as a bridge, which means that it relies on `Python` as a + glue language. In addition to that, `uproot` itself utilises the C++ library + `AwkwardArray`[@pivarski_jim_2018_6522027] to efficiently deal with jagged + data in `ROOT` files. Most of the features of `uproot` are available in the + Julia context, but there are intrinsic performance and usability drawbacks due + to the three language architecture. - -_ (?) Generic statement like: `UpROOT.jl` has demonstrated tree processing -speeds at the same level as the `C++` `ROOT` framework as well as the -Python-based `uproot` library _ +- `ROOT.jl`[^4] is one of the oldest Julia `ROOT` packages. It uses C++ bindings to + directly wrap the `ROOT` framework and therefore is not limited ot I/O. + Unfortunately, the `Cxx.jl`[^5] package which is used to generate the C++ glue + code does not support Julia 1.4 or later. The multi-threaded features are also + limited. # Conclusion +`UnROOT.jl` is an important package in high-energy physics and related +scientific fields where the `ROOT` dataformat is established, since the ability +to read and parse scientific data is certainly the first mandatory step to open +the window to a programming language and its package ecosystem. + # References [^1]: https://julialang.org/blog/2017/01/moredots/ +[^2]: https://github.com/queryverse/Query.jl +[^3]: https://github.com/JuliaPy/PyCall.jl +[^4]: https://github.com/JuliaHEP/ROOT.jl +[^5]: https://github.com/JuliaInterop/Cxx.jl From 4c4b23519a8ff79c7e870b974698b275c7109992 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:30:33 +0200 Subject: [PATCH 19/25] Update conclusions with Jerry's comments --- paper/paper.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 5f951a68..9f07e69c 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -159,7 +159,10 @@ implementation. `UnROOT.jl` is an important package in high-energy physics and related scientific fields where the `ROOT` dataformat is established, since the ability to read and parse scientific data is certainly the first mandatory step to open -the window to a programming language and its package ecosystem. +the window to a programming language and its package ecosystem. `UnROOT.jl` has +demonstrated tree processing speeds at the same level as the `C++` `ROOT` +framework in per-event iteration as well as the Python-based `uproot` library in +chunked iteration. # References From ce82cac8821d5b5795035e5917c1bea959620ce4 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:31:11 +0200 Subject: [PATCH 20/25] Fix typo --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 9f07e69c..8d6d5f80 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -123,7 +123,7 @@ Threads.@threads for event in mytree # multi-threading # ... Operate on event end -mytree.Muon_pt # whole column as a lazy vector of vectors +mytree.Muon_pt # a column as a lazy vector of vectors ``` The `LazyTree` is designed as `<: AbstractArray` which makes it compose well From 0bc724bde2330ee557f5fa5985efb993bd381c83 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:35:02 +0200 Subject: [PATCH 21/25] Add NanoAOD refernce --- paper/paper.bib | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/paper/paper.bib b/paper/paper.bib index 4980e354..781be1f0 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -130,3 +130,14 @@ @article{Adri_n_Mart_nez_2016 title = {Letter of intent for {KM}3NeT 2.0}, journal = {Journal of Physics G: Nuclear and Particle Physics} } +@article{Ehataht:2020ebp, + author = {Ehat\"aht, Karl}, + editor = "Doglioni, C. and Kim, D. and Stewart, G. A. and Silvestris, L. and Jackson, P. and Kamleh, W.", + collaboration = "CMS", + title = "{NANOAOD: a new compact event data format in CMS}", + doi = "10.1051/epjconf/202024506002", + journal = "EPJ Web Conf.", + volume = "245", + pages = "06002", + year = "2020" +} From d3949aeb6a9ca5ba6dd0c8c7280552cb94be1732 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:35:54 +0200 Subject: [PATCH 22/25] Add reference to CMS/NanoAOD --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 8d6d5f80..87df2889 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -79,7 +79,7 @@ branches are supported as well, including their nested variants. Additionally, `UnROOT.jl` provides a way to hook into the deserialisation process of custom types where the automatic parsing fails. By the time of writing, `UnROOT` is already used successfully in the data analysis of the KM3NeT neutrino -telescope[@Adri_n_Mart_nez_2016] and the CMS detector. +telescope[@Adri_n_Mart_nez_2016] and the CMS detector[@Ehataht:2020ebp]. Opening and loading a `TTree` lazily -- i.e. without reading the whole data into memory -- is simple: From c4814647f56a86691910d6c79a0d3f05cc9d3133 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:44:47 +0200 Subject: [PATCH 23/25] Fix pandas reference --- paper/paper.bib | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 781be1f0..b9eb5265 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -96,8 +96,8 @@ @Article{harris2020array url = {https://doi.org/10.1038/s41586-020-2649-2} } @software{reback2020pandas, - author = {The pandas development team}, - title = {pandas-dev/pandas: Pandas}, + author = {{The pandas development team}}, + title = {Pandas}, month = feb, year = 2020, publisher = {Zenodo}, From 75821bd0b95ae6d8aa2c27a06a31f7209e6b5f1f Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 01:46:30 +0200 Subject: [PATCH 24/25] Add spaces between words and references --- paper/paper.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 87df2889..07a31942 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -26,7 +26,7 @@ date: 08 October 2021 bibliography: paper.bib --- # Summary -`UnROOT.jl` is a pure Julia implementation of CERN ROOT[@Brun:1997pa] files I/O +`UnROOT.jl` is a pure Julia implementation of CERN ROOT [@Brun:1997pa] files I/O (`.root`) that is fast, memory-efficient, and composes well with Julia's high-performance iteration, array, and multi-threading interfaces. @@ -36,7 +36,7 @@ problem for a long time. Often, physicists would start prototyping with a `Python` front-end which glues to a `C/C++/Fortran` back-end. Soon they will hit a task which is extremely hard to express in columnar (i.e. "vectorized") style, a type of problems which are normally tackled with libraries like -`numpy`[@harris2020array] or `pandas`[@reback2020pandas]. This usually leads to +`numpy` [@harris2020array] or `pandas` [@reback2020pandas]. This usually leads to either writing `C++` kernels and interface them with `Python`, or porting the prototype to `C++` and start to maintain two code bases including the wrapper code. Both options are engineering challenges for physicists who usually have no @@ -50,7 +50,7 @@ trivial most of the time. `UnROOT.jl` attempts to solve all of the above by choosing Julia, a high-performance language with simple and expressive syntax [@Julia]. Julia is designed to solve the two-language problem in general. This has been studied for -HEP specifically as well[@JuliaPerformance]. Analysis software written in Julia +HEP specifically as well [@JuliaPerformance]. Analysis software written in Julia can freely escape to a `for-loop` whenever vectorized-style processing is not flexible enough, without any performance degradation. At the same time, `UnROOT.jl` transparently supports multi-threading and multi-processing by @@ -79,7 +79,7 @@ branches are supported as well, including their nested variants. Additionally, `UnROOT.jl` provides a way to hook into the deserialisation process of custom types where the automatic parsing fails. By the time of writing, `UnROOT` is already used successfully in the data analysis of the KM3NeT neutrino -telescope[@Adri_n_Mart_nez_2016] and the CMS detector[@Ehataht:2020ebp]. +telescope [@Adri_n_Mart_nez_2016] and the CMS detector [@Ehataht:2020ebp]. Opening and loading a `TTree` lazily -- i.e. without reading the whole data into memory -- is simple: @@ -128,7 +128,7 @@ mytree.Muon_pt # a column as a lazy vector of vectors The `LazyTree` is designed as `<: AbstractArray` which makes it compose well with the rest of the Julia ecosystem. For example, syntactic loop fusion [^1] or -Query-style tabular manipulations provided by packages like `Query.jl`[^2] without +Query-style tabular manipulations provided by packages like `Query.jl` [^2] without any additional code support just work out-of-the-box. # Comparison with existing software @@ -141,16 +141,16 @@ it is by the time of writing the most complete and best documented ROOT I/O implementation. - `UpROOT.jl` is a wrapper for the aforementioned `uproot` Python package and - uses `PyCall.jl`[^3] as a bridge, which means that it relies on `Python` as a + uses `PyCall.jl` [^3] as a bridge, which means that it relies on `Python` as a glue language. In addition to that, `uproot` itself utilises the C++ library - `AwkwardArray`[@pivarski_jim_2018_6522027] to efficiently deal with jagged + `AwkwardArray` [@pivarski_jim_2018_6522027] to efficiently deal with jagged data in `ROOT` files. Most of the features of `uproot` are available in the Julia context, but there are intrinsic performance and usability drawbacks due to the three language architecture. -- `ROOT.jl`[^4] is one of the oldest Julia `ROOT` packages. It uses C++ bindings to +- `ROOT.jl` [^4] is one of the oldest Julia `ROOT` packages. It uses C++ bindings to directly wrap the `ROOT` framework and therefore is not limited ot I/O. - Unfortunately, the `Cxx.jl`[^5] package which is used to generate the C++ glue + Unfortunately, the `Cxx.jl` [^5] package which is used to generate the C++ glue code does not support Julia 1.4 or later. The multi-threaded features are also limited. From d4400d04e36a2806e537be0d08d0c1714c98f780 Mon Sep 17 00:00:00 2001 From: Tamas Gal Date: Thu, 2 Jun 2022 10:25:49 +0200 Subject: [PATCH 25/25] Fix typo --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 07a31942..d9b5ede5 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -64,7 +64,7 @@ the main features of Julia. The `ROOT` dataformat is flexible and mostly self-descriptive. Users can define their own data structures (C++ classes) which derive from `ROOT` classes and serialise them into directories, trees and branches. The information about the -deserialisation is written to the output file (therfore: self-descriptive) but +deserialisation is written to the output file (therefore: self-descriptive) but there are some basic structures and constants needed to bootstrap the parsing process. One of the biggest advantages of the `ROOT` data format is the ability to store jagged structures like nested arrays of structs with different