diff --git a/README.md b/README.md index 01a32e6..4561572 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,16 @@ Pkg.add("SASLib") I did benchmarking mostly on my Macbook Pro laptop. In general, the Julia implementation is somewhere between 10-100x faster than the Python Pandas. Test results are documented in the `test/perf_results_` folders. +Latest performance [test results for v1.0.0](test/perf_results_1.0.0) is as follows: + +Test|Result| +----|------| +py\_jl\_homimp\_50.md |Julia is ~27.9x faster than Python/Pandas| +py\_jl\_numeric\_1000000\_2\_100.md |Julia is ~10.2x faster than Python/Pandas| +py\_jl\_productsales\_100.md |Julia is ~46.9x faster than Python/Pandas| +py\_jl\_test1\_100.md |Julia is ~118.8x faster than Python/Pandas| +py\_jl\_topical\_30.md |Julia is ~27.3x faster than Python/Pandas| + ## User Guide ``` diff --git a/test/perf_results_1.0.0/py_jl_homimp_50.md b/test/perf_results_1.0.0/py_jl_homimp_50.md new file mode 100644 index 0000000..0bf31c0 --- /dev/null +++ b/test/perf_results_1.0.0/py_jl_homimp_50.md @@ -0,0 +1,73 @@ +# Julia/Python Performance Test Result + +## Summary + +Julia is ~27.9x faster than Python/Pandas + +## Test File + +Iterations: 50 + +Filename|Size|Rows|Columns|Numeric Columns|String Columns +--------|----|----|-------|---------------|-------------- +homimp.sas7bdat|1.2 MB|46641|6|1|5 + +## Python +``` +$ python -V +Python 3.7.1 +$ python perf_test1.py data_AHS2013/homimp.sas7bdat 50 +Minimum: 0.5793 seconds +``` + +## Julia (ObjectPool) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 20.20 MiB + allocs estimate: 494963 + -------------- + minimum time: 39.500 ms (0.00% GC) + median time: 44.556 ms (0.00% GC) + mean time: 44.054 ms (4.70% GC) + maximum time: 63.587 ms (7.46% GC) + -------------- + samples: 50 + evals/sample: 1 +``` + +## Julia (Regular String Array) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 18.02 MiB + allocs estimate: 428420 + -------------- + minimum time: 20.776 ms (0.00% GC) + median time: 25.170 ms (0.00% GC) + mean time: 29.005 ms (18.45% GC) + maximum time: 109.289 ms (73.77% GC) + -------------- + samples: 50 + evals/sample: 1 +``` diff --git a/test/perf_results_1.0.0/py_jl_numeric_1000000_2_100.md b/test/perf_results_1.0.0/py_jl_numeric_1000000_2_100.md new file mode 100644 index 0000000..876fcd4 --- /dev/null +++ b/test/perf_results_1.0.0/py_jl_numeric_1000000_2_100.md @@ -0,0 +1,47 @@ +# Julia/Python Performance Test Result + +## Summary + +Julia is ~10.2x faster than Python/Pandas + +## Test File + +Iterations: 100 + +Filename|Size|Rows|Columns|Numeric Columns|String Columns +--------|----|----|-------|---------------|-------------- +numeric_1000000_2.sas7bdat|16.3 MB|1000000|2|2|0 + +## Python +``` +$ python -V +Python 3.7.1 +$ python perf_test1.py data_misc/numeric_1000000_2.sas7bdat 100 +Minimum: 1.8784 seconds +``` + +## Julia +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 168.83 MiB + allocs estimate: 1004863 + -------------- + minimum time: 183.319 ms (6.02% GC) + median time: 208.804 ms (14.80% GC) + mean time: 235.003 ms (25.50% GC) + maximum time: 383.528 ms (54.19% GC) + -------------- + samples: 22 + evals/sample: 1 +``` diff --git a/test/perf_results_1.0.0/py_jl_productsales_100.md b/test/perf_results_1.0.0/py_jl_productsales_100.md new file mode 100644 index 0000000..93c7b3a --- /dev/null +++ b/test/perf_results_1.0.0/py_jl_productsales_100.md @@ -0,0 +1,73 @@ +# Julia/Python Performance Test Result + +## Summary + +Julia is ~46.9x faster than Python/Pandas + +## Test File + +Iterations: 100 + +Filename|Size|Rows|Columns|Numeric Columns|String Columns +--------|----|----|-------|---------------|-------------- +productsales.sas7bdat|148.5 kB|1440|10|4|6 + +## Python +``` +$ python -V +Python 3.7.1 +$ python perf_test1.py data_pandas/productsales.sas7bdat 100 +Minimum: 0.0505 seconds +``` + +## Julia (ObjectPool) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 1.17 MiB + allocs estimate: 14693 + -------------- + minimum time: 1.745 ms (0.00% GC) + median time: 2.431 ms (0.00% GC) + mean time: 2.679 ms (2.39% GC) + maximum time: 5.482 ms (60.67% GC) + -------------- + samples: 100 + evals/sample: 1 +``` + +## Julia (Regular String Array) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 1.15 MiB + allocs estimate: 14638 + -------------- + minimum time: 1.078 ms (0.00% GC) + median time: 3.277 ms (0.00% GC) + mean time: 6.618 ms (3.48% GC) + maximum time: 83.970 ms (0.00% GC) + -------------- + samples: 100 + evals/sample: 1 +``` diff --git a/test/perf_results_1.0.0/py_jl_test1_100.md b/test/perf_results_1.0.0/py_jl_test1_100.md new file mode 100644 index 0000000..1043623 --- /dev/null +++ b/test/perf_results_1.0.0/py_jl_test1_100.md @@ -0,0 +1,73 @@ +# Julia/Python Performance Test Result + +## Summary + +Julia is ~118.8x faster than Python/Pandas + +## Test File + +Iterations: 100 + +Filename|Size|Rows|Columns|Numeric Columns|String Columns +--------|----|----|-------|---------------|-------------- +test1.sas7bdat|131.1 kB|10|100|73|27 + +## Python +``` +$ python -V +Python 3.7.1 +$ python perf_test1.py data_pandas/test1.sas7bdat 100 +Minimum: 0.1036 seconds +``` + +## Julia (ObjectPool) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 1.00 MiB + allocs estimate: 7132 + -------------- + minimum time: 871.807 μs (0.00% GC) + median time: 1.254 ms (0.00% GC) + mean time: 1.470 ms (6.75% GC) + maximum time: 6.470 ms (78.01% GC) + -------------- + samples: 100 + evals/sample: 1 +``` + +## Julia (Regular String Array) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 990.86 KiB + allocs estimate: 6819 + -------------- + minimum time: 1.119 ms (0.00% GC) + median time: 2.666 ms (0.00% GC) + mean time: 9.009 ms (6.71% GC) + maximum time: 161.985 ms (0.00% GC) + -------------- + samples: 100 + evals/sample: 1 +``` diff --git a/test/perf_results_1.0.0/py_jl_topical_30.md b/test/perf_results_1.0.0/py_jl_topical_30.md new file mode 100644 index 0000000..56f3cc6 --- /dev/null +++ b/test/perf_results_1.0.0/py_jl_topical_30.md @@ -0,0 +1,73 @@ +# Julia/Python Performance Test Result + +## Summary + +Julia is ~27.3x faster than Python/Pandas + +## Test File + +Iterations: 30 + +Filename|Size|Rows|Columns|Numeric Columns|String Columns +--------|----|----|-------|---------------|-------------- +topical.sas7bdat|13.6 MB|84355|114|8|106 + +## Python +``` +$ python -V +Python 3.7.1 +$ python perf_test1.py data_AHS2013/topical.sas7bdat 30 +Minimum: 46.9720 seconds +``` + +## Julia (ObjectPool) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 685.66 MiB + allocs estimate: 19193161 + -------------- + minimum time: 1.720 s (6.37% GC) + median time: 1.806 s (11.83% GC) + mean time: 1.796 s (10.69% GC) + maximum time: 1.863 s (13.57% GC) + -------------- + samples: 3 + evals/sample: 1 +``` + +## Julia (Regular String Array) +``` +Julia Version 1.3.0 +Commit 46ce4d7933 (2019-11-26 06:09 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin19.0.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-6.0.1 (ORCJIT, haswell) +Environment: + JULIA_NUM_THREADS = 4 + +BenchmarkTools.Trial: + memory estimate: 648.04 MiB + allocs estimate: 19048983 + -------------- + minimum time: 1.994 s (46.01% GC) + median time: 2.559 s (51.16% GC) + mean time: 2.559 s (51.16% GC) + maximum time: 3.123 s (54.45% GC) + -------------- + samples: 2 + evals/sample: 1 +``` diff --git a/test/perf_results_1.0.0/saslib_vs_readstat.md b/test/perf_results_1.0.0/saslib_vs_readstat.md new file mode 100644 index 0000000..a8971b7 --- /dev/null +++ b/test/perf_results_1.0.0/saslib_vs_readstat.md @@ -0,0 +1,40 @@ +# SASLib vs ReadStat test + +Key | Description | +--------|-------------------------| +F64 | number of Float64 columns| +STR | number of String columns| +DT | number of date/time coumns| +COMP | compression method| +S/R | SASLib time divided by ReadStat time| +SA/R | SASLib time (regular string arrays) divided by ReadStat time| +SASLibA | SASLib (regular string arrays)| + +``` +Filename : ReadStat SASLib S/R SASLibA SA/R F64 STR DT COMP +data_misc/numeric_1000000_2.sas7bdat : 367.403 ms 164.249 ms ( 45%) 165.407 ms ( 45%) 2 0 0 none +data_misc/types.sas7bdat : 0.067 ms 0.132 ms (196%) 0.132 ms (196%) 5 1 0 none +data_AHS2013/homimp.sas7bdat : 54.358 ms 39.673 ms ( 73%) 21.815 ms ( 40%) 1 5 0 none +data_AHS2013/omov.sas7bdat : 3.644 ms 6.631 ms (182%) 5.451 ms (150%) 3 5 0 none +data_AHS2013/owner.sas7bdat : 18.117 ms 13.985 ms ( 77%) 8.112 ms ( 45%) 0 3 0 none +data_AHS2013/ratiov.sas7bdat : 6.723 ms 6.038 ms ( 90%) 3.197 ms ( 48%) 0 9 0 none +data_AHS2013/rmov.sas7bdat : 72.551 ms 90.487 ms (125%) 63.868 ms ( 88%) 2 21 0 none +data_AHS2013/topical.sas7bdat : 3394.267 ms 1755.026 ms ( 52%) 1153.360 ms ( 34%) 8 106 0 none +data_pandas/airline.sas7bdat : 0.093 ms 0.114 ms (122%) 0.117 ms (125%) 6 0 0 none +data_pandas/datetime.sas7bdat : 0.061 ms 0.133 ms (219%) 0.132 ms (217%) 1 1 2 none +data_pandas/productsales.sas7bdat : 2.812 ms 1.726 ms ( 61%) 1.075 ms ( 38%) 4 5 1 none +data_pandas/test1.sas7bdat : 0.606 ms 0.900 ms (148%) 0.836 ms (138%) 73 25 2 none +data_pandas/test2.sas7bdat : 0.624 ms 0.693 ms (111%) 0.690 ms (111%) 73 25 2 RLE +data_pandas/test4.sas7bdat : 0.607 ms 0.885 ms (146%) 0.849 ms (140%) 73 25 2 none +data_pandas/test5.sas7bdat : 0.625 ms 0.721 ms (115%) 0.693 ms (111%) 73 25 2 RLE +data_pandas/test7.sas7bdat : 0.606 ms 0.912 ms (151%) 0.855 ms (141%) 73 25 2 none +data_pandas/test9.sas7bdat : 0.622 ms 0.701 ms (113%) 0.705 ms (113%) 73 25 2 RLE +data_pandas/test10.sas7bdat : 0.606 ms 0.955 ms (158%) 0.844 ms (139%) 73 25 2 none +data_pandas/test12.sas7bdat : 0.625 ms 0.702 ms (112%) 0.683 ms (109%) 73 25 2 RLE +data_pandas/test13.sas7bdat : 0.606 ms 0.924 ms (152%) 0.860 ms (142%) 73 25 2 none +data_pandas/test15.sas7bdat : 0.623 ms 0.725 ms (116%) 0.698 ms (112%) 73 25 2 RLE +data_pandas/test16.sas7bdat : 0.614 ms 1.572 ms (256%) 1.626 ms (265%) 73 25 2 none +data_reikoch/barrows.sas7bdat : 11.242 ms 6.438 ms ( 57%) 6.513 ms ( 58%) 72 0 0 RLE +data_reikoch/extr.sas7bdat : 0.077 ms 0.310 ms (400%) 0.303 ms (391%) 0 1 0 none +data_reikoch/ietest2.sas7bdat : 0.048 ms 0.106 ms (221%) 0.106 ms (221%) 0 1 0 RLE +``` diff --git a/test/perf_test_readstat.jl b/test/perf_test_readstat.jl index 195b485..4ff2980 100644 --- a/test/perf_test_readstat.jl +++ b/test/perf_test_readstat.jl @@ -1,6 +1,7 @@ using BenchmarkTools using SASLib using ReadStat +using Printf if length(ARGS) != 1 println("Usage: julia ", PROGRAM_FILE, " ") @@ -17,19 +18,19 @@ function performtest(io, f, samples, seconds) println(io, "\n\n================ $f =================") mime = MIME("text/plain") try - info("testing $f with ReadStat") + @info("testing $f with ReadStat") println(io, "ReadStat:") b1 = @benchmark read_sas7bdat($f) samples=samples seconds=seconds show(io, mime, b1) println(io) - info("testing $f with SASLib") + @info("testing $f with SASLib") println(io, "SASLib:") b2 = @benchmark readsas($f, verbose_level=0) samples=samples seconds=seconds show(io, mime, b2) println(io) - info("testing $f with SASLib regular string") + @info("testing $f with SASLib regular string") println(io, "SASLib (regular string):") b3 = @benchmark readsas($f, string_array_fn=Dict(:_all_ => REGULAR_STR_ARRAY), verbose_level=0) samples=samples seconds=seconds show(io, mime, b3) @@ -47,7 +48,7 @@ function performtest(io, f, samples, seconds) p2 = round(Int, t2/t1*100) p3 = round(Int, t3/t1*100) comp = md.compression - info("Results: ", join(string.([f,t1,t2,p2,t3,p3,nd,ns,nt,comp]), ",")) + @info("Results: ", join(string.([f,t1,t2,p2,t3,p3,nd,ns,nt,comp]), ",")) @printf io "%-40s: %8.3f ms %8.3f ms (%3d%%) %8.3f ms (%3d%%) %4d %4d %4d %4s\n" f t1 t2 p2 t3 p3 nd ns nt comp catch ex println(ex) diff --git a/test/py_jl_test.jl b/test/py_jl_test.jl index 49a857d..7977b9c 100644 --- a/test/py_jl_test.jl +++ b/test/py_jl_test.jl @@ -1,28 +1,31 @@ # Compare loading one file in Python vs Julia +using Dates +using InteractiveUtils: versioninfo if length(ARGS) != 3 println("Usage: $PROGRAM_FILE ") exit() end -using SASLib, BenchmarkTools, Humanize +using SASLib, BenchmarkTools +using Humanize: datasize file = ARGS[1] cnt = ARGS[2] dir = ARGS[3] basename = split(file, "/")[end] -shortname = replace(basename, r"\.sas7bdat", "") +shortname = replace(basename, r"\.sas7bdat" => "") output = "$dir/py_jl_$(shortname)_$(cnt).md" -prt(msg...) = info(now(), " ", msg...) +prt(msg...) = println(now(), " ", msg...) # run python part # result is minimum time in seconds prt("Running python test for file ", file, " ", cnt, " times") pyver_cmd = `python -V` pyres_cmd = `python perf_test1.py $file $cnt` -pyver = readstring(pipeline(pyver_cmd, stderr=pipeline(`cat`))) -pyres = readstring(pipeline(pyres_cmd)) +pyver = readlines(open(pyver_cmd))[1] +pyres = readlines(open(pyres_cmd))[1] # first line of result is Minimum py = parse(Float64, match(r"[0-9]*\.[0-9]*", pyres).match) # read metadata @@ -53,7 +56,7 @@ end # analysis direction = jl < py ? "faster" : "slower" -ratio = round(direction == "faster" ? py/jl : jl/py, 1) +ratio = round(direction == "faster" ? py/jl : jl/py, digits = 1) io = nothing try @@ -98,6 +101,6 @@ try catch err println(err) finally - io != nothing && try close(io) catch e println(e) end + io !== nothing && try close(io) catch e println(e) end end prt("Written file $output")