-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
lz4 and zstd compression via fst #111
base: master
Are you sure you want to change the base?
Conversation
Suggested by @MarcusKlik. This PR implements optional
When we read the data back in, we call We can select I also updated the tests to check back-compatibility and |
Codecov Report
@@ Coverage Diff @@
## master #111 +/- ##
==========================================
+ Coverage 99.91% 99.91% +<.01%
==========================================
Files 16 16
Lines 1203 1224 +21
==========================================
+ Hits 1202 1223 +21
Misses 1 1
Continue to review full report at Codecov.
|
Speeds are comparable for small data. Large data benchmarks are forthcoming. library(storr)
library(microbenchmark)
st_none <- storr_rds(tempfile(), compress = "none")
st_gzfile <- storr_rds(tempfile(), compress = "gzfile")
st_fst <- storr_rds(tempfile(), compress = "fst")
microbenchmark(
none = st_none$set(key = "x", value = runif(1)),
gzfile = st_gzfile$set(key = "x", value = runif(1)),
fst = st_fst$set(key = "x", value = runif(1))
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> none 5.863562 6.246155 6.503483 6.442023 6.751117 7.786466 100
#> gzfile 4.528847 4.978007 5.209034 5.138960 5.376522 6.907493 100
#> fst 5.827846 6.205787 6.563974 6.392180 6.712995 13.425614 100 Created on 2019-06-18 by the reprex package (v0.3.0) |
Benchmarks on 800 MB data, where library(storr)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
data <- runif(1e8)
object_size(data)
#> 800 MB
st_none <- storr_rds(tempfile(), compress = "none")
st_gzfile <- storr_rds(tempfile(), compress = "gzfile")
st_fst <- storr_rds(tempfile(), compress = "fst")
microbenchmark(
none = st_none$set(key = "x", value = data),
gzfile = st_gzfile$set(key = "x", value = data),
fst = st_fst$set(key = "x", value = data),
times = 1
)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> none 6.375112 6.375112 6.375112 6.375112 6.375112 6.375112 1
#> gzfile 89.511704 89.511704 89.511704 89.511704 89.511704 89.511704 1
#> fst 6.636744 6.636744 6.636744 6.636744 6.636744 6.636744 1 and for large enough data to repack: library(storr)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
data <- runif(3e8)
object_size(data)
#> 2.4 GB
st_none <- storr_rds(tempfile(), compress = "none")
st_gzfile <- storr_rds(tempfile(), compress = "gzfile")
st_fst <- storr_rds(tempfile(), compress = "fst")
microbenchmark(
none = st_none$set(key = "x", value = data, use_cache = FALSE),
gzfile = st_gzfile$set(key = "x", value = data, use_cache = FALSE),
fst = st_fst$set(key = "x", value = data, use_cache = FALSE),
times = 1
)
#> Repacking large object
#> Repacking large object
#> Unit: seconds
#> expr min lq mean median uq max neval
#> none 27.21743 27.21743 27.21743 27.21743 27.21743 27.21743 1
#> gzfile 276.25813 276.25813 276.25813 276.25813 276.25813 276.25813 1
#> fst 36.22005 36.22005 36.22005 36.22005 36.22005 36.22005 1 Created on 2019-06-18 by the reprex package (v0.3.0) |
A minor snag: we need an extra deserialization step if we compress with library(storr)
library(microbenchmark)
st_none <- storr_rds(tempfile(), compress = "none")
st_gzfile <- storr_rds(tempfile(), compress = "gzfile")
st_fst <- storr_rds(tempfile(), compress = "fst")
st_none$set(key = "x", value = 1, use_cache = FALSE)
st_gzfile$set(key = "x", value = 1, use_cache = FALSE)
st_fst$set(key = "x", value = 1, use_cache = FALSE)
microbenchmark(
none = st_none$get(key = "x", use_cache = FALSE),
gzfile = st_gzfile$get(key = "x", use_cache = FALSE),
fst = st_fst$get(key = "x", use_cache = FALSE)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> none 205.729 207.9270 219.5831 211.3970 224.3855 415.171 100
#> gzfile 207.871 210.9915 225.9019 217.7565 233.5205 274.534 100
#> fst 227.325 229.5640 240.8948 232.3175 244.8615 374.784 100 Created on 2019-06-18 by the reprex package (v0.3.0) |
For 800 GB, reading the data is slower than with no compression, but it is still better than the default compression. library(storr)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
data <- runif(1e8)
object_size(data)
#> 800 MB
st_none <- storr_rds(tempfile(), compress = "none")
st_gzfile <- storr_rds(tempfile(), compress = "gzfile")
st_fst <- storr_rds(tempfile(), compress = "fst")
st_none$set(key = "x", value = data, use_cache = FALSE)
st_gzfile$set(key = "x", value = data, use_cache = FALSE)
st_fst$set(key = "x", value = data, use_cache = FALSE)
microbenchmark(
none = st_none$get(key = "x", use_cache = FALSE),
gzfile = st_gzfile$get(key = "x", use_cache = FALSE),
fst = st_fst$get(key = "x", use_cache = FALSE),
times = 1
)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> none 2.425930 2.425930 2.425930 2.425930 2.425930 2.425930 1
#> gzfile 4.908687 4.908687 4.908687 4.908687 4.908687 4.908687 1
#> fst 3.112166 3.112166 3.112166 3.112166 3.112166 3.112166 1 Created on 2019-06-18 by the reprex package (v0.3.0) For larger data: library(storr)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
data <- runif(3e8)
object_size(data)
#> 2.4 GB
st_none <- storr_rds(tempfile(), compress = "none")
st_gzfile <- storr_rds(tempfile(), compress = "gzfile")
st_fst <- storr_rds(tempfile(), compress = "fst")
st_none$set(key = "x", value = data, use_cache = FALSE)
#> Repacking large object
st_gzfile$set(key = "x", value = data, use_cache = FALSE)
#> Repacking large object
st_fst$set(key = "x", value = data, use_cache = FALSE)
microbenchmark(
none = st_none$get(key = "x", use_cache = FALSE),
gzfile = st_gzfile$get(key = "x", use_cache = FALSE),
fst = st_fst$get(key = "x", use_cache = FALSE),
times = 1
)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> none 7.194649 7.194649 7.194649 7.194649 7.194649 7.194649 1
#> gzfile 14.287884 14.287884 14.287884 14.287884 14.287884 14.287884 1
#> fst 8.197173 8.197173 8.197173 8.197173 8.197173 8.197173 1 |
We might still want to keep #109, either instead of or in addition to #111: #109 (comment). |
Hi @wlandau, I see you have added an option Would it be useful to distinguish between the two (very different) compressors? |
Yes, I agree. In 486b1ea, I exposed both the algorithm and the compression factor to |
Assuming a compression factor of 0, I expect library(gt)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
library(storr)
library(tidyverse)
none <- storr_rds(tempfile(), "none")
gzip <- storr_rds(tempfile(), "gzip")
lz4 <- storr_rds(tempfile(), "lz4")
zstd <- storr_rds(tempfile("zstd"))
results <- NULL
for (scale in seq_len(8)) {
data <- runif(10 ^ scale)
set <- microbenchmark(
none = none$set(key = "x", value = data, use_cache = FALSE),
gzip = gzip$set(key = "x", value = data, use_cache = FALSE),
lz4 = lz4$set(key = "x", value = data, use_cache = FALSE),
zstd = zstd$set(key = "x", value = data, use_cache = FALSE),
times = floor(100 ^ (1 / scale))
) %>%
mutate(op = "set")
get <- microbenchmark(
none = none$get(key = "x", use_cache = FALSE),
gzip = gzip$get(key = "x", use_cache = FALSE),
lz4 = lz4$get(key = "x", use_cache = FALSE),
zstd = zstd$get(key = "x", use_cache = FALSE),
times = floor(100 ^ (1 / scale))
) %>%
mutate(op = "get")
new_results <- bind_rows(get, set) %>%
mutate(size = as.numeric(object_size(data)))
results <- bind_rows(new_results, results)
}
results <- as_tibble(results) %>%
mutate(
algo = as.character(expr),
time = time / 1e9
) %>%
select(-expr) %>%
group_by(algo, op, size) %>%
summarize(time = mean(time), reps = n())
ggplot(results) +
geom_line(aes(x = size, y = time, group = algo, color = algo)) +
facet_grid(op ~ ., scales = "free_y") +
xlab("Bytes") +
ylab("Mean seconds") +
labs(color = "storr operation") +
theme_gray(16) ggplot(results) +
geom_line(aes(x = size, y = time, group = algo, color = algo)) +
facet_grid(op ~ ., scales = "free_y") +
xlab("Log bytes") +
ylab("Log mean seconds") +
scale_x_log10() +
scale_y_log10() +
labs(color = "storr operation") +
theme_gray(16) results %>%
spread("algo", "time") %>%
arrange(op, size) %>%
select(reps, size, none, gzip, lz4, zstd) %>%
gt() %>%
fmt_number(columns = vars(none, gzip, lz4, zstd), decimals = 3) %>%
fmt_scientific(column = vars(size))
#> Adding missing grouping variables: `op`
Created on 2019-06-20 by the reprex package (v0.3.0) |
One problem with this approach is that we have 3 copies of the data in memory at the same time.
@MarcusKlik, does there exist a way to compress and write to a connection simultaneously, as with |
Hi @wlandau, in general, if you want to compress and store in a single step, you will need some existing function that uses R's C API to accomplish that or write one yourself. Any step that calls two distinct methods (from In Method |
This is exactly what I had in mind. How much of a difference in memory usage do you think it will make? How much does the compression level affect block size? It looks like blocks cam be as large as |
Hi @wlandau, yes indeed, the blocksize is set using this code and will be in between 16 kB and 0.8 * INT_MAX in size. The latter only occurs if the original The thing is that because To avoid large allocations, perhaps yet another argument will be useful to allow the user to set the maximum block size to use, e.g. max_buffer = 1e8. Or maybe using a factor of the original vector length: max_buffer_ratio = 0.5. With the two arguments combined (connection and max_buffer_ratio) you would be able to set the algorithm to use very little overhead in terms of RAM usage, what do you think? |
I think I should document this tradeoff. Commits forthcoming.
I love that idea. I think it would require some work to understand the tradeoff between |
ah, lucky guy :-). In the So, it might be a good idea to significantly reduce the block sizes used in Thanks for discussing your use case! |
Excellent! Thank you for being open to these improvements. |
When I discussed with @richfitz earlier today, he suggested that:
|
Thanks @wlandau - I'll take a shot at an abstraction layer for this once I see what the connection interface to the fst compression looks like. I think once that's done we can come up with something here that gets the performance improvements you're wanting/seeing without creating an interface that's too unweildly |
Awesome! Thank you for the support. |
Just realized I had a typo in the code for the library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
library(storr)
library(tidyverse)
none_0 <- storr_rds(tempfile())
lz4_0 <- storr_rds(tempfile(), "lz4", 0)
lz4_25 <- storr_rds(tempfile(), "lz4", 25)
lz4_50 <- storr_rds(tempfile(), "lz4", 50)
lz4_75 <- storr_rds(tempfile(), "lz4", 75)
lz4_100 <- storr_rds(tempfile(), "lz4", 100)
zstd_0 <- storr_rds(tempfile(), "zstd", 0)
zstd_25 <- storr_rds(tempfile(), "zstd", 25)
zstd_50 <- storr_rds(tempfile(), "zstd", 50)
zstd_75 <- storr_rds(tempfile(), "zstd", 75)
zstd_100 <- storr_rds(tempfile(), "zstd", 100)
results <- NULL
for (scale in seq_len(8)) {
data <- runif(10 ^ scale)
set <- microbenchmark(
none_0 = none_0$set(key = "x", value = data, use_cache = FALSE),
lz4_0 = lz4_0$set(key = "x", value = data, use_cache = FALSE),
lz4_25 = lz4_25$set(key = "x", value = data, use_cache = FALSE),
lz4_50 = lz4_50$set(key = "x", value = data, use_cache = FALSE),
lz4_75 = lz4_75$set(key = "x", value = data, use_cache = FALSE),
lz4_100 = lz4_100$set(key = "x", value = data, use_cache = FALSE),
zstd_0 = zstd_0$set(key = "x", value = data, use_cache = FALSE),
zstd_25 = zstd_25$set(key = "x", value = data, use_cache = FALSE),
zstd_50 = zstd_50$set(key = "x", value = data, use_cache = FALSE),
zstd_75 = zstd_75$set(key = "x", value = data, use_cache = FALSE),
zstd_100 = zstd_100$set(key = "x", value = data, use_cache = FALSE),
times = floor(100 ^ (1 / scale))
) %>%
mutate(op = "set")
get <- microbenchmark(
none_0 = none_0$get(key = "x", use_cache = FALSE),
lz4_0 = lz4_0$get(key = "x", use_cache = FALSE),
lz4_25 = lz4_25$get(key = "x", use_cache = FALSE),
lz4_50 = lz4_50$get(key = "x", use_cache = FALSE),
lz4_75 = lz4_75$get(key = "x", use_cache = FALSE),
lz4_100 = lz4_100$get(key = "x", use_cache = FALSE),
zstd_0 = zstd_0$get(key = "x", use_cache = FALSE),
zstd_25 = zstd_25$get(key = "x", use_cache = FALSE),
zstd_50 = zstd_50$get(key = "x", use_cache = FALSE),
zstd_75 = zstd_75$get(key = "x", use_cache = FALSE),
zstd_100 = zstd_100$get(key = "x", use_cache = FALSE),
times = floor(100 ^ (1 / scale))
) %>%
mutate(op = "get")
new_results <- bind_rows(get, set) %>%
mutate(size = as.numeric(object_size(data)))
results <- bind_rows(new_results, results)
}
results <- as_tibble(results) %>%
mutate(
algo = ordered(
gsub("_.*$", "", as.character(expr)),
levels = c("lz4", "zstd", "none")
),
compression = ordered(
as.integer(gsub("^.*_", "", as.character(expr)))
),
time = time / 1e9
) %>%
select(-expr) %>%
group_by(algo, compression, op, size) %>%
summarize(time = mean(time), reps = n())
ggplot(results) +
geom_line(
aes(
x = size,
y = time,
group = compression,
linetype = compression,
color = compression
)
) +
facet_grid(op ~ algo, scales = "free_y", labeller = label_both) +
xlab("Bytes") +
ylab("Mean seconds") +
theme_gray(16) +
scale_color_brewer(palette = "Set1") Created on 2019-08-03 by the reprex package (v0.3.0) |
From ropensci/drake#907 (comment), this PR does not quite achieve the efficiency of |
No description provided.