ARROW-18062: [R] error in CI jobs for R 3.5 and 3.6 when R package be…

…ing installed (#14424) Not sure why this behavior would differ between R versions, but it makes sense to move the UDF registration to after the binding registry is created and only do it if slice_sample() is called. Authored-by: Neal Richardson <neal.p.richardson@gmail.com> Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
apache · Oct 15, 2022 · f0cf5c2 · f0cf5c2
1 parent 99b4092
commit f0cf5c2
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 14 deletions.
diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R
@@ -139,7 +139,6 @@ call_binding_agg <- function(fun_name, ...) {
   agg_funcs[[fun_name]](...)
 }
 
-#' @importFrom stats runif
 create_binding_cache <- function() {
   # Called in .onLoad()
   .cache$docs <- list()
@@ -164,17 +163,6 @@ create_binding_cache <- function() {
   register_bindings_type()
   register_bindings_augmented()
 
-  # HACK because random() doesn't work (ARROW-17974)
-  register_scalar_function(
-    "_random_along",
-    function(context, x) {
-      Array$create(runif(length(x)))
-    },
-    in_type = schema(x = boolean()),
-    out_type = float64(),
-    auto_convert = FALSE
-  )
-
   # We only create the cache for nse_funcs and not agg_funcs
   .cache$functions <- c(as.list(nse_funcs), arrow_funcs)
 }

diff --git a/r/R/dplyr-slice.R b/r/R/dplyr-slice.R
@@ -86,6 +86,7 @@ slice_max.arrow_dplyr_query <- function(.data, order_by, ..., n, prop, with_ties
 }
 slice_max.Dataset <- slice_max.ArrowTabular <- slice_max.RecordBatchReader <- slice_max.arrow_dplyr_query
 
+#' @importFrom stats runif
 slice_sample.arrow_dplyr_query <- function(.data,
                                            ...,
                                            n,
@@ -116,10 +117,21 @@ slice_sample.arrow_dplyr_query <- function(.data,
   if (prop < 1) {
     .data <- as_adq(.data)
     # TODO(ARROW-17974): use Expression$create("random") instead of UDF hack
-    # HACK: use our UDF to generate random. It needs an input column because
+    # HACK: use a UDF to generate random. It needs an input column because
     # nullary functions don't work, and that column has to be typed. We've
     # chosen boolean() type because it's compact and can always be created:
     # pick any column and do is.na, that will be boolean.
+    if (is.null(.cache$functions[["_random_along"]])) {
+      register_scalar_function(
+        "_random_along",
+        function(context, x) {
+          Array$create(runif(length(x)))
+        },
+        in_type = schema(x = boolean()),
+        out_type = float64(),
+        auto_convert = FALSE
+      )
+    }
     # TODO: get an actual FieldRef because the first col could be derived
     ref <- Expression$create("is_null", .data$selected_columns[[1]])
     expr <- Expression$create("_random_along", ref) < prop

diff --git a/r/tests/testthat/test-dplyr-slice.R b/r/tests/testthat/test-dplyr-slice.R
@@ -99,8 +99,10 @@ test_that("slice_sample, ungrouped", {
     "weight_by"
   )
 
+  # Let's not take any chances on random failures
+  skip_on_cran()
   # Because this is random (and we only have 10 rows), try several times
-  for (i in 1:10) {
+  for (i in 1:50) {
     sampled_prop <- tab %>%
       slice_sample(prop = .2) %>%
       collect() %>%