add cudasynchronize to pin point error, ci trial #2

rapidsai · Feb 13, 2024 · 0d25832 · 0d25832
1 parent fa57a91
commit 0d25832
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 0 deletions.
diff --git a/ci/test_python_dask.sh b/ci/test_python_dask.sh
@@ -17,6 +17,7 @@ pytest \
   --cov=cuml_dask \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cuml-dask-coverage.xml" \
   --cov-report=term \
+  -s \
   test_dask_logistic_regression.py::test_standardization_sparse
 #  .
 

diff --git a/cpp/src/glm/qn/mg/standardization.cuh b/cpp/src/glm/qn/mg/standardization.cuh
@@ -38,6 +38,10 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/init.cuh>
 
+#include <raft/core/device_mdarray.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
 namespace ML {
 namespace GLM {
 namespace opg {
@@ -78,6 +82,7 @@ void mean_stddev(const raft::handle_t& handle,
                  T* mean_vector,
                  T* stddev_vector)
 {
+  cudaDeviceSynchronize();
   int D        = X.n;
   int num_rows = X.m;
   auto stream  = handle.get_stream();
@@ -88,6 +93,7 @@ void mean_stddev(const raft::handle_t& handle,
   rmm::device_uvector<T> ones(num_rows, stream);
   auto ones_view = raft::make_device_vector_view(ones.data(), num_rows);
   raft::matrix::fill(handle, ones_view, T(1.0));
+  cudaDeviceSynchronize();
 
   SimpleDenseMat<T> ones_mat(ones.data(), 1, num_rows);
   X.gemmb(handle, 1., ones_mat, false, false, 0., mean_mat, stream);
@@ -96,17 +102,28 @@ void mean_stddev(const raft::handle_t& handle,
   raft::linalg::multiplyScalar(mean_vector, mean_vector, weight, D, stream);
   comm.allreduce(mean_vector, mean_vector, D, raft::comms::op_t::SUM, stream);
   comm.sync_stream(stream);
+  cudaDeviceSynchronize();
 
   // calculate stdev.S
   SimpleDenseMat<T> stddev_mat(stddev_vector, 1, D);
 
+  ML::Logger::get().setLevel(6);
   rmm::device_uvector<T> values_copy(X.nnz, stream);
+  CUML_LOG_DEBUG("sparkdebug X.nnz: %d", X.nnz);
+
+  auto log_X_values = raft::arr2Str(X.values, X.nnz, "", stream);
+  CUML_LOG_DEBUG("sparkdebug log_X_values: %s", log_X_values.c_str());
+
   raft::copy(values_copy.data(), X.values, X.nnz, stream);
+  CUML_LOG_DEBUG("sparkdebug finished copying X.values with X.nnz: %d", X.nnz);
+  cudaDeviceSynchronize();
 
   auto square_op = [] __device__(const T a) { return a * a; };
   raft::linalg::unaryOp(X.values, X.values, X.nnz, square_op, stream);
   X.gemmb(handle, 1., ones_mat, false, false, 0., stddev_mat, stream);
   raft::copy(X.values, values_copy.data(), X.nnz, stream);
+  CUML_LOG_DEBUG("sparkdebug finished copying back to X.values with X.nnz: %d", X.nnz);
+  cudaDeviceSynchronize();
 
   weight = n_samples < 1 ? T(0) : T(1) / T(n_samples - 1);
   raft::linalg::multiplyScalar(stddev_vector, stddev_vector, weight, D, stream);