From 1291502170f3cc515960dfd084029eafadd54529 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Thu, 8 Apr 2021 23:40:21 +0200
Subject: [PATCH] Fix NaN errors observed with ARIMA in CUDA 11.2 builds
 (#3730)

Closes #3649

The error was in `batched_kalman_loop_kernel`: calculating `Y = a M X + b Y` when `b = 0` can still result in `NaN` if `Y` is uninitialized. One possible fix would be to initialize to zeros the component that was uninitialized. The fix I chose is to remove the unnecessary read, so no uninitialized value is accessed and we save unnecessary operations.

Authors:
  - Louis Sugy (https://github.com/Nyrio)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/3730
---
 cpp/src/arima/batched_kalman.cu | 7 +++----
 python/cuml/test/test_arima.py  | 6 ------
 2 files changed, 3 insertions(+), 10 deletions(-)
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
index a255a64f4d..d8ee3cf64a 100644
--- a/cpp/src/arima/batched_kalman.cu
+++ b/cpp/src/arima/batched_kalman.cu
@@ -49,14 +49,13 @@ DI void Mv_l(const double* A, const double* v, double* out) {
 }
 
 template <int n>
-DI void Mv_l(double alpha, const double* A, const double* v, double beta,
-             double* out) {
+DI void Mv_l(double alpha, const double* A, const double* v, double* out) {
   for (int i = 0; i < n; i++) {
     double sum = 0.0;
     for (int j = 0; j < n; j++) {
       sum += A[i + j * n] * v[j];
     }
-    out[i] = alpha * sum + beta * out[i];
+    out[i] = alpha * sum;
   }
 }
 
@@ -179,7 +178,7 @@ __global__ void batched_kalman_loop_kernel(
           l_K[i] = _1_Fs * l_TP[i];
         }
       } else
-        Mv_l<rd>(_1_Fs, l_TP, l_Z, 0.0, l_K);
+        Mv_l<rd>(_1_Fs, l_TP, l_Z, l_K);
 
       // 4. alpha = T*alpha + K*vs[it] + c
       // tmp = T*alpha
diff --git a/python/cuml/test/test_arima.py b/python/cuml/test/test_arima.py
index 260edeabd8..95380266c1 100644
--- a/python/cuml/test/test_arima.py
+++ b/python/cuml/test/test_arima.py
@@ -37,7 +37,6 @@
 from collections import namedtuple
 import numpy as np
 import os
-import rmm
 import warnings
 
 import pandas as pd
@@ -270,11 +269,6 @@ def _statsmodels_to_cuml(ref_fits, cuml_model, order, seasonal_order,
         in statsmodels and cuML models (it depends on the order).
 
     """
-
-    if rmm._cuda.gpu.runtimeGetVersion() >= 11020:
-        pytest.skip("CUDA 11.2 nan failure, see "
-                    "https://github.com/rapidsai/cuml/issues/3649")
-
     nb = cuml_model.batch_size
     N = cuml_model.complexity
     x = np.zeros(nb * N, dtype=np.float64)