diff --git a/README.md b/README.md
index 7d597108..a279eb02 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-<a href='https://github.com/WenjieDu/PyPOTS'><img src='https://raw.githubusercontent.com/WenjieDu/PyPOTS/main/docs/figs/PyPOTS%20logo.svg?sanitize=true' width='190' align='right' /></a>
+<a href='https://github.com/WenjieDu/PyPOTS'><img src='https://raw.githubusercontent.com/WenjieDu/PyPOTS/main/docs/figs/PyPOTS%20logo.svg?sanitize=true' width='200' align='right' /></a>
 
-# <p align='center'>Welcome to PyPOTS</p>
+## <p align='center'>Welcome to PyPOTS</p>
 **<p align='center'>A Python Toolbox for Data Mining on Partially-Observed Time Series</p>**
 
 <p align='center'>
diff --git a/pypots/__version__.py b/pypots/__version__.py
index 0323fd65..ae44ed76 100644
--- a/pypots/__version__.py
+++ b/pypots/__version__.py
@@ -21,4 +21,4 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 
-version = '0.0.7'
+version = "0.0.7"
diff --git a/pypots/base.py b/pypots/base.py
index 8d8e3acf..cdded5d5 100644
--- a/pypots/base.py
+++ b/pypots/base.py
@@ -12,8 +12,7 @@
 
 
 class BaseModel(ABC):
-    """ Base class for all models.
-    """
+    """Base class for all models."""
 
     def __init__(self, device):
         self.logger = {}
@@ -21,13 +20,17 @@ def __init__(self, device):
 
         if device is None:
             self.device = torch.device(
-                "cuda:0" if torch.cuda.is_available() and torch.cuda.device_count() > 0 else "cpu"
+                "cuda:0"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 0
+                else "cpu"
             )
         else:
             self.device = device
 
-    def check_input(self, expected_n_steps, expected_n_features, X, y=None, out_dtype='tensor'):
-        """ Check value type and shape of input X and y
+    def check_input(
+        self, expected_n_steps, expected_n_features, X, y=None, out_dtype="tensor"
+    ):
+        """Check value type and shape of input X and y
 
         Parameters
         ----------
@@ -54,15 +57,20 @@ def check_input(self, expected_n_steps, expected_n_features, X, y=None, out_dtyp
 
         y : tensor
         """
-        assert out_dtype in ['tensor', 'ndarray'], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}'
+        assert out_dtype in [
+            "tensor",
+            "ndarray",
+        ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}'
         is_list = isinstance(X, list)
         is_array = isinstance(X, np.ndarray)
         is_tensor = isinstance(X, torch.Tensor)
-        assert is_tensor or is_array or is_list, TypeError('X should be an instance of list/np.ndarray/torch.Tensor, '
-                                                           f'but got {type(X)}')
+        assert is_tensor or is_array or is_list, TypeError(
+            "X should be an instance of list/np.ndarray/torch.Tensor, "
+            f"but got {type(X)}"
+        )
 
         # convert the data type if in need
-        if out_dtype == 'tensor':
+        if out_dtype == "tensor":
             if is_list:
                 X = torch.tensor(X).to(self.device)
             elif is_array:
@@ -80,29 +88,42 @@ def check_input(self, expected_n_steps, expected_n_features, X, y=None, out_dtyp
 
         # check the shape of X here
         X_shape = X.shape
-        assert len(X_shape) == 3, f'input should have 3 dimensions [n_samples, seq_len, n_features],' \
-                                  f'but got shape={X.shape}'
-        assert X_shape[1] == expected_n_steps, f'expect X.shape[1] to be {expected_n_steps}, but got {X_shape[1]}'
-        assert X_shape[2] == expected_n_features, f'expect X.shape[2] to be {expected_n_features}, but got {X_shape[2]}'
+        assert len(X_shape) == 3, (
+            f"input should have 3 dimensions [n_samples, seq_len, n_features],"
+            f"but got shape={X.shape}"
+        )
+        assert (
+            X_shape[1] == expected_n_steps
+        ), f"expect X.shape[1] to be {expected_n_steps}, but got {X_shape[1]}"
+        assert (
+            X_shape[2] == expected_n_features
+        ), f"expect X.shape[2] to be {expected_n_features}, but got {X_shape[2]}"
 
         if y is not None:
-            assert len(X) == len(y), f'lengths of X and y must match, ' \
-                                     f'but got f{len(X)} and {len(y)}'
+            assert len(X) == len(y), (
+                f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}"
+            )
             if isinstance(y, torch.Tensor):
-                y = y.to(self.device) if out_dtype == 'tensor' else y.numpy()
+                y = y.to(self.device) if out_dtype == "tensor" else y.numpy()
             elif isinstance(y, list):
-                y = torch.tensor(y).to(self.device) if out_dtype == 'tensor' else np.asarray(y)
+                y = (
+                    torch.tensor(y).to(self.device)
+                    if out_dtype == "tensor"
+                    else np.asarray(y)
+                )
             elif isinstance(y, np.ndarray):
-                y = torch.from_numpy(y).to(self.device) if out_dtype == 'tensor' else y
+                y = torch.from_numpy(y).to(self.device) if out_dtype == "tensor" else y
             else:
-                raise TypeError('y should be an instance of list/np.ndarray/torch.Tensor, '
-                                f'but got {type(y)}')
+                raise TypeError(
+                    "y should be an instance of list/np.ndarray/torch.Tensor, "
+                    f"but got {type(y)}"
+                )
             return X, y
         else:
             return X
 
     def save_logs_to_tensorboard(self, saving_path):
-        """ Save logs (self.logger) into a tensorboard file.
+        """Save logs (self.logger) into a tensorboard file.
 
         Parameters
         ----------
@@ -110,14 +131,14 @@ def save_logs_to_tensorboard(self, saving_path):
             Local disk path to save the tensorboard file.
         """
         # TODO: find a solution for log saving
-        raise IOError('This function is not ready for users.')
+        raise IOError("This function is not ready for users.")
         # tb_summary_writer = SummaryWriter(saving_path)
         # tb_summary_writer.add_custom_scalars(self.logger)
         # tb_summary_writer.close()
         # print(f'Log saved successfully to {saving_path}.')
 
     def save_model(self, saving_path):
-        """ Save the model to a disk file.
+        """Save the model to a disk file.
 
         Parameters
         ----------
@@ -128,10 +149,10 @@ def save_model(self, saving_path):
             torch.save(self.model, saving_path)
         except Exception as e:
             print(e)
-        print(f'Saved successfully to {saving_path}.')
+        print(f"Saved successfully to {saving_path}.")
 
     def load_model(self, model_path):
-        """ Load the saved model from a disk file.
+        """Load the saved model from a disk file.
 
         Parameters
         ----------
@@ -152,14 +173,15 @@ def load_model(self, model_path):
                 self.model = loaded_model.model
         except Exception as e:
             raise e
-        print(f'Model loaded successfully from {model_path}.')
+        print(f"Model loaded successfully from {model_path}.")
 
 
 class BaseNNModel(BaseModel):
-    """ Abstract class for all neural-network models.
-    """
+    """Abstract class for all neural-network models."""
 
-    def __init__(self, learning_rate, epochs, patience, batch_size, weight_decay, device):
+    def __init__(
+        self, learning_rate, epochs, patience, batch_size, weight_decay, device
+    ):
         super().__init__(device)
 
         # training hype-parameters
@@ -173,14 +195,12 @@ def __init__(self, learning_rate, epochs, patience, batch_size, weight_decay, de
         self.model = None
         self.optimizer = None
         self.best_model_dict = None
-        self.best_loss = float('inf')
-        self.logger = {
-            'training_loss': [],
-            'validating_loss': []
-        }
+        self.best_loss = float("inf")
+        self.logger = {"training_loss": [], "validating_loss": []}
 
     def _print_model_size(self):
-        """ Print the number of trainable parameters in the initialized NN model.
-        """
+        """Print the number of trainable parameters in the initialized NN model."""
         num_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
-        print(f'Model initialized successfully. Number of the trainable parameters: {num_params}')
+        print(
+            f"Model initialized successfully. Number of the trainable parameters: {num_params}"
+        )
diff --git a/pypots/classification/__init__.py b/pypots/classification/__init__.py
index 80d42c90..56a7a03c 100644
--- a/pypots/classification/__init__.py
+++ b/pypots/classification/__init__.py
@@ -10,8 +10,7 @@
 from pypots.classification.raindrop import Raindrop
 
 __all__ = [
-    'BRITS',
-    'GRUD',
-    'Raindrop',
-
+    "BRITS",
+    "GRUD",
+    "Raindrop",
 ]
diff --git a/pypots/classification/base.py b/pypots/classification/base.py
index c1b706a7..4ee02af0 100644
--- a/pypots/classification/base.py
+++ b/pypots/classification/base.py
@@ -15,15 +15,14 @@
 
 
 class BaseClassifier(BaseModel):
-    """ Abstract class for all classification models.
-    """
+    """Abstract class for all classification models."""
 
     def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
     def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """ Train the classifier.
+        """Train the classifier.
 
         Parameters
         ----------
@@ -45,7 +44,7 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
 
     @abstractmethod
     def classify(self, X):
-        """ Classify the input with the trained model.
+        """Classify the input with the trained model.
 
         Parameters
         ----------
@@ -61,9 +60,19 @@ def classify(self, X):
 
 
 class BaseNNClassifier(BaseNNModel, BaseClassifier):
-    def __init__(self, n_classes, learning_rate, epochs, patience, batch_size, weight_decay,
-                 device):
-        super().__init__(learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_classes,
+        learning_rate,
+        epochs,
+        patience,
+        batch_size,
+        weight_decay,
+        device,
+    ):
+        super().__init__(
+            learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
         self.n_classes = n_classes
 
     @abstractmethod
@@ -71,12 +80,12 @@ def assemble_input_data(self, data):
         pass
 
     def _train_model(self, training_loader, val_loader=None):
-        self.optimizer = torch.optim.Adam(self.model.parameters(),
-                                          lr=self.lr,
-                                          weight_decay=self.weight_decay)
+        self.optimizer = torch.optim.Adam(
+            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
 
         # each training starts from the very beginning, so reset the loss and model dict here
-        self.best_loss = float('inf')
+        self.best_loss = float("inf")
         self.best_model_dict = None
 
         try:
@@ -87,12 +96,14 @@ def _train_model(self, training_loader, val_loader=None):
                     inputs = self.assemble_input_data(data)
                     self.optimizer.zero_grad()
                     results = self.model.forward(inputs)
-                    results['loss'].backward()
+                    results["loss"].backward()
                     self.optimizer.step()
-                    epoch_train_loss_collector.append(results['loss'].item())
+                    epoch_train_loss_collector.append(results["loss"].item())
 
-                mean_train_loss = np.mean(epoch_train_loss_collector)  # mean training loss of the current epoch
-                self.logger['training_loss'].append(mean_train_loss)
+                mean_train_loss = np.mean(
+                    epoch_train_loss_collector
+                )  # mean training loss of the current epoch
+                self.logger["training_loss"].append(mean_train_loss)
 
                 if val_loader is not None:
                     self.model.eval()
@@ -101,14 +112,16 @@ def _train_model(self, training_loader, val_loader=None):
                         for idx, data in enumerate(val_loader):
                             inputs = self.assemble_input_data(data)
                             results = self.model.forward(inputs)
-                            epoch_val_loss_collector.append(results['loss'].item())
+                            epoch_val_loss_collector.append(results["loss"].item())
 
                     mean_val_loss = np.mean(epoch_val_loss_collector)
-                    self.logger['validating_loss'].append(mean_val_loss)
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}')
+                    self.logger["validating_loss"].append(mean_val_loss)
+                    print(
+                        f"epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}"
+                    )
                     mean_loss = mean_val_loss
                 else:
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}')
+                    print(f"epoch {epoch}: training loss {mean_train_loss:.4f}")
                     mean_loss = mean_train_loss
 
                 if mean_loss < self.best_loss:
@@ -118,18 +131,24 @@ def _train_model(self, training_loader, val_loader=None):
                 else:
                     self.patience -= 1
                     if self.patience == 0:
-                        print('Exceeded the training patience. Terminating the training procedure...')
+                        print(
+                            "Exceeded the training patience. Terminating the training procedure..."
+                        )
                         break
         except Exception as e:
-            print(f'Exception: {e}')
+            print(f"Exception: {e}")
             if self.best_model_dict is None:
-                raise RuntimeError('Training got interrupted. Model was not get trained. Please try fit() again.')
+                raise RuntimeError(
+                    "Training got interrupted. Model was not get trained. Please try fit() again."
+                )
             else:
-                RuntimeWarning('Training got interrupted. '
-                               'Model will load the best parameters so far for testing. '
-                               "If you don't want it, please try fit() again.")
+                RuntimeWarning(
+                    "Training got interrupted. "
+                    "Model will load the best parameters so far for testing. "
+                    "If you don't want it, please try fit() again."
+                )
 
-        if np.equal(self.best_loss, float('inf')):
-            raise ValueError('Something is wrong. best_loss is Nan after training.')
+        if np.equal(self.best_loss, float("inf")):
+            raise ValueError("Something is wrong. best_loss is Nan after training.")
 
-        print('Finished training.')
+        print("Finished training.")
diff --git a/pypots/classification/brits.py b/pypots/classification/brits.py
index 10618dd3..f73dbcf5 100644
--- a/pypots/classification/brits.py
+++ b/pypots/classification/brits.py
@@ -12,10 +12,7 @@
 
 from pypots.classification.base import BaseNNClassifier
 from pypots.data import DatasetForBRITS
-from pypots.imputation.brits import (
-    RITS as imputation_RITS,
-    _BRITS as imputation_BRITS
-)
+from pypots.imputation.brits import RITS as imputation_RITS, _BRITS as imputation_BRITS
 
 
 class RITS(imputation_RITS):
@@ -24,16 +21,24 @@ def __init__(self, n_steps, n_features, rnn_hidden_size, n_classes, device=None)
         self.dropout = nn.Dropout(p=0.25)
         self.classifier = nn.Linear(self.rnn_hidden_size, n_classes)
 
-    def forward(self, inputs, direction='forward'):
+    def forward(self, inputs, direction="forward"):
         ret_dict = super().forward(inputs, direction)
-        logits = self.classifier(ret_dict['final_hidden_state'])
-        ret_dict['prediction'] = torch.softmax(logits, dim=1)
+        logits = self.classifier(ret_dict["final_hidden_state"])
+        ret_dict["prediction"] = torch.softmax(logits, dim=1)
         return ret_dict
 
 
 class _BRITS(imputation_BRITS, nn.Module):
-    def __init__(self, n_steps, n_features, rnn_hidden_size, n_classes,
-                 classification_weight, reconstruction_weight, device=None):
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        rnn_hidden_size,
+        n_classes,
+        classification_weight,
+        reconstruction_weight,
+        device=None,
+    ):
         super().__init__(n_steps, n_features, rnn_hidden_size)
         self.n_steps = n_steps
         self.n_features = n_features
@@ -46,7 +51,7 @@ def __init__(self, n_steps, n_features, rnn_hidden_size, n_classes,
         self.reconstruction_weight = reconstruction_weight
 
     def merge_ret(self, ret_f, ret_b):
-        """ Merge (average) results from two RITS models into one.
+        """Merge (average) results from two RITS models into one.
 
         Parameters
         ----------
@@ -61,19 +66,19 @@ def merge_ret(self, ret_f, ret_b):
             Merged results in a dictionary.
         """
         results = {
-            'imputed_data': (ret_f['imputed_data'] + ret_b['imputed_data']) / 2,
-            'prediction': (ret_f['prediction'] + ret_b['prediction']) / 2
+            "imputed_data": (ret_f["imputed_data"] + ret_b["imputed_data"]) / 2,
+            "prediction": (ret_f["prediction"] + ret_b["prediction"]) / 2,
         }
         return results
 
     def classify(self, inputs):
-        ret_f = self.rits_f(inputs, 'forward')
-        ret_b = self.reverse(self.rits_b(inputs, 'backward'))
+        ret_f = self.rits_f(inputs, "forward")
+        ret_b = self.reverse(self.rits_b(inputs, "backward"))
         merged_ret = self.merge_ret(ret_f, ret_b)
         return merged_ret, ret_f, ret_b
 
     def forward(self, inputs):
-        """ Forward processing of BRITS.
+        """Forward processing of BRITS.
 
         Parameters
         ----------
@@ -85,21 +90,32 @@ def forward(self, inputs):
         dict, A dictionary includes all results.
         """
         merged_ret, ret_f, ret_b = self.classify(inputs)
-        ret_f['classification_loss'] = F.nll_loss(torch.log(ret_f['prediction']), inputs['label'])
-        ret_b['classification_loss'] = F.nll_loss(torch.log(ret_b['prediction']), inputs['label'])
-        consistency_loss = self.get_consistency_loss(ret_f['imputed_data'], ret_b['imputed_data'])
-        classification_loss = (ret_f['classification_loss'] + ret_b['classification_loss']) / 2
-        merged_ret['consistency_loss'] = consistency_loss
-        merged_ret['classification_loss'] = classification_loss
-        merged_ret['loss'] = \
-            consistency_loss + \
-            (ret_f['reconstruction_loss'] + ret_b['reconstruction_loss']) * self.reconstruction_weight + \
-            (ret_f['classification_loss'] + ret_b['classification_loss']) * self.classification_weight
+        ret_f["classification_loss"] = F.nll_loss(
+            torch.log(ret_f["prediction"]), inputs["label"]
+        )
+        ret_b["classification_loss"] = F.nll_loss(
+            torch.log(ret_b["prediction"]), inputs["label"]
+        )
+        consistency_loss = self.get_consistency_loss(
+            ret_f["imputed_data"], ret_b["imputed_data"]
+        )
+        classification_loss = (
+            ret_f["classification_loss"] + ret_b["classification_loss"]
+        ) / 2
+        merged_ret["consistency_loss"] = consistency_loss
+        merged_ret["classification_loss"] = classification_loss
+        merged_ret["loss"] = (
+            consistency_loss
+            + (ret_f["reconstruction_loss"] + ret_b["reconstruction_loss"])
+            * self.reconstruction_weight
+            + (ret_f["classification_loss"] + ret_b["classification_loss"])
+            * self.classification_weight
+        )
         return merged_ret
 
 
 class BRITS(BaseNNClassifier):
-    """ BRITS implementation of BaseClassifier.
+    """BRITS implementation of BaseClassifier.
 
     Attributes
     ----------
@@ -128,20 +144,24 @@ class BRITS(BaseNNClassifier):
         Run the model on which device.
     """
 
-    def __init__(self,
-                 n_steps,
-                 n_features,
-                 rnn_hidden_size,
-                 n_classes,
-                 classification_weight=1,
-                 reconstruction_weight=1,
-                 learning_rate=1e-3,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(n_classes, learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        rnn_hidden_size,
+        n_classes,
+        classification_weight=1,
+        reconstruction_weight=1,
+        learning_rate=1e-3,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            n_classes, learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
         self.n_steps = n_steps
         self.n_features = n_features
@@ -149,13 +169,20 @@ def __init__(self,
         self.classification_weight = classification_weight
         self.reconstruction_weight = reconstruction_weight
 
-        self.model = _BRITS(self.n_steps, self.n_features, self.rnn_hidden_size, self.n_classes,
-                            self.classification_weight, self.reconstruction_weight, self.device)
+        self.model = _BRITS(
+            self.n_steps,
+            self.n_features,
+            self.rnn_hidden_size,
+            self.n_classes,
+            self.classification_weight,
+            self.reconstruction_weight,
+            self.device,
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
 
     def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """ Fit the model on the given training data.
+        """Fit the model on the given training data.
 
         Parameters
         ----------
@@ -169,11 +196,17 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         self : object,
             Trained model.
         """
-        train_X, train_y = self.check_input(self.n_steps, self.n_features, train_X, train_y)
+        train_X, train_y = self.check_input(
+            self.n_steps, self.n_features, train_X, train_y
+        )
         val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y)
 
-        training_set = DatasetForBRITS(train_X, train_y)  # time_gaps is necessary for BRITS
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_set = DatasetForBRITS(
+            train_X, train_y
+        )  # time_gaps is necessary for BRITS
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
 
         if val_X is None:
             self._train_model(training_loader)
@@ -187,7 +220,7 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         return self
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -200,22 +233,27 @@ def assemble_input_data(self, data):
             A dictionary with data assembled.
         """
         # fetch data
-        indices, X, missing_mask, deltas, back_X, back_missing_mask, back_deltas, label = data
+        (
+            indices,
+            X,
+            missing_mask,
+            deltas,
+            back_X,
+            back_missing_mask,
+            back_deltas,
+            label,
+        ) = data
 
         # assemble input data
         inputs = {
-            'indices': indices,
-            'label': label,
-            'forward': {
-                'X': X,
-                'missing_mask': missing_mask,
-                'deltas': deltas
+            "indices": indices,
+            "label": label,
+            "forward": {"X": X, "missing_mask": missing_mask, "deltas": deltas},
+            "backward": {
+                "X": back_X,
+                "missing_mask": back_missing_mask,
+                "deltas": back_deltas,
             },
-            'backward': {
-                'X': back_X,
-                'missing_mask': back_missing_mask,
-                'deltas': back_deltas
-            }
         }
         return inputs
 
@@ -229,24 +267,28 @@ def classify(self, X):
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 # cannot use input_data_processing, cause here has no label
-                indices, X, missing_mask, deltas, back_X, back_missing_mask, back_deltas = data
+                (
+                    indices,
+                    X,
+                    missing_mask,
+                    deltas,
+                    back_X,
+                    back_missing_mask,
+                    back_deltas,
+                ) = data
                 # assemble input data
                 inputs = {
-                    'indices': indices,
-                    'forward': {
-                        'X': X,
-                        'missing_mask': missing_mask,
-                        'deltas': deltas
+                    "indices": indices,
+                    "forward": {"X": X, "missing_mask": missing_mask, "deltas": deltas},
+                    "backward": {
+                        "X": back_X,
+                        "missing_mask": back_missing_mask,
+                        "deltas": back_deltas,
                     },
-                    'backward': {
-                        'X': back_X,
-                        'missing_mask': back_missing_mask,
-                        'deltas': back_deltas
-                    }
                 }
 
                 results, _, _ = self.model.classify(inputs)
-                prediction_collector.append(results['prediction'])
+                prediction_collector.append(results["prediction"])
 
         predictions = torch.cat(prediction_collector)
         return predictions.cpu().detach().numpy()
diff --git a/pypots/classification/grud.py b/pypots/classification/grud.py
index da23e88d..7b313eb0 100644
--- a/pypots/classification/grud.py
+++ b/pypots/classification/grud.py
@@ -24,19 +24,27 @@ def __init__(self, n_steps, n_features, rnn_hidden_size, n_classes, device=None)
         self.device = device
 
         # create models
-        self.rnn_cell = nn.GRUCell(self.n_features * 2 + self.rnn_hidden_size, self.rnn_hidden_size)
-        self.temp_decay_h = TemporalDecay(input_size=self.n_features, output_size=self.rnn_hidden_size, diag=False)
-        self.temp_decay_x = TemporalDecay(input_size=self.n_features, output_size=self.n_features, diag=True)
+        self.rnn_cell = nn.GRUCell(
+            self.n_features * 2 + self.rnn_hidden_size, self.rnn_hidden_size
+        )
+        self.temp_decay_h = TemporalDecay(
+            input_size=self.n_features, output_size=self.rnn_hidden_size, diag=False
+        )
+        self.temp_decay_x = TemporalDecay(
+            input_size=self.n_features, output_size=self.n_features, diag=True
+        )
         self.classifier = nn.Linear(self.rnn_hidden_size, self.n_classes)
 
     def classify(self, inputs):
-        values = inputs['X']
-        masks = inputs['missing_mask']
-        deltas = inputs['deltas']
-        empirical_mean = inputs['empirical_mean']
-        X_filledLOCF = inputs['X_filledLOCF']
+        values = inputs["X"]
+        masks = inputs["missing_mask"]
+        deltas = inputs["deltas"]
+        empirical_mean = inputs["empirical_mean"]
+        X_filledLOCF = inputs["X_filledLOCF"]
 
-        hidden_state = torch.zeros((values.size()[0], self.rnn_hidden_size), device=self.device)
+        hidden_state = torch.zeros(
+            (values.size()[0], self.rnn_hidden_size), device=self.device
+        )
 
         for t in range(self.n_steps):
             # for data, [batch, time, features]
@@ -59,7 +67,7 @@ def classify(self, inputs):
         return prediction
 
     def forward(self, inputs):
-        """ Forward processing of GRU-D.
+        """Forward processing of GRU-D.
 
         Parameters
         ----------
@@ -72,16 +80,13 @@ def forward(self, inputs):
             A dictionary includes all results.
         """
         prediction = self.classify(inputs)
-        classification_loss = F.nll_loss(torch.log(prediction), inputs['label'])
-        results = {
-            'prediction': prediction,
-            'loss': classification_loss
-        }
+        classification_loss = F.nll_loss(torch.log(prediction), inputs["label"])
+        results = {"prediction": prediction, "loss": classification_loss}
         return results
 
 
 class GRUD(BaseNNClassifier):
-    """ GRU-D implementation of BaseClassifier.
+    """GRU-D implementation of BaseClassifier.
 
     Attributes
     ----------
@@ -110,28 +115,38 @@ class GRUD(BaseNNClassifier):
         Run the model on which device.
     """
 
-    def __init__(self,
-                 n_steps,
-                 n_features,
-                 rnn_hidden_size,
-                 n_classes,
-                 learning_rate=1e-3,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(n_classes, learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        rnn_hidden_size,
+        n_classes,
+        learning_rate=1e-3,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            n_classes, learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
         self.n_steps = n_steps
         self.n_features = n_features
         self.rnn_hidden_size = rnn_hidden_size
-        self.model = _GRUD(self.n_steps, self.n_features, self.rnn_hidden_size, self.n_classes, self.device)
+        self.model = _GRUD(
+            self.n_steps,
+            self.n_features,
+            self.rnn_hidden_size,
+            self.n_classes,
+            self.device,
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
 
     def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """ Fit the model on the given training data.
+        """Fit the model on the given training data.
 
         Parameters
         ----------
@@ -145,11 +160,15 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         self : object,
             Trained model.
         """
-        train_X, train_y = self.check_input(self.n_steps, self.n_features, train_X, train_y)
+        train_X, train_y = self.check_input(
+            self.n_steps, self.n_features, train_X, train_y
+        )
         val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y)
 
         training_set = DatasetForGRUD(train_X, train_y)
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
 
         if val_X is None:
             self._train_model(training_loader)
@@ -163,7 +182,7 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         return self
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -180,13 +199,13 @@ def assemble_input_data(self, data):
 
         # assemble input data
         inputs = {
-            'indices': indices,
-            'X': X,
-            'X_filledLOCF': X_filledLOCF,
-            'missing_mask': missing_mask,
-            'deltas': deltas,
-            'empirical_mean': empirical_mean,
-            'label': label,
+            "indices": indices,
+            "X": X,
+            "X_filledLOCF": X_filledLOCF,
+            "missing_mask": missing_mask,
+            "deltas": deltas,
+            "empirical_mean": empirical_mean,
+            "label": label,
         }
         return inputs
 
@@ -203,12 +222,12 @@ def classify(self, X):
                 indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean = data
                 # assemble input data
                 inputs = {
-                    'indices': indices,
-                    'X': X,
-                    'X_filledLOCF': X_filledLOCF,
-                    'missing_mask': missing_mask,
-                    'deltas': deltas,
-                    'empirical_mean': empirical_mean,
+                    "indices": indices,
+                    "X": X,
+                    "X_filledLOCF": X_filledLOCF,
+                    "missing_mask": missing_mask,
+                    "deltas": deltas,
+                    "empirical_mean": empirical_mean,
                 }
 
                 prediction = self.model.classify(inputs)
diff --git a/pypots/classification/raindrop.py b/pypots/classification/raindrop.py
index 29fdc9e4..9d2907d3 100644
--- a/pypots/classification/raindrop.py
+++ b/pypots/classification/raindrop.py
@@ -38,17 +38,18 @@
 
 
 class PositionalEncodingTF(nn.Module):
-    """ Generate positional encoding according to time information.
-    """
+    """Generate positional encoding according to time information."""
 
     def __init__(self, d_pe, max_len=500):
         super().__init__()
-        assert d_pe % 2 == 0, 'd_pe should be even, otherwise the output dims will be not equal to d_pe'
+        assert (
+            d_pe % 2 == 0
+        ), "d_pe should be even, otherwise the output dims will be not equal to d_pe"
         self.max_len = max_len
         self._num_timescales = d_pe // 2
 
     def forward(self, time_vectors):
-        """ Generate positional encoding.
+        """Generate positional encoding.
 
         Parameters
         ----------
@@ -64,7 +65,9 @@ def forward(self, time_vectors):
 
         times = time_vectors.unsqueeze(2)
         scaled_time = times / torch.Tensor(timescales[None, None, :])
-        pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], axis=-1)  # T x B x d_model
+        pe = torch.cat(
+            [torch.sin(scaled_time), torch.cos(scaled_time)], axis=-1
+        )  # T x B x d_model
         pe = pe.type(torch.FloatTensor)
         return pe
 
@@ -72,11 +75,22 @@ def forward(self, time_vectors):
 class ObservationPropagation(MessagePassing):
     _alpha: OptTensor
 
-    def __init__(self, in_channels: Union[int, Tuple[int, int]], out_channels: int,
-                 n_nodes: int, ob_dim: int, heads: int = 1, concat: bool = True,
-                 beta: bool = False, dropout: float = 0., edge_dim: Optional[int] = None,
-                 bias: bool = True, root_weight: bool = True, **kwargs):
-        kwargs.setdefault('aggr', 'add')
+    def __init__(
+        self,
+        in_channels: Union[int, Tuple[int, int]],
+        out_channels: int,
+        n_nodes: int,
+        ob_dim: int,
+        heads: int = 1,
+        concat: bool = True,
+        beta: bool = False,
+        dropout: float = 0.0,
+        edge_dim: Optional[int] = None,
+        bias: bool = True,
+        root_weight: bool = True,
+        **kwargs
+    ):
+        kwargs.setdefault("aggr", "add")
         super().__init__(node_dim=0, **kwargs)
 
         self.in_channels = in_channels
@@ -97,27 +111,28 @@ def __init__(self, in_channels: Union[int, Tuple[int, int]], out_channels: int,
         if edge_dim is not None:
             self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False)
         else:
-            self.lin_edge = self.register_parameter('lin_edge', None)
+            self.lin_edge = self.register_parameter("lin_edge", None)
 
         if concat:
-            self.lin_skip = Linear(in_channels[1], heads * out_channels,
-                                   bias=bias)
+            self.lin_skip = Linear(in_channels[1], heads * out_channels, bias=bias)
             if self.beta:
                 self.lin_beta = Linear(3 * heads * out_channels, 1, bias=False)
             else:
-                self.lin_beta = self.register_parameter('lin_beta', None)
+                self.lin_beta = self.register_parameter("lin_beta", None)
         else:
             self.lin_skip = Linear(in_channels[1], out_channels, bias=bias)
             if self.beta:
                 self.lin_beta = Linear(3 * out_channels, 1, bias=False)
             else:
-                self.lin_beta = self.register_parameter('lin_beta', None)
+                self.lin_beta = self.register_parameter("lin_beta", None)
 
         self.weight = Parameter(torch.Tensor(in_channels[1], heads * out_channels))
         self.bias = Parameter(torch.Tensor(heads * out_channels))
 
         self.n_nodes = n_nodes
-        self.nodewise_weights = Parameter(torch.Tensor(self.n_nodes, heads * out_channels))
+        self.nodewise_weights = Parameter(
+            torch.Tensor(self.n_nodes, heads * out_channels)
+        )
 
         self.increase_dim = Linear(in_channels[1], heads * out_channels * 8)
         self.map_weights = Parameter(torch.Tensor(self.n_nodes, heads * 16))
@@ -145,8 +160,16 @@ def reset_parameters(self):
         glorot(self.map_weights)
         self.increase_dim.reset_parameters()
 
-    def forward(self, x: Union[Tensor, PairTensor], p_t: Tensor, edge_index: Adj, edge_weights=None, use_beta=False,
-                edge_attr: OptTensor = None, return_attention_weights=None):
+    def forward(
+        self,
+        x: Union[Tensor, PairTensor],
+        p_t: Tensor,
+        edge_index: Adj,
+        edge_weights=None,
+        use_beta=False,
+        edge_attr: OptTensor = None,
+        return_attention_weights=None,
+    ):
 
         r"""
         Args:
@@ -165,7 +188,9 @@ def forward(self, x: Union[Tensor, PairTensor], p_t: Tensor, edge_index: Adj, ed
         if isinstance(x, Tensor):
             x: PairTensor = (x, x)
 
-        out = self.propagate(edge_index, x=x, edge_weights=edge_weights, edge_attr=edge_attr, size=None)
+        out = self.propagate(
+            edge_index, x=x, edge_weights=edge_weights, edge_attr=edge_attr, size=None
+        )
 
         alpha = self._alpha
         self._alpha = None
@@ -181,13 +206,20 @@ def forward(self, x: Union[Tensor, PairTensor], p_t: Tensor, edge_index: Adj, ed
             if isinstance(edge_index, Tensor):
                 return out, (edge_index, alpha)
             elif isinstance(edge_index, SparseTensor):
-                return out, edge_index.set_value(alpha, layout='coo')
+                return out, edge_index.set_value(alpha, layout="coo")
         else:
             return out
 
-    def message_selfattention(self, x_i: Tensor, x_j: Tensor, edge_weights: Tensor, edge_attr: OptTensor,
-                              index: Tensor, ptr: OptTensor,
-                              size_i: Optional[int]) -> Tensor:
+    def message_selfattention(
+        self,
+        x_i: Tensor,
+        x_j: Tensor,
+        edge_weights: Tensor,
+        edge_attr: OptTensor,
+        index: Tensor,
+        ptr: OptTensor,
+        size_i: Optional[int],
+    ) -> Tensor:
         query = self.lin_query(x_i).view(-1, self.heads, self.out_channels)
         key = self.lin_key(x_j).view(-1, self.heads, self.out_channels)
 
@@ -208,9 +240,16 @@ def message_selfattention(self, x_i: Tensor, x_j: Tensor, edge_weights: Tensor,
         out *= alpha.view(-1, self.heads, 1)
         return out
 
-    def message(self, x_i: Tensor, x_j: Tensor, edge_weights: Tensor, edge_attr: OptTensor,
-                index: Tensor, ptr: OptTensor,
-                size_i: Optional[int]) -> Tensor:
+    def message(
+        self,
+        x_i: Tensor,
+        x_j: Tensor,
+        edge_weights: Tensor,
+        edge_attr: OptTensor,
+        index: Tensor,
+        ptr: OptTensor,
+        size_i: Optional[int],
+    ) -> Tensor:
         use_beta = self.use_beta
         if use_beta:
             n_step = self.p_t.shape[0]
@@ -221,7 +260,17 @@ def message(self, x_i: Tensor, x_j: Tensor, edge_weights: Tensor, edge_attr: Opt
 
             p_emb = self.p_t.unsqueeze(0)
 
-            aa = torch.cat([w_v.repeat(1, n_step, 1, ), p_emb.repeat(n_edges, 1, 1)], dim=-1)
+            aa = torch.cat(
+                [
+                    w_v.repeat(
+                        1,
+                        n_step,
+                        1,
+                    ),
+                    p_emb.repeat(n_edges, 1, 1),
+                ],
+                dim=-1,
+            )
             beta = torch.mean(h_W * aa, dim=-1)
 
         if edge_weights is not None:
@@ -257,16 +306,22 @@ def message(self, x_i: Tensor, x_j: Tensor, edge_weights: Tensor, edge_attr: Opt
             target_nodes = self.edge_index[1]
             w1 = self.nodewise_weights[source_nodes].unsqueeze(-1)
             w2 = self.nodewise_weights[target_nodes].unsqueeze(1)
-            out = torch.bmm(x_i.view(-1, self.heads, self.out_channels), torch.bmm(w1, w2))
+            out = torch.bmm(
+                x_i.view(-1, self.heads, self.out_channels), torch.bmm(w1, w2)
+            )
         if use_beta:
             out = out * gamma.view(-1, self.heads, out.shape[-1])
         else:
             out = out * gamma.view(-1, self.heads, 1)
         return out
 
-    def aggregate(self, inputs: Tensor, index: Tensor,
-                  ptr: Optional[Tensor] = None,
-                  dim_size: Optional[int] = None) -> Tensor:
+    def aggregate(
+        self,
+        inputs: Tensor,
+        index: Tensor,
+        ptr: Optional[Tensor] = None,
+        dim_size: Optional[int] = None,
+    ) -> Tensor:
         r"""Aggregates messages from neighbors as
         :math:`\square_{j \in \mathcal{N}(i)}`.
 
@@ -278,19 +333,33 @@ def aggregate(self, inputs: Tensor, index: Tensor,
         :meth:`__init__` by the :obj:`aggr` argument.
         """
         index = self.index
-        return scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
-                       reduce=self.aggr)
+        return scatter(
+            inputs, index, dim=self.node_dim, dim_size=dim_size, reduce=self.aggr
+        )
 
     def __repr__(self):
-        return '{}({}, {}, heads={})'.format(self.__class__.__name__,
-                                             self.in_channels,
-                                             self.out_channels,
-                                             self.heads)
+        return "{}({}, {}, heads={})".format(
+            self.__class__.__name__, self.in_channels, self.out_channels, self.heads
+        )
 
 
 class _Raindrop(nn.Module):
-    def __init__(self, n_layers, n_features, d_model, d_inner, n_heads, n_classes, dropout=0.3, max_len=215, d_static=9,
-                 aggregation='mean', sensor_wise_mask=False, static=False, device=None):
+    def __init__(
+        self,
+        n_layers,
+        n_features,
+        d_model,
+        d_inner,
+        n_heads,
+        n_classes,
+        dropout=0.3,
+        max_len=215,
+        d_static=9,
+        aggregation="mean",
+        sensor_wise_mask=False,
+        static=False,
+        device=None,
+    ):
         super().__init__()
         self.n_layers = n_layers
         self.n_features = n_features
@@ -310,30 +379,43 @@ def __init__(self, n_layers, n_features, d_model, d_inner, n_heads, n_classes, d
         self.global_structure = torch.ones(n_features, n_features, device=self.device)
         if self.static:
             self.emb = nn.Linear(d_static, n_features)
-        assert d_model % n_features == 0, 'd_model must be divisible by n_features'
+        assert d_model % n_features == 0, "d_model must be divisible by n_features"
         self.d_ob = int(d_model / n_features)
         self.encoder = nn.Linear(n_features * self.d_ob, n_features * self.d_ob)
         d_pe = 16
         self.pos_encoder = PositionalEncodingTF(d_pe, max_len)
         if self.sensor_wise_mask:
             dim_check = n_features * (self.d_ob + d_pe)
-            assert dim_check % n_heads == 0, 'dim_check must be divisible by n_heads'
-            encoder_layers = TransformerEncoderLayer(n_features * (self.d_ob + d_pe), n_heads, d_inner, dropout)
+            assert dim_check % n_heads == 0, "dim_check must be divisible by n_heads"
+            encoder_layers = TransformerEncoderLayer(
+                n_features * (self.d_ob + d_pe), n_heads, d_inner, dropout
+            )
         else:
             dim_check = d_model + d_pe
-            assert dim_check % n_heads == 0, 'dim_check must be divisible by n_heads'
-            encoder_layers = TransformerEncoderLayer(d_model + d_pe, n_heads, d_inner, dropout)
+            assert dim_check % n_heads == 0, "dim_check must be divisible by n_heads"
+            encoder_layers = TransformerEncoderLayer(
+                d_model + d_pe, n_heads, d_inner, dropout
+            )
         self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
 
         self.adj = torch.ones([self.n_features, self.n_features], device=self.device)
 
         self.R_u = Parameter(torch.Tensor(1, self.n_features * self.d_ob))
 
-        self.ob_propagation = ObservationPropagation(in_channels=max_len * self.d_ob, out_channels=max_len * self.d_ob,
-                                                     heads=1, n_nodes=n_features, ob_dim=self.d_ob)
-        self.ob_propagation_layer2 = ObservationPropagation(in_channels=max_len * self.d_ob,
-                                                            out_channels=max_len * self.d_ob, heads=1,
-                                                            n_nodes=n_features, ob_dim=self.d_ob)
+        self.ob_propagation = ObservationPropagation(
+            in_channels=max_len * self.d_ob,
+            out_channels=max_len * self.d_ob,
+            heads=1,
+            n_nodes=n_features,
+            ob_dim=self.d_ob,
+        )
+        self.ob_propagation_layer2 = ObservationPropagation(
+            in_channels=max_len * self.d_ob,
+            out_channels=max_len * self.d_ob,
+            heads=1,
+            n_nodes=n_features,
+            ob_dim=self.d_ob,
+        )
         if static:
             d_final = d_model + d_pe + n_features
         else:
@@ -356,7 +438,7 @@ def init_weights(self):
         glorot(self.R_u)
 
     def classify(self, inputs):
-        """ Forward processing of BRITS.
+        """Forward processing of BRITS.
 
         Parameters
         ----------
@@ -376,36 +458,38 @@ def classify(self, inputs):
                 Number of nonzero recordings.
             missing_mask : array, shape of [n_steps, n_samples, n_features]
         """
-        src = inputs['X']
-        static = inputs['static']
-        times = inputs['timestamps']
-        lengths = inputs['lengths']
-        missing_mask = inputs['missing_mask']
+        src = inputs["X"]
+        static = inputs["static"]
+        times = inputs["timestamps"]
+        lengths = inputs["lengths"]
+        missing_mask = inputs["missing_mask"]
 
         max_len, batch_size = src.shape[0], src.shape[1]
 
         src = torch.repeat_interleave(src, self.d_ob, dim=-1)
         h = F.relu(src * self.R_u)
-        pe = self.pos_encoder(times)
+        pe = self.pos_encoder(times).to(self.device)
         if static is not None:
             emb = self.emb(static)
 
         h = self.dropout(h)
 
         mask = torch.arange(max_len)[None, :] >= (lengths.cpu()[:, None])
-        mask = mask.squeeze(1)
+        mask = mask.squeeze(1).to(self.device)
 
         x = h
 
         adj = self.global_structure
-        adj[torch.eye(self.n_features).byte()] = 1
+        adj[torch.eye(self.n_features, dtype=torch.bool)] = 1
 
         edge_index = torch.nonzero(adj).T
         edge_weights = adj[edge_index[0], edge_index[1]]
 
         batch_size = src.shape[1]
         n_step = src.shape[0]
-        output = torch.zeros([n_step, batch_size, self.n_features * self.d_ob], device=self.device)
+        output = torch.zeros(
+            [n_step, batch_size, self.n_features * self.d_ob], device=self.device
+        )
 
         alpha_all = torch.zeros([edge_index.shape[1], batch_size], device=self.device)
 
@@ -414,21 +498,33 @@ def classify(self, inputs):
             step_data = x[:, unit, :]
             p_t = pe[:, unit, :]
 
-            step_data = step_data.reshape([n_step, self.n_features, self.d_ob]).permute(1, 0, 2)
+            step_data = step_data.reshape([n_step, self.n_features, self.d_ob]).permute(
+                1, 0, 2
+            )
             step_data = step_data.reshape(self.n_features, n_step * self.d_ob)
 
-            step_data, attention_weights = self.ob_propagation(step_data, p_t=p_t, edge_index=edge_index,
-                                                               edge_weights=edge_weights,
-                                                               use_beta=False, edge_attr=None,
-                                                               return_attention_weights=True)
+            step_data, attention_weights = self.ob_propagation(
+                step_data,
+                p_t=p_t,
+                edge_index=edge_index,
+                edge_weights=edge_weights,
+                use_beta=False,
+                edge_attr=None,
+                return_attention_weights=True,
+            )
 
             edge_index_layer2 = attention_weights[0]
             edge_weights_layer2 = attention_weights[1].squeeze(-1)
 
-            step_data, attention_weights = self.ob_propagation_layer2(step_data, p_t=p_t, edge_index=edge_index_layer2,
-                                                                      edge_weights=edge_weights_layer2,
-                                                                      use_beta=False, edge_attr=None,
-                                                                      return_attention_weights=True)
+            step_data, attention_weights = self.ob_propagation_layer2(
+                step_data,
+                p_t=p_t,
+                edge_index=edge_index_layer2,
+                edge_weights=edge_weights_layer2,
+                use_beta=False,
+                edge_attr=None,
+                return_attention_weights=True,
+            )
 
             step_data = step_data.view([self.n_features, n_step, self.d_ob])
             step_data = step_data.permute([1, 0, 2])  # [n_step, n_features, d_ob]
@@ -452,19 +548,25 @@ def classify(self, inputs):
 
         sensor_wise_mask = self.sensor_wise_mask
 
-        lengths2 = lengths.unsqueeze(1)
+        lengths2 = lengths.unsqueeze(1).to(self.device)
         mask2 = mask.permute(1, 0).unsqueeze(2).long()
         if sensor_wise_mask:
-            output = torch.zeros([batch_size, self.n_features, self.d_ob + 16], device=self.device)
+            output = torch.zeros(
+                [batch_size, self.n_features, self.d_ob + 16], device=self.device
+            )
             extended_missing_mask = missing_mask.view(-1, batch_size, self.n_features)
             for se in range(self.n_features):
                 r_out = r_out.view(-1, batch_size, self.n_features, (self.d_ob + 16))
                 out = r_out[:, :, se, :]
-                l_ = torch.sum(extended_missing_mask[:, :, se], dim=0).unsqueeze(1)  # length
-                out_sensor = torch.sum(out * (1 - extended_missing_mask[:, :, se].unsqueeze(-1)), dim=0) / (l_ + 1)
+                l_ = torch.sum(extended_missing_mask[:, :, se], dim=0).unsqueeze(
+                    1
+                )  # length
+                out_sensor = torch.sum(
+                    out * (1 - extended_missing_mask[:, :, se].unsqueeze(-1)), dim=0
+                ) / (l_ + 1)
                 output[:, se, :] = out_sensor
             output = output.view([-1, self.n_features * (self.d_ob + 16)])
-        elif self.aggregation == 'mean':
+        elif self.aggregation == "mean":
             output = torch.sum(r_out * (1 - mask2), dim=0) / (lengths2 + 1)
         else:
             raise RuntimeError
@@ -479,11 +581,11 @@ def classify(self, inputs):
 
     def forward(self, inputs):
         prediction = self.classify(inputs)
-        classification_loss = F.nll_loss(torch.log(prediction), inputs['label'])
+        classification_loss = F.nll_loss(torch.log(prediction), inputs["label"])
 
         results = {
-            'prediction': prediction,
-            'loss': classification_loss
+            "prediction": prediction,
+            "loss": classification_loss
             # 'distance': distance,
         }
 
@@ -510,37 +612,53 @@ class Raindrop(BaseNNClassifier):
         Run the model on which device.
     """
 
-    def __init__(self,
-                 n_features,
-                 n_layers,
-                 d_model,
-                 d_inner,
-                 n_heads,
-                 n_classes,
-                 dropout,
-                 max_len,
-                 d_static,
-                 aggregation,
-                 sensor_wise_mask,
-                 static,
-                 learning_rate=1e-3,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(n_classes, learning_rate, epochs, patience, batch_size,
-                         weight_decay, device)
+    def __init__(
+        self,
+        n_features,
+        n_layers,
+        d_model,
+        d_inner,
+        n_heads,
+        n_classes,
+        dropout,
+        max_len,
+        d_static,
+        aggregation,
+        sensor_wise_mask,
+        static,
+        learning_rate=1e-3,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            n_classes, learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
         self.n_features = n_features
         self.n_steps = max_len
-        self.model = _Raindrop(n_layers, n_features, d_model, d_inner, n_heads, n_classes, dropout, max_len, d_static,
-                               aggregation, sensor_wise_mask, static=static, device=self.device)
+        self.model = _Raindrop(
+            n_layers,
+            n_features,
+            d_model,
+            d_inner,
+            n_heads,
+            n_classes,
+            dropout,
+            max_len,
+            d_static,
+            aggregation,
+            sensor_wise_mask,
+            static=static,
+            device=self.device,
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
 
     def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """ Fit the model on the given training data.
+        """Fit the model on the given training data.
 
         Parameters
         ----------
@@ -554,11 +672,15 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         self : object,
             Trained model.
         """
-        train_X, train_y = self.check_input(self.n_steps, self.n_features, train_X, train_y)
+        train_X, train_y = self.check_input(
+            self.n_steps, self.n_features, train_X, train_y
+        )
         val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y)
 
         training_set = DatasetForGRUD(train_X, train_y)
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
 
         if val_X is None:
             self._train_model(training_loader)
@@ -572,7 +694,7 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         return self
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -596,12 +718,12 @@ def assemble_input_data(self, data):
         times = times.permute(1, 0)
 
         inputs = {
-            'X': X,
-            'static': None,
-            'timestamps': times,
-            'lengths': lengths,
-            'missing_mask': missing_mask,
-            'label': label
+            "X": X,
+            "static": None,
+            "timestamps": times,
+            "lengths": lengths,
+            "missing_mask": missing_mask,
+            "label": label,
         }
         return inputs
 
@@ -627,11 +749,11 @@ def classify(self, X):
                 times = times.permute(1, 0)
 
                 inputs = {
-                    'X': X,
-                    'static': None,
-                    'timestamps': times,
-                    'lengths': lengths,
-                    'missing_mask': missing_mask,
+                    "X": X,
+                    "static": None,
+                    "timestamps": times,
+                    "lengths": lengths,
+                    "missing_mask": missing_mask,
                 }
 
                 prediction = self.model.classify(inputs)
diff --git a/pypots/clustering/__init__.py b/pypots/clustering/__init__.py
index 5ed2a560..b8fb3b8e 100644
--- a/pypots/clustering/__init__.py
+++ b/pypots/clustering/__init__.py
@@ -8,7 +8,4 @@
 from pypots.clustering.crli import CRLI
 from pypots.clustering.vader import VaDER
 
-__all__ = [
-    'CRLI',
-    'VaDER'
-]
+__all__ = ["CRLI", "VaDER"]
diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py
index 236f3c67..e7420521 100644
--- a/pypots/clustering/base.py
+++ b/pypots/clustering/base.py
@@ -15,15 +15,14 @@
 
 
 class BaseClusterer(BaseModel):
-    """ Abstract class for all clustering models.
-    """
+    """Abstract class for all clustering models."""
 
     def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
     def fit(self, train_X):
-        """ Train the cluster.
+        """Train the cluster.
 
         Parameters
         ----------
@@ -39,7 +38,7 @@ def fit(self, train_X):
 
     @abstractmethod
     def cluster(self, X):
-        """ Cluster the input with the trained model.
+        """Cluster the input with the trained model.
 
         Parameters
         ----------
@@ -55,8 +54,19 @@ def cluster(self, X):
 
 
 class BaseNNClusterer(BaseNNModel, BaseClusterer):
-    def __init__(self, n_clusters, learning_rate, epochs, patience, batch_size, weight_decay, device):
-        super().__init__(learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_clusters,
+        learning_rate,
+        epochs,
+        patience,
+        batch_size,
+        weight_decay,
+        device,
+    ):
+        super().__init__(
+            learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
         self.n_clusters = n_clusters
 
     @abstractmethod
@@ -64,12 +74,12 @@ def assemble_input_data(self, data):
         pass
 
     def _train_model(self, training_loader, val_loader=None):
-        self.optimizer = torch.optim.Adam(self.model.parameters(),
-                                          lr=self.lr,
-                                          weight_decay=self.weight_decay)
+        self.optimizer = torch.optim.Adam(
+            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
 
         # each training starts from the very beginning, so reset the loss and model dict here
-        self.best_loss = float('inf')
+        self.best_loss = float("inf")
         self.best_model_dict = None
 
         try:
@@ -80,12 +90,14 @@ def _train_model(self, training_loader, val_loader=None):
                     inputs = self.assemble_input_data(data)
                     self.optimizer.zero_grad()
                     results = self.model.forward(inputs)
-                    results['loss'].backward()
+                    results["loss"].backward()
                     self.optimizer.step()
-                    epoch_train_loss_collector.append(results['loss'].item())
+                    epoch_train_loss_collector.append(results["loss"].item())
 
-                mean_train_loss = np.mean(epoch_train_loss_collector)  # mean training loss of the current epoch
-                self.logger['training_loss'].append(mean_train_loss)
+                mean_train_loss = np.mean(
+                    epoch_train_loss_collector
+                )  # mean training loss of the current epoch
+                self.logger["training_loss"].append(mean_train_loss)
 
                 if val_loader is not None:
                     self.model.eval()
@@ -94,14 +106,16 @@ def _train_model(self, training_loader, val_loader=None):
                         for idx, data in enumerate(val_loader):
                             inputs = self.assemble_input_data(data)
                             results = self.model.forward(inputs)
-                            epoch_val_loss_collector.append(results['loss'].item())
+                            epoch_val_loss_collector.append(results["loss"].item())
 
                     mean_val_loss = np.mean(epoch_val_loss_collector)
-                    self.logger['validating_loss'].append(mean_val_loss)
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}')
+                    self.logger["validating_loss"].append(mean_val_loss)
+                    print(
+                        f"epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}"
+                    )
                     mean_loss = mean_val_loss
                 else:
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}')
+                    print(f"epoch {epoch}: training loss {mean_train_loss:.4f}")
                     mean_loss = mean_train_loss
 
                 if mean_loss < self.best_loss:
@@ -111,18 +125,24 @@ def _train_model(self, training_loader, val_loader=None):
                 else:
                     self.patience -= 1
                     if self.patience == 0:
-                        print('Exceeded the training patience. Terminating the training procedure...')
+                        print(
+                            "Exceeded the training patience. Terminating the training procedure..."
+                        )
                         break
         except Exception as e:
-            print(f'Exception: {e}')
+            print(f"Exception: {e}")
             if self.best_model_dict is None:
-                raise RuntimeError('Training got interrupted. Model was not get trained. Please try fit() again.')
+                raise RuntimeError(
+                    "Training got interrupted. Model was not get trained. Please try fit() again."
+                )
             else:
-                RuntimeWarning('Training got interrupted. '
-                               'Model will load the best parameters so far for testing. '
-                               "If you don't want it, please try fit() again.")
+                RuntimeWarning(
+                    "Training got interrupted. "
+                    "Model will load the best parameters so far for testing. "
+                    "If you don't want it, please try fit() again."
+                )
 
-        if np.equal(self.best_loss, float('inf')):
-            raise ValueError('Something is wrong. best_loss is Nan after training.')
+        if np.equal(self.best_loss, float("inf")):
+            raise ValueError("Something is wrong. best_loss is Nan after training.")
 
-        print('Finished training.')
+        print("Finished training.")
diff --git a/pypots/clustering/crli.py b/pypots/clustering/crli.py
index 81b70d43..007a00ff 100644
--- a/pypots/clustering/crli.py
+++ b/pypots/clustering/crli.py
@@ -19,8 +19,8 @@
 from pypots.utils.metrics import cal_mse
 
 RNN_CELL = {
-    'LSTM': nn.LSTMCell,
-    'GRU': nn.GRUCell,
+    "LSTM": nn.LSTMCell,
+    "GRU": nn.GRUCell,
 }
 
 
@@ -28,7 +28,9 @@ def reverse_tensor(tensor_):
     if tensor_.dim() <= 1:
         return tensor_
     indices = range(tensor_.size()[1])[::-1]
-    indices = torch.tensor(indices, dtype=torch.long, device=tensor_.device, requires_grad=False)
+    indices = torch.tensor(
+        indices, dtype=torch.long, device=tensor_.device, requires_grad=False
+    )
     return tensor_.index_select(1, indices)
 
 
@@ -42,7 +44,7 @@ def __init__(self, cell_type, n_layer, d_input, d_hidden, device):
         self.device = device
 
         self.model = nn.ModuleList()
-        if cell_type in ['LSTM', 'GRU']:
+        if cell_type in ["LSTM", "GRU"]:
             for i in range(n_layer):
                 if i == 0:
                     self.model.append(RNN_CELL[cell_type](d_input, d_hidden))
@@ -52,32 +54,42 @@ def __init__(self, cell_type, n_layer, d_input, d_hidden, device):
         self.output_layer = nn.Linear(d_hidden, d_input)
 
     def forward(self, inputs):
-        X, missing_mask = inputs['X'], inputs['missing_mask']
+        X, missing_mask = inputs["X"], inputs["missing_mask"]
         bz, n_steps, _ = X.shape
         hidden_state = torch.zeros((bz, self.d_hidden), device=self.device)
-        hidden_state_collector = torch.empty((bz, n_steps, self.d_hidden), device=self.device)
+        hidden_state_collector = torch.empty(
+            (bz, n_steps, self.d_hidden), device=self.device
+        )
         output_collector = torch.empty((bz, n_steps, self.d_input), device=self.device)
-        if self.cell_type == 'LSTM':
+        if self.cell_type == "LSTM":
             # TODO: cell states should have different shapes
             cell_states = torch.zeros((self.d_input, self.d_hidden), device=self.device)
             for step in range(n_steps):
                 x = X[:, step, :]
                 estimation = self.output_layer(hidden_state)
                 output_collector[:, step] = estimation
-                imputed_x = missing_mask[:, step] * x + (1 - missing_mask[:, step]) * estimation
+                imputed_x = (
+                    missing_mask[:, step] * x + (1 - missing_mask[:, step]) * estimation
+                )
                 for i in range(self.n_layer):
                     if i == 0:
-                        hidden_state, cell_states = self.model[i](imputed_x, (hidden_state, cell_states))
+                        hidden_state, cell_states = self.model[i](
+                            imputed_x, (hidden_state, cell_states)
+                        )
                     else:
-                        hidden_state, cell_states = self.model[i](hidden_state, (hidden_state, cell_states))
+                        hidden_state, cell_states = self.model[i](
+                            hidden_state, (hidden_state, cell_states)
+                        )
                 hidden_state_collector[:, step, :] = hidden_state
 
-        elif self.cell_type == 'GRU':
+        elif self.cell_type == "GRU":
             for step in range(n_steps):
                 x = X[:, step, :]
                 estimation = self.output_layer(hidden_state)
                 output_collector[:, step] = estimation
-                imputed_x = missing_mask[:, step] * x + (1 - missing_mask[:, step]) * estimation
+                imputed_x = (
+                    missing_mask[:, step] * x + (1 - missing_mask[:, step]) * estimation
+                )
                 for i in range(self.n_layer):
                     if i == 0:
                         hidden_state = self.model[i](imputed_x, hidden_state)
@@ -103,50 +115,60 @@ def forward(self, inputs):
         b_outputs, b_final_hidden_state = self.b_rnn(inputs)
         b_outputs = reverse_tensor(b_outputs)  # reverse the output of the backward rnn
         imputation = (f_outputs + b_outputs) / 2
-        imputed_X = inputs['X'] * inputs['missing_mask'] + imputation * (1 - inputs['missing_mask'])
-        fb_final_hidden_states = torch.concat([f_final_hidden_state, b_final_hidden_state], dim=-1)
+        imputed_X = inputs["X"] * inputs["missing_mask"] + imputation * (
+            1 - inputs["missing_mask"]
+        )
+        fb_final_hidden_states = torch.concat(
+            [f_final_hidden_state, b_final_hidden_state], dim=-1
+        )
         return imputation, imputed_X, fb_final_hidden_states
 
 
 class Discriminator(nn.Module):
-    def __init__(self, cell_type, d_input, device='cpu'):
+    def __init__(self, cell_type, d_input, device="cpu"):
         super().__init__()
         self.cell_type = cell_type
         self.device = device
         # this setting is the same with the official implementation
-        self.rnn_cell_module_list = nn.ModuleList([
-            RNN_CELL[cell_type](d_input, 32),
-            RNN_CELL[cell_type](32, 16),
-            RNN_CELL[cell_type](16, 8),
-            RNN_CELL[cell_type](8, 16),
-            RNN_CELL[cell_type](16, 32),
-        ])
+        self.rnn_cell_module_list = nn.ModuleList(
+            [
+                RNN_CELL[cell_type](d_input, 32),
+                RNN_CELL[cell_type](32, 16),
+                RNN_CELL[cell_type](16, 8),
+                RNN_CELL[cell_type](8, 16),
+                RNN_CELL[cell_type](16, 32),
+            ]
+        )
         self.output_layer = nn.Linear(32, d_input)
 
     def forward(self, inputs):
-        imputed_X = inputs['imputed_X']
+        imputed_X = inputs["imputed_X"]
         bz, n_steps, _ = imputed_X.shape
         hidden_states = [
             torch.zeros((bz, 32), device=self.device),
             torch.zeros((bz, 16), device=self.device),
             torch.zeros((bz, 8), device=self.device),
             torch.zeros((bz, 16), device=self.device),
-            torch.zeros((bz, 32), device=self.device)
+            torch.zeros((bz, 32), device=self.device),
         ]
         hidden_state_collector = torch.empty((bz, n_steps, 32), device=self.device)
-        if self.cell_type == 'LSTM':
+        if self.cell_type == "LSTM":
             cell_states = torch.zeros((self.d_input, self.d_hidden), device=self.device)
             for step in range(n_steps):
                 x = imputed_X[:, step, :]
                 for i, rnn_cell in enumerate(self.rnn_cell_module_list):
                     if i == 0:
-                        hidden_state, cell_states = rnn_cell(x, (hidden_states[i], cell_states))
+                        hidden_state, cell_states = rnn_cell(
+                            x, (hidden_states[i], cell_states)
+                        )
                     else:
-                        hidden_state, cell_states = rnn_cell(hidden_states[i - 1], (hidden_states[i], cell_states))
+                        hidden_state, cell_states = rnn_cell(
+                            hidden_states[i - 1], (hidden_states[i], cell_states)
+                        )
                     hidden_states[i] = hidden_state
                 hidden_state_collector[:, step, :] = hidden_state
 
-        elif self.cell_type == 'GRU':
+        elif self.cell_type == "GRU":
             for step in range(n_steps):
                 x = imputed_X[:, step, :]
                 for i, rnn_cell in enumerate(self.rnn_cell_module_list):
@@ -162,7 +184,9 @@ def forward(self, inputs):
 
 
 class Decoder(nn.Module):
-    def __init__(self, n_steps, d_input, d_output, fcn_output_dims: list = None, device='cpu'):
+    def __init__(
+        self, n_steps, d_input, d_output, fcn_output_dims: list = None, device="cpu"
+    ):
         super().__init__()
         self.n_steps = n_steps
         self.d_output = d_output
@@ -181,13 +205,15 @@ def __init__(self, n_steps, d_input, d_output, fcn_output_dims: list = None, dev
         self.output_layer = nn.Linear(fcn_output_dims[-1], d_output)
 
     def forward(self, inputs):
-        generator_fb_hidden_states = inputs['generator_fb_hidden_states']
+        generator_fb_hidden_states = inputs["generator_fb_hidden_states"]
         bz, _ = generator_fb_hidden_states.shape
         fcn_latent = generator_fb_hidden_states
         for layer in self.fcn:
             fcn_latent = layer(fcn_latent)
         hidden_state = fcn_latent
-        hidden_state_collector = torch.empty((bz, self.n_steps, self.fcn_output_dims[-1]), device=self.device)
+        hidden_state_collector = torch.empty(
+            (bz, self.n_steps, self.fcn_output_dims[-1]), device=self.device
+        )
         for i in range(self.n_steps):
             hidden_state = self.rnn_cell(hidden_state, hidden_state)
             hidden_state_collector[:, i, :] = hidden_state
@@ -196,112 +222,148 @@ def forward(self, inputs):
 
 
 class _CRLI(nn.Module):
-    def __init__(self, n_steps, n_features, n_clusters, n_generator_layers, rnn_hidden_size, decoder_fcn_output_dims,
-                 lambda_kmeans, rnn_cell_type='GRU', device='cpu'):
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        n_clusters,
+        n_generator_layers,
+        rnn_hidden_size,
+        decoder_fcn_output_dims,
+        lambda_kmeans,
+        rnn_cell_type="GRU",
+        device="cpu",
+    ):
         super().__init__()
-        self.generator = Generator(n_generator_layers, n_features, rnn_hidden_size, rnn_cell_type, device)
+        self.generator = Generator(
+            n_generator_layers, n_features, rnn_hidden_size, rnn_cell_type, device
+        )
         self.discriminator = Discriminator(rnn_cell_type, n_features, device)
         self.decoder = Decoder(
             n_steps, rnn_hidden_size * 2, n_features, decoder_fcn_output_dims, device
         )  # fully connected network is included in Decoder
-        self.kmeans = KMeans(n_clusters=n_clusters)  # TODO: implement KMean with torch for gpu acceleration
+        self.kmeans = KMeans(
+            n_clusters=n_clusters
+        )  # TODO: implement KMean with torch for gpu acceleration
 
         self.n_clusters = n_clusters
         self.lambda_kmeans = lambda_kmeans
         self.device = device
 
-    def cluster(self, inputs, training_object='generator'):
+    def cluster(self, inputs, training_object="generator"):
         # concat final states from generator and input it as the initial state of decoder
         imputation, imputed_X, generator_fb_hidden_states = self.generator(inputs)
-        inputs['imputation'] = imputation
-        inputs['imputed_X'] = imputed_X
-        inputs['generator_fb_hidden_states'] = generator_fb_hidden_states
-        if training_object == 'discriminator':
+        inputs["imputation"] = imputation
+        inputs["imputed_X"] = imputed_X
+        inputs["generator_fb_hidden_states"] = generator_fb_hidden_states
+        if training_object == "discriminator":
             discrimination = self.discriminator(inputs)
-            inputs['discrimination'] = discrimination
+            inputs["discrimination"] = discrimination
             return inputs  # if only train discriminator, then no need to run decoder
 
         reconstruction, fcn_latent = self.decoder(inputs)
-        inputs['reconstruction'] = reconstruction
-        inputs['fcn_latent'] = fcn_latent
+        inputs["reconstruction"] = reconstruction
+        inputs["fcn_latent"] = fcn_latent
         return inputs
 
-    def forward(self, inputs, training_object='generator'):
-        assert training_object in ['generator', 'discriminator'], \
-            'training_object should be "generator" or "discriminator"'
+    def forward(self, inputs, training_object="generator"):
+        assert training_object in [
+            "generator",
+            "discriminator",
+        ], 'training_object should be "generator" or "discriminator"'
 
-        X = inputs['X']
-        missing_mask = inputs['missing_mask']
+        X = inputs["X"]
+        missing_mask = inputs["missing_mask"]
         batch_size, n_steps, n_features = X.shape
         losses = {}
         inputs = self.cluster(inputs, training_object)
-        if training_object == 'discriminator':
-            l_D = F.binary_cross_entropy_with_logits(inputs['discrimination'], missing_mask)
-            losses['l_disc'] = l_D
+        if training_object == "discriminator":
+            l_D = F.binary_cross_entropy_with_logits(
+                inputs["discrimination"], missing_mask
+            )
+            losses["l_disc"] = l_D
         else:
-            inputs['discrimination'] = inputs['discrimination'].detach()
-            l_G = F.binary_cross_entropy_with_logits(inputs['discrimination'], 1 - missing_mask,
-                                                     weight=1 - missing_mask)
-            l_pre = cal_mse(inputs['imputation'], X, missing_mask)
-            l_rec = cal_mse(inputs['reconstruction'], X, missing_mask)
-            HTH = torch.matmul(inputs['fcn_latent'], inputs['fcn_latent'].permute(1, 0))
+            inputs["discrimination"] = inputs["discrimination"].detach()
+            l_G = F.binary_cross_entropy_with_logits(
+                inputs["discrimination"], 1 - missing_mask, weight=1 - missing_mask
+            )
+            l_pre = cal_mse(inputs["imputation"], X, missing_mask)
+            l_rec = cal_mse(inputs["reconstruction"], X, missing_mask)
+            HTH = torch.matmul(inputs["fcn_latent"], inputs["fcn_latent"].permute(1, 0))
             term_F = torch.nn.init.orthogonal_(
-                torch.randn(batch_size, self.n_clusters, device=self.device),
-                gain=1
+                torch.randn(batch_size, self.n_clusters, device=self.device), gain=1
             )
             FTHTHF = torch.matmul(torch.matmul(term_F.permute(1, 0), HTH), term_F)
             l_kmeans = torch.trace(HTH) - torch.trace(FTHTHF)  # k-means loss
             loss_gene = l_G + l_pre + l_rec + l_kmeans * self.lambda_kmeans
-            losses['l_gene'] = loss_gene
+            losses["l_gene"] = loss_gene
         return losses
 
 
 class CRLI(BaseNNClusterer):
-    def __init__(self,
-                 n_steps,
-                 n_features,
-                 n_clusters,
-                 n_generator_layers,
-                 rnn_hidden_size,
-                 decoder_fcn_output_dims=None,
-                 lambda_kmeans=1,
-                 rnn_cell_type='GRU',
-                 G_steps=1,
-                 D_steps=1,
-                 learning_rate=1e-3,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(n_clusters, learning_rate, epochs, patience, batch_size, weight_decay, device)
-        assert G_steps > 0 and D_steps > 0, 'G_steps and D_steps should both >0'
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        n_clusters,
+        n_generator_layers,
+        rnn_hidden_size,
+        decoder_fcn_output_dims=None,
+        lambda_kmeans=1,
+        rnn_cell_type="GRU",
+        G_steps=1,
+        D_steps=1,
+        learning_rate=1e-3,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            n_clusters,
+            learning_rate,
+            epochs,
+            patience,
+            batch_size,
+            weight_decay,
+            device,
+        )
+        assert G_steps > 0 and D_steps > 0, "G_steps and D_steps should both >0"
 
         self.n_steps = n_steps
         self.n_features = n_features
         self.G_steps = G_steps
         self.D_steps = D_steps
 
-        self.model = _CRLI(n_steps, n_features, n_clusters, n_generator_layers, rnn_hidden_size,
-                           decoder_fcn_output_dims, lambda_kmeans, rnn_cell_type, device)
+        self.model = _CRLI(
+            n_steps,
+            n_features,
+            n_clusters,
+            n_generator_layers,
+            rnn_hidden_size,
+            decoder_fcn_output_dims,
+            lambda_kmeans,
+            rnn_cell_type,
+            device,
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
-        self.logger = {
-            'training_loss_generator': [],
-            'training_loss_discriminator': []
-        }
+        self.logger = {"training_loss_generator": [], "training_loss_discriminator": []}
 
     def fit(self, train_X):
         train_X = self.check_input(self.n_steps, self.n_features, train_X)
         training_set = DatasetForGRUD(train_X)
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
         self._train_model(training_loader)
         self.model.load_state_dict(self.best_model_dict)
         self.model.eval()  # set the model as eval status to freeze it.
         return self
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -317,24 +379,28 @@ def assemble_input_data(self, data):
         indices, X, _, missing_mask, _, _ = data
 
         inputs = {
-            'X': X,
-            'missing_mask': missing_mask,
+            "X": X,
+            "missing_mask": missing_mask,
         }
         return inputs
 
     def _train_model(self, training_loader, val_loader=None):
         self.G_optimizer = torch.optim.Adam(
             [
-                {'params': self.model.generator.parameters()},
-                {'params': self.model.decoder.parameters()}
+                {"params": self.model.generator.parameters()},
+                {"params": self.model.decoder.parameters()},
             ],
-            lr=self.lr, weight_decay=self.weight_decay
+            lr=self.lr,
+            weight_decay=self.weight_decay,
+        )
+        self.D_optimizer = torch.optim.Adam(
+            self.model.discriminator.parameters(),
+            lr=self.lr,
+            weight_decay=self.weight_decay,
         )
-        self.D_optimizer = torch.optim.Adam(self.model.discriminator.parameters(), lr=self.lr,
-                                            weight_decay=self.weight_decay)
 
         # each training starts from the very beginning, so reset the loss and model dict here
-        self.best_loss = float('inf')
+        self.best_loss = float("inf")
         self.best_model_dict = None
 
         try:
@@ -347,25 +413,35 @@ def _train_model(self, training_loader, val_loader=None):
 
                     for _ in range(self.D_steps):
                         self.D_optimizer.zero_grad()
-                        results = self.model.forward(inputs, training_object='discriminator')
-                        results['l_disc'].backward(retain_graph=True)
+                        results = self.model.forward(
+                            inputs, training_object="discriminator"
+                        )
+                        results["l_disc"].backward(retain_graph=True)
                         self.D_optimizer.step()
-                        epoch_train_loss_D_collector.append(results['l_disc'].item())
+                        epoch_train_loss_D_collector.append(results["l_disc"].item())
 
                     for _ in range(self.G_steps):
                         self.G_optimizer.zero_grad()
-                        results = self.model.forward(inputs, training_object='generator')
-                        results['l_gene'].backward()
+                        results = self.model.forward(
+                            inputs, training_object="generator"
+                        )
+                        results["l_gene"].backward()
                         self.G_optimizer.step()
-                        epoch_train_loss_G_collector.append(results['l_gene'].item())
-
-                mean_train_G_loss = np.mean(epoch_train_loss_G_collector)  # mean training loss of the current epoch
-                mean_train_D_loss = np.mean(epoch_train_loss_D_collector)  # mean training loss of the current epoch
-                self.logger['training_loss_generator'].append(mean_train_G_loss)
-                self.logger['training_loss_discriminator'].append(mean_train_D_loss)
-                print(f'epoch {epoch}: '
-                      f'training loss_generator {mean_train_G_loss:.4f}, '
-                      f'train loss_discriminator {mean_train_D_loss:.4f}')
+                        epoch_train_loss_G_collector.append(results["l_gene"].item())
+
+                mean_train_G_loss = np.mean(
+                    epoch_train_loss_G_collector
+                )  # mean training loss of the current epoch
+                mean_train_D_loss = np.mean(
+                    epoch_train_loss_D_collector
+                )  # mean training loss of the current epoch
+                self.logger["training_loss_generator"].append(mean_train_G_loss)
+                self.logger["training_loss_discriminator"].append(mean_train_D_loss)
+                print(
+                    f"epoch {epoch}: "
+                    f"training loss_generator {mean_train_G_loss:.4f}, "
+                    f"train loss_discriminator {mean_train_D_loss:.4f}"
+                )
                 mean_loss = mean_train_G_loss
 
                 if mean_loss < self.best_loss:
@@ -375,21 +451,27 @@ def _train_model(self, training_loader, val_loader=None):
                 else:
                     self.patience -= 1
                     if self.patience == 0:
-                        print('Exceeded the training patience. Terminating the training procedure...')
+                        print(
+                            "Exceeded the training patience. Terminating the training procedure..."
+                        )
                         break
         except Exception as e:
-            print(f'Exception: {e}')
+            print(f"Exception: {e}")
             if self.best_model_dict is None:
-                raise RuntimeError('Training got interrupted. Model was not get trained. Please try fit() again.')
+                raise RuntimeError(
+                    "Training got interrupted. Model was not get trained. Please try fit() again."
+                )
             else:
-                RuntimeWarning('Training got interrupted. '
-                               'Model will load the best parameters so far for testing. '
-                               "If you don't want it, please try fit() again.")
+                RuntimeWarning(
+                    "Training got interrupted. "
+                    "Model will load the best parameters so far for testing. "
+                    "If you don't want it, please try fit() again."
+                )
 
-        if np.equal(self.best_loss, float('inf')):
-            raise ValueError('Something is wrong. best_loss is Nan after training.')
+        if np.equal(self.best_loss, float("inf")):
+            raise ValueError("Something is wrong. best_loss is Nan after training.")
 
-        print('Finished training.')
+        print("Finished training.")
 
     def cluster(self, X):
         X = self.check_input(self.n_steps, self.n_features, X)
@@ -402,7 +484,7 @@ def cluster(self, X):
             for idx, data in enumerate(test_loader):
                 inputs = self.assemble_input_data(data)
                 inputs = self.model.cluster(inputs)
-                latent_collector.append(inputs['fcn_latent'])
+                latent_collector.append(inputs["fcn_latent"])
 
         latent_collector = torch.cat(latent_collector).cpu().detach().numpy()
         clustering = self.model.kmeans.fit_predict(latent_collector)
diff --git a/pypots/clustering/vader.py b/pypots/clustering/vader.py
index 1114c964..b0a1dff8 100644
--- a/pypots/clustering/vader.py
+++ b/pypots/clustering/vader.py
@@ -49,13 +49,15 @@ def __init__(self, input_size, hidden_size, bias=True):
         if bias:
             self.bias_ch = Parameter(torch.Tensor(3 * hidden_size))
         else:
-            self.register_parameter('bias_ch', None)
-        self.register_buffer('wc_blank', torch.zeros(hidden_size))
+            self.register_parameter("bias_ch", None)
+        self.register_buffer("wc_blank", torch.zeros(hidden_size))
         self.reset_parameters()
 
     def forward(self, input, hx=None):
         if hx is None:
-            zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+            zeros = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
             hx = (zeros, zeros)
 
         h, c = hx
@@ -64,21 +66,23 @@ def forward(self, input, hx=None):
         wh = F.linear(h, self.weight_hh, self.bias_hh)
         wc = F.linear(c, self.weight_ch, self.bias_ch)
 
-        wxhc = wx + \
-               wh + \
-               torch.cat(
-                   (
-                       wc[:, :2 * self.hidden_size],
-                       Variable(self.wc_blank).expand_as(h),
-                       wc[:, 2 * self.hidden_size:]
-                   ),
-                   dim=1
-               )
-
-        i = torch.sigmoid(wxhc[:, :self.hidden_size])
-        f = torch.sigmoid(wxhc[:, self.hidden_size:2 * self.hidden_size])
-        g = torch.tanh(wxhc[:, 2 * self.hidden_size:3 * self.hidden_size])
-        o = torch.sigmoid(wxhc[:, 3 * self.hidden_size:])
+        wxhc = (
+            wx
+            + wh
+            + torch.cat(
+                (
+                    wc[:, : 2 * self.hidden_size],
+                    Variable(self.wc_blank).expand_as(h),
+                    wc[:, 2 * self.hidden_size :],
+                ),
+                dim=1,
+            )
+        )
+
+        i = torch.sigmoid(wxhc[:, : self.hidden_size])
+        f = torch.sigmoid(wxhc[:, self.hidden_size : 2 * self.hidden_size])
+        g = torch.tanh(wxhc[:, 2 * self.hidden_size : 3 * self.hidden_size])
+        o = torch.sigmoid(wxhc[:, 3 * self.hidden_size :])
 
         c = f * c + i * g
         h = o * torch.tanh(c)
@@ -128,7 +132,16 @@ class _VaDER(nn.Module):
 
     """
 
-    def __init__(self, n_steps, d_input, n_clusters, d_rnn_hidden, d_mu_stddev, eps=1e-9, alpha=1.0):
+    def __init__(
+        self,
+        n_steps,
+        d_input,
+        n_clusters,
+        d_rnn_hidden,
+        d_mu_stddev,
+        eps=1e-9,
+        alpha=1.0,
+    ):
         super().__init__()
         self.n_steps = n_steps
         self.d_input = d_input
@@ -143,15 +156,15 @@ def __init__(self, n_steps, d_input, n_clusters, d_rnn_hidden, d_mu_stddev, eps=
         self.encoder = PeepholeLSTMCell(d_input, d_rnn_hidden)
         self.decoder = PeepholeLSTMCell(d_input, d_rnn_hidden)
         self.ae_encode_layers = nn.Sequential(
-            nn.Linear(d_rnn_hidden, d_rnn_hidden),
-            nn.Softplus()
+            nn.Linear(d_rnn_hidden, d_rnn_hidden), nn.Softplus()
         )
         self.ae_decode_layers = nn.Sequential(
-            nn.Linear(d_mu_stddev, d_rnn_hidden),
-            nn.Softplus()
+            nn.Linear(d_mu_stddev, d_rnn_hidden), nn.Softplus()
         )
         self.mu_layer = nn.Linear(d_rnn_hidden, d_mu_stddev)  # layer for mean
-        self.stddev_layer = nn.Linear(d_rnn_hidden, d_mu_stddev)  # layer for standard variance
+        self.stddev_layer = nn.Linear(
+            d_rnn_hidden, d_mu_stddev
+        )  # layer for standard variance
         self.rnn_transform_layer = nn.Linear(d_rnn_hidden, d_input)
         self.gmm_layer = GMMLayer(d_mu_stddev, n_clusters)
 
@@ -165,8 +178,12 @@ def encode(self, X, missing_mask):
 
         X_imputed = self.implicit_imputation_layer(X, missing_mask)
 
-        hidden_state = torch.zeros((batch_size, self.d_rnn_hidden), dtype=X.dtype, device=X.device)
-        cell_state = torch.zeros((batch_size, self.d_rnn_hidden), dtype=X.dtype, device=X.device)
+        hidden_state = torch.zeros(
+            (batch_size, self.d_rnn_hidden), dtype=X.dtype, device=X.device
+        )
+        cell_state = torch.zeros(
+            (batch_size, self.d_rnn_hidden), dtype=X.dtype, device=X.device
+        )
         # cell_state_collector = torch.empty((batch_size, self.n_steps, self.d_rnn_hidden),
         #                                    dtype=X.dtype, device=X.device)
         for i in range(self.n_steps):
@@ -185,10 +202,13 @@ def decode(self, z):
         hidden_state = self.ae_decode_layers(hidden_state)
 
         cell_state = torch.zeros(hidden_state.size(), dtype=z.dtype, device=z.device)
-        inputs = torch.zeros((z.size(0), self.n_steps, self.d_input), dtype=z.dtype, device=z.device)
+        inputs = torch.zeros(
+            (z.size(0), self.n_steps, self.d_input), dtype=z.dtype, device=z.device
+        )
 
-        hidden_state_collector = torch.empty((z.size(0), self.n_steps, self.d_rnn_hidden),
-                                             dtype=z.dtype, device=z.device)
+        hidden_state_collector = torch.empty(
+            (z.size(0), self.n_steps, self.d_rnn_hidden), dtype=z.dtype, device=z.device
+        )
         for i in range(self.n_steps):
             x = inputs[:, i, :]
             hidden_state, cell_state = self.decoder(x, (hidden_state, cell_state))
@@ -204,34 +224,59 @@ def get_results(self, X, missing_mask):
         return X_reconstructed, mu_c, var_c, phi_c, z, mu_tilde, stddev_tilde
 
     def cluster(self, inputs):
-        X, missing_mask = inputs['X'], inputs['missing_mask']
-        X_reconstructed, mu_c, var_c, phi_c, z, mu_tilde, stddev_tilde = self.get_results(X, missing_mask)
+        X, missing_mask = inputs["X"], inputs["missing_mask"]
+        (
+            X_reconstructed,
+            mu_c,
+            var_c,
+            phi_c,
+            z,
+            mu_tilde,
+            stddev_tilde,
+        ) = self.get_results(X, missing_mask)
 
         def func_to_apply(mu_t_, mu_, stddev_, phi_):
             # the covariance matrix is diagonal, so we can just take the product
-            return np.log(self.eps + phi_) + \
-                   np.log(self.eps + multivariate_normal.pdf(mu_t_, mean=mu_, cov=np.diag(stddev_)))
+            return np.log(self.eps + phi_) + np.log(
+                self.eps
+                + multivariate_normal.pdf(mu_t_, mean=mu_, cov=np.diag(stddev_))
+            )
 
         mu_tilde = mu_tilde.detach().cpu().numpy()
         mu = mu_c.detach().cpu().numpy()
         var = var_c.detach().cpu().numpy()
         phi = phi_c.detach().cpu().numpy()
-        p = np.array([func_to_apply(mu_tilde, mu[i], var[i], phi[i]) for i in np.arange(mu.shape[0])])
+        p = np.array(
+            [
+                func_to_apply(mu_tilde, mu[i], var[i], phi[i])
+                for i in np.arange(mu.shape[0])
+            ]
+        )
         clustering_results = np.argmax(p, axis=0)
         return clustering_results
 
     def forward(self, inputs, pretrain=False):
-        X, missing_mask = inputs['X'], inputs['missing_mask']
-        X_reconstructed, mu_c, var_c, phi_c, z, mu_tilde, stddev_tilde = self.get_results(X, missing_mask)
+        X, missing_mask = inputs["X"], inputs["missing_mask"]
+        (
+            X_reconstructed,
+            mu_c,
+            var_c,
+            phi_c,
+            z,
+            mu_tilde,
+            stddev_tilde,
+        ) = self.get_results(X, missing_mask)
 
         # calculate the reconstruction loss
         unscaled_reconstruction_loss = cal_mse(X_reconstructed, X, missing_mask)
-        reconstruction_loss = unscaled_reconstruction_loss * self.n_steps * self.d_input / missing_mask.sum()
+        reconstruction_loss = (
+            unscaled_reconstruction_loss
+            * self.n_steps
+            * self.d_input
+            / missing_mask.sum()
+        )
         if pretrain:
-            results = {
-                'loss': reconstruction_loss,
-                'z': z
-            }
+            results = {"loss": reconstruction_loss, "z": z}
             return results
 
         # calculate the latent loss
@@ -244,7 +289,7 @@ def forward(self, inputs, pretrain=False):
 
         ii, jj = torch.meshgrid(
             torch.arange(self.n_clusters, dtype=torch.int64, device=X.device),
-            torch.arange(batch_size, dtype=torch.int64, device=X.device)
+            torch.arange(batch_size, dtype=torch.int64, device=X.device),
         )
         ii = ii.flatten()
         jj = jj.flatten()
@@ -253,7 +298,7 @@ def forward(self, inputs, pretrain=False):
         mc_b = mu_c.index_select(dim=0, index=ii)
         sc_b = var_c.index_select(dim=0, index=ii)
         z_b = z.index_select(dim=0, index=jj)
-        log_pdf_z = - 0.5 * (lsc_b + log_2pi + torch.square(z_b - mc_b) / sc_b)
+        log_pdf_z = -0.5 * (lsc_b + log_2pi + torch.square(z_b - mc_b) / sc_b)
         log_pdf_z = log_pdf_z.reshape([batch_size, self.n_clusters, self.d_mu_stddev])
 
         log_p = log_phi_c + log_pdf_z.sum(dim=2)
@@ -264,28 +309,28 @@ def forward(self, inputs, pretrain=False):
         term1 = torch.log(var_c + self.eps)
         st_b = var_tilde.index_select(dim=0, index=jj)
         sc_b = var_c.index_select(dim=0, index=ii)
-        term2 = torch.reshape(st_b / (sc_b + self.eps), [batch_size, self.n_clusters, self.d_mu_stddev])
+        term2 = torch.reshape(
+            st_b / (sc_b + self.eps), [batch_size, self.n_clusters, self.d_mu_stddev]
+        )
         mt_b = mu_tilde.index_select(dim=0, index=jj)
         mc_b = mu_c.index_select(dim=0, index=ii)
         term3 = torch.reshape(
             torch.square(mt_b - mc_b) / (sc_b + self.eps),
-            [batch_size, self.n_clusters, self.d_mu_stddev]
+            [batch_size, self.n_clusters, self.d_mu_stddev],
         )
 
-        latent_loss1 = 0.5 * torch.sum(gamma_c * torch.sum(term1 + term2 + term3, dim=2), dim=1)
-        latent_loss2 = - torch.sum(gamma_c * (log_phi_c - log_gamma_c), dim=1)
-        latent_loss3 = - 0.5 * torch.sum(1 + stddev_tilde, dim=1)
+        latent_loss1 = 0.5 * torch.sum(
+            gamma_c * torch.sum(term1 + term2 + term3, dim=2), dim=1
+        )
+        latent_loss2 = -torch.sum(gamma_c * (log_phi_c - log_gamma_c), dim=1)
+        latent_loss3 = -0.5 * torch.sum(1 + stddev_tilde, dim=1)
 
         latent_loss1 = latent_loss1.mean()
         latent_loss2 = latent_loss2.mean()
         latent_loss3 = latent_loss3.mean()
         latent_loss = latent_loss1 + latent_loss2 + latent_loss3
 
-        results = {
-            'loss': reconstruction_loss + self.alpha * latent_loss,
-            'z': z
-
-        }
+        results = {"loss": reconstruction_loss + self.alpha * latent_loss, "z": z}
 
         return results
 
@@ -297,38 +342,52 @@ def inverse_softplus(x):
 
 
 class VaDER(BaseNNClusterer):
-    def __init__(self,
-                 n_steps,
-                 n_features,
-                 n_clusters,
-                 rnn_hidden_size,
-                 d_mu_stddev,
-                 learning_rate=1e-3,
-                 pretrain_epochs=10,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(n_clusters, learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        n_clusters,
+        rnn_hidden_size,
+        d_mu_stddev,
+        learning_rate=1e-3,
+        pretrain_epochs=10,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            n_clusters,
+            learning_rate,
+            epochs,
+            patience,
+            batch_size,
+            weight_decay,
+            device,
+        )
         self.n_steps = n_steps
         self.n_features = n_features
         self.pretrain_epochs = pretrain_epochs
-        self.model = _VaDER(n_steps, n_features, n_clusters, rnn_hidden_size, d_mu_stddev)
+        self.model = _VaDER(
+            n_steps, n_features, n_clusters, rnn_hidden_size, d_mu_stddev
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
 
     def fit(self, train_X):
         train_X = self.check_input(self.n_steps, self.n_features, train_X)
         training_set = DatasetForGRUD(train_X)
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
         self._train_model(training_loader)
         self.model.load_state_dict(self.best_model_dict)
         self.model.eval()  # set the model as eval status to freeze it.
         return self
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -344,18 +403,18 @@ def assemble_input_data(self, data):
         indices, X, _, missing_mask, _, _ = data
 
         inputs = {
-            'X': X,
-            'missing_mask': missing_mask,
+            "X": X,
+            "missing_mask": missing_mask,
         }
         return inputs
 
     def _train_model(self, training_loader, val_loader=None):
-        self.optimizer = torch.optim.Adam(self.model.parameters(),
-                                          lr=self.lr,
-                                          weight_decay=self.weight_decay)
+        self.optimizer = torch.optim.Adam(
+            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
 
         # each training starts from the very beginning, so reset the loss and model dict here
-        self.best_loss = float('inf')
+        self.best_loss = float("inf")
         self.best_model_dict = None
 
         # pretrain to initialize parameters of GMM layer
@@ -365,7 +424,7 @@ def _train_model(self, training_loader, val_loader=None):
                 inputs = self.assemble_input_data(data)
                 self.optimizer.zero_grad()
                 results = self.model.forward(inputs, pretrain=True)
-                results['loss'].backward()
+                results["loss"].backward()
                 self.optimizer.step()
         with torch.no_grad():
             sample_collector = []
@@ -373,9 +432,11 @@ def _train_model(self, training_loader, val_loader=None):
                 for idx, data in enumerate(training_loader):
                     inputs = self.assemble_input_data(data)
                     results = self.model.forward(inputs, pretrain=True)
-                    sample_collector.append(results['z'])
+                    sample_collector.append(results["z"])
             samples = torch.cat(sample_collector).cpu().detach().numpy()
-            gmm = GaussianMixture(n_components=self.n_clusters, covariance_type="diag", reg_covar=1e-04)
+            gmm = GaussianMixture(
+                n_components=self.n_clusters, covariance_type="diag", reg_covar=1e-04
+            )
             gmm.fit(samples)
             # get GMM parameters
             phi = np.log(gmm.weights_ + 1e-9)  # inverse softmax
@@ -395,12 +456,14 @@ def _train_model(self, training_loader, val_loader=None):
                     inputs = self.assemble_input_data(data)
                     self.optimizer.zero_grad()
                     results = self.model.forward(inputs)
-                    results['loss'].backward()
+                    results["loss"].backward()
                     self.optimizer.step()
-                    epoch_train_loss_collector.append(results['loss'].item())
+                    epoch_train_loss_collector.append(results["loss"].item())
 
-                mean_train_loss = np.mean(epoch_train_loss_collector)  # mean training loss of the current epoch
-                self.logger['training_loss'].append(mean_train_loss)
+                mean_train_loss = np.mean(
+                    epoch_train_loss_collector
+                )  # mean training loss of the current epoch
+                self.logger["training_loss"].append(mean_train_loss)
 
                 if val_loader is not None:
                     self.model.eval()
@@ -409,14 +472,16 @@ def _train_model(self, training_loader, val_loader=None):
                         for idx, data in enumerate(val_loader):
                             inputs = self.assemble_input_data(data)
                             results = self.model.forward(inputs)
-                            epoch_val_loss_collector.append(results['loss'].item())
+                            epoch_val_loss_collector.append(results["loss"].item())
 
                     mean_val_loss = np.mean(epoch_val_loss_collector)
-                    self.logger['validating_loss'].append(mean_val_loss)
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}')
+                    self.logger["validating_loss"].append(mean_val_loss)
+                    print(
+                        f"epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}"
+                    )
                     mean_loss = mean_val_loss
                 else:
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}')
+                    print(f"epoch {epoch}: training loss {mean_train_loss:.4f}")
                     mean_loss = mean_train_loss
 
                 if mean_loss < self.best_loss:
@@ -426,21 +491,27 @@ def _train_model(self, training_loader, val_loader=None):
                 else:
                     self.patience -= 1
                     if self.patience == 0:
-                        print('Exceeded the training patience. Terminating the training procedure...')
+                        print(
+                            "Exceeded the training patience. Terminating the training procedure..."
+                        )
                         break
         except Exception as e:
-            print(f'Exception: {e}')
+            print(f"Exception: {e}")
             if self.best_model_dict is None:
-                raise RuntimeError('Training got interrupted. Model was not get trained. Please try fit() again.')
+                raise RuntimeError(
+                    "Training got interrupted. Model was not get trained. Please try fit() again."
+                )
             else:
-                RuntimeWarning('Training got interrupted. '
-                               'Model will load the best parameters so far for testing. '
-                               "If you don't want it, please try fit() again.")
+                RuntimeWarning(
+                    "Training got interrupted. "
+                    "Model will load the best parameters so far for testing. "
+                    "If you don't want it, please try fit() again."
+                )
 
-        if np.equal(self.best_loss, float('inf')):
-            raise ValueError('Something is wrong. best_loss is Nan after training.')
+        if np.equal(self.best_loss, float("inf")):
+            raise ValueError("Something is wrong. best_loss is Nan after training.")
 
-        print('Finished training.')
+        print("Finished training.")
 
     def cluster(self, X):
         X = self.check_input(self.n_steps, self.n_features, X)
diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py
index a847af8f..3e3cefc2 100644
--- a/pypots/data/__init__.py
+++ b/pypots/data/__init__.py
@@ -9,7 +9,10 @@
 from pypots.data.dataset_for_brits import DatasetForBRITS
 from pypots.data.dataset_for_grud import DatasetForGRUD
 from pypots.data.dataset_for_mit import DatasetForMIT
-from pypots.data.generating import generate_random_walk, generate_random_walk_for_classification
+from pypots.data.generating import (
+    generate_random_walk,
+    generate_random_walk_for_classification,
+)
 from pypots.data.integration import (
     masked_fill,
     mcar,
diff --git a/pypots/data/base.py b/pypots/data/base.py
index cc7a5988..827b5d93 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -10,7 +10,7 @@
 
 
 class BaseDataset(Dataset):
-    """ Base dataset class in PyPOTS.
+    """Base dataset class in PyPOTS.
 
     Parameters
     ----------
@@ -34,7 +34,7 @@ def __len__(self):
         return len(self.X)
 
     def __getitem__(self, idx):
-        """ Fetch data according to index.
+        """Fetch data according to index.
 
         Parameters
         ----------
@@ -52,8 +52,6 @@ def __getitem__(self, idx):
         ]
 
         if self.y is not None:
-            sample.append(
-                self.y[idx].to(torch.long)
-            )
+            sample.append(self.y[idx].to(torch.long))
 
         return sample
diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py
index 3b505f6c..0f3ee6a7 100644
--- a/pypots/data/dataset_for_brits.py
+++ b/pypots/data/dataset_for_brits.py
@@ -11,7 +11,7 @@
 
 
 def parse_delta(missing_mask):
-    """ Generate time-gap (delta) matrix from missing masks.
+    """Generate time-gap (delta) matrix from missing masks.
 
     Parameters
     ----------
@@ -34,7 +34,10 @@ def parse_delta(missing_mask):
             if step == 0:
                 delta.append(torch.zeros(1, n_features, device=device))
             else:
-                delta.append(torch.ones(1, n_features, device=device) + (1 - m_mask[step]) * delta[-1])
+                delta.append(
+                    torch.ones(1, n_features, device=device)
+                    + (1 - m_mask[step]) * delta[-1]
+                )
         delta = torch.concat(delta, dim=0)
         delta_collector.append(delta.unsqueeze(0))
     delta = torch.concat(delta_collector, dim=0)
@@ -42,7 +45,7 @@ def parse_delta(missing_mask):
 
 
 class DatasetForBRITS(BaseDataset):
-    """ Dataset class for BRITS.
+    """Dataset class for BRITS.
 
     Parameters
     ----------
@@ -66,20 +69,20 @@ def __init__(self, X, y=None):
         backward_delta = parse_delta(backward_missing_mask)
 
         self.data = {
-            'forward': {
-                'X': forward_X,
-                'missing_mask': forward_missing_mask,
-                'delta': forward_delta
+            "forward": {
+                "X": forward_X,
+                "missing_mask": forward_missing_mask,
+                "delta": forward_delta,
             },
-            'backward': {
-                'X': backward_X,
-                'missing_mask': backward_missing_mask,
-                'delta': backward_delta
+            "backward": {
+                "X": backward_X,
+                "missing_mask": backward_missing_mask,
+                "delta": backward_delta,
             },
         }
 
     def __getitem__(self, idx):
-        """ Fetch data according to index.
+        """Fetch data according to index.
 
         Parameters
         ----------
@@ -109,18 +112,16 @@ def __getitem__(self, idx):
         sample = [
             torch.tensor(idx),
             # for forward
-            self.data['forward']['X'][idx].to(torch.float32),
-            self.data['forward']['missing_mask'][idx].to(torch.float32),
-            self.data['forward']['delta'][idx].to(torch.float32),
+            self.data["forward"]["X"][idx].to(torch.float32),
+            self.data["forward"]["missing_mask"][idx].to(torch.float32),
+            self.data["forward"]["delta"][idx].to(torch.float32),
             # for backward
-            self.data['backward']['X'][idx].to(torch.float32),
-            self.data['backward']['missing_mask'][idx].to(torch.float32),
-            self.data['backward']['delta'][idx].to(torch.float32),
+            self.data["backward"]["X"][idx].to(torch.float32),
+            self.data["backward"]["missing_mask"][idx].to(torch.float32),
+            self.data["backward"]["delta"][idx].to(torch.float32),
         ]
 
         if self.y is not None:
-            sample.append(
-                self.y[idx].to(torch.long)
-            )
+            sample.append(self.y[idx].to(torch.long))
 
         return sample
diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py
index 0e504906..f3dd1d80 100644
--- a/pypots/data/dataset_for_grud.py
+++ b/pypots/data/dataset_for_grud.py
@@ -14,7 +14,7 @@
 
 
 class DatasetForGRUD(BaseDataset):
-    """ Dataset class for model GRUD.
+    """Dataset class for model GRUD.
 
     Parameters
     ----------
@@ -33,11 +33,12 @@ def __init__(self, X, y=None):
         self.X = torch.nan_to_num(X)
         self.deltas = parse_delta(self.missing_mask)
         self.X_filledLOCF = self.locf.locf_torch(X)
-        self.empirical_mean = \
-            torch.sum(self.missing_mask * self.X, dim=[0, 1]) / torch.sum(self.missing_mask, dim=[0, 1])
+        self.empirical_mean = torch.sum(
+            self.missing_mask * self.X, dim=[0, 1]
+        ) / torch.sum(self.missing_mask, dim=[0, 1])
 
     def __getitem__(self, idx):
-        """ Fetch data according to index.
+        """Fetch data according to index.
 
         Parameters
         ----------
@@ -77,8 +78,6 @@ def __getitem__(self, idx):
         ]
 
         if self.y is not None:
-            sample.append(
-                self.y[idx].to(torch.long)
-            )
+            sample.append(self.y[idx].to(torch.long))
 
         return sample
diff --git a/pypots/data/dataset_for_mit.py b/pypots/data/dataset_for_mit.py
index 07676ca1..b24e3f75 100644
--- a/pypots/data/dataset_for_mit.py
+++ b/pypots/data/dataset_for_mit.py
@@ -12,7 +12,7 @@
 
 
 class DatasetForMIT(BaseDataset):
-    """ Dataset for models that need MIT (masked imputation task) in their training, such as SAITS.
+    """Dataset for models that need MIT (masked imputation task) in their training, such as SAITS.
 
     For more information about MIT, please refer to :cite:`du2022SAITS`.
 
@@ -41,7 +41,7 @@ def __init__(self, X, y=None, rate=0.2):
         self.rate = rate
 
     def __getitem__(self, idx):
-        """ Fetch data according to index.
+        """Fetch data according to index.
 
         Parameters
         ----------
@@ -80,8 +80,6 @@ def __getitem__(self, idx):
         ]
 
         if self.y is not None:
-            sample.append(
-                self.y[idx].to(torch.long)
-            )
+            sample.append(self.y[idx].to(torch.long))
 
         return sample
diff --git a/pypots/data/generating.py b/pypots/data/generating.py
index f67e1129..664bec00 100644
--- a/pypots/data/generating.py
+++ b/pypots/data/generating.py
@@ -11,8 +11,10 @@
 from sklearn.utils import check_random_state
 
 
-def generate_random_walk(n_samples=1000, n_steps=24, n_features=10, mu=0., std=1., random_state=None):
-    """ Generate random walk time-series data.
+def generate_random_walk(
+    n_samples=1000, n_steps=24, n_features=10, mu=0.0, std=1.0, random_state=None
+):
+    """Generate random walk time-series data.
 
     Parameters
     ----------
@@ -44,9 +46,15 @@ def generate_random_walk(n_samples=1000, n_steps=24, n_features=10, mu=0., std=1
     return ts_samples
 
 
-def generate_random_walk_for_classification(n_classes=2, n_samples_each_class=500, n_steps=24, n_features=10,
-                                            shuffle=True, random_state=None):
-    """ Generate random walk time-series data for the classification task.
+def generate_random_walk_for_classification(
+    n_classes=2,
+    n_samples_each_class=500,
+    n_steps=24,
+    n_features=10,
+    shuffle=True,
+    random_state=None,
+):
+    """Generate random walk time-series data for the classification task.
 
     Parameters
     ----------
@@ -82,7 +90,9 @@ def generate_random_walk_for_classification(n_classes=2, n_samples_each_class=50
     std = 1
 
     for c_ in range(n_classes):
-        ts_samples = generate_random_walk(n_samples_each_class, n_steps, n_features, mu, std, random_state)
+        ts_samples = generate_random_walk(
+            n_samples_each_class, n_steps, n_features, mu, std, random_state
+        )
         label_samples = np.asarray([1 for _ in range(n_samples_each_class)]) * c_
         ts_collector.extend(ts_samples)
         label_collector.extend(label_samples)
@@ -101,10 +111,18 @@ def generate_random_walk_for_classification(n_classes=2, n_samples_each_class=50
     return X, y
 
 
-def generate_random_walk_for_anomaly_detection(n_samples=1000, n_steps=24, n_features=10, mu=0., std=1.,
-                                               anomaly_proportion=0.1, anomaly_fraction=0.02, anomaly_scale_factor=2.0,
-                                               random_state=None):
-    """ Generate random walk time-series data for the anomaly-detection task.
+def generate_random_walk_for_anomaly_detection(
+    n_samples=1000,
+    n_steps=24,
+    n_features=10,
+    mu=0.0,
+    std=1.0,
+    anomaly_proportion=0.1,
+    anomaly_fraction=0.02,
+    anomaly_scale_factor=2.0,
+    random_state=None,
+):
+    """Generate random walk time-series data for the anomaly-detection task.
 
     Parameters
     ----------
@@ -134,8 +152,12 @@ def generate_random_walk_for_anomaly_detection(n_samples=1000, n_steps=24, n_fea
     y : array, shape of [n_classes*n_samples_each_class]
         Labels indicating if time-series samples are anomalies.
     """
-    assert 0 < anomaly_proportion < 1, f'anomaly_proportion should be >0 and <1, but got {anomaly_proportion}'
-    assert 0 < anomaly_fraction < 1, f'anomaly_fraction should be >0 and <1, but got {anomaly_fraction}'
+    assert (
+        0 < anomaly_proportion < 1
+    ), f"anomaly_proportion should be >0 and <1, but got {anomaly_proportion}"
+    assert (
+        0 < anomaly_fraction < 1
+    ), f"anomaly_fraction should be >0 and <1, but got {anomaly_fraction}"
     seed = check_random_state(random_state)
     X = seed.randn(n_samples, n_steps, n_features) * std + mu
     n_anomaly = math.floor(n_samples * anomaly_proportion)
@@ -148,10 +170,14 @@ def generate_random_walk_for_anomaly_detection(n_samples=1000, n_steps=24, n_fea
         max_difference = min_val - max_val
         n_points = n_steps * n_features
         n_anomaly_points = int(n_points * anomaly_fraction)
-        point_indices = np.random.choice(a=n_points, size=n_anomaly_points, replace=False)
+        point_indices = np.random.choice(
+            a=n_points, size=n_anomaly_points, replace=False
+        )
         for p_i in point_indices:
-            anomaly_sample[p_i] = mu + np.random.uniform(low=min_val - anomaly_scale_factor * max_difference,
-                                                         high=max_val + anomaly_scale_factor * max_difference)
+            anomaly_sample[p_i] = mu + np.random.uniform(
+                low=min_val - anomaly_scale_factor * max_difference,
+                high=max_val + anomaly_scale_factor * max_difference,
+            )
         X[a_i] = anomaly_sample.reshape(n_steps, n_features)
 
     # create labels
diff --git a/pypots/data/load_specific_datasets.py b/pypots/data/load_specific_datasets.py
index 98adcb82..7c5352e3 100644
--- a/pypots/data/load_specific_datasets.py
+++ b/pypots/data/load_specific_datasets.py
@@ -9,7 +9,7 @@
 import tsdb
 
 SUPPORTED_DATASETS = [
-    'physionet_2012',
+    "physionet_2012",
 ]
 
 
@@ -38,33 +38,28 @@ def preprocess_physionet2012(data):
         A dict containing processed data.
 
     """
-    X = data['X'].drop(data['static_features'], axis=1)
+    X = data["X"].drop(data["static_features"], axis=1)
 
     def apply_func(df_temp):  # pad and truncate to set the max length of samples as 48
-        missing = list(set(range(0, 48)).difference(set(df_temp['Time'])))
-        missing_part = pd.DataFrame({'Time': missing})
+        missing = list(set(range(0, 48)).difference(set(df_temp["Time"])))
+        missing_part = pd.DataFrame({"Time": missing})
         df_temp = df_temp.append(missing_part, ignore_index=False, sort=False)  # pad
-        df_temp = df_temp.set_index('Time').sort_index().reset_index()
+        df_temp = df_temp.set_index("Time").sort_index().reset_index()
         df_temp = df_temp.iloc[:48]  # truncate
         return df_temp
 
-    X = X.groupby('RecordID').apply(apply_func)
-    X = X.drop('RecordID', axis=1)  #
+    X = X.groupby("RecordID").apply(apply_func)
+    X = X.drop("RecordID", axis=1)  #
     X = X.reset_index()
-    X = X.drop(['level_1', 'Time'], axis=1)
-    return {
-        'X': X,
-        'y': data['y']
-    }
+    X = X.drop(["level_1", "Time"], axis=1)
+    return {"X": X, "y": data["y"]}
 
 
-PREPROCESSING = {
-    'physionet_2012': preprocess_physionet2012
-}
+PREPROCESSING = {"physionet_2012": preprocess_physionet2012}
 
 
 def load_specific_dataset(dataset_name, use_cache=True):
-    """ Load specific datasets supported by PyPOTS.
+    """Load specific datasets supported by PyPOTS.
     Different from tsdb.load_dataset(), which only produces merely raw data,
     load_specific_dataset here does some preprocessing operations,
     like truncating time series to generate samples with the same length.
@@ -85,12 +80,16 @@ def load_specific_dataset(dataset_name, use_cache=True):
         e.g. standardizing and splitting.
 
     """
-    print(f'Loading the dataset {dataset_name} with TSDB (https://github.com/WenjieDu/Time_Series_Database)...')
-    assert dataset_name in SUPPORTED_DATASETS, f'Dataset {dataset_name} is not supported. ' \
-                                               f'If you believe this dataset is valuable to be supported by PyPOTS,' \
-                                               f'please create an issue on GitHub ' \
-                                               f'https://github.com/WenjieDu/PyPOTS/issues'
-    print(f'Starting preprocessing {dataset_name}...')
+    print(
+        f"Loading the dataset {dataset_name} with TSDB (https://github.com/WenjieDu/Time_Series_Database)..."
+    )
+    assert dataset_name in SUPPORTED_DATASETS, (
+        f"Dataset {dataset_name} is not supported. "
+        f"If you believe this dataset is valuable to be supported by PyPOTS,"
+        f"please create an issue on GitHub "
+        f"https://github.com/WenjieDu/PyPOTS/issues"
+    )
+    print(f"Starting preprocessing {dataset_name}...")
     data = tsdb.load_dataset(dataset_name, use_cache)
     data = PREPROCESSING[dataset_name](data)
     return data
diff --git a/pypots/forecasting/__init__.py b/pypots/forecasting/__init__.py
index b88497a2..c28fe0fd 100644
--- a/pypots/forecasting/__init__.py
+++ b/pypots/forecasting/__init__.py
@@ -7,7 +7,4 @@
 
 from pypots.forecasting.bttf import BTTF
 
-__all__ = [
-    'BTTF'
-
-]
+__all__ = ["BTTF"]
diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py
index 2942f86c..ce930fed 100644
--- a/pypots/forecasting/base.py
+++ b/pypots/forecasting/base.py
@@ -15,15 +15,14 @@
 
 
 class BaseForecaster(BaseModel):
-    """ Abstract class for all forecasting models.
-    """
+    """Abstract class for all forecasting models."""
 
     def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
     def fit(self, train_X):
-        """ Train the cluster.
+        """Train the cluster.
 
         Parameters
         ----------
@@ -39,7 +38,7 @@ def fit(self, train_X):
 
     @abstractmethod
     def forecast(self, X):
-        """ Forecast the future the input with the trained model.
+        """Forecast the future the input with the trained model.
 
         Parameters
         ----------
@@ -55,20 +54,24 @@ def forecast(self, X):
 
 
 class BaseNNForecaster(BaseNNModel, BaseForecaster):
-    def __init__(self, learning_rate, epochs, patience, batch_size, weight_decay, device):
-        super().__init__(learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self, learning_rate, epochs, patience, batch_size, weight_decay, device
+    ):
+        super().__init__(
+            learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
     @abstractmethod
     def assemble_input_data(self, data):
         pass
 
     def _train_model(self, training_loader, val_loader=None):
-        self.optimizer = torch.optim.Adam(self.model.parameters(),
-                                          lr=self.lr,
-                                          weight_decay=self.weight_decay)
+        self.optimizer = torch.optim.Adam(
+            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
 
         # each training starts from the very beginning, so reset the loss and model dict here
-        self.best_loss = float('inf')
+        self.best_loss = float("inf")
         self.best_model_dict = None
 
         try:
@@ -79,12 +82,14 @@ def _train_model(self, training_loader, val_loader=None):
                     inputs = self.assemble_input_data(data)
                     self.optimizer.zero_grad()
                     results = self.model.forward(inputs)
-                    results['loss'].backward()
+                    results["loss"].backward()
                     self.optimizer.step()
-                    epoch_train_loss_collector.append(results['loss'].item())
+                    epoch_train_loss_collector.append(results["loss"].item())
 
-                mean_train_loss = np.mean(epoch_train_loss_collector)  # mean training loss of the current epoch
-                self.logger['training_loss'].append(mean_train_loss)
+                mean_train_loss = np.mean(
+                    epoch_train_loss_collector
+                )  # mean training loss of the current epoch
+                self.logger["training_loss"].append(mean_train_loss)
 
                 if val_loader is not None:
                     self.model.eval()
@@ -93,14 +98,16 @@ def _train_model(self, training_loader, val_loader=None):
                         for idx, data in enumerate(val_loader):
                             inputs = self.assemble_input_data(data)
                             results = self.model.forward(inputs)
-                            epoch_val_loss_collector.append(results['loss'].item())
+                            epoch_val_loss_collector.append(results["loss"].item())
 
                     mean_val_loss = np.mean(epoch_val_loss_collector)
-                    self.logger['validating_loss'].append(mean_val_loss)
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}')
+                    self.logger["validating_loss"].append(mean_val_loss)
+                    print(
+                        f"epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}"
+                    )
                     mean_loss = mean_val_loss
                 else:
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}')
+                    print(f"epoch {epoch}: training loss {mean_train_loss:.4f}")
                     mean_loss = mean_train_loss
 
                 if mean_loss < self.best_loss:
@@ -110,18 +117,24 @@ def _train_model(self, training_loader, val_loader=None):
                 else:
                     self.patience -= 1
                     if self.patience == 0:
-                        print('Exceeded the training patience. Terminating the training procedure...')
+                        print(
+                            "Exceeded the training patience. Terminating the training procedure..."
+                        )
                         break
         except Exception as e:
-            print(f'Exception: {e}')
+            print(f"Exception: {e}")
             if self.best_model_dict is None:
-                raise RuntimeError('Training got interrupted. Model was not get trained. Please try fit() again.')
+                raise RuntimeError(
+                    "Training got interrupted. Model was not get trained. Please try fit() again."
+                )
             else:
-                RuntimeWarning('Training got interrupted. '
-                               'Model will load the best parameters so far for testing. '
-                               "If you don't want it, please try fit() again.")
+                RuntimeWarning(
+                    "Training got interrupted. "
+                    "Model will load the best parameters so far for testing. "
+                    "If you don't want it, please try fit() again."
+                )
 
-        if np.equal(self.best_loss, float('inf')):
-            raise ValueError('Something is wrong. best_loss is Nan after training.')
+        if np.equal(self.best_loss, float("inf")):
+            raise ValueError("Something is wrong. best_loss is Nan after training.")
 
-        print('Finished training.')
+        print("Finished training.")
diff --git a/pypots/forecasting/bttf.py b/pypots/forecasting/bttf.py
index 9967300e..e119783c 100644
--- a/pypots/forecasting/bttf.py
+++ b/pypots/forecasting/bttf.py
@@ -26,8 +26,16 @@
 
 def mvnrnd_pre(mu, Lambda):
     src = normrnd(size=(mu.shape[0],))
-    return solve_ut(cholesky_upper(Lambda, overwrite_a=True, check_finite=False),
-                    src, lower=False, check_finite=False, overwrite_b=True) + mu
+    return (
+        solve_ut(
+            cholesky_upper(Lambda, overwrite_a=True, check_finite=False),
+            src,
+            lower=False,
+            check_finite=False,
+            overwrite_b=True,
+        )
+        + mu
+    )
 
 
 def cov_mat(mat, mat_bar):
@@ -36,25 +44,31 @@ def cov_mat(mat, mat_bar):
 
 
 def ten2mat(tensor, mode):
-    return np.reshape(np.moveaxis(tensor, mode, 0), (tensor.shape[mode], -1), order='F')
+    return np.reshape(np.moveaxis(tensor, mode, 0), (tensor.shape[mode], -1), order="F")
 
 
 def sample_factor_u(tau_sparse_tensor, tau_ind, U, V, X, beta0=1):
-    """Sampling M-by-R factor matrix U and its hyper-parameters (mu_u, Lambda_u).
-    """
+    """Sampling M-by-R factor matrix U and its hyper-parameters (mu_u, Lambda_u)."""
 
     dim1, rank = U.shape
     U_bar = np.mean(U, axis=0)
     temp = dim1 / (dim1 + beta0)
     var_mu_hyper = temp * U_bar
-    var_U_hyper = inv(np.eye(rank) + cov_mat(U, U_bar) + temp * beta0 * np.outer(U_bar, U_bar))
+    var_U_hyper = inv(
+        np.eye(rank) + cov_mat(U, U_bar) + temp * beta0 * np.outer(U_bar, U_bar)
+    )
     var_Lambda_hyper = wishart.rvs(df=dim1 + rank, scale=var_U_hyper)
     var_mu_hyper = mvnrnd_pre(var_mu_hyper, (dim1 + beta0) * var_Lambda_hyper)
 
     var1 = kr_prod(X, V).T
     var2 = kr_prod(var1, var1)
-    var3 = (var2 @ ten2mat(tau_ind, 0).T).reshape([rank, rank, dim1]) + var_Lambda_hyper[:, :, None]
-    var4 = var1 @ ten2mat(tau_sparse_tensor, 0).T + (var_Lambda_hyper @ var_mu_hyper)[:, None]
+    var3 = (var2 @ ten2mat(tau_ind, 0).T).reshape(
+        [rank, rank, dim1]
+    ) + var_Lambda_hyper[:, :, None]
+    var4 = (
+        var1 @ ten2mat(tau_sparse_tensor, 0).T
+        + (var_Lambda_hyper @ var_mu_hyper)[:, None]
+    )
     for i in range(dim1):
         U[i, :] = mvnrnd_pre(solve(var3[:, :, i], var4[:, i]), var3[:, :, i])
 
@@ -62,21 +76,27 @@ def sample_factor_u(tau_sparse_tensor, tau_ind, U, V, X, beta0=1):
 
 
 def sample_factor_v(tau_sparse_tensor, tau_ind, U, V, X, beta0=1):
-    """Sampling N-by-R factor matrix V and its hyper-parameters (mu_v, Lambda_v).
-    """
+    """Sampling N-by-R factor matrix V and its hyper-parameters (mu_v, Lambda_v)."""
 
     dim2, rank = V.shape
     V_bar = np.mean(V, axis=0)
     temp = dim2 / (dim2 + beta0)
     var_mu_hyper = temp * V_bar
-    var_V_hyper = inv(np.eye(rank) + cov_mat(V, V_bar) + temp * beta0 * np.outer(V_bar, V_bar))
+    var_V_hyper = inv(
+        np.eye(rank) + cov_mat(V, V_bar) + temp * beta0 * np.outer(V_bar, V_bar)
+    )
     var_Lambda_hyper = wishart.rvs(df=dim2 + rank, scale=var_V_hyper)
     var_mu_hyper = mvnrnd_pre(var_mu_hyper, (dim2 + beta0) * var_Lambda_hyper)
 
     var1 = kr_prod(X, U).T
     var2 = kr_prod(var1, var1)
-    var3 = (var2 @ ten2mat(tau_ind, 1).T).reshape([rank, rank, dim2]) + var_Lambda_hyper[:, :, None]
-    var4 = var1 @ ten2mat(tau_sparse_tensor, 1).T + (var_Lambda_hyper @ var_mu_hyper)[:, None]
+    var3 = (var2 @ ten2mat(tau_ind, 1).T).reshape(
+        [rank, rank, dim2]
+    ) + var_Lambda_hyper[:, :, None]
+    var4 = (
+        var1 @ ten2mat(tau_sparse_tensor, 1).T
+        + (var_Lambda_hyper @ var_mu_hyper)[:, None]
+    )
     for j in range(dim2):
         V[j, :] = mvnrnd_pre(solve(var3[:, :, j], var4[:, j]), var3[:, :, j])
 
@@ -101,10 +121,12 @@ def sample_var_coefficient(X, time_lags):
     d = time_lags.shape[0]
     tmax = np.max(time_lags)
 
-    Z_mat = X[tmax: dim, :]
+    Z_mat = X[tmax:dim, :]
     Q_mat = np.zeros((dim - tmax, rank * d))
     for k in range(d):
-        Q_mat[:, k * rank: (k + 1) * rank] = X[tmax - time_lags[k]: dim - time_lags[k], :]
+        Q_mat[:, k * rank : (k + 1) * rank] = X[
+            tmax - time_lags[k] : dim - time_lags[k], :
+        ]
     var_Psi0 = np.eye(rank * d) + Q_mat.T @ Q_mat
     var_Psi = inv(var_Psi0)
     var_M = var_Psi @ Q_mat.T @ Z_mat
@@ -123,14 +145,16 @@ def sample_factor_x(tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, Lambda_x)
     d = time_lags.shape[0]
     A0 = np.dstack([A] * d)
     for k in range(d):
-        A0[k * rank: (k + 1) * rank, :, k] = 0
+        A0[k * rank : (k + 1) * rank, :, k] = 0
     mat0 = Lambda_x @ A.T
-    mat1 = np.einsum('kij, jt -> kit', A.reshape([d, rank, rank]), Lambda_x)
-    mat2 = np.einsum('kit, kjt -> ij', mat1, A.reshape([d, rank, rank]))
+    mat1 = np.einsum("kij, jt -> kit", A.reshape([d, rank, rank]), Lambda_x)
+    mat2 = np.einsum("kit, kjt -> ij", mat1, A.reshape([d, rank, rank]))
 
     var1 = kr_prod(V, U).T
     var2 = kr_prod(var1, var1)
-    var3 = (var2 @ ten2mat(tau_ind, 2).T).reshape([rank, rank, dim3]) + Lambda_x[:, :, None]
+    var3 = (var2 @ ten2mat(tau_ind, 2).T).reshape([rank, rank, dim3]) + Lambda_x[
+        :, :, None
+    ]
     var4 = var1 @ ten2mat(tau_sparse_tensor, 2).T
     for t in range(dim3):
         Mt = np.zeros((rank, rank))
@@ -149,8 +173,10 @@ def sample_factor_x(tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, Lambda_x)
             for k in index:
                 temp[:, n] = X[t + time_lags[k] - time_lags, :].reshape(rank * d)
                 n += 1
-            temp0 = X[t + time_lags[index], :].T - np.einsum('ijk, ik -> jk', A0[:, :, index], temp)
-            Nt = np.einsum('kij, jk -> i', mat1[index, :, :], temp0)
+            temp0 = X[t + time_lags[index], :].T - np.einsum(
+                "ijk, ik -> jk", A0[:, :, index], temp
+            )
+            Nt = np.einsum("kij, jk -> i", mat1[index, :, :], temp0)
 
         var3[:, :, t] = var3[:, :, t] + Mt
         if t < tmax:
@@ -178,7 +204,16 @@ def ar4cast(A, X, Sigma, time_lags, multi_step):
     return X_new
 
 
-def _BTTF(dense_tensor, sparse_tensor, init, rank, time_lags, burn_iter, gibbs_iter, multi_step=1):
+def _BTTF(
+    dense_tensor,
+    sparse_tensor,
+    init,
+    rank,
+    time_lags,
+    burn_iter,
+    gibbs_iter,
+    multi_step=1,
+):
     """Bayesian Temporal Tensor Factorization, BTTF."""
 
     dim1, dim2, dim3 = sparse_tensor.shape
@@ -213,10 +248,14 @@ def _BTTF(dense_tensor, sparse_tensor, init, rank, time_lags, burn_iter, gibbs_i
         U = sample_factor_u(tau_sparse_tensor, tau_ind, U, V, X)
         V = sample_factor_v(tau_sparse_tensor, tau_ind, U, V, X)
         A, Sigma = sample_var_coefficient(X, time_lags)
-        X = sample_factor_x(tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, inv(Sigma))
-        tensor_hat = np.einsum('is, js, ts -> ijt', U, V, X)
-        tau = np.random.gamma(1e-6 + 0.5 * np.sum(ind),
-                              1 / (1e-6 + 0.5 * np.sum(((sparse_tensor - tensor_hat) ** 2) * ind)))
+        X = sample_factor_x(
+            tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, inv(Sigma)
+        )
+        tensor_hat = np.einsum("is, js, ts -> ijt", U, V, X)
+        tau = np.random.gamma(
+            1e-6 + 0.5 * np.sum(ind),
+            1 / (1e-6 + 0.5 * np.sum(((sparse_tensor - tensor_hat) ** 2) * ind)),
+        )
         temp_hat += tensor_hat[pos_test]
         if (it + 1) % show_iter == 0 and it < burn_iter:
             # temp_hat = temp_hat / show_iter
@@ -233,7 +272,7 @@ def _BTTF(dense_tensor, sparse_tensor, init, rank, time_lags, burn_iter, gibbs_i
             tensor_hat_plus += tensor_hat
             X0 = ar4cast(A, X, Sigma, time_lags, multi_step)
             X_plus[:, :, it - burn_iter] = X0
-            tensor_new_plus += np.einsum('is, js, ts -> ijt', U, V, X0[- multi_step:, :])
+            tensor_new_plus += np.einsum("is, js, ts -> ijt", U, V, X0[-multi_step:, :])
     tensor_hat = tensor_hat_plus / gibbs_iter
     # print('Imputation MAPE: {:.6}'.format(compute_mape(dense_test, tensor_hat[:, :, : dim3][pos_test])))
     # print('Imputation RMSE: {:.6}'.format(compute_rmse(dense_test, tensor_hat[:, :, : dim3][pos_test])))
@@ -243,7 +282,9 @@ def _BTTF(dense_tensor, sparse_tensor, init, rank, time_lags, burn_iter, gibbs_i
     return tensor_hat, U_plus, V_plus, X_plus, A_plus, Sigma_plus, tau_plus
 
 
-def sample_factor_x_partial(tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, Lambda_x, back_step):
+def sample_factor_x_partial(
+    tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, Lambda_x, back_step
+):
     """Sampling T-by-R factor matrix X."""
 
     dim3, rank = X.shape
@@ -252,15 +293,17 @@ def sample_factor_x_partial(tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, L
     d = time_lags.shape[0]
     A0 = np.dstack([A] * d)
     for k in range(d):
-        A0[k * rank: (k + 1) * rank, :, k] = 0
+        A0[k * rank : (k + 1) * rank, :, k] = 0
     mat0 = Lambda_x @ A.T
-    mat1 = np.einsum('kij, jt -> kit', A.reshape([d, rank, rank]), Lambda_x)
-    mat2 = np.einsum('kit, kjt -> ij', mat1, A.reshape([d, rank, rank]))
+    mat1 = np.einsum("kij, jt -> kit", A.reshape([d, rank, rank]), Lambda_x)
+    mat2 = np.einsum("kit, kjt -> ij", mat1, A.reshape([d, rank, rank]))
 
     var1 = kr_prod(V, U).T
     var2 = kr_prod(var1, var1)
-    var3 = (var2 @ ten2mat(tau_ind[:, :, - back_step:], 2).T).reshape([rank, rank, back_step]) + Lambda_x[:, :, None]
-    var4 = var1 @ ten2mat(tau_sparse_tensor[:, :, - back_step:], 2).T
+    var3 = (var2 @ ten2mat(tau_ind[:, :, -back_step:], 2).T).reshape(
+        [rank, rank, back_step]
+    ) + Lambda_x[:, :, None]
+    var4 = var1 @ ten2mat(tau_sparse_tensor[:, :, -back_step:], 2).T
     for t in range(dim3 - back_step, dim3):
         Mt = np.zeros((rank, rank))
         Nt = np.zeros(rank)
@@ -275,15 +318,24 @@ def sample_factor_x_partial(tau_sparse_tensor, tau_ind, time_lags, U, V, X, A, L
             for k in index:
                 temp[:, n] = X[t + time_lags[k] - time_lags, :].reshape(rank * d)
                 n += 1
-            temp0 = X[t + time_lags[index], :].T - np.einsum('ijk, ik -> jk', A0[:, :, index], temp)
-            Nt = np.einsum('kij, jk -> i', mat1[index, :, :], temp0)
+            temp0 = X[t + time_lags[index], :].T - np.einsum(
+                "ijk, ik -> jk", A0[:, :, index], temp
+            )
+            Nt = np.einsum("kij, jk -> i", mat1[index, :, :], temp0)
         var3[:, :, t + back_step - dim3] = var3[:, :, t + back_step - dim3] + Mt
-        X[t, :] = mvnrnd_pre(solve(var3[:, :, t + back_step - dim3],
-                                   var4[:, t + back_step - dim3] + Nt + Qt), var3[:, :, t + back_step - dim3])
+        X[t, :] = mvnrnd_pre(
+            solve(
+                var3[:, :, t + back_step - dim3],
+                var4[:, t + back_step - dim3] + Nt + Qt,
+            ),
+            var3[:, :, t + back_step - dim3],
+        )
     return X
 
 
-def _BTTF_partial(sparse_tensor, init, rank, time_lags, gibbs_iter, multi_step=1, gamma=10):
+def _BTTF_partial(
+    sparse_tensor, init, rank, time_lags, gibbs_iter, multi_step=1, gamma=10
+):
     """Bayesian Temporal Tensor Factorization, BTTF."""
 
     dim1, dim2, dim3 = sparse_tensor.shape
@@ -304,50 +356,97 @@ def _BTTF_partial(sparse_tensor, init, rank, time_lags, gibbs_iter, multi_step=1
     for it in range(gibbs_iter):
         tau_ind = tau_plus[it] * ind
         tau_sparse_tensor = tau_plus[it] * sparse_tensor
-        X = sample_factor_x_partial(tau_sparse_tensor, tau_ind, time_lags, U_plus[:, :, it], V_plus[:, :, it],
-                                    X_plus[:, :, it], A_plus[:, :, it], inv(Sigma_plus[:, :, it]), back_step)
+        X = sample_factor_x_partial(
+            tau_sparse_tensor,
+            tau_ind,
+            time_lags,
+            U_plus[:, :, it],
+            V_plus[:, :, it],
+            X_plus[:, :, it],
+            A_plus[:, :, it],
+            inv(Sigma_plus[:, :, it]),
+            back_step,
+        )
         X0 = ar4cast(A_plus[:, :, it], X, Sigma_plus[:, :, it], time_lags, multi_step)
         X_new_plus[:, :, it] = X0
-        tensor_new_plus += np.einsum('is, js, ts -> ijt', U_plus[:, :, it], V_plus[:, :, it], X0[- multi_step:, :])
+        tensor_new_plus += np.einsum(
+            "is, js, ts -> ijt", U_plus[:, :, it], V_plus[:, :, it], X0[-multi_step:, :]
+        )
     tensor_hat = tensor_new_plus / gibbs_iter
     tensor_hat[tensor_hat < 0] = 0
 
     return tensor_hat, U_plus, V_plus, X_new_plus, A_plus, Sigma_plus, tau_plus
 
 
-def BTTF_forecast(dense_tensor, sparse_tensor, pred_step, multi_step, rank, time_lags, burn_iter, gibbs_iter, gamma=10):
+def BTTF_forecast(
+    dense_tensor,
+    sparse_tensor,
+    pred_step,
+    multi_step,
+    rank,
+    time_lags,
+    burn_iter,
+    gibbs_iter,
+    gamma=10,
+):
     dim1, dim2, T = dense_tensor.shape
     start_time = T - pred_step
     max_count = int(np.ceil(pred_step / multi_step))
     tensor_hat = np.zeros((dim1, dim2, max_count * multi_step))
     for t in range(max_count):
         if t == 0:
-            init = {"U": 0.1 * np.random.randn(dim1, rank),
-                    "V": 0.1 * np.random.randn(dim2, rank),
-                    "X": 0.1 * np.random.randn(start_time, rank)}
-            tensor, U, V, X_new, A, Sigma, tau = _BTTF(dense_tensor[:, :, : start_time],
-                                                       sparse_tensor[:, :, : start_time],
-                                                       init,
-                                                       rank,
-                                                       time_lags,
-                                                       burn_iter,
-                                                       gibbs_iter,
-                                                       multi_step)
+            init = {
+                "U": 0.1 * np.random.randn(dim1, rank),
+                "V": 0.1 * np.random.randn(dim2, rank),
+                "X": 0.1 * np.random.randn(start_time, rank),
+            }
+            tensor, U, V, X_new, A, Sigma, tau = _BTTF(
+                dense_tensor[:, :, :start_time],
+                sparse_tensor[:, :, :start_time],
+                init,
+                rank,
+                time_lags,
+                burn_iter,
+                gibbs_iter,
+                multi_step,
+            )
         else:
-            init = {"U_plus": U, "V_plus": V, "X_plus": X_new, "A_plus": A, "Sigma_plus": Sigma, "tau_plus": tau}
-            tensor, U, V, X_new, A, Sigma, tau = _BTTF_partial(sparse_tensor[:, :, : start_time + t * multi_step],
-                                                               init,
-                                                               rank,
-                                                               time_lags,
-                                                               gibbs_iter,
-                                                               multi_step,
-                                                               gamma)
-        tensor_hat[:, :, t * multi_step: (t + 1) * multi_step] = tensor[:, :, - multi_step:]
+            init = {
+                "U_plus": U,
+                "V_plus": V,
+                "X_plus": X_new,
+                "A_plus": A,
+                "Sigma_plus": Sigma,
+                "tau_plus": tau,
+            }
+            tensor, U, V, X_new, A, Sigma, tau = _BTTF_partial(
+                sparse_tensor[:, :, : start_time + t * multi_step],
+                init,
+                rank,
+                time_lags,
+                gibbs_iter,
+                multi_step,
+                gamma,
+            )
+        tensor_hat[:, :, t * multi_step : (t + 1) * multi_step] = tensor[
+            :, :, -multi_step:
+        ]
     return tensor_hat
 
 
 class BTTF(BaseForecaster):
-    def __init__(self, n_steps, n_features, pred_step, multi_step, rank, time_lags, burn_iter, gibbs_iter, device=None):
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        pred_step,
+        multi_step,
+        rank,
+        time_lags,
+        burn_iter,
+        gibbs_iter,
+        device=None,
+    ):
         super().__init__(device)
         self.n_steps = n_steps
         self.n_features = n_features
@@ -359,15 +458,21 @@ def __init__(self, n_steps, n_features, pred_step, multi_step, rank, time_lags,
         self.gibbs_iter = gibbs_iter
 
     def fit(self, train_X):
-        warnings.warn(
-            'Please run func forecast(X) directly.'
-        )
+        warnings.warn("Please run func forecast(X) directly.")
 
     def forecast(self, X):
-        self.check_input(self.n_steps, self.n_features, X, out_dtype='ndarray')
+        self.check_input(self.n_steps, self.n_features, X, out_dtype="ndarray")
         X = X.transpose((0, 2, 1))
 
-        pred = BTTF_forecast(X, X.copy(), self.pred_step, self.multi_step,
-                             self.rank, self.time_lags, self.burn_iter, self.gibbs_iter)
+        pred = BTTF_forecast(
+            X,
+            X.copy(),
+            self.pred_step,
+            self.multi_step,
+            self.rank,
+            self.time_lags,
+            self.burn_iter,
+            self.gibbs_iter,
+        )
         pred = pred.transpose((0, 2, 1))
         return pred
diff --git a/pypots/imputation/__init__.py b/pypots/imputation/__init__.py
index 3b274edb..bc176266 100644
--- a/pypots/imputation/__init__.py
+++ b/pypots/imputation/__init__.py
@@ -11,9 +11,8 @@
 from pypots.imputation.transformer import Transformer
 
 __all__ = [
-    'BRITS',
-    'Transformer',
-    'SAITS',
-    'LOCF',
-
+    "BRITS",
+    "Transformer",
+    "SAITS",
+    "LOCF",
 ]
diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py
index c3aee779..c40c09d0 100644
--- a/pypots/imputation/base.py
+++ b/pypots/imputation/base.py
@@ -21,15 +21,14 @@
 
 
 class BaseImputer(BaseModel):
-    """ Abstract class for all imputation models.
-    """
+    """Abstract class for all imputation models."""
 
     def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
     def fit(self, train_X, val_X=None):
-        """ Train the imputer.
+        """Train the imputer.
 
         Parameters
         ----------
@@ -47,7 +46,7 @@ def fit(self, train_X, val_X=None):
 
     @abstractmethod
     def impute(self, X):
-        """ Impute missing data with the trained model.
+        """Impute missing data with the trained model.
 
         Parameters
         ----------
@@ -63,20 +62,30 @@ def impute(self, X):
 
 
 class BaseNNImputer(BaseNNModel, BaseImputer):
-    def __init__(self, learning_rate, epochs, patience, batch_size, weight_decay, device):
-        super().__init__(learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self, learning_rate, epochs, patience, batch_size, weight_decay, device
+    ):
+        super().__init__(
+            learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
     @abstractmethod
     def assemble_input_data(self, data):
         pass
 
-    def _train_model(self, training_loader, val_loader=None, val_X_intact=None, val_indicating_mask=None):
-        self.optimizer = torch.optim.Adam(self.model.parameters(),
-                                          lr=self.lr,
-                                          weight_decay=self.weight_decay)
+    def _train_model(
+        self,
+        training_loader,
+        val_loader=None,
+        val_X_intact=None,
+        val_indicating_mask=None,
+    ):
+        self.optimizer = torch.optim.Adam(
+            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
 
         # each training starts from the very beginning, so reset the loss and model dict here
-        self.best_loss = float('inf')
+        self.best_loss = float("inf")
         self.best_model_dict = None
 
         try:
@@ -87,12 +96,14 @@ def _train_model(self, training_loader, val_loader=None, val_X_intact=None, val_
                     inputs = self.assemble_input_data(data)
                     self.optimizer.zero_grad()
                     results = self.model.forward(inputs)
-                    results['loss'].backward()
+                    results["loss"].backward()
                     self.optimizer.step()
-                    epoch_train_loss_collector.append(results['loss'].item())
+                    epoch_train_loss_collector.append(results["loss"].item())
 
-                mean_train_loss = np.mean(epoch_train_loss_collector)  # mean training loss of the current epoch
-                self.logger['training_loss'].append(mean_train_loss)
+                mean_train_loss = np.mean(
+                    epoch_train_loss_collector
+                )  # mean training loss of the current epoch
+                self.logger["training_loss"].append(mean_train_loss)
 
                 if val_loader is not None:
                     self.model.eval()
@@ -101,17 +112,21 @@ def _train_model(self, training_loader, val_loader=None, val_X_intact=None, val_
                         for idx, data in enumerate(val_loader):
                             inputs = self.assemble_input_data(data)
                             results = self.model.forward(inputs)
-                            imputation_collector.append(results['imputed_data'])
+                            imputation_collector.append(results["imputed_data"])
 
                     imputation_collector = torch.cat(imputation_collector)
                     imputation_collector = imputation_collector
 
-                    mean_val_loss = cal_mae(imputation_collector, val_X_intact, val_indicating_mask)
-                    self.logger['validating_loss'].append(mean_val_loss)
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}')
+                    mean_val_loss = cal_mae(
+                        imputation_collector, val_X_intact, val_indicating_mask
+                    )
+                    self.logger["validating_loss"].append(mean_val_loss)
+                    print(
+                        f"epoch {epoch}: training loss {mean_train_loss:.4f}, validating loss {mean_val_loss:.4f}"
+                    )
                     mean_loss = mean_val_loss
                 else:
-                    print(f'epoch {epoch}: training loss {mean_train_loss:.4f}')
+                    print(f"epoch {epoch}: training loss {mean_train_loss:.4f}")
                     mean_loss = mean_train_loss
 
                 if mean_loss < self.best_loss:
@@ -121,25 +136,31 @@ def _train_model(self, training_loader, val_loader=None, val_X_intact=None, val_
                 else:
                     self.patience -= 1
 
-                if os.getenv('enable_nni', False):
+                if os.getenv("enable_nni", False):
                     nni.report_intermediate_result(mean_loss)
                     if epoch == self.epochs - 1 or self.patience == 0:
                         nni.report_final_result(self.best_loss)
 
                 if self.patience == 0:
-                    print('Exceeded the training patience. Terminating the training procedure...')
+                    print(
+                        "Exceeded the training patience. Terminating the training procedure..."
+                    )
                     break
 
         except Exception as e:
-            print(f'Exception: {e}')
+            print(f"Exception: {e}")
             if self.best_model_dict is None:
-                raise RuntimeError('Training got interrupted. Model was not get trained. Please try fit() again.')
+                raise RuntimeError(
+                    "Training got interrupted. Model was not get trained. Please try fit() again."
+                )
             else:
-                RuntimeWarning('Training got interrupted. '
-                               'Model will load the best parameters so far for testing. '
-                               "If you don't want it, please try fit() again.")
+                RuntimeWarning(
+                    "Training got interrupted. "
+                    "Model will load the best parameters so far for testing. "
+                    "If you don't want it, please try fit() again."
+                )
 
-        if np.equal(self.best_loss, float('inf')):
-            raise ValueError('Something is wrong. best_loss is Nan after training.')
+        if np.equal(self.best_loss.item(), float("inf")):
+            raise ValueError("Something is wrong. best_loss is Nan after training.")
 
-        print('Finished training.')
+        print("Finished training.")
diff --git a/pypots/imputation/brits.py b/pypots/imputation/brits.py
index 8f9317fb..46587d81 100644
--- a/pypots/imputation/brits.py
+++ b/pypots/imputation/brits.py
@@ -21,7 +21,7 @@
 
 
 class FeatureRegression(nn.Module):
-    """ The module used to capture the correlation between features for imputation.
+    """The module used to capture the correlation between features for imputation.
 
     Attributes
     ----------
@@ -45,18 +45,18 @@ def __init__(self, input_size):
         self.b = Parameter(torch.Tensor(input_size))
 
         m = torch.ones(input_size, input_size) - torch.eye(input_size, input_size)
-        self.register_buffer('m', m)
+        self.register_buffer("m", m)
 
         self.reset_parameters()
 
     def reset_parameters(self):
-        stdv = 1. / math.sqrt(self.W.size(0))
+        stdv = 1.0 / math.sqrt(self.W.size(0))
         self.W.data.uniform_(-stdv, stdv)
         if self.b is not None:
             self.b.data.uniform_(-stdv, stdv)
 
     def forward(self, x):
-        """ Forward processing of the NN module.
+        """Forward processing of the NN module.
 
         Parameters
         ----------
@@ -74,7 +74,7 @@ def forward(self, x):
 
 
 class TemporalDecay(nn.Module):
-    """ The module used to generate the temporal decay factor gamma in the original paper.
+    """The module used to generate the temporal decay factor gamma in the original paper.
 
     Attributes
     ----------
@@ -100,20 +100,20 @@ def __init__(self, input_size, output_size, diag=False):
         self.b = Parameter(torch.Tensor(output_size))
 
         if self.diag:
-            assert (input_size == output_size)
+            assert input_size == output_size
             m = torch.eye(input_size, input_size)
-            self.register_buffer('m', m)
+            self.register_buffer("m", m)
 
         self.reset_parameters()
 
     def reset_parameters(self):
-        stdv = 1. / math.sqrt(self.W.size(0))
+        stdv = 1.0 / math.sqrt(self.W.size(0))
         self.W.data.uniform_(-stdv, stdv)
         if self.b is not None:
             self.b.data.uniform_(-stdv, stdv)
 
     def forward(self, delta):
-        """ Forward processing of the NN module.
+        """Forward processing of the NN module.
 
         Parameters
         ----------
@@ -134,7 +134,7 @@ def forward(self, delta):
 
 
 class RITS(nn.Module):
-    """ model RITS: Recurrent Imputation for Time Series
+    """model RITS: Recurrent Imputation for Time Series
 
     Attributes
     ----------
@@ -179,14 +179,18 @@ def __init__(self, n_steps, n_features, rnn_hidden_size, device=None):
         self.device = device
 
         self.rnn_cell = nn.LSTMCell(self.n_features * 2, self.rnn_hidden_size)
-        self.temp_decay_h = TemporalDecay(input_size=self.n_features, output_size=self.rnn_hidden_size, diag=False)
-        self.temp_decay_x = TemporalDecay(input_size=self.n_features, output_size=self.n_features, diag=True)
+        self.temp_decay_h = TemporalDecay(
+            input_size=self.n_features, output_size=self.rnn_hidden_size, diag=False
+        )
+        self.temp_decay_x = TemporalDecay(
+            input_size=self.n_features, output_size=self.n_features, diag=True
+        )
         self.hist_reg = nn.Linear(self.rnn_hidden_size, self.n_features)
         self.feat_reg = FeatureRegression(self.n_features)
         self.combining_weight = nn.Linear(self.n_features * 2, self.n_features)
 
     def impute(self, inputs, direction):
-        """ The imputation function.
+        """The imputation function.
         Parameters
         ----------
         inputs : dict,
@@ -203,13 +207,17 @@ def impute(self, inputs, direction):
         reconstruction_loss : float tensor,
             reconstruction loss
         """
-        values = inputs[direction]['X']  # feature values
-        masks = inputs[direction]['missing_mask']  # missing masks
-        deltas = inputs[direction]['deltas']  # time-gap values
+        values = inputs[direction]["X"]  # feature values
+        masks = inputs[direction]["missing_mask"]  # missing masks
+        deltas = inputs[direction]["deltas"]  # time-gap values
 
         # create hidden states and cell states for the lstm cell
-        hidden_states = torch.zeros((values.size()[0], self.rnn_hidden_size), device=self.device)
-        cell_states = torch.zeros((values.size()[0], self.rnn_hidden_size), device=self.device)
+        hidden_states = torch.zeros(
+            (values.size()[0], self.rnn_hidden_size), device=self.device
+        )
+        cell_states = torch.zeros(
+            (values.size()[0], self.rnn_hidden_size), device=self.device
+        )
 
         estimations = []
         reconstruction_loss = 0.0
@@ -242,14 +250,16 @@ def impute(self, inputs, direction):
             estimations.append(c_h.unsqueeze(dim=1))
 
             inputs = torch.cat([c_c, m], dim=1)
-            hidden_states, cell_states = self.rnn_cell(inputs, (hidden_states, cell_states))
+            hidden_states, cell_states = self.rnn_cell(
+                inputs, (hidden_states, cell_states)
+            )
 
         estimations = torch.cat(estimations, dim=1)
         imputed_data = masks * values + (1 - masks) * estimations
         return imputed_data, hidden_states, reconstruction_loss
 
-    def forward(self, inputs, direction='forward'):
-        """ Forward processing of the NN module.
+    def forward(self, inputs, direction="forward"):
+        """Forward processing of the NN module.
         Parameters
         ----------
         inputs : dict,
@@ -265,19 +275,21 @@ def forward(self, inputs, direction='forward'):
         """
         imputed_data, hidden_state, reconstruction_loss = self.impute(inputs, direction)
         # for each iteration, reconstruction_loss increases its value for 3 times
-        reconstruction_loss /= (self.n_steps * 3)
+        reconstruction_loss /= self.n_steps * 3
 
         ret_dict = {
-            'consistency_loss': torch.tensor(0.0, device=self.device),  # single direction, has no consistency loss
-            'reconstruction_loss': reconstruction_loss,
-            'imputed_data': imputed_data,
-            'final_hidden_state': hidden_state
+            "consistency_loss": torch.tensor(
+                0.0, device=self.device
+            ),  # single direction, has no consistency loss
+            "reconstruction_loss": reconstruction_loss,
+            "imputed_data": imputed_data,
+            "final_hidden_state": hidden_state,
         }
         return ret_dict
 
 
 class _BRITS(nn.Module):
-    """ model BRITS: Bidirectional RITS
+    """model BRITS: Bidirectional RITS
     BRITS consists of two RITS, which take time-series data from two directions (forward/backward) respectively.
 
     Attributes
@@ -306,7 +318,7 @@ def __init__(self, n_steps, n_features, rnn_hidden_size, device=None):
         self.rits_b = RITS(n_steps, n_features, rnn_hidden_size, device)
 
     def impute(self, inputs):
-        """ Impute the missing data. Only impute, this is for test stage.
+        """Impute the missing data. Only impute, this is for test stage.
 
         Parameters
         ----------
@@ -319,16 +331,16 @@ def impute(self, inputs):
             The feature vectors with missing part imputed.
 
         """
-        imputed_data_f, _, _ = self.rits_f.impute(inputs, 'forward')
-        imputed_data_b, _, _ = self.rits_b.impute(inputs, 'backward')
-        imputed_data_b = {'imputed_data_b': imputed_data_b}
-        imputed_data_b = self.reverse(imputed_data_b)['imputed_data_b']
+        imputed_data_f, _, _ = self.rits_f.impute(inputs, "forward")
+        imputed_data_b, _, _ = self.rits_b.impute(inputs, "backward")
+        imputed_data_b = {"imputed_data_b": imputed_data_b}
+        imputed_data_b = self.reverse(imputed_data_b)["imputed_data_b"]
         imputed_data = (imputed_data_f + imputed_data_b) / 2
         return imputed_data
 
     @staticmethod
     def get_consistency_loss(pred_f, pred_b):
-        """ Calculate the consistency loss between the imputation from two RITS models.
+        """Calculate the consistency loss between the imputation from two RITS models.
 
         Parameters
         ----------
@@ -347,7 +359,7 @@ def get_consistency_loss(pred_f, pred_b):
 
     @staticmethod
     def reverse(ret):
-        """ Reverse the array values on the time dimension in the given dictionary.
+        """Reverse the array values on the time dimension in the given dictionary.
 
         Parameters
         ----------
@@ -363,7 +375,9 @@ def reverse_tensor(tensor_):
             if tensor_.dim() <= 1:
                 return tensor_
             indices = range(tensor_.size()[1])[::-1]
-            indices = torch.tensor(indices, dtype=torch.long, device=tensor_.device, requires_grad=False)
+            indices = torch.tensor(
+                indices, dtype=torch.long, device=tensor_.device, requires_grad=False
+            )
             return tensor_.index_select(1, indices)
 
         for key in ret:
@@ -372,7 +386,7 @@ def reverse_tensor(tensor_):
         return ret
 
     def merge_ret(self, ret_f, ret_b):
-        """ Merge (average) results from two RITS models into one.
+        """Merge (average) results from two RITS models into one.
 
         Parameters
         ----------
@@ -386,17 +400,21 @@ def merge_ret(self, ret_f, ret_b):
         dict,
             Merged results in a dictionary.
         """
-        consistency_loss = self.get_consistency_loss(ret_f['imputed_data'], ret_b['imputed_data'])
-        ret_f['imputed_data'] = (ret_f['imputed_data'] + ret_b['imputed_data']) / 2
-        ret_f['consistency_loss'] = consistency_loss
-        ret_f['loss'] = consistency_loss + \
-                        ret_f['reconstruction_loss'] + \
-                        ret_b['reconstruction_loss']
+        consistency_loss = self.get_consistency_loss(
+            ret_f["imputed_data"], ret_b["imputed_data"]
+        )
+        ret_f["imputed_data"] = (ret_f["imputed_data"] + ret_b["imputed_data"]) / 2
+        ret_f["consistency_loss"] = consistency_loss
+        ret_f["loss"] = (
+            consistency_loss
+            + ret_f["reconstruction_loss"]
+            + ret_b["reconstruction_loss"]
+        )
 
         return ret_f
 
     def forward(self, inputs):
-        """ Forward processing of BRITS.
+        """Forward processing of BRITS.
 
         Parameters
         ----------
@@ -407,14 +425,14 @@ def forward(self, inputs):
         -------
         dict, A dictionary includes all results.
         """
-        ret_f = self.rits_f(inputs, 'forward')
-        ret_b = self.reverse(self.rits_b(inputs, 'backward'))
+        ret_f = self.rits_f(inputs, "forward")
+        ret_b = self.reverse(self.rits_b(inputs, "backward"))
         ret = self.merge_ret(ret_f, ret_b)
         return ret
 
 
 class BRITS(BaseNNImputer):
-    """ BRITS implementation
+    """BRITS implementation
 
     Attributes
     ----------
@@ -451,28 +469,34 @@ class BRITS(BaseNNImputer):
         Run the model on which device.
     """
 
-    def __init__(self,
-                 n_steps,
-                 n_features,
-                 rnn_hidden_size,
-                 learning_rate=1e-3,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        rnn_hidden_size,
+        learning_rate=1e-3,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
         self.n_steps = n_steps
         self.n_features = n_features
         self.rnn_hidden_size = rnn_hidden_size
 
-        self.model = _BRITS(self.n_steps, self.n_features, self.rnn_hidden_size, self.device)
+        self.model = _BRITS(
+            self.n_steps, self.n_features, self.rnn_hidden_size, self.device
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
 
     def fit(self, train_X, val_X=None):
-        """ Fit the model on the given training data.
+        """Fit the model on the given training data.
 
         Parameters
         ----------
@@ -492,23 +516,29 @@ def fit(self, train_X, val_X=None):
             val_X = self.check_input(self.n_steps, self.n_features, val_X)
 
         training_set = DatasetForBRITS(train_X)  # time_gaps is necessary for BRITS
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
 
         if val_X is None:
             self._train_model(training_loader)
         else:
-            val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(val_X, 0.2)
+            val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(
+                val_X, 0.2
+            )
             val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan)
             val_set = DatasetForBRITS(val_X)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
-            self._train_model(training_loader, val_loader, val_X_intact, val_X_indicating_mask)
+            self._train_model(
+                training_loader, val_loader, val_X_intact, val_X_indicating_mask
+            )
 
         self.model.load_state_dict(self.best_model_dict)
         self.model.eval()  # set the model as eval status to freeze it.
         return self
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -524,17 +554,13 @@ def assemble_input_data(self, data):
         indices, X, missing_mask, deltas, back_X, back_missing_mask, back_deltas = data
         # assemble input data
         inputs = {
-            'indices': indices,
-            'forward': {
-                'X': X,
-                'missing_mask': missing_mask,
-                'deltas': deltas
+            "indices": indices,
+            "forward": {"X": X, "missing_mask": missing_mask, "deltas": deltas},
+            "backward": {
+                "X": back_X,
+                "missing_mask": back_missing_mask,
+                "deltas": back_deltas,
             },
-            'backward': {
-                'X': back_X,
-                'missing_mask': back_missing_mask,
-                'deltas': back_deltas
-            }
         }
 
         return inputs
diff --git a/pypots/imputation/locf.py b/pypots/imputation/locf.py
index fbc1073b..2d391bb9 100644
--- a/pypots/imputation/locf.py
+++ b/pypots/imputation/locf.py
@@ -14,7 +14,7 @@
 
 
 class LOCF(BaseImputer):
-    """ LOCF (Last Observed Carried Forward) imputation method.
+    """LOCF (Last Observed Carried Forward) imputation method.
 
     Attributes
     ----------
@@ -23,17 +23,17 @@ class LOCF(BaseImputer):
     """
 
     def __init__(self, nan=0):
-        super().__init__('cpu')
+        super().__init__("cpu")
         self.nan = nan
 
     def fit(self, train_X, val_X=None):
         warnings.warn(
-            'LOCF (Last Observed Carried Forward) imputation class has no parameter to train. '
-            'Please run func impute(X) directly.'
+            "LOCF (Last Observed Carried Forward) imputation class has no parameter to train. "
+            "Please run func impute(X) directly."
         )
 
     def locf_numpy(self, X):
-        """ Numpy implementation of LOCF.
+        """Numpy implementation of LOCF.
 
         Parameters
         ----------
@@ -71,7 +71,7 @@ def locf_numpy(self, X):
         return X_imputed
 
     def locf_torch(self, X):
-        """ Torch implementation of LOCF.
+        """Torch implementation of LOCF.
 
         Parameters
         ----------
@@ -86,7 +86,7 @@ def locf_torch(self, X):
         trans_X = X.permute((0, 2, 1))
         mask = torch.isnan(trans_X)
         n_samples, n_steps, n_features = mask.shape
-        idx = torch.where(~mask, torch.arange(n_features), 0)
+        idx = torch.where(~mask, torch.arange(n_features, device=mask.device), 0)
         idx = torch.cummax(idx, dim=2)
 
         collector = []
@@ -104,7 +104,7 @@ def locf_torch(self, X):
         return X_imputed
 
     def impute(self, X):
-        """ Impute missing values
+        """Impute missing values
 
         Parameters
         ----------
@@ -116,8 +116,10 @@ def impute(self, X):
         array-like,
             Imputed time series.
         """
-        assert len(X.shape) == 3, f'Input X should have 3 dimensions [n_samples, n_steps, n_features], ' \
-                                  f'but the actual shape of X: {X.shape}'
+        assert len(X.shape) == 3, (
+            f"Input X should have 3 dimensions [n_samples, n_steps, n_features], "
+            f"but the actual shape of X: {X.shape}"
+        )
         if isinstance(X, list):
             X = np.asarray(X)
 
@@ -126,6 +128,7 @@ def impute(self, X):
         elif isinstance(X, torch.Tensor):
             X_imputed = self.locf_torch(X).detach().cpu().numpy()
         else:
-            raise TypeError('X must be type of list/np.ndarray/torch.Tensor, '
-                            f'but got {type(X)}')
+            raise TypeError(
+                "X must be type of list/np.ndarray/torch.Tensor, " f"but got {type(X)}"
+            )
         return X_imputed
diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py
index b521aa0c..3badbbbe 100644
--- a/pypots/imputation/saits.py
+++ b/pypots/imputation/saits.py
@@ -20,24 +20,61 @@
 
 
 class _SAITS(nn.Module):
-    def __init__(self, n_layers, d_time, d_feature, d_model, d_inner, n_head, d_k, d_v, dropout,
-                 diagonal_attention_mask=True, ORT_weight=1, MIT_weight=1):
+    def __init__(
+        self,
+        n_layers,
+        d_time,
+        d_feature,
+        d_model,
+        d_inner,
+        n_head,
+        d_k,
+        d_v,
+        dropout,
+        diagonal_attention_mask=True,
+        ORT_weight=1,
+        MIT_weight=1,
+    ):
         super().__init__()
         self.n_layers = n_layers
         actual_d_feature = d_feature * 2
         self.ORT_weight = ORT_weight
         self.MIT_weight = MIT_weight
 
-        self.layer_stack_for_first_block = nn.ModuleList([
-            EncoderLayer(d_time, actual_d_feature, d_model, d_inner, n_head, d_k, d_v, dropout, 0,
-                         diagonal_attention_mask)
-            for _ in range(n_layers)
-        ])
-        self.layer_stack_for_second_block = nn.ModuleList([
-            EncoderLayer(d_time, actual_d_feature, d_model, d_inner, n_head, d_k, d_v, dropout, 0,
-                         diagonal_attention_mask)
-            for _ in range(n_layers)
-        ])
+        self.layer_stack_for_first_block = nn.ModuleList(
+            [
+                EncoderLayer(
+                    d_time,
+                    actual_d_feature,
+                    d_model,
+                    d_inner,
+                    n_head,
+                    d_k,
+                    d_v,
+                    dropout,
+                    0,
+                    diagonal_attention_mask,
+                )
+                for _ in range(n_layers)
+            ]
+        )
+        self.layer_stack_for_second_block = nn.ModuleList(
+            [
+                EncoderLayer(
+                    d_time,
+                    actual_d_feature,
+                    d_model,
+                    d_inner,
+                    n_head,
+                    d_k,
+                    d_v,
+                    dropout,
+                    0,
+                    diagonal_attention_mask,
+                )
+                for _ in range(n_layers)
+            ]
+        )
 
         self.dropout = nn.Dropout(p=dropout)
         self.position_enc = PositionalEncoding(d_model, n_position=d_time)
@@ -52,11 +89,13 @@ def __init__(self, n_layers, d_time, d_feature, d_model, d_inner, n_head, d_k, d
         self.weight_combine = nn.Linear(d_feature + d_time, d_feature)
 
     def impute(self, inputs):
-        X, masks = inputs['X'], inputs['missing_mask']
+        X, masks = inputs["X"], inputs["missing_mask"]
         # first DMSA block
         input_X_for_first = torch.cat([X, masks], dim=2)
         input_X_for_first = self.embedding_1(input_X_for_first)
-        enc_output = self.dropout(self.position_enc(input_X_for_first))  # namely, term e in the math equation
+        enc_output = self.dropout(
+            self.position_enc(input_X_for_first)
+        )  # namely, term e in the math equation
         for encoder_layer in self.layer_stack_for_first_block:
             enc_output, _ = encoder_layer(enc_output)
 
@@ -66,7 +105,9 @@ def impute(self, inputs):
         # second DMSA block
         input_X_for_second = torch.cat([X_prime, masks], dim=2)
         input_X_for_second = self.embedding_2(input_X_for_second)
-        enc_output = self.position_enc(input_X_for_second)  # namely term alpha in math algo
+        enc_output = self.position_enc(
+            input_X_for_second
+        )  # namely term alpha in math algo
         for encoder_layer in self.layer_stack_for_second_block:
             enc_output, attn_weights = encoder_layer(enc_output)
 
@@ -85,11 +126,13 @@ def impute(self, inputs):
         )  # namely term eta
         # combine X_tilde_1 and X_tilde_2
         X_tilde_3 = (1 - combining_weights) * X_tilde_2 + combining_weights * X_tilde_1
-        X_c = masks * X + (1 - masks) * X_tilde_3  # replace non-missing part with original data
+        X_c = (
+            masks * X + (1 - masks) * X_tilde_3
+        )  # replace non-missing part with original data
         return X_c, [X_tilde_1, X_tilde_2, X_tilde_3]
 
     def forward(self, inputs):
-        X, masks = inputs['X'], inputs['missing_mask']
+        X, masks = inputs["X"], inputs["missing_mask"]
         reconstruction_loss = 0
         imputed_data, [X_tilde_1, X_tilde_2, X_tilde_3] = self.impute(inputs)
 
@@ -100,38 +143,45 @@ def forward(self, inputs):
         reconstruction_loss /= 3
 
         # have to cal imputation loss in the val stage; no need to cal imputation loss here in the tests stage
-        imputation_loss = cal_mae(X_tilde_3, inputs['X_intact'], inputs['indicating_mask'])
+        imputation_loss = cal_mae(
+            X_tilde_3, inputs["X_intact"], inputs["indicating_mask"]
+        )
 
         loss = self.ORT_weight * reconstruction_loss + self.MIT_weight * imputation_loss
 
         return {
-            'imputed_data': imputed_data,
-            'reconstruction_loss': reconstruction_loss, 'imputation_loss': imputation_loss,
-            'loss': loss
+            "imputed_data": imputed_data,
+            "reconstruction_loss": reconstruction_loss,
+            "imputation_loss": imputation_loss,
+            "loss": loss,
         }
 
 
 class SAITS(BaseNNImputer):
-    def __init__(self,
-                 n_steps,
-                 n_features,
-                 n_layers,
-                 d_model,
-                 d_inner,
-                 n_head,
-                 d_k,
-                 d_v,
-                 dropout,
-                 diagonal_attention_mask=True,
-                 ORT_weight=1,
-                 MIT_weight=1,
-                 learning_rate=1e-3,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        n_layers,
+        d_model,
+        d_inner,
+        n_head,
+        d_k,
+        d_v,
+        dropout,
+        diagonal_attention_mask=True,
+        ORT_weight=1,
+        MIT_weight=1,
+        learning_rate=1e-3,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
         self.n_steps = n_steps
         self.n_features = n_features
@@ -147,9 +197,20 @@ def __init__(self,
         self.ORT_weight = ORT_weight
         self.MIT_weight = MIT_weight
 
-        self.model = _SAITS(self.n_layers, self.n_steps, self.n_features, self.d_model, self.d_inner, self.n_head,
-                            self.d_k, self.d_v, self.dropout, self.diagonal_attention_mask,
-                            self.ORT_weight, self.MIT_weight)
+        self.model = _SAITS(
+            self.n_layers,
+            self.n_steps,
+            self.n_features,
+            self.d_model,
+            self.d_inner,
+            self.n_head,
+            self.d_k,
+            self.d_v,
+            self.dropout,
+            self.diagonal_attention_mask,
+            self.ORT_weight,
+            self.MIT_weight,
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
 
@@ -159,21 +220,27 @@ def fit(self, train_X, val_X=None):
             val_X = self.check_input(self.n_steps, self.n_features, val_X)
 
         training_set = DatasetForMIT(train_X)
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
         if val_X is None:
             self._train_model(training_loader)
         else:
-            val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(val_X, 0.2)
+            val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(
+                val_X, 0.2
+            )
             val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan)
             val_set = DatasetForMIT(val_X)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
-            self._train_model(training_loader, val_loader, val_X_intact, val_X_indicating_mask)
+            self._train_model(
+                training_loader, val_loader, val_X_intact, val_X_indicating_mask
+            )
 
         self.model.load_state_dict(self.best_model_dict)
         self.model.eval()  # set the model as eval status to freeze it.
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -188,10 +255,10 @@ def assemble_input_data(self, data):
         indices, X_intact, X, missing_mask, indicating_mask = data
 
         inputs = {
-            'X': X,
-            'X_intact': X_intact,
-            'missing_mask': missing_mask,
-            'indicating_mask': indicating_mask
+            "X": X,
+            "X_intact": X_intact,
+            "missing_mask": missing_mask,
+            "indicating_mask": indicating_mask,
         }
 
         return inputs
@@ -205,7 +272,7 @@ def impute(self, X):
 
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
-                inputs = {'X': data[1], 'missing_mask': data[2]}
+                inputs = {"X": data[1], "missing_mask": data[2]}
                 imputed_data, _ = self.model.impute(inputs)
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/transformer.py b/pypots/imputation/transformer.py
index 5f68fcdd..8146a266 100644
--- a/pypots/imputation/transformer.py
+++ b/pypots/imputation/transformer.py
@@ -50,7 +50,7 @@ def __init__(self, n_head, d_model, d_k, d_v, attn_dropout):
         self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
         self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
 
-        self.attention = ScaledDotProductAttention(d_k ** 0.5, attn_dropout)
+        self.attention = ScaledDotProductAttention(d_k**0.5, attn_dropout)
         self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
 
     def forward(self, q, k, v, attn_mask=None):
@@ -68,7 +68,9 @@ def forward(self, q, k, v, attn_mask=None):
 
         if attn_mask is not None:
             # this mask is imputation mask, which is not generated from each batch, so needs broadcasting on batch dim
-            attn_mask = attn_mask.unsqueeze(0).unsqueeze(1)  # For batch and head axis broadcasting.
+            attn_mask = attn_mask.unsqueeze(0).unsqueeze(
+                1
+            )  # For batch and head axis broadcasting.
 
         v, attn_weights = self.attention(q, k, v, attn_mask)
 
@@ -97,8 +99,19 @@ def forward(self, x):
 
 
 class EncoderLayer(nn.Module):
-    def __init__(self, d_time, d_feature, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, attn_dropout=0.1,
-                 diagonal_attention_mask=False):
+    def __init__(
+        self,
+        d_time,
+        d_feature,
+        d_model,
+        d_inner,
+        n_head,
+        d_k,
+        d_v,
+        dropout=0.1,
+        attn_dropout=0.1,
+        diagonal_attention_mask=False,
+    ):
         super().__init__()
 
         self.diagonal_attention_mask = diagonal_attention_mask
@@ -119,7 +132,9 @@ def forward(self, enc_input):
         residual = enc_input
         # here we apply LN before attention cal, namely Pre-LN, refer paper https://arxiv.org/abs/2002.04745
         enc_input = self.layer_norm(enc_input)
-        enc_output, attn_weights = self.slf_attn(enc_input, enc_input, enc_input, attn_mask=mask_time)
+        enc_output, attn_weights = self.slf_attn(
+            enc_input, enc_input, enc_input, attn_mask=mask_time
+        )
         enc_output = self.dropout(enc_output)
         enc_output += residual
 
@@ -131,38 +146,69 @@ class PositionalEncoding(nn.Module):
     def __init__(self, d_hid, n_position=200):
         super().__init__()
         # Not a parameter
-        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+        self.register_buffer(
+            "pos_table", self._get_sinusoid_encoding_table(n_position, d_hid)
+        )
 
     @staticmethod
     def _get_sinusoid_encoding_table(n_position, d_hid):
-        """ Sinusoid position encoding table """
+        """Sinusoid position encoding table"""
 
         def get_position_angle_vec(position):
-            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
-
-        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+            return [
+                position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+                for hid_j in range(d_hid)
+            ]
+
+        sinusoid_table = np.array(
+            [get_position_angle_vec(pos_i) for pos_i in range(n_position)]
+        )
         sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
         sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
         return torch.FloatTensor(sinusoid_table).unsqueeze(0)
 
     def forward(self, x):
-        return x + self.pos_table[:, :x.size(1)].clone().detach()
+        return x + self.pos_table[:, : x.size(1)].clone().detach()
 
 
 class _TransformerEncoder(nn.Module):
-    def __init__(self, n_layers, d_time, d_feature, d_model, d_inner, n_head, d_k, d_v, dropout,
-                 ORT_weight=1, MIT_weight=1):
+    def __init__(
+        self,
+        n_layers,
+        d_time,
+        d_feature,
+        d_model,
+        d_inner,
+        n_head,
+        d_k,
+        d_v,
+        dropout,
+        ORT_weight=1,
+        MIT_weight=1,
+    ):
         super().__init__()
         self.n_layers = n_layers
         actual_d_feature = d_feature * 2
         self.ORT_weight = ORT_weight
         self.MIT_weight = MIT_weight
 
-        self.layer_stack = nn.ModuleList([
-            EncoderLayer(d_time, actual_d_feature, d_model, d_inner, n_head, d_k, d_v, dropout, 0,
-                         False)
-            for _ in range(n_layers)
-        ])
+        self.layer_stack = nn.ModuleList(
+            [
+                EncoderLayer(
+                    d_time,
+                    actual_d_feature,
+                    d_model,
+                    d_inner,
+                    n_head,
+                    d_k,
+                    d_v,
+                    dropout,
+                    0,
+                    False,
+                )
+                for _ in range(n_layers)
+            ]
+        )
 
         self.embedding = nn.Linear(actual_d_feature, d_model)
         self.position_enc = PositionalEncoding(d_model, n_position=d_time)
@@ -170,7 +216,7 @@ def __init__(self, n_layers, d_time, d_feature, d_model, d_inner, n_head, d_k, d
         self.reduce_dim = nn.Linear(d_model, d_feature)
 
     def impute(self, inputs):
-        X, masks = inputs['X'], inputs['missing_mask']
+        X, masks = inputs["X"], inputs["missing_mask"]
         input_X = torch.cat([X, masks], dim=2)
         input_X = self.embedding(input_X)
         enc_output = self.dropout(self.position_enc(input_X))
@@ -179,46 +225,55 @@ def impute(self, inputs):
             enc_output, _ = encoder_layer(enc_output)
 
         learned_presentation = self.reduce_dim(enc_output)
-        imputed_data = masks * X + (1 - masks) * learned_presentation  # replace non-missing part with original data
+        imputed_data = (
+            masks * X + (1 - masks) * learned_presentation
+        )  # replace non-missing part with original data
         return imputed_data, learned_presentation
 
     def forward(self, inputs):
-        X, masks = inputs['X'], inputs['missing_mask']
+        X, masks = inputs["X"], inputs["missing_mask"]
         imputed_data, learned_presentation = self.impute(inputs)
         reconstruction_loss = cal_mae(learned_presentation, X, masks)
 
         # have to cal imputation loss in the val stage; no need to cal imputation loss here in the tests stage
-        imputation_loss = cal_mae(learned_presentation, inputs['X_intact'], inputs['indicating_mask'])
+        imputation_loss = cal_mae(
+            learned_presentation, inputs["X_intact"], inputs["indicating_mask"]
+        )
 
         loss = self.ORT_weight * reconstruction_loss + self.MIT_weight * imputation_loss
 
         return {
-            'imputed_data': imputed_data,
-            'reconstruction_loss': reconstruction_loss, 'imputation_loss': imputation_loss,
-            'loss': loss
+            "imputed_data": imputed_data,
+            "reconstruction_loss": reconstruction_loss,
+            "imputation_loss": imputation_loss,
+            "loss": loss,
         }
 
 
 class Transformer(BaseNNImputer):
-    def __init__(self,
-                 n_steps,
-                 n_features,
-                 n_layers,
-                 d_model,
-                 d_inner,
-                 n_head,
-                 d_k,
-                 d_v,
-                 dropout,
-                 ORT_weight=1,
-                 MIT_weight=1,
-                 learning_rate=1e-3,
-                 epochs=100,
-                 patience=10,
-                 batch_size=32,
-                 weight_decay=1e-5,
-                 device=None):
-        super().__init__(learning_rate, epochs, patience, batch_size, weight_decay, device)
+    def __init__(
+        self,
+        n_steps,
+        n_features,
+        n_layers,
+        d_model,
+        d_inner,
+        n_head,
+        d_k,
+        d_v,
+        dropout,
+        ORT_weight=1,
+        MIT_weight=1,
+        learning_rate=1e-3,
+        epochs=100,
+        patience=10,
+        batch_size=32,
+        weight_decay=1e-5,
+        device=None,
+    ):
+        super().__init__(
+            learning_rate, epochs, patience, batch_size, weight_decay, device
+        )
 
         self.n_steps = n_steps
         self.n_features = n_features
@@ -233,9 +288,19 @@ def __init__(self,
         self.ORT_weight = ORT_weight
         self.MIT_weight = MIT_weight
 
-        self.model = _TransformerEncoder(self.n_layers, self.n_steps, self.n_features, self.d_model, self.d_inner,
-                                         self.n_head, self.d_k, self.d_v, self.dropout,
-                                         self.ORT_weight, self.MIT_weight)
+        self.model = _TransformerEncoder(
+            self.n_layers,
+            self.n_steps,
+            self.n_features,
+            self.d_model,
+            self.d_inner,
+            self.n_head,
+            self.d_k,
+            self.d_v,
+            self.dropout,
+            self.ORT_weight,
+            self.MIT_weight,
+        )
         self.model = self.model.to(self.device)
         self._print_model_size()
 
@@ -245,22 +310,28 @@ def fit(self, train_X, val_X=None):
             val_X = self.check_input(self.n_steps, self.n_features, val_X)
 
         training_set = DatasetForMIT(train_X)
-        training_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True)
+        training_loader = DataLoader(
+            training_set, batch_size=self.batch_size, shuffle=True
+        )
         if val_X is None:
             self._train_model(training_loader)
         else:
-            val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(val_X, 0.2)
+            val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(
+                val_X, 0.2
+            )
             val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan)
             val_set = DatasetForMIT(val_X)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
-            self._train_model(training_loader, val_loader, val_X_intact, val_X_indicating_mask)
+            self._train_model(
+                training_loader, val_loader, val_X_intact, val_X_indicating_mask
+            )
 
         self.model.load_state_dict(self.best_model_dict)
         self.model.eval()  # set the model as eval status to freeze it.
         return self
 
     def assemble_input_data(self, data):
-        """ Assemble the input data into a dictionary.
+        """Assemble the input data into a dictionary.
 
         Parameters
         ----------
@@ -276,10 +347,10 @@ def assemble_input_data(self, data):
         indices, X_intact, X, missing_mask, indicating_mask = data
 
         inputs = {
-            'X': X,
-            'X_intact': X_intact,
-            'missing_mask': missing_mask,
-            'indicating_mask': indicating_mask
+            "X": X,
+            "X_intact": X_intact,
+            "missing_mask": missing_mask,
+            "indicating_mask": indicating_mask,
         }
 
         return inputs
@@ -293,7 +364,7 @@ def impute(self, X):
 
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
-                inputs = {'X': data[1], 'missing_mask': data[2]}
+                inputs = {"X": data[1], "missing_mask": data[2]}
                 imputed_data, _ = self.model.impute(inputs)
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/tests/test_classification.py b/pypots/tests/test_classification.py
index bcda07df..9f283ab2 100644
--- a/pypots/tests/test_classification.py
+++ b/pypots/tests/test_classification.py
@@ -16,113 +16,145 @@
 
 class TestBRITS(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.train_y = DATA['train_y']
-        self.val_X = DATA['val_X']
-        self.val_y = DATA['val_y']
-        self.test_X = DATA['test_X']
-        self.test_y = DATA['test_y']
-        print('Running test cases for BRITS...')
-        self.brits = BRITS(DATA['n_steps'], DATA['n_features'], 256,
-                           n_classes=DATA['n_classes'], epochs=EPOCHS)
+        self.train_X = DATA["train_X"]
+        self.train_y = DATA["train_y"]
+        self.val_X = DATA["val_X"]
+        self.val_y = DATA["val_y"]
+        self.test_X = DATA["test_X"]
+        self.test_y = DATA["test_y"]
+        print("Running test cases for BRITS...")
+        self.brits = BRITS(
+            DATA["n_steps"],
+            DATA["n_features"],
+            256,
+            n_classes=DATA["n_classes"],
+            epochs=EPOCHS,
+        )
         self.brits.fit(self.train_X, self.train_y, self.val_X, self.val_y)
 
     def test_parameters(self):
-        assert (hasattr(self.brits, 'model')
-                and self.brits.model is not None)
+        assert hasattr(self.brits, "model") and self.brits.model is not None
 
-        assert (hasattr(self.brits, 'optimizer')
-                and self.brits.optimizer is not None)
+        assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None
 
-        assert hasattr(self.brits, 'best_loss')
-        self.assertNotEqual(self.brits.best_loss, float('inf'))
+        assert hasattr(self.brits, "best_loss")
+        self.assertNotEqual(self.brits.best_loss, float("inf"))
 
-        assert (hasattr(self.brits, 'best_model_dict')
-                and self.brits.best_model_dict is not None)
+        assert (
+            hasattr(self.brits, "best_model_dict")
+            and self.brits.best_model_dict is not None
+        )
 
     def test_classify(self):
         predictions = self.brits.classify(self.test_X)
         metrics = cal_binary_classification_metrics(predictions, self.test_y)
-        print(f'ROC_AUC: {metrics["roc_auc"]}, \n'
-              f'PR_AUC: {metrics["pr_auc"]},\n'
-              f'F1: {metrics["f1"]},\n'
-              f'Precision: {metrics["precision"]},\n'
-              f'Recall: {metrics["recall"]},\n')
-        assert metrics['roc_auc'] >= 0.5, 'ROC-AUC < 0.5'
+        print(
+            f'ROC_AUC: {metrics["roc_auc"]}, \n'
+            f'PR_AUC: {metrics["pr_auc"]},\n'
+            f'F1: {metrics["f1"]},\n'
+            f'Precision: {metrics["precision"]},\n'
+            f'Recall: {metrics["recall"]},\n'
+        )
+        assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
 
 class TestGRUD(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.train_y = DATA['train_y']
-        self.val_X = DATA['val_X']
-        self.val_y = DATA['val_y']
-        self.test_X = DATA['test_X']
-        self.test_y = DATA['test_y']
-        print('Running test cases for GRUD...')
-        self.grud = GRUD(DATA['n_steps'], DATA['n_features'], 256, n_classes=DATA['n_classes'], epochs=EPOCHS)
+        self.train_X = DATA["train_X"]
+        self.train_y = DATA["train_y"]
+        self.val_X = DATA["val_X"]
+        self.val_y = DATA["val_y"]
+        self.test_X = DATA["test_X"]
+        self.test_y = DATA["test_y"]
+        print("Running test cases for GRUD...")
+        self.grud = GRUD(
+            DATA["n_steps"],
+            DATA["n_features"],
+            256,
+            n_classes=DATA["n_classes"],
+            epochs=EPOCHS,
+        )
         self.grud.fit(self.train_X, self.train_y, self.val_X, self.val_y)
 
     def test_parameters(self):
-        assert (hasattr(self.grud, 'model')
-                and self.grud.model is not None)
+        assert hasattr(self.grud, "model") and self.grud.model is not None
 
-        assert (hasattr(self.grud, 'optimizer')
-                and self.grud.optimizer is not None)
+        assert hasattr(self.grud, "optimizer") and self.grud.optimizer is not None
 
-        assert hasattr(self.grud, 'best_loss')
-        self.assertNotEqual(self.grud.best_loss, float('inf'))
+        assert hasattr(self.grud, "best_loss")
+        self.assertNotEqual(self.grud.best_loss, float("inf"))
 
-        assert (hasattr(self.grud, 'best_model_dict')
-                and self.grud.best_model_dict is not None)
+        assert (
+            hasattr(self.grud, "best_model_dict")
+            and self.grud.best_model_dict is not None
+        )
 
     def test_classify(self):
         predictions = self.grud.classify(self.test_X)
         metrics = cal_binary_classification_metrics(predictions, self.test_y)
-        print(f'ROC_AUC: {metrics["roc_auc"]}, \n'
-              f'PR_AUC: {metrics["pr_auc"]},\n'
-              f'F1: {metrics["f1"]},\n'
-              f'Precision: {metrics["precision"]},\n'
-              f'Recall: {metrics["recall"]},\n')
-        assert metrics['roc_auc'] >= 0.5, 'ROC-AUC < 0.5'
+        print(
+            f'ROC_AUC: {metrics["roc_auc"]}, \n'
+            f'PR_AUC: {metrics["pr_auc"]},\n'
+            f'F1: {metrics["f1"]},\n'
+            f'Precision: {metrics["precision"]},\n'
+            f'Recall: {metrics["recall"]},\n'
+        )
+        assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
 
 class TestRaindrop(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.train_y = DATA['train_y']
-        self.val_X = DATA['val_X']
-        self.val_y = DATA['val_y']
-        self.test_X = DATA['test_X']
-        self.test_y = DATA['test_y']
-        print('Running test cases for Raindrop...')
-        self.raindrop = Raindrop(DATA['n_features'], 2, DATA['n_features'] * 4, 256, 2, DATA['n_classes'], 0.3,
-                                 DATA['n_steps'], 0, 'mean', False, False, epochs=EPOCHS)
+        self.train_X = DATA["train_X"]
+        self.train_y = DATA["train_y"]
+        self.val_X = DATA["val_X"]
+        self.val_y = DATA["val_y"]
+        self.test_X = DATA["test_X"]
+        self.test_y = DATA["test_y"]
+        print("Running test cases for Raindrop...")
+        self.raindrop = Raindrop(
+            DATA["n_features"],
+            2,
+            DATA["n_features"] * 4,
+            256,
+            2,
+            DATA["n_classes"],
+            0.3,
+            DATA["n_steps"],
+            0,
+            "mean",
+            False,
+            False,
+            epochs=EPOCHS,
+        )
         self.raindrop.fit(self.train_X, self.train_y, self.val_X, self.val_y)
 
     def test_parameters(self):
-        assert (hasattr(self.raindrop, 'model')
-                and self.raindrop.model is not None)
+        assert hasattr(self.raindrop, "model") and self.raindrop.model is not None
 
-        assert (hasattr(self.raindrop, 'optimizer')
-                and self.raindrop.optimizer is not None)
+        assert (
+            hasattr(self.raindrop, "optimizer") and self.raindrop.optimizer is not None
+        )
 
-        assert hasattr(self.raindrop, 'best_loss')
-        self.assertNotEqual(self.raindrop.best_loss, float('inf'))
+        assert hasattr(self.raindrop, "best_loss")
+        self.assertNotEqual(self.raindrop.best_loss, float("inf"))
 
-        assert (hasattr(self.raindrop, 'best_model_dict')
-                and self.raindrop.best_model_dict is not None)
+        assert (
+            hasattr(self.raindrop, "best_model_dict")
+            and self.raindrop.best_model_dict is not None
+        )
 
     def test_classify(self):
         predictions = self.raindrop.classify(self.test_X)
         metrics = cal_binary_classification_metrics(predictions, self.test_y)
-        print(f'ROC_AUC: {metrics["roc_auc"]}, \n'
-              f'PR_AUC: {metrics["pr_auc"]},\n'
-              f'F1: {metrics["f1"]},\n'
-              f'Precision: {metrics["precision"]},\n'
-              f'Recall: {metrics["recall"]},\n')
-        assert metrics['roc_auc'] >= 0.5, 'ROC-AUC < 0.5'
+        print(
+            f'ROC_AUC: {metrics["roc_auc"]}, \n'
+            f'PR_AUC: {metrics["pr_auc"]},\n'
+            f'F1: {metrics["f1"]},\n'
+            f'Precision: {metrics["precision"]},\n'
+            f'Recall: {metrics["recall"]},\n'
+        )
+        assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/pypots/tests/test_clustering.py b/pypots/tests/test_clustering.py
index 7dbdc6d2..52584e35 100644
--- a/pypots/tests/test_clustering.py
+++ b/pypots/tests/test_clustering.py
@@ -17,63 +17,75 @@
 
 class TestCRLI(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.train_y = DATA['train_y']
-        print('Running test cases for CRLI...')
-        self.crli = CRLI(n_steps=DATA['n_steps'], n_features=DATA['n_features'], n_clusters=DATA['n_classes'],
-                         n_generator_layers=2, rnn_hidden_size=128, epochs=EPOCHS)
+        self.train_X = DATA["train_X"]
+        self.train_y = DATA["train_y"]
+        print("Running test cases for CRLI...")
+        self.crli = CRLI(
+            n_steps=DATA["n_steps"],
+            n_features=DATA["n_features"],
+            n_clusters=DATA["n_classes"],
+            n_generator_layers=2,
+            rnn_hidden_size=128,
+            epochs=EPOCHS,
+        )
         self.crli.fit(self.train_X)
 
     def test_parameters(self):
-        assert (hasattr(self.crli, 'model')
-                and self.crli.model is not None)
+        assert hasattr(self.crli, "model") and self.crli.model is not None
 
-        assert (hasattr(self.crli, 'G_optimizer')
-                and self.crli.G_optimizer is not None)
-        assert (hasattr(self.crli, 'D_optimizer')
-                and self.crli.D_optimizer is not None)
+        assert hasattr(self.crli, "G_optimizer") and self.crli.G_optimizer is not None
+        assert hasattr(self.crli, "D_optimizer") and self.crli.D_optimizer is not None
 
-        assert hasattr(self.crli, 'best_loss')
-        self.assertNotEqual(self.crli.best_loss, float('inf'))
+        assert hasattr(self.crli, "best_loss")
+        self.assertNotEqual(self.crli.best_loss, float("inf"))
 
-        assert (hasattr(self.crli, 'best_model_dict')
-                and self.crli.best_model_dict is not None)
+        assert (
+            hasattr(self.crli, "best_model_dict")
+            and self.crli.best_model_dict is not None
+        )
 
     def test_cluster(self):
         clustering = self.crli.cluster(self.train_X)
         RI = cal_rand_index(clustering, self.train_y)
         CP = cal_cluster_purity(clustering, self.train_y)
-        print(f'RI: {RI}\nCP: {CP}')
+        print(f"RI: {RI}\nCP: {CP}")
 
 
 class TestVaDER(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.train_y = DATA['train_y']
-        print('Running test cases for VaDER...')
-        self.vader = VaDER(n_steps=DATA['n_steps'], n_features=DATA['n_features'], n_clusters=DATA['n_classes'],
-                           rnn_hidden_size=64, d_mu_stddev=5, pretrain_epochs=20, epochs=EPOCHS)
+        self.train_X = DATA["train_X"]
+        self.train_y = DATA["train_y"]
+        print("Running test cases for VaDER...")
+        self.vader = VaDER(
+            n_steps=DATA["n_steps"],
+            n_features=DATA["n_features"],
+            n_clusters=DATA["n_classes"],
+            rnn_hidden_size=64,
+            d_mu_stddev=5,
+            pretrain_epochs=20,
+            epochs=EPOCHS,
+        )
         self.vader.fit(self.train_X)
 
     def test_parameters(self):
-        assert (hasattr(self.vader, 'model')
-                and self.vader.model is not None)
+        assert hasattr(self.vader, "model") and self.vader.model is not None
 
-        assert (hasattr(self.vader, 'optimizer')
-                and self.vader.optimizer is not None)
+        assert hasattr(self.vader, "optimizer") and self.vader.optimizer is not None
 
-        assert hasattr(self.vader, 'best_loss')
-        self.assertNotEqual(self.vader.best_loss, float('inf'))
+        assert hasattr(self.vader, "best_loss")
+        self.assertNotEqual(self.vader.best_loss, float("inf"))
 
-        assert (hasattr(self.vader, 'best_model_dict')
-                and self.vader.best_model_dict is not None)
+        assert (
+            hasattr(self.vader, "best_model_dict")
+            and self.vader.best_model_dict is not None
+        )
 
     def test_cluster(self):
         clustering = self.vader.cluster(self.train_X)
         RI = cal_rand_index(clustering, self.train_y)
         CP = cal_cluster_purity(clustering, self.train_y)
-        print(f'RI: {RI}\nCP: {CP}')
+        print(f"RI: {RI}\nCP: {CP}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/pypots/tests/test_forecasting.py b/pypots/tests/test_forecasting.py
index 1956c1e8..74a9fb60 100644
--- a/pypots/tests/test_forecasting.py
+++ b/pypots/tests/test_forecasting.py
@@ -19,19 +19,25 @@
 class TestBTTF(unittest.TestCase):
     def setUp(self) -> None:
         DATA = gene_random_walk_data(n_steps=120, n_features=10)
-        self.test_X = DATA['test_X']
-        self.test_X_intact = DATA['test_X_intact']
+        self.test_X = DATA["test_X"]
+        self.test_X_intact = DATA["test_X_intact"]
         self.test_X_for_input = self.test_X[:, :100]
-        print('Running test cases for BTTF...')
-        self.bttf = BTTF(100, 10,
-                         20, 2, 10,
-                         np.asarray([1, 2, 3, 10, 10 + 1, 10 + 2, 20, 20 + 1, 20 + 2]),
-                         5, 5)
+        print("Running test cases for BTTF...")
+        self.bttf = BTTF(
+            100,
+            10,
+            20,
+            2,
+            10,
+            np.asarray([1, 2, 3, 10, 10 + 1, 10 + 2, 20, 20 + 1, 20 + 2]),
+            5,
+            5,
+        )
 
     def test_forecasting(self):
         predictions = self.bttf.forecast(self.test_X_for_input)
         mae = cal_mae(predictions, self.test_X_intact[:, 100:])
-        print(f'prediction MAE: {mae}')
+        print(f"prediction MAE: {mae}")
 
-    if __name__ == '__main__':
+    if __name__ == "__main__":
         unittest.main()
diff --git a/pypots/tests/test_imputation.py b/pypots/tests/test_imputation.py
index 160d5e92..4219aa4a 100644
--- a/pypots/tests/test_imputation.py
+++ b/pypots/tests/test_imputation.py
@@ -24,119 +24,151 @@
 
 class TestSAITS(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.val_X = DATA['val_X']
-        self.test_X = DATA['test_X']
-        self.test_X_intact = DATA['test_X_intact']
-        self.test_X_indicating_mask = DATA['test_X_indicating_mask']
-        print('Running test cases for SAITS...')
-        self.saits = SAITS(DATA['n_steps'], DATA['n_features'], n_layers=2, d_model=256, d_inner=128, n_head=4,
-                           d_k=64, d_v=64, dropout=0.1, epochs=EPOCH)
+        self.train_X = DATA["train_X"]
+        self.val_X = DATA["val_X"]
+        self.test_X = DATA["test_X"]
+        self.test_X_intact = DATA["test_X_intact"]
+        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
+        print("Running test cases for SAITS...")
+        self.saits = SAITS(
+            DATA["n_steps"],
+            DATA["n_features"],
+            n_layers=2,
+            d_model=256,
+            d_inner=128,
+            n_head=4,
+            d_k=64,
+            d_v=64,
+            dropout=0.1,
+            epochs=EPOCH,
+        )
         self.saits.fit(self.train_X, self.val_X)
 
     def test_parameters(self):
-        assert (hasattr(self.saits, 'model')
-                and self.saits.model is not None)
+        assert hasattr(self.saits, "model") and self.saits.model is not None
 
-        assert (hasattr(self.saits, 'optimizer')
-                and self.saits.optimizer is not None)
+        assert hasattr(self.saits, "optimizer") and self.saits.optimizer is not None
 
-        assert hasattr(self.saits, 'best_loss')
-        self.assertNotEqual(self.saits.best_loss, float('inf'))
+        assert hasattr(self.saits, "best_loss")
+        self.assertNotEqual(self.saits.best_loss, float("inf"))
 
-        assert (hasattr(self.saits, 'best_model_dict')
-                and self.saits.best_model_dict is not None)
+        assert (
+            hasattr(self.saits, "best_model_dict")
+            and self.saits.best_model_dict is not None
+        )
 
     def test_impute(self):
         imputed_X = self.saits.impute(self.test_X)
-        assert not np.isnan(imputed_X).any(), 'Output still has missing values after running impute().'
+        assert not np.isnan(
+            imputed_X
+        ).any(), "Output still has missing values after running impute()."
         test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask)
-        print(f'SAITS test_MAE: {test_MAE}')
+        print(f"SAITS test_MAE: {test_MAE}")
 
 
 class TestTransformer(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.val_X = DATA['val_X']
-        self.test_X = DATA['test_X']
-        self.test_X_intact = DATA['test_X_intact']
-        self.test_X_indicating_mask = DATA['test_X_indicating_mask']
-        print('Running test cases for Transformer...')
-        self.transformer = Transformer(DATA['n_steps'], DATA['n_features'], n_layers=2, d_model=256, d_inner=128,
-                                       n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=EPOCH)
+        self.train_X = DATA["train_X"]
+        self.val_X = DATA["val_X"]
+        self.test_X = DATA["test_X"]
+        self.test_X_intact = DATA["test_X_intact"]
+        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
+        print("Running test cases for Transformer...")
+        self.transformer = Transformer(
+            DATA["n_steps"],
+            DATA["n_features"],
+            n_layers=2,
+            d_model=256,
+            d_inner=128,
+            n_head=4,
+            d_k=64,
+            d_v=64,
+            dropout=0.1,
+            epochs=EPOCH,
+        )
         self.transformer.fit(self.train_X, self.val_X)
 
     def test_parameters(self):
-        assert (hasattr(self.transformer, 'model')
-                and self.transformer.model is not None)
+        assert hasattr(self.transformer, "model") and self.transformer.model is not None
 
-        assert (hasattr(self.transformer, 'optimizer')
-                and self.transformer.optimizer is not None)
+        assert (
+            hasattr(self.transformer, "optimizer")
+            and self.transformer.optimizer is not None
+        )
 
-        assert hasattr(self.transformer, 'best_loss')
-        self.assertNotEqual(self.transformer.best_loss, float('inf'))
+        assert hasattr(self.transformer, "best_loss")
+        self.assertNotEqual(self.transformer.best_loss, float("inf"))
 
-        assert (hasattr(self.transformer, 'best_model_dict')
-                and self.transformer.best_model_dict is not None)
+        assert (
+            hasattr(self.transformer, "best_model_dict")
+            and self.transformer.best_model_dict is not None
+        )
 
     def test_impute(self):
         imputed_X = self.transformer.impute(self.test_X)
-        assert not np.isnan(imputed_X).any(), 'Output still has missing values after running impute().'
+        assert not np.isnan(
+            imputed_X
+        ).any(), "Output still has missing values after running impute()."
         test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask)
-        print(f'Transformer test_MAE: {test_MAE}')
+        print(f"Transformer test_MAE: {test_MAE}")
 
 
 class TestBRITS(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.val_X = DATA['val_X']
-        self.test_X = DATA['test_X']
-        self.test_X_intact = DATA['test_X_intact']
-        self.test_X_indicating_mask = DATA['test_X_indicating_mask']
-        print('Running test cases for BRITS...')
-        self.brits = BRITS(DATA['n_steps'], DATA['n_features'], 256, epochs=EPOCH)
+        self.train_X = DATA["train_X"]
+        self.val_X = DATA["val_X"]
+        self.test_X = DATA["test_X"]
+        self.test_X_intact = DATA["test_X_intact"]
+        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
+        print("Running test cases for BRITS...")
+        self.brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH)
         self.brits.fit(self.train_X, self.val_X)
 
     def test_parameters(self):
-        assert (hasattr(self.brits, 'model')
-                and self.brits.model is not None)
+        assert hasattr(self.brits, "model") and self.brits.model is not None
 
-        assert (hasattr(self.brits, 'optimizer')
-                and self.brits.optimizer is not None)
+        assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None
 
-        assert hasattr(self.brits, 'best_loss')
-        self.assertNotEqual(self.brits.best_loss, float('inf'))
+        assert hasattr(self.brits, "best_loss")
+        self.assertNotEqual(self.brits.best_loss, float("inf"))
 
-        assert (hasattr(self.brits, 'best_model_dict')
-                and self.brits.best_model_dict is not None)
+        assert (
+            hasattr(self.brits, "best_model_dict")
+            and self.brits.best_model_dict is not None
+        )
 
     def test_impute(self):
         imputed_X = self.brits.impute(self.test_X)
-        assert not np.isnan(imputed_X).any(), 'Output still has missing values after running impute().'
+        assert not np.isnan(
+            imputed_X
+        ).any(), "Output still has missing values after running impute()."
         test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask)
-        print(f'BRITS test_MAE: {test_MAE}')
+        print(f"BRITS test_MAE: {test_MAE}")
 
 
 class TestLOCF(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA['train_X']
-        self.val_X = DATA['val_X']
-        self.test_X = DATA['test_X']
-        self.test_X_intact = DATA['test_X_intact']
-        self.test_X_indicating_mask = DATA['test_X_indicating_mask']
-        print('Running test cases for LOCF...')
+        self.train_X = DATA["train_X"]
+        self.val_X = DATA["val_X"]
+        self.test_X = DATA["test_X"]
+        self.test_X_intact = DATA["test_X_intact"]
+        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
+        print("Running test cases for LOCF...")
         self.locf = LOCF(nan=0)
 
     def test_parameters(self):
-        assert (hasattr(self.locf, 'nan')
-                and self.locf.nan is not None)
+        assert hasattr(self.locf, "nan") and self.locf.nan is not None
 
     def test_impute(self):
         test_X_imputed = self.locf.impute(self.test_X)
-        assert not np.isnan(test_X_imputed).any(), 'Output still has missing values after running impute().'
-        test_MAE = cal_mae(test_X_imputed, self.test_X_intact, self.test_X_indicating_mask)
-        print(f'LOCF test_MAE: {test_MAE}')
+        assert not np.isnan(
+            test_X_imputed
+        ).any(), "Output still has missing values after running impute()."
+        test_MAE = cal_mae(
+            test_X_imputed, self.test_X_intact, self.test_X_indicating_mask
+        )
+        print(f"LOCF test_MAE: {test_MAE}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/pypots/tests/unified_data_for_test.py b/pypots/tests/unified_data_for_test.py
index c90c45bb..2bdf89fc 100644
--- a/pypots/tests/unified_data_for_test.py
+++ b/pypots/tests/unified_data_for_test.py
@@ -14,14 +14,17 @@
 from pypots.data import load_specific_dataset
 
 
-def gene_random_walk_data(n_steps=24, n_features=10, n_classes=2, n_samples_each_class=1000):
-    """ Generate a random-walk dataset.
-    """
+def gene_random_walk_data(
+    n_steps=24, n_features=10, n_classes=2, n_samples_each_class=1000
+):
+    """Generate a random-walk dataset."""
     # generate samples
-    X, y = generate_random_walk_for_classification(n_classes=n_classes,
-                                                   n_samples_each_class=n_samples_each_class,
-                                                   n_steps=n_steps,
-                                                   n_features=n_features)
+    X, y = generate_random_walk_for_classification(
+        n_classes=n_classes,
+        n_samples_each_class=n_samples_each_class,
+        n_steps=n_steps,
+        n_features=n_features,
+    )
     # split into train/val/test sets
     train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
     train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2)
@@ -46,54 +49,62 @@ def gene_random_walk_data(n_steps=24, n_features=10, n_classes=2, n_samples_each
     test_X = test_X.reshape(-1, n_steps, n_features)
 
     # mask values in the test set as ground truth
-    test_X_intact, test_X, test_X_missing_mask, test_X_indicating_mask = mcar(test_X, 0.3)
+    test_X_intact, test_X, test_X_missing_mask, test_X_indicating_mask = mcar(
+        test_X, 0.3
+    )
     test_X = masked_fill(test_X, 1 - test_X_missing_mask, torch.nan)
 
     data = {
-        'n_classes': n_classes,
-        'n_steps': n_steps,
-        'n_features': n_features,
-        'train_X': train_X, 'train_y': train_y,
-        'val_X': val_X, 'val_y': val_y,
-        'test_X': test_X, 'test_y': test_y,
-        'test_X_intact': test_X_intact,
-        'test_X_indicating_mask': test_X_indicating_mask
+        "n_classes": n_classes,
+        "n_steps": n_steps,
+        "n_features": n_features,
+        "train_X": train_X,
+        "train_y": train_y,
+        "val_X": val_X,
+        "val_y": val_y,
+        "test_X": test_X,
+        "test_y": test_y,
+        "test_X_intact": test_X_intact,
+        "test_X_indicating_mask": test_X_indicating_mask,
     }
     return data
 
 
 def gene_physionet2012():
-    """ Generate PhysioNet2012.
-    """
+    """Generate PhysioNet2012."""
     # generate samples
-    df = load_specific_dataset('physionet_2012')
-    X = df['X']
-    X = X.drop(df['static_features'], axis=1)
+    df = load_specific_dataset("physionet_2012")
+    X = df["X"]
+    X = X.drop(df["static_features"], axis=1)
 
     def apply_func(df_temp):
-        missing = list(set(range(0, 48)).difference(set(df_temp['Time'])))
-        missing_part = pd.DataFrame({'Time': missing})
+        missing = list(set(range(0, 48)).difference(set(df_temp["Time"])))
+        missing_part = pd.DataFrame({"Time": missing})
         df_temp = df_temp.append(missing_part, ignore_index=False, sort=False)
-        df_temp = df_temp.set_index('Time').sort_index().reset_index()
+        df_temp = df_temp.set_index("Time").sort_index().reset_index()
         df_temp = df_temp.iloc[:48]
         return df_temp
 
-    X = X.groupby('RecordID').apply(apply_func)
-    X = X.drop('RecordID', axis=1)
+    X = X.groupby("RecordID").apply(apply_func)
+    X = X.drop("RecordID", axis=1)
     X = X.reset_index()
-    X = X.drop(['level_1', 'Time'], axis=1)
+    X = X.drop(["level_1", "Time"], axis=1)
 
-    y = df['y']
-    all_recordID = X['RecordID'].unique()
+    y = df["y"]
+    all_recordID = X["RecordID"].unique()
     train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2)
     train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2)
-    train_set = X[X['RecordID'].isin(train_set_ids)]
-    val_set = X[X['RecordID'].isin(val_set_ids)]
-    test_set = X[X['RecordID'].isin(test_set_ids)]
-    train_set = train_set.drop('RecordID', axis=1)
-    val_set = val_set.drop('RecordID', axis=1)
-    test_set = test_set.drop('RecordID', axis=1)
-    train_X, val_X, test_X = train_set.to_numpy(), val_set.to_numpy(), test_set.to_numpy()
+    train_set = X[X["RecordID"].isin(train_set_ids)]
+    val_set = X[X["RecordID"].isin(val_set_ids)]
+    test_set = X[X["RecordID"].isin(test_set_ids)]
+    train_set = train_set.drop("RecordID", axis=1)
+    val_set = val_set.drop("RecordID", axis=1)
+    test_set = test_set.drop("RecordID", axis=1)
+    train_X, val_X, test_X = (
+        train_set.to_numpy(),
+        val_set.to_numpy(),
+        test_set.to_numpy(),
+    )
     # normalization
     scaler = StandardScaler()
     train_X = scaler.fit_transform(train_X)
@@ -109,18 +120,23 @@ def apply_func(df_temp):
     test_y = y[y.index.isin(test_set_ids)]
     train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy()
 
-    test_X_intact, test_X, test_X_missing_mask, test_X_indicating_mask = mcar(test_X, 0.1)
+    test_X_intact, test_X, test_X_missing_mask, test_X_indicating_mask = mcar(
+        test_X, 0.1
+    )
     test_X = masked_fill(test_X, 1 - test_X_missing_mask, torch.nan)
 
     data = {
-        'n_classes': 2,
-        'n_steps': 48,
-        'n_features': train_X.shape[-1],
-        'train_X': train_X, 'train_y': train_y.flatten(),
-        'val_X': val_X, 'val_y': val_y.flatten(),
-        'test_X': test_X, 'test_y': test_y.flatten(),
-        'test_X_intact': test_X_intact,
-        'test_X_indicating_mask': test_X_indicating_mask
+        "n_classes": 2,
+        "n_steps": 48,
+        "n_features": train_X.shape[-1],
+        "train_X": train_X,
+        "train_y": train_y.flatten(),
+        "val_X": val_X,
+        "val_y": val_y.flatten(),
+        "test_X": test_X,
+        "test_y": test_y.flatten(),
+        "test_X_intact": test_X_intact,
+        "test_X_indicating_mask": test_X_indicating_mask,
     }
     return data
 
diff --git a/pypots/utils/__init__.py b/pypots/utils/__init__.py
index 193e2be2..2d6f3394 100644
--- a/pypots/utils/__init__.py
+++ b/pypots/utils/__init__.py
@@ -1,5 +1,6 @@
 """
 
 """
+
 # Created by Wenjie Du <wenjay.du@gmail.com>
-# License: GPL-v3
\ No newline at end of file
+# License: GPL-v3
diff --git a/pypots/utils/metrics.py b/pypots/utils/metrics.py
index 168f3af4..f47ac132 100644
--- a/pypots/utils/metrics.py
+++ b/pypots/utils/metrics.py
@@ -11,9 +11,11 @@
 
 
 def cal_mae(inputs, target, mask=None):
-    """ calculate Mean Absolute Error"""
-    assert type(inputs) == type(target), f'types of inputs and target must match, ' \
-                                         f'type(inputs)={type(inputs)}, type(target)={type(target)}'
+    """calculate Mean Absolute Error"""
+    assert type(inputs) == type(target), (
+        f"types of inputs and target must match, but got"
+        f"type(inputs)={type(inputs)}, type(target)={type(target)}"
+    )
     lib = np if isinstance(inputs, np.ndarray) else torch
     if mask is not None:
         return lib.sum(lib.abs(inputs - target) * mask) / (lib.sum(mask) + 1e-9)
@@ -22,9 +24,11 @@ def cal_mae(inputs, target, mask=None):
 
 
 def cal_mse(inputs, target, mask=None):
-    """ calculate Mean Square Error"""
-    assert type(inputs) == type(target), f'types of inputs and target must match, ' \
-                                         f'type(inputs)={type(inputs)}, type(target)={type(target)}'
+    """calculate Mean Square Error"""
+    assert type(inputs) == type(target), (
+        f"types of inputs and target must match, but got"
+        f"type(inputs)={type(inputs)}, type(target)={type(target)}"
+    )
     lib = np if isinstance(inputs, np.ndarray) else torch
     if mask is not None:
         return lib.sum(lib.square(inputs - target) * mask) / (lib.sum(mask) + 1e-9)
@@ -33,26 +37,32 @@ def cal_mse(inputs, target, mask=None):
 
 
 def cal_rmse(inputs, target, mask=None):
-    """ calculate Root Mean Square Error"""
-    assert type(inputs) == type(target), f'types of inputs and target must match, ' \
-                                         f'type(inputs)={type(inputs)}, type(target)={type(target)}'
+    """calculate Root Mean Square Error"""
+    assert type(inputs) == type(target), (
+        f"types of inputs and target must match, but got"
+        f"type(inputs)={type(inputs)}, type(target)={type(target)}"
+    )
     lib = np if isinstance(inputs, np.ndarray) else torch
     return lib.sqrt(cal_mse(inputs, target, mask))
 
 
 def cal_mre(inputs, target, mask=None):
-    """ calculate Mean Relative Error"""
-    assert type(inputs) == type(target), f'types of inputs and target must match, ' \
-                                         f'type(inputs)={type(inputs)}, type(target)={type(target)}'
+    """calculate Mean Relative Error"""
+    assert type(inputs) == type(target), (
+        f"types of inputs and target must match, but got"
+        f"type(inputs)={type(inputs)}, type(target)={type(target)}"
+    )
     lib = np if isinstance(inputs, np.ndarray) else torch
     if mask is not None:
-        return lib.sum(lib.abs(inputs - target) * mask) / (lib.sum(lib.abs(target * mask)) + 1e-9)
+        return lib.sum(lib.abs(inputs - target) * mask) / (
+            lib.sum(lib.abs(target * mask)) + 1e-9
+        )
     else:
         return lib.mean(lib.abs(inputs - target)) / (lib.sum(lib.abs(target)) + 1e-9)
 
 
 def cal_binary_classification_metrics(prob_predictions, targets, pos_label=1):
-    """ Calculate the evaluation metrics for the binary classification task,
+    """Calculate the evaluation metrics for the binary classification task,
         including accuracy, precision, recall, f1 score, area under ROC curve, and area under Precision-Recall curve.
         If targets contains multiple categories, please set the positive category as `pos_label`.
 
@@ -89,10 +99,14 @@ def cal_binary_classification_metrics(prob_predictions, targets, pos_label=1):
     elif len(targets.shape) == 2 and targets.shape[1] == 1:
         targets = np.asarray(targets).flatten()
     else:
-        raise f'targets dimensions should be 1 or 2, but got targets.shape: {targets.shape}'
-
-    if len(prob_predictions.shape) == 1 or (len(prob_predictions.shape) == 2 and prob_predictions.shape[1] == 1):
-        prob_predictions = np.asarray(prob_predictions).flatten()  # turn the array shape into [n_samples]
+        raise f"targets dimensions should be 1 or 2, but got targets.shape: {targets.shape}"
+
+    if len(prob_predictions.shape) == 1 or (
+        len(prob_predictions.shape) == 2 and prob_predictions.shape[1] == 1
+    ):
+        prob_predictions = np.asarray(
+            prob_predictions
+        ).flatten()  # turn the array shape into [n_samples]
         binary_predictions = prob_predictions
         prediction_categories = (prob_predictions >= 0.5).astype(int)
         binary_prediction_categories = prediction_categories
@@ -101,7 +115,7 @@ def cal_binary_classification_metrics(prob_predictions, targets, pos_label=1):
         binary_predictions = prob_predictions[:, pos_label]
         binary_prediction_categories = (prediction_categories == pos_label).astype(int)
     else:
-        raise f'predictions dimensions should be 1 or 2, but got predictions.shape: {prob_predictions.shape}'
+        raise f"predictions dimensions should be 1 or 2, but got predictions.shape: {prob_predictions.shape}"
 
     # accuracy score doesn't have to be of binary classification
     acc_score = cal_acc(prediction_categories, targets)
@@ -112,28 +126,32 @@ def cal_binary_classification_metrics(prob_predictions, targets, pos_label=1):
     binary_targets = np.copy(targets)
     binary_targets[~mask] = mask_val
 
-    precision, recall, f1 = cal_precision_recall_f1(binary_prediction_categories, binary_targets, pos_label)
-    pr_auc, precisions, recalls, _ = cal_pr_auc(binary_predictions, binary_targets, pos_label)
+    precision, recall, f1 = cal_precision_recall_f1(
+        binary_prediction_categories, binary_targets, pos_label
+    )
+    pr_auc, precisions, recalls, _ = cal_pr_auc(
+        binary_predictions, binary_targets, pos_label
+    )
     ROC_AUC, fprs, tprs, _ = cal_roc_auc(binary_predictions, binary_targets, pos_label)
     PR_AUC = metrics.auc(recalls, precisions)
     classification_metrics = {
-        'predictions': prediction_categories,
-        'accuracy': acc_score,
-        'precision': precision,
-        'recall': recall,
-        'f1': f1,
-        'precisions': precisions,
-        'recalls': recalls,
-        'pr_auc': PR_AUC,
-        'fprs': fprs,
-        'tprs': tprs,
-        'roc_auc': ROC_AUC,
+        "predictions": prediction_categories,
+        "accuracy": acc_score,
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+        "precisions": precisions,
+        "recalls": recalls,
+        "pr_auc": PR_AUC,
+        "fprs": fprs,
+        "tprs": tprs,
+        "roc_auc": ROC_AUC,
     }
     return classification_metrics
 
 
 def cal_precision_recall_f1(prob_predictions, targets, pos_label=1):
-    """ Calculate precision, recall, and F1-score of model predictions.
+    """Calculate precision, recall, and F1-score of model predictions.
 
     Parameters
     ----------
@@ -154,14 +172,15 @@ def cal_precision_recall_f1(prob_predictions, targets, pos_label=1):
         The F1 score of model predictions.
 
     """
-    precision, recall, f1, _ = metrics.precision_recall_fscore_support(targets, prob_predictions,
-                                                                       pos_label=pos_label)
+    precision, recall, f1, _ = metrics.precision_recall_fscore_support(
+        targets, prob_predictions, pos_label=pos_label
+    )
     precision, recall, f1 = precision[pos_label], recall[pos_label], f1[pos_label]
     return precision, recall, f1
 
 
 def cal_pr_auc(prob_predictions, targets, pos_label=1):
-    """ Calculate precisions, recalls, and area under PR curve of model predictions.
+    """Calculate precisions, recalls, and area under PR curve of model predictions.
 
     Parameters
     ----------
@@ -185,14 +204,15 @@ def cal_pr_auc(prob_predictions, targets, pos_label=1):
 
     """
 
-    precisions, recalls, thresholds = metrics.precision_recall_curve(targets, prob_predictions,
-                                                                     pos_label=pos_label)
+    precisions, recalls, thresholds = metrics.precision_recall_curve(
+        targets, prob_predictions, pos_label=pos_label
+    )
     pr_auc = metrics.auc(recalls, precisions)
     return pr_auc, precisions, recalls, thresholds
 
 
 def cal_roc_auc(prob_predictions, targets, pos_label=1):
-    """ Calculate false positive rates, true positive rates, and area under AUC curve of model predictions.
+    """Calculate false positive rates, true positive rates, and area under AUC curve of model predictions.
 
     Parameters
     ----------
@@ -215,14 +235,15 @@ def cal_roc_auc(prob_predictions, targets, pos_label=1):
         Increasing thresholds on the decision function used to compute FPR and TPR.
 
     """
-    fprs, tprs, thresholds = metrics.roc_curve(y_true=targets, y_score=prob_predictions,
-                                               pos_label=pos_label)
+    fprs, tprs, thresholds = metrics.roc_curve(
+        y_true=targets, y_score=prob_predictions, pos_label=pos_label
+    )
     roc_auc = metrics.auc(fprs, tprs)
     return roc_auc, fprs, tprs, thresholds
 
 
 def cal_acc(class_predictions, targets):
-    """ Calculate accuracy score of model predictions.
+    """Calculate accuracy score of model predictions.
 
     Parameters
     ----------
@@ -242,7 +263,7 @@ def cal_acc(class_predictions, targets):
 
 
 def cal_rand_index(class_predictions, targets):
-    """ Calculate Rand Index, a measure of the similarity between two data clusterings.
+    """Calculate Rand Index, a measure of the similarity between two data clusterings.
         Refer to :cite:`rand1971RandIndex`.
 
     Parameters
@@ -279,7 +300,7 @@ def cal_rand_index(class_predictions, targets):
 
 
 def cal_adjusted_rand_index(class_predictions, targets):
-    """ Calculate adjusted Rand Index.
+    """Calculate adjusted Rand Index.
     Refer to :cite:`hubert1985AdjustedRI`.
 
     Parameters
@@ -299,7 +320,7 @@ def cal_adjusted_rand_index(class_predictions, targets):
 
 
 def cal_cluster_purity(class_predictions, targets):
-    """ Calculate cluster purity.
+    """Calculate cluster purity.
 
     Parameters
     ----------
@@ -319,5 +340,7 @@ def cal_cluster_purity(class_predictions, targets):
 
     """
     contingency_matrix = metrics.cluster.contingency_matrix(targets, class_predictions)
-    cluster_purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
+    cluster_purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(
+        contingency_matrix
+    )
     return cluster_purity
diff --git a/setup.py b/setup.py
index aecb0cb8..20cc6be4 100644
--- a/setup.py
+++ b/setup.py
@@ -2,39 +2,45 @@
 
 from pypots.__version__ import version
 
-with open('./README.md', encoding='utf-8') as f:
+with open("./README.md", encoding="utf-8") as f:
     README = f.read()
 
 setup(
-    name='pypots',
+    name="pypots",
     version=version,
-    description='A Python Toolbox for Data Mining on Partially-Observed Time Series',
+    description="A Python Toolbox for Data Mining on Partially-Observed Time Series",
     long_description=README,
-    long_description_content_type='text/markdown',
-    license='GPL-3.0',
-    author='Wenjie Du',
-    author_email='wenjay.du@gmail.com',
-    url='https://github.com/WenjieDu/PyPOTS',
-    download_url='https://github.com/WenjieDu/PyPOTS/archive/master.zip',
+    long_description_content_type="text/markdown",
+    license="GPL-3.0",
+    author="Wenjie Du",
+    author_email="wenjay.du@gmail.com",
+    url="https://github.com/WenjieDu/PyPOTS",
+    download_url="https://github.com/WenjieDu/PyPOTS/archive/master.zip",
     keywords=[
-        'data mining', 'neural networks', 'machine learning', 'deep learning',
-        'partially observed', 'time series', 'missing data', 'missing values',
+        "data mining",
+        "neural networks",
+        "machine learning",
+        "deep learning",
+        "partially observed",
+        "time series",
+        "missing data",
+        "missing values",
     ],
-    packages=find_packages(exclude=['tests']),
+    packages=find_packages(exclude=["tests"]),
     include_package_data=True,
     install_requires=[
-        'matplotlib',
-        'numpy',
-        'scikit_learn',
-        'scipy',
-        'torch>=1.10',  # torch_sparse v0.6.12 requires 1.9<=torch<1.10, v0.6.13 needs torch>=1.10
-        'torch_sparse==0.6.13',
-        'torch_scatter',
-        'torch_geometric',
-        'tensorboard',
-        'pandas',
-        'pycorruptor',
-        'tsdb',
+        "matplotlib",
+        "numpy",
+        "scikit_learn",
+        "scipy",
+        "torch>=1.10",  # torch_sparse v0.6.12 requires 1.9<=torch<1.10, v0.6.13 needs torch>=1.10
+        "torch_sparse==0.6.13",
+        "torch_scatter",
+        "torch_geometric",
+        "tensorboard",
+        "pandas",
+        "pycorruptor",
+        "tsdb",
     ],
-    setup_requires=['setuptools>=38.6.0'],
+    setup_requires=["setuptools>=38.6.0"],
 )