Nixtla · jasminerienecker · Jul 9, 2024 · Jul 9, 2024 · Jul 14, 2024
diff --git a/nbs/common.base_windows.ipynb b/nbs/common.base_windows.ipynb
@@ -103,6 +103,7 @@
     "                 windows_batch_size,\n",
     "                 inference_windows_batch_size,\n",
     "                 start_padding_enabled,\n",
+    "                 data_availability_threshold=0.0,\n",
     "                 step_size=1,\n",
     "                 num_lr_decays=0,\n",
     "                 early_stop_patience_steps=-1,\n",
@@ -146,6 +147,7 @@
     "            self.padder_train = nn.ConstantPad1d(padding=(self.input_size-1, self.h), value=0)\n",
     "        else:\n",
     "            self.padder_train = nn.ConstantPad1d(padding=(0, self.h), value=0)\n",
+    "        self.data_availability_threshold = data_availability_threshold\n",
     "\n",
     "        # Batch sizes\n",
     "        self.batch_size = batch_size\n",
@@ -221,11 +223,11 @@
     "            available_idx = temporal_cols.get_loc('available_mask')\n",
     "            available_condition = windows[:, :self.input_size, available_idx]\n",
     "            available_condition = torch.sum(available_condition, axis=1)\n",
-    "            final_condition = (available_condition > 0)\n",
+    "            final_condition = (available_condition > self.data_availability_threshold * self.input_size)\n",
     "            if self.h > 0:\n",
     "                sample_condition = windows[:, self.input_size:, available_idx]\n",
     "                sample_condition = torch.sum(sample_condition, axis=1)\n",
-    "                final_condition = (sample_condition > 0) & (available_condition > 0)\n",
+    "                final_condition = (sample_condition > self.data_availability_threshold * self.h) & (available_condition > self.data_availability_threshold * self.input_size)\n",
     "            windows = windows[final_condition]\n",
     "\n",
     "            # Parse Static data to match windows\n",
@@ -880,7 +882,39 @@
    "id": "bf493ff9",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "#| hide\n",
+    "# Test that data_availability_threshold filters windows with low data availability\n",
+    "\n",
+    "#mark every odd data point as bad quality \n",
+    "AirPassengersDF['available_mask'] = [1 if i % 2 == 0 else 0 for i in range(len(AirPassengersDF))]\n",
+    "dataset, indices, dates, ds = TimeSeriesDataset.from_df(df=AirPassengersDF)\n",
+    "data = TimeSeriesDataModule(dataset=dataset, batch_size=1, drop_last=True)\n",
+    "\n",
+    "train_loader =  data.train_dataloader()\n",
+    "batch = next(iter(train_loader))\n",
+    "\n",
+    "basewindows = BaseWindows(h=12,\n",
+    "                            input_size=24,\n",
+    "                            hist_exog_list=['x', 'x2'],\n",
+    "                            futr_exog_list=['x'],\n",
+    "                            data_availability_threshold=0.8,\n",
+    "                            loss=MAE(),\n",
+    "                            valid_loss=MAE(),\n",
+    "                            learning_rate=0.001,\n",
+    "                            max_steps=1,\n",
+    "                            val_check_steps=0,\n",
+    "                            batch_size=1,\n",
+    "                            valid_batch_size=1,\n",
+    "                            windows_batch_size=10,\n",
+    "                            inference_windows_batch_size=2, \n",
+    "                            start_padding_enabled=False)\n",
+    "\n",
+    "try:\n",
+    "    basewindows._create_windows(batch, step='train')\n",
+    "except Exception as e:\n",
+    "    assert str(e) == \"No windows available for training\""
+   ]
   }
  ],
  "metadata": {

diff --git a/nbs/models.autoformer.ipynb b/nbs/models.autoformer.ipynb
@@ -483,6 +483,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=1024, number of windows to sample in each inference batch.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `scaler_type`: str='robust', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
     "    `num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.<br>\n",
@@ -532,6 +533,7 @@
     "                 windows_batch_size = 1024,\n",
     "                 inference_windows_batch_size = 1024,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold = 0.0,\n",
     "                 step_size: int = 1,\n",
     "                 scaler_type: str = 'identity',\n",
     "                 random_seed: int = 1,\n",
@@ -560,6 +562,7 @@
     "                                       valid_batch_size=valid_batch_size,\n",
     "                                       inference_windows_batch_size=inference_windows_batch_size,\n",
     "                                       start_padding_enabled = start_padding_enabled,\n",
+    "                                       data_availability_threshold = data_availability_threshold,\n",
     "                                       step_size=step_size,\n",
     "                                       scaler_type=scaler_type,\n",
     "                                       num_workers_loader=num_workers_loader,\n",

diff --git a/nbs/models.bitcn.ipynb b/nbs/models.bitcn.ipynb
@@ -166,6 +166,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=-1, number of windows to sample in each inference batch, -1 uses all.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `step_size`: int=1, step size between each window of temporal data.<br>\n",
     "    `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
@@ -206,6 +207,7 @@
     "                 windows_batch_size = 1024,\n",
     "                 inference_windows_batch_size = 1024,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold = 0.0,\n",
     "                 step_size: int = 1,\n",
     "                 scaler_type: str = 'identity',\n",
     "                 random_seed: int = 1,\n",
@@ -234,6 +236,7 @@
     "            valid_batch_size=valid_batch_size,\n",
     "            windows_batch_size=windows_batch_size,\n",
     "            inference_windows_batch_size=inference_windows_batch_size,\n",
+    "            data_availability_threshold=data_availability_threshold,\n",
     "            start_padding_enabled=start_padding_enabled,\n",
     "            step_size=step_size,\n",
     "            scaler_type=scaler_type,\n",

diff --git a/nbs/models.deepar.ipynb b/nbs/models.deepar.ipynb
@@ -177,6 +177,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=-1, number of windows to sample in each inference batch, -1 uses all.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `step_size`: int=1, step size between each window of temporal data.<br>\n",
     "    `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int, random_seed for pytorch initializer and numpy generators.<br>\n",

diff --git a/nbs/models.deepnpts.ipynb b/nbs/models.deepnpts.ipynb
@@ -122,6 +122,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=-1, number of windows to sample in each inference batch, -1 uses all.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `step_size`: int=1, step size between each window of temporal data.<br>\n",
     "    `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int, random_seed for pytorch initializer and numpy generators.<br>\n",
@@ -167,6 +168,7 @@
     "                 windows_batch_size: int = 1024,\n",
     "                 inference_windows_batch_size: int = 1024,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold: float = 0.0,\n",
     "                 step_size: int = 1,\n",
     "                 scaler_type: str = 'standard',\n",
     "                 random_seed: int = 1,\n",
@@ -206,6 +208,7 @@
     "                                    valid_batch_size=valid_batch_size,\n",
     "                                    inference_windows_batch_size=inference_windows_batch_size,\n",
     "                                    start_padding_enabled=start_padding_enabled,\n",
+    "                                    data_availability_threshold=data_availability_threshold,\n",
     "                                    step_size=step_size,\n",
     "                                    scaler_type=scaler_type,\n",
     "                                    num_workers_loader=num_workers_loader,\n",

diff --git a/nbs/models.dlinear.ipynb b/nbs/models.dlinear.ipynb
@@ -157,6 +157,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=1024, number of windows to sample in each inference batch.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `scaler_type`: str='robust', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
     "    `num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.<br>\n",
@@ -197,6 +198,7 @@
     "                 windows_batch_size = 1024,\n",
     "                 inference_windows_batch_size = 1024,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold = 0.0,\n",
     "                 step_size: int = 1,\n",
     "                 scaler_type: str = 'identity',\n",
     "                 random_seed: int = 1,\n",
@@ -225,6 +227,7 @@
     "                                       valid_batch_size=valid_batch_size,\n",
     "                                       inference_windows_batch_size=inference_windows_batch_size,\n",
     "                                       start_padding_enabled = start_padding_enabled,\n",
+    "                                       data_availability_threshold = data_availability_threshold,\n",
     "                                       step_size=step_size,\n",
     "                                       scaler_type=scaler_type,\n",
     "                                       num_workers_loader=num_workers_loader,\n",

diff --git a/nbs/models.fedformer.ipynb b/nbs/models.fedformer.ipynb
@@ -472,6 +472,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=1024, number of windows to sample in each inference batch.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `scaler_type`: str='robust', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
     "    `num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.<br>\n",
@@ -515,6 +516,7 @@
     "                 num_lr_decays: int = -1,\n",
     "                 early_stop_patience_steps: int =-1,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold = 0.0,\n",
     "                 val_check_steps: int = 100,\n",
     "                 batch_size: int = 32,\n",
     "                 valid_batch_size: Optional[int] = None,\n",
@@ -547,6 +549,7 @@
     "                                       valid_batch_size=valid_batch_size,\n",
     "                                       inference_windows_batch_size=inference_windows_batch_size,\n",
     "                                       start_padding_enabled=start_padding_enabled,\n",
+    "                                       data_availability_threshold=data_availability_threshold,\n",
     "                                       step_size=step_size,\n",
     "                                       scaler_type=scaler_type,\n",
     "                                       num_workers_loader=num_workers_loader,\n",

diff --git a/nbs/models.informer.ipynb b/nbs/models.informer.ipynb
@@ -292,6 +292,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=1024, number of windows to sample in each inference batch.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `scaler_type`: str='robust', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
     "    `num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.<br>\n",
@@ -341,6 +342,7 @@
     "                 windows_batch_size = 1024,\n",
     "                 inference_windows_batch_size = 1024,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold = 0.0,\n",
     "                 step_size: int = 1,\n",
     "                 scaler_type: str = 'identity',\n",
     "                 random_seed: int = 1,\n",
@@ -369,6 +371,7 @@
     "                                       windows_batch_size=windows_batch_size,\n",
     "                                       inference_windows_batch_size = inference_windows_batch_size,\n",
     "                                       start_padding_enabled=start_padding_enabled,\n",
+    "                                       data_availability_threshold=data_availability_threshold,\n",
     "                                       step_size=step_size,\n",
     "                                       scaler_type=scaler_type,\n",
     "                                       num_workers_loader=num_workers_loader,\n",

diff --git a/nbs/models.mlp.ipynb b/nbs/models.mlp.ipynb
@@ -108,6 +108,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=-1, number of windows to sample in each inference batch, -1 uses all.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `step_size`: int=1, step size between each window of temporal data.<br>\n",
     "    `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
@@ -147,6 +148,7 @@
     "                 windows_batch_size = 1024,\n",
     "                 inference_windows_batch_size = -1,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold = 0.0,\n",
     "                 step_size: int = 1,\n",
     "                 scaler_type: str = 'identity',\n",
     "                 random_seed: int = 1,\n",
@@ -177,6 +179,7 @@
     "                                  windows_batch_size=windows_batch_size,\n",
     "                                  inference_windows_batch_size=inference_windows_batch_size,\n",
     "                                  start_padding_enabled=start_padding_enabled,\n",
+    "                                  data_availability_threshold=data_availability_threshold,\n",
     "                                  step_size=step_size,\n",
     "                                  scaler_type=scaler_type,\n",
     "                                  num_workers_loader=num_workers_loader,\n",

diff --git a/nbs/models.nbeats.ipynb b/nbs/models.nbeats.ipynb
@@ -264,6 +264,7 @@
     "    `windows_batch_size`: int=1024, number of windows to sample in each training batch, default uses all.<br>\n",
     "    `inference_windows_batch_size`: int=-1, number of windows to sample in each inference batch, -1 uses all.<br>\n",
     "    `start_padding_enabled`: bool=False, if True, the model will pad the time series with zeros at the beginning, by input size.<br>\n",
+    "    `data_availability_threshold`: float=0.0, drop windows where the percentage of available data points is less than this threshold.<br>\n",
     "    `step_size`: int=1, step size between each window of temporal data.<br>\n",
     "    `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
     "    `random_seed`: int, random_seed for pytorch initializer and numpy generators.<br>\n",
@@ -309,6 +310,7 @@
     "                 windows_batch_size: int = 1024,\n",
     "                 inference_windows_batch_size: int = -1,\n",
     "                 start_padding_enabled = False,\n",
+    "                 data_availability_threshold = 0.0,\n",
     "                 step_size: int = 1,\n",
     "                 scaler_type: str ='identity',\n",
     "                 random_seed: int = 1,\n",
@@ -341,6 +343,7 @@
     "                                     valid_batch_size=valid_batch_size,\n",
     "                                     inference_windows_batch_size=inference_windows_batch_size,\n",
     "                                     start_padding_enabled=start_padding_enabled,\n",
+    "                                     data_availability_threshold=data_availability_threshold,\n",
     "                                     step_size=step_size,\n",
     "                                     scaler_type=scaler_type,\n",
     "                                     num_workers_loader=num_workers_loader,\n",