diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py index f50ab8a18b..e6d94ea1ca 100644 --- a/darts/models/forecasting/forecasting_model.py +++ b/darts/models/forecasting/forecasting_model.py @@ -525,7 +525,7 @@ def historical_forecasts( pd.RangeIndex( start=last_points_times[0], stop=last_points_times[-1] + 1, - step=1, + step=series.freq * stride, ), np.array(last_points_values), columns=series.columns, diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 7429c61fcb..ca1ed3d39b 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -636,22 +636,24 @@ def predict( f"but it ranges only from {cov.start_time()} until {cov.end_time()}.", ) - if cov.has_datetime_index: - covariate_matrices[cov_type].append( - cov[first_req_ts:last_req_ts].values() - ) - else: - # include last_req_ts when slicing series with integer indices - covariate_matrices[cov_type].append( - cov[first_req_ts : last_req_ts + 1].values() - ) + # Note: we use slice() rather than the [] operator because + # for integer-indexed series [] does not act on the time index. + last_req_ts = ( + # For range indexes, we need to make the end timestamp inclusive here + last_req_ts + ts.freq + if ts.has_range_index + else last_req_ts + ) + covariate_matrices[cov_type].append( + cov.slice(first_req_ts, last_req_ts).values(copy=False) + ) covariate_matrices[cov_type] = np.stack(covariate_matrices[cov_type]) series_matrix = None if "target" in self.lags: series_matrix = np.stack( - [ts[self.lags["target"][0] :].values() for ts in series] + [ts[self.lags["target"][0] :].values(copy=False) for ts in series] ) # repeat series_matrix to shape (num_samples * num_series, n_lags, n_components) diff --git a/darts/tests/models/forecasting/test_covariate_index_generators.py b/darts/tests/models/forecasting/test_covariate_index_generators.py index 399d741bdb..196249e975 100644 --- a/darts/tests/models/forecasting/test_covariate_index_generators.py +++ b/darts/tests/models/forecasting/test_covariate_index_generators.py @@ -34,7 +34,7 @@ class CovariateIndexGeneratorTestCase(DartsBaseTestClass): # pd.DatetimeIndex # target covariate for inference dataset for n <= output_chunk_length cov_time_inf_short = TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=target_time.start_time(), length=n_target + n_short, freq=target_time.freq, @@ -43,7 +43,7 @@ class CovariateIndexGeneratorTestCase(DartsBaseTestClass): ) # target covariate for inference dataset for n > output_chunk_length cov_time_inf_long = TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=target_time.start_time(), length=n_target + n_long, freq=target_time.freq, @@ -54,7 +54,7 @@ class CovariateIndexGeneratorTestCase(DartsBaseTestClass): # integer index # target covariate for inference dataset for n <= output_chunk_length cov_int_inf_short = TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=target_int.start_time(), length=n_target + n_short, freq=target_int.freq, @@ -63,7 +63,7 @@ class CovariateIndexGeneratorTestCase(DartsBaseTestClass): ) # target covariate for inference dataset for n > output_chunk_length cov_int_inf_long = TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=target_int.start_time(), length=n_target + n_long, freq=target_int.freq, diff --git a/darts/tests/models/forecasting/test_encoders.py b/darts/tests/models/forecasting/test_encoders.py index 5a91fe497a..44d0bcabe1 100644 --- a/darts/tests/models/forecasting/test_encoders.py +++ b/darts/tests/models/forecasting/test_encoders.py @@ -56,7 +56,7 @@ class EncoderTestCase(DartsBaseTestClass): # multi-TS at prediction should be as follows inf_ts_short_future = [ TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=ts.end_time() + (1 - 12) * ts.freq, length=12 + 6, freq=ts.freq ), np.arange(12 + 6), @@ -66,7 +66,7 @@ class EncoderTestCase(DartsBaseTestClass): inf_ts_long_future = [ TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=ts.end_time() + (1 - 12) * ts.freq, length=12 + 8, freq=ts.freq ), np.arange(12 + 8), @@ -76,7 +76,7 @@ class EncoderTestCase(DartsBaseTestClass): inf_ts_short_past = [ TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=ts.end_time() + (1 - 12) * ts.freq, length=12, freq=ts.freq ), np.arange(12), @@ -86,7 +86,7 @@ class EncoderTestCase(DartsBaseTestClass): inf_ts_long_past = [ TimeSeries.from_times_and_values( - tg._generate_index( + tg.generate_index( start=ts.end_time() + (1 - 12) * ts.freq, length=12 + (8 - 6), freq=ts.freq, @@ -298,7 +298,7 @@ def test_cyclic_encoder(self): attribute = "month" month_series = TimeSeries.from_times_and_values( - times=tg._generate_index( + times=tg.generate_index( start=pd.to_datetime("2000-01-01"), length=24, freq="MS" ), values=np.arange(24), @@ -338,7 +338,7 @@ def test_datetime_attribute_encoder(self): attribute = "month" month_series = TimeSeries.from_times_and_values( - times=tg._generate_index( + times=tg.generate_index( start=pd.to_datetime("2000-01-01"), length=24, freq="MS" ), values=np.arange(24), diff --git a/darts/tests/models/forecasting/test_local_forecasting_models.py b/darts/tests/models/forecasting/test_local_forecasting_models.py index 6e005c2511..36cf8cca24 100644 --- a/darts/tests/models/forecasting/test_local_forecasting_models.py +++ b/darts/tests/models/forecasting/test_local_forecasting_models.py @@ -214,11 +214,11 @@ def test_exogenous_variables_support(self): # test case with numerical pd.RangeIndex target_num_idx = TimeSeries.from_times_and_values( - times=tg._generate_index(start=0, length=len(self.ts_gaussian)), + times=tg.generate_index(start=0, length=len(self.ts_gaussian)), values=self.ts_gaussian.all_values(copy=False), ) fc_num_idx = TimeSeries.from_times_and_values( - times=tg._generate_index(start=0, length=len(self.ts_gaussian_long)), + times=tg.generate_index(start=0, length=len(self.ts_gaussian_long)), values=self.ts_gaussian_long.all_values(copy=False), ) @@ -270,6 +270,29 @@ def test_dummy_series(self): with self.assertRaises(ValueError): autoarima.fit(series=ts) + def test_forecast_time_index(self): + # the forecast time index should follow that of the train series + + # integer-index, with step>1 + values = np.random.rand(20) + idx = pd.RangeIndex(start=10, stop=50, step=2) + ts = TimeSeries.from_times_and_values(idx, values) + + model = NaiveSeasonal(K=1) + model.fit(ts) + pred = model.predict(n=5) + self.assertTrue( + all(pred.time_index == pd.RangeIndex(start=50, stop=60, step=2)) + ) + + # datetime-index + ts = tg.constant_timeseries(start=pd.Timestamp("20130101"), length=20, value=1) + model = NaiveSeasonal(K=1) + model.fit(ts) + pred = model.predict(n=5) + self.assertEqual(pred.start_time(), pd.Timestamp("20130121")) + self.assertEqual(pred.end_time(), pd.Timestamp("20130125")) + def test_statsmodels_dual_models(self): # same tests, but VARIMA requires to work on a multivariate target series diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index b8fdc71e02..0a3bc0d778 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -992,6 +992,40 @@ def test_gradient_boosted_model_with_eval_set(self, lgb_fit_patch): assert lgb_fit_patch.call_args[1]["eval_set"] is not None assert lgb_fit_patch.call_args[1]["early_stopping_rounds"] == 2 + def test_integer_indexed_series(self): + values_target = np.random.rand(30) + values_past_cov = np.random.rand(30) + values_future_cov = np.random.rand(30) + + idx1 = pd.RangeIndex(start=0, stop=30, step=1) + idx2 = pd.RangeIndex(start=10, stop=70, step=2) + + preds = [] + + for idx in [idx1, idx2]: + target = TimeSeries.from_times_and_values(idx, values_target) + past_cov = TimeSeries.from_times_and_values(idx, values_past_cov) + future_cov = TimeSeries.from_times_and_values(idx, values_future_cov) + + train, _ = target[:20], target[20:] + + model = LinearRegressionModel( + lags=[-2, -1], lags_past_covariates=[-2, -1], lags_future_covariates=[0] + ) + model.fit( + series=train, past_covariates=past_cov, future_covariates=future_cov + ) + + preds.append(model.predict(n=10)) + + # the predicted values should not depend on the time axis + np.testing.assert_equal(preds[0].values(), preds[1].values()) + + # the time axis returned by the second model should be as expected + self.assertTrue( + all(preds[1].time_index == pd.RangeIndex(start=50, stop=70, step=2)) + ) + def test_encoders(self): max_past_lag = -4 max_future_lag = 4 diff --git a/darts/tests/test_timeseries.py b/darts/tests/test_timeseries.py index c6ccdcf5d4..c363e1b6d8 100644 --- a/darts/tests/test_timeseries.py +++ b/darts/tests/test_timeseries.py @@ -105,6 +105,36 @@ def test_integer_indexing(self): list(indexed_ts.time_index) == list(pd.RangeIndex(2, 7, step=1)) ) + # check integer indexing features when series index does not start at 0 + values = np.random.random(100) + times = pd.RangeIndex(10, 110) + series: TimeSeries = TimeSeries.from_times_and_values(times, values) + + # getting index for idx should return i s.t., series[i].time == idx + self.assertEqual(series.get_index_at_point(101), 91) + + # check integer indexing features when series index starts at 0 with a step > 1 + values = np.random.random(100) + times = pd.RangeIndex(0, 200, step=2) + series: TimeSeries = TimeSeries.from_times_and_values(times, values) + + # getting index for idx should return i s.t., series[i].time == idx + self.assertEqual(series.get_index_at_point(100), 50) + + # slicing should act the same irrespective of the initial time stamp + np.testing.assert_equal(series[10:20].values().flatten(), values[10:20]) + + # drop_after should act on the timestamp + np.testing.assert_equal(series.drop_after(20).values().flatten(), values[:10]) + + # test get_index_at_point on series which does not start at 0 and with a step > 1 + values = np.random.random(10) + times = pd.RangeIndex(10, 30, step=2) + series: TimeSeries = TimeSeries.from_times_and_values(times, values) + + # getting index for idx should return i s.t., series[i].time == idx + self.assertEqual(series.get_index_at_point(16), 3) + def test_univariate_component(self): series = TimeSeries.from_values(np.array([10, 20, 30])).with_columns_renamed( "0", "component" @@ -233,6 +263,36 @@ def helper_test_slice(test_case, test_series: TimeSeries): test_case.assertEqual(seriesC.start_time(), pd.Timestamp("20130108")) test_case.assertEqual(seriesC.end_time(), pd.Timestamp("20130110")) + # integer-indexed series, starting at 0 + values = np.random.rand(30) + idx = pd.RangeIndex(start=0, stop=30, step=1) + ts = TimeSeries.from_times_and_values(idx, values) + slice_vals = ts.slice(10, 20).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[10:20]) + + # integer-indexed series, not starting at 0 + values = np.random.rand(30) + idx = pd.RangeIndex(start=5, stop=35, step=1) + ts = TimeSeries.from_times_and_values(idx, values) + slice_vals = ts.slice(10, 20).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[5:15]) + + # integer-indexed series, starting at 0, with step > 1 + values = np.random.rand(30) + idx = pd.RangeIndex(start=0, stop=60, step=2) + ts = TimeSeries.from_times_and_values(idx, values) + slice_vals = ts.slice(10, 20).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[5:10]) + + # integer-indexed series, not starting at 0, with step > 1 + values = np.random.rand(30) + idx = pd.RangeIndex(start=5, stop=65, step=2) + ts = TimeSeries.from_times_and_values(idx, values) + slice_vals = ts.slice(11, 21).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[3:8]) + + # test cases where start and/or stop are not in the series + # n points, base case seriesD = test_series.slice_n_points_after(pd.Timestamp("20130102"), n=3) test_case.assertEqual(seriesD.start_time(), pd.Timestamp("20130102")) @@ -256,6 +316,29 @@ def helper_test_slice(test_case, test_series: TimeSeries): test_case.assertEqual(seriesG.start_time(), pd.Timestamp("20130101")) test_case.assertEqual(seriesG.end_time(), pd.Timestamp("20130107")) + # integer indexed series, step = 1, timestamps not in series + values = np.random.rand(30) + idx = pd.RangeIndex(start=0, stop=30, step=1) + ts = TimeSeries.from_times_and_values(idx, values) + # end timestamp further off, slice should be inclusive of last timestamp: + slice_vals = ts.slice(10, 30).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[10:]) + slice_vals = ts.slice(10, 32).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[10:]) + + # end timestamp within the series make it exclusive: + slice_vals = ts.slice(10, 29).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[10:29]) + + # integer indexed series, step > 1, timestamps not in series + idx = pd.RangeIndex(start=0, stop=60, step=2) + ts = TimeSeries.from_times_and_values(idx, values) + slice_vals = ts.slice(11, 31).values(copy=False).flatten() + np.testing.assert_equal(slice_vals, values[5:15]) + + slice_ts = ts.slice(40, 60) + test_case.assertEqual(ts.end_time(), slice_ts.end_time()) + @staticmethod def helper_test_split(test_case, test_series: TimeSeries): seriesA, seriesB = test_series.split_after(pd.Timestamp("20130104")) @@ -1029,6 +1112,12 @@ def test_gaps(self): ).all() ) + # test gaps detection on integer-indexed series + values = np.array([1, 2, np.nan, np.nan, 3, 4, np.nan, 6]) + times = pd.RangeIndex(8) + ts = TimeSeries.from_times_and_values(times, values) + np.testing.assert_equal(ts.gaps().values, np.array([[2, 3, 2], [6, 6, 1]])) + def test_longest_contiguous_slice(self): times = pd.date_range("20130101", "20130111") pd_series1 = pd.Series( diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index 0791ecc327..8478824505 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -11,7 +11,7 @@ from darts.dataprocessing.transformers import BoxCox, Scaler from darts.tests.base_test_class import DartsBaseTestClass from darts.timeseries import DEFAULT_GLOBAL_STATIC_COV_NAME, STATIC_COV_TAG -from darts.utils.timeseries_generation import _generate_index, linear_timeseries +from darts.utils.timeseries_generation import generate_index, linear_timeseries class TimeSeriesStaticCovariateTestCase(DartsBaseTestClass): @@ -25,7 +25,7 @@ def setUpClass(cls): pd.concat( [ pd.DataFrame( - _generate_index(start=pd.Timestamp(2010, 1, 1), length=len_ts) + generate_index(start=pd.Timestamp(2010, 1, 1), length=len_ts) ) ] * n_groups, diff --git a/darts/tests/utils/test_timeseries_generation.py b/darts/tests/utils/test_timeseries_generation.py index 7dd8914b1a..1a7457cc22 100644 --- a/darts/tests/utils/test_timeseries_generation.py +++ b/darts/tests/utils/test_timeseries_generation.py @@ -5,10 +5,10 @@ from darts.tests.base_test_class import DartsBaseTestClass from darts.utils.timeseries_generation import ( - _generate_index, autoregressive_timeseries, constant_timeseries, gaussian_timeseries, + generate_index, holidays_timeseries, linear_timeseries, random_walk_timeseries, @@ -34,7 +34,7 @@ def test_routine(start, end=None, length=None): test_routine(start=0, length=length_assert) test_routine(start=0, end=length_assert - 1) test_routine(start=pd.Timestamp("2000-01-01"), length=length_assert) - end_date = _generate_index( + end_date = generate_index( start=pd.Timestamp("2000-01-01"), length=length_assert )[-1] test_routine(start=pd.Timestamp("2000-01-01"), end=end_date) @@ -66,7 +66,7 @@ def test_routine(start, end=None, length=None): test_routine(start=0, length=length_assert) test_routine(start=0, end=length_assert - 1) test_routine(start=pd.Timestamp("2000-01-01"), length=length_assert) - end_date = _generate_index( + end_date = generate_index( start=pd.Timestamp("2000-01-01"), length=length_assert )[-1] test_routine(start=pd.Timestamp("2000-01-01"), end=end_date) @@ -94,7 +94,7 @@ def test_routine(start, end=None, length=None): test_routine(start=0, length=length_assert) test_routine(start=0, end=length_assert - 1) test_routine(start=pd.Timestamp("2000-01-01"), length=length_assert) - end_date = _generate_index( + end_date = generate_index( start=pd.Timestamp("2000-01-01"), length=length_assert )[-1] test_routine(start=pd.Timestamp("2000-01-01"), end=end_date) @@ -110,7 +110,7 @@ def test_routine(start, end=None, length=None): test_routine(start=0, length=length_assert) test_routine(start=0, end=length_assert - 1) test_routine(start=pd.Timestamp("2000-01-01"), length=length_assert) - end_date = _generate_index( + end_date = generate_index( start=pd.Timestamp("2000-01-01"), length=length_assert )[-1] test_routine(start=pd.Timestamp("2000-01-01"), end=end_date) @@ -126,7 +126,7 @@ def test_routine(start, end=None, length=None): test_routine(start=0, length=length_assert) test_routine(start=0, end=length_assert - 1) test_routine(start=pd.Timestamp("2000-01-01"), length=length_assert) - end_date = _generate_index( + end_date = generate_index( start=pd.Timestamp("2000-01-01"), length=length_assert )[-1] test_routine(start=pd.Timestamp("2000-01-01"), end=end_date) @@ -179,55 +179,113 @@ def test_routine( holidays_timeseries(time_index_3, "US", until=163) def test_generate_index(self): - def test_routine(start, end=None, length=None, freq="D"): - # testing length, correct start and if sorted (monotonic increasing) - index = _generate_index(start=start, end=end, length=length, freq=freq) - self.assertEqual(len(index), length_assert) - self.assertTrue(index.is_monotonic_increasing) - self.assertTrue(index[0] == start_assert) - self.assertTrue(index[-1] == end_assert) + def test_routine( + expected_length, + expected_start, + expected_end, + start, + end=None, + length=None, + freq=None, + ): + index = generate_index(start=start, end=end, length=length, freq=freq) + self.assertEqual(len(index), expected_length) + self.assertEqual(index[0], expected_start) + self.assertEqual(index[-1], expected_end) + + for length in [1, 2, 5, 50]: + for start in [0, 1, 9]: + + # test pd.RangeIndex with varying step sizes + for step in [1, 2, 4]: + expected_start = start + expected_end = start + (length - 1) * step + freq = None if step == 1 else step + test_routine( + expected_length=length, + expected_start=expected_start, + expected_end=expected_end, + start=start, + length=length, + freq=freq, + ) - for length_assert in [1, 2, 5, 10, 100]: - for start_pos in [0, 1]: - # pandas.RangeIndex - start_assert, end_assert = start_pos, start_pos + length_assert - 1 - test_routine(start=start_assert, length=length_assert, freq="") - test_routine(start=start_assert, length=length_assert, freq="D") - test_routine(start=start_assert, end=end_assert) - test_routine(start=start_assert, end=end_assert, freq="D") - test_routine( - start=None, end=end_assert, length=length_assert, freq="BH" - ) - # pandas.DatetimeIndex - start_date = pd.DatetimeIndex(["2000-01-01"], freq="D") - start_date += start_date.freq * start_pos - # dates = _generate_index(start=start_date[0], length=length_assert) - dates = _generate_index(start=start_date[0], length=length_assert) - start_assert, end_assert = dates[0], dates[-1] - test_routine(start=start_assert, length=length_assert) - test_routine(start=start_assert, end=end_assert) - test_routine(start=None, end=end_assert, length=length_assert, freq="D") + test_routine( + expected_length=length, + expected_start=expected_start, + expected_end=expected_end, + start=start, + end=expected_end, + freq=step, + ) + + test_routine( + expected_length=length, + expected_start=expected_start, + expected_end=expected_end, + start=None, + end=expected_end, + length=length, + freq=step, + ) + + if start == 0: + continue + + # test pd.DatetimeIndex with a start date within 01 and 09 + start_date = pd.Timestamp(f"2000-01-0{start}") + dates = generate_index( + start=start_date, + length=length, + freq="D" if step == 1 else f"{step}D", + ) + start_assert, end_assert = dates[0], dates[-1] + test_routine( + expected_length=length, + expected_start=start_assert, + expected_end=end_assert, + start=start_assert, + length=length, + freq="D" if step == 1 else f"{step}D", + ) + test_routine( + expected_length=length, + expected_start=start_assert, + expected_end=end_assert, + start=start_assert, + end=end_assert, + freq="D" if step == 1 else f"{step}D", + ) + test_routine( + expected_length=length, + expected_start=start_assert, + expected_end=end_assert, + start=None, + end=end_assert, + length=length, + freq="D" if step == 1 else f"{step}D", + ) # `start`, `end` and `length` cannot both be set simultaneously with self.assertRaises(ValueError): - _generate_index(start=0, end=9, length=10) + generate_index(start=0, end=9, length=10) # same as above but `start` defaults to timestamp '2000-01-01' in all timeseries generation functions with self.assertRaises(ValueError): linear_timeseries(end=9, length=10) # exactly two of [`start`, `end`, `length`] must be set with self.assertRaises(ValueError): - test_routine(start=0) + generate_index(start=0) with self.assertRaises(ValueError): - test_routine(start=None, end=1) + generate_index(start=None, end=1) with self.assertRaises(ValueError): - test_routine(start=None, end=None, length=10) + generate_index(start=None, end=None, length=10) # `start` and `end` must have same type with self.assertRaises(ValueError): - test_routine(start=0, end=pd.Timestamp("2000-01-01")) + generate_index(start=0, end=pd.Timestamp("2000-01-01")) with self.assertRaises(ValueError): - test_routine(start=pd.Timestamp("2000-01-01"), end=10) + generate_index(start=pd.Timestamp("2000-01-01"), end=10) def test_autoregressive_timeseries(self): # testing for correct length @@ -254,7 +312,7 @@ def test_calculation(coef): test_length(start=0, length=length_assert) test_length(start=0, end=length_assert - 1) test_length(start=pd.Timestamp("2000-01-01"), length=length_assert) - end_date = _generate_index( + end_date = generate_index( start=pd.Timestamp("2000-01-01"), length=length_assert )[-1] test_length(start=pd.Timestamp("2000-01-01"), end=end_date) diff --git a/darts/timeseries.py b/darts/timeseries.py index 381d830e2b..d73c3a1b52 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -139,8 +139,6 @@ def __init__(self, xa: xr.DataArray): # As of xarray 0.18.2, this sorting discards the freq of the index for some reason # https://github.com/pydata/xarray/issues/5466 # We sort only if the time axis is not already sorted (monotically increasing). - - # TODO also avoid sorting if index is RangeIndex (already sorted by definition) self._xa = ( xa.copy() if xa.get_index(self._time_dim).is_monotonic_increasing @@ -205,7 +203,7 @@ def __init__(self, xa: xr.DataArray): logger, ) else: - self._freq = 1 + self._freq = self._time_index.step self._freq_str = None # check static covariates @@ -438,7 +436,8 @@ def from_csv( filepath_or_buffer The path to the CSV file, or the file object; consistent with the argument of `pandas.read_csv` function time_col - The time column name. If set, the column will be cast to a pandas DatetimeIndex. + The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains + timestamps) or a RangeIndex with step size of 1 (if it contains integers). If not set, the pandas RangeIndex will be used. value_cols A string or list of strings representing the value column(s) to be extracted from the CSV file. If set to @@ -528,7 +527,8 @@ def from_dataframe( df The DataFrame time_col - The time column name. If set, the column will be cast to a pandas DatetimeIndex. + The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains + timestamps) or a RangeIndex with step size of 1 (if it contains integers). If not set, the DataFrame index will be used. In this case the DataFrame must contain an index that is either a pandas DatetimeIndex or a pandas RangeIndex. If a DatetimeIndex is used, it is better if it has no holes; alternatively setting `fill_missing_dates` can in some casees solve @@ -701,7 +701,8 @@ def from_group_dataframe( A string or list of strings representing the columns from the DataFrame by which to extract the individual TimeSeries groups. time_col - The time column name. If set, the column will be cast to a pandas DatetimeIndex. + The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains + timestamps) or a RangeIndex with step size of 1 (if it contains integers). If not set, the DataFrame index will be used. In this case the DataFrame must contain an index that is either a pandas DatetimeIndex or a pandas RangeIndex. If a DatetimeIndex is used, it is better if it has no holes; alternatively setting `fill_missing_dates` can in some casees solve @@ -1890,7 +1891,7 @@ def intvl(start, end): if self._has_datetime_index: return pd.date_range(start=start, end=end, freq=self._freq).size else: - return start - end + return int((end - start) / self._freq) + 1 gap_df["gap_size"] = gap_df.apply( lambda row: intvl(start=row.gap_start, end=row.gap_end), axis=1 @@ -1915,7 +1916,7 @@ def get_index_at_point( self, point: Union[pd.Timestamp, float, int], after=True ) -> int: """ - Converts a point along the time axis into an integer index. + Converts a point along the time axis index into an integer index ranging in (0, len(series)-1). Parameters ---------- @@ -1931,8 +1932,9 @@ def get_index_at_point( In case of a ``float``, the parameter will be treated as the proportion of the time series that should lie before the point. - In the case of ``int``, the parameter will returned as such, provided that it is in the series. Otherwise - it will raise a ValueError. + If an ``int`` and series is datetime-indexed, the value of `point` is returned. + If an ``int`` and series is integer-indexed, the index position of `point` in the RangeIndex is returned + (accounting for steps). after If the provided pandas Timestamp is not in the time series index, whether to return the index of the next timestamp or the index of the previous one. @@ -1947,12 +1949,20 @@ def get_index_at_point( ) point_index = int((len(self) - 1) * point) elif isinstance(point, (int, np.int64)): - raise_if( - point not in range(len(self)), + if self.has_datetime_index or (self.start_time() == 0 and self.freq == 1): + point_index = point + else: + point_index_float = (point - self.start_time()) / self.freq + point_index = int(point_index_float) + raise_if( + point_index != point_index_float, + "The provided point is not a valid index for this series.", + ) + raise_if_not( + 0 <= point_index < len(self), "point (int) should be a valid index in series", logger, ) - point_index = point elif isinstance(point, pd.Timestamp): raise_if_not( self._has_datetime_index, @@ -2091,8 +2101,11 @@ def slice( self, start_ts: Union[pd.Timestamp, int], end_ts: Union[pd.Timestamp, int] ): """ - Return a new TimeSeries, starting later than `start_ts` and ending before `end_ts`, inclusive on both ends. - The timestamps don't have to be in the series. + Return a new TimeSeries, starting later than `start_ts` and ending before `end_ts`. + For series having DatetimeIndex, this is inclusive on both ends. For series having a RangeIndex, + `end_ts` is exclusive. + + `start_ts` and `end_ts` don't have to be in the series. Parameters ---------- @@ -2118,9 +2131,15 @@ def slice( "indexed using an integer-based RangeIndex.", logger, ) - idx = pd.DatetimeIndex( - filter(lambda t: start_ts <= t <= end_ts, self._time_index) - ) + if start_ts in self._time_index and end_ts in self._time_index: + return self[ + start_ts:end_ts + ] # we assume this is faster than the filtering below + else: + idx = pd.DatetimeIndex( + filter(lambda t: start_ts <= t <= end_ts, self._time_index) + ) + return self[idx] else: raise_if( self._has_datetime_index, @@ -2128,8 +2147,23 @@ def slice( "the series is indexed with a DatetimeIndex.", logger, ) - idx = pd.RangeIndex(start_ts, end_ts, step=1) - return self[idx] + # get closest timestamps if either start or end are not in the index + effective_start_ts = ( + min(self._time_index, key=lambda t: abs(t - start_ts)) + if start_ts not in self._time_index + else start_ts + ) + effective_end_ts = ( + min(self._time_index, key=lambda t: abs(t - end_ts)) + if end_ts not in self._time_index + else end_ts + ) + if end_ts >= effective_end_ts + self.freq: + # if the requested end_ts is further off from the end of the time series, + # we have to increase effectiv_end_ts to make the last timestamp inclusive. + effective_end_ts += self.freq + idx = pd.RangeIndex(effective_start_ts, effective_end_ts, step=self.freq) + return self[idx] def slice_n_points_after( self, start_ts: Union[pd.Timestamp, int], n: int @@ -2509,7 +2543,9 @@ def append_values(self, values: np.ndarray) -> "TimeSeries": freq=self._freq, ) else: - idx = pd.RangeIndex(len(self), len(self) + len(values), 1) + idx = pd.RangeIndex( + len(self), len(self) + self.freq * len(values), step=self.freq + ) return self.append( self.__class__.from_times_and_values( @@ -3992,6 +4028,15 @@ def __getitem__( .. warning:: slices use pandas convention of including both ends of the slice. + + Notes + ----- + For integer-indexed series, integers or slices of integer will return the result + of ``isel()``. That is, if integer ``i`` is provided, it returns the ``i``-th value + along the series, which is not necessarily the value where the time index is equal to ``i`` + (e.g., if the time index does not start at 0). In contrast, calling this method with a + ``pd.RangeIndex`` returns the result of ``sel()`` - i.e., the values where the time + index matches the provided range index. """ def _check_dt(): @@ -4096,7 +4141,11 @@ def _set_freq_in_xa(xa_: xr.DataArray): time_idx = xa_.get_index(self._time_dim) if time_idx.is_integer() and not isinstance(time_idx, pd.RangeIndex): xa_ = xa_.assign_coords( - {self._time_dim: pd.RangeIndex(start=key, stop=key + 1)} + { + self._time_dim: pd.RangeIndex( + start=key, stop=key + self.freq, step=self.freq + ) + } ) _set_freq_in_xa(xa_) # indexing may discard the freq so we restore it... @@ -4326,9 +4375,9 @@ def concatenate( "of the first series.", ) - from darts.utils.timeseries_generation import _generate_index + from darts.utils.timeseries_generation import generate_index - tindex = _generate_index( + tindex = generate_index( start=series[0].start_time(), freq=series[0].freq_str, length=da_concat.shape[0], diff --git a/darts/utils/data/encoder_base.py b/darts/utils/data/encoder_base.py index 386ca68784..ccfc93cc94 100644 --- a/darts/utils/data/encoder_base.py +++ b/darts/utils/data/encoder_base.py @@ -13,7 +13,7 @@ from darts import TimeSeries from darts.dataprocessing.transformers import FittableDataTransformer from darts.logging import get_logger -from darts.utils.timeseries_generation import _generate_index +from darts.utils.timeseries_generation import generate_index SupportedIndex = Union[pd.DatetimeIndex, pd.RangeIndex] EncoderOutputType = Optional[Union[Sequence[TimeSeries], List[TimeSeries]]] @@ -118,7 +118,7 @@ def generate_inference_series( if covariate is not None: return covariate.time_index else: - return _generate_index( + return generate_index( start=target.end_time() - target.freq * (self.input_chunk_length - 1), length=self.input_chunk_length + max(0, n - self.output_chunk_length), freq=target.freq, @@ -162,7 +162,7 @@ def generate_inference_series( if covariate is not None: return covariate.time_index else: - return _generate_index( + return generate_index( start=target.end_time() - target.freq * (self.input_chunk_length - 1), length=self.input_chunk_length + max(n, self.output_chunk_length), freq=target.freq, diff --git a/darts/utils/timeseries_generation.py b/darts/utils/timeseries_generation.py index a9d8bacc4b..3a7f3c40d7 100644 --- a/darts/utils/timeseries_generation.py +++ b/darts/utils/timeseries_generation.py @@ -16,11 +16,11 @@ logger = get_logger(__name__) -def _generate_index( +def generate_index( start: Optional[Union[pd.Timestamp, int]] = None, end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, - freq: str = "D", + freq: str = None, name: str = None, ) -> Union[pd.DatetimeIndex, pd.RangeIndex]: """Returns an index with a given start point and length. Either a pandas DatetimeIndex with given frequency @@ -38,10 +38,12 @@ def _generate_index( length Optionally, the length of the returned index. Works only with either `start` or `end`. freq - The time difference between two adjacent entries in the returned index. Only effective if `start` is a - pandas Timestamp. A DateOffset alias is expected; see + The time difference between two adjacent entries in the returned index. In case `start` is a timestamp, + a DateOffset alias is expected; see `docs `_. - The freq is optional for generating an integer index. + By default, "D" (daily) is used. + If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex. + The freq is optional for generating an integer index (if not specified, 1 is used). """ constructors = [ arg_name @@ -63,13 +65,18 @@ def _generate_index( if isinstance(start, pd.Timestamp) or isinstance(end, pd.Timestamp): index = pd.date_range( - start=start, end=end, periods=length, freq=freq, name=name + start=start, + end=end, + periods=length, + freq="D" if freq is None else freq, + name=name, ) else: # int + step = 1 if freq is None else freq index = pd.RangeIndex( - start=start if start is not None else end - length + 1, - stop=end + 1 if end is not None else start + length, - step=1, + start=start if start is not None else end - step * length + step, + stop=end + step if end is not None else start + step * length, + step=step, name=name, ) return index @@ -80,7 +87,7 @@ def constant_timeseries( start: Optional[Union[pd.Timestamp, int]] = pd.Timestamp("2000-01-01"), end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, - freq: str = "D", + freq: str = None, column_name: Optional[str] = "constant", dtype: np.dtype = np.float64, ) -> TimeSeries: @@ -101,9 +108,12 @@ def constant_timeseries( length Optionally, the length of the returned index. Works only with either `start` or `end`. freq - The time difference between two adjacent entries in the returned TimeSeries. Only effective if `start` is a - pandas Timestamp. A DateOffset alias is expected; see + The time difference between two adjacent entries in the returned index. In case `start` is a timestamp, + a DateOffset alias is expected; see `docs `_. + By default, "D" (daily) is used. + If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex. + The freq is optional for generating an integer index (if not specified, 1 is used). column_name Optionally, the name of the value column for the returned TimeSeries dtype @@ -115,7 +125,7 @@ def constant_timeseries( A constant TimeSeries with value 'value'. """ - index = _generate_index(start=start, end=end, freq=freq, length=length) + index = generate_index(start=start, end=end, freq=freq, length=length) values = np.full(len(index), value, dtype=dtype) return TimeSeries.from_times_and_values( @@ -129,7 +139,7 @@ def linear_timeseries( start: Optional[Union[pd.Timestamp, int]] = pd.Timestamp("2000-01-01"), end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, - freq: str = "D", + freq: str = None, column_name: Optional[str] = "linear", dtype: np.dtype = np.float64, ) -> TimeSeries: @@ -155,9 +165,12 @@ def linear_timeseries( length Optionally, the length of the returned index. Works only with either `start` or `end`. freq - The time difference between two adjacent entries in the returned TimeSeries. Only effective if `start` is a - pandas Timestamp. A DateOffset alias is expected; see + The time difference between two adjacent entries in the returned index. In case `start` is a timestamp, + a DateOffset alias is expected; see `docs `_. + By default, "D" (daily) is used. + If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex. + The freq is optional for generating an integer index (if not specified, 1 is used). column_name Optionally, the name of the value column for the returned TimeSeries dtype @@ -169,7 +182,7 @@ def linear_timeseries( A linear TimeSeries created as indicated above. """ - index = _generate_index(start=start, end=end, freq=freq, length=length) + index = generate_index(start=start, end=end, freq=freq, length=length) values = np.linspace(start_value, end_value, len(index), dtype=dtype) return TimeSeries.from_times_and_values( index, values, freq=freq, columns=pd.Index([column_name]) @@ -184,7 +197,7 @@ def sine_timeseries( start: Optional[Union[pd.Timestamp, int]] = pd.Timestamp("2000-01-01"), end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, - freq: str = "D", + freq: str = None, column_name: Optional[str] = "sine", dtype: np.dtype = np.float64, ) -> TimeSeries: @@ -212,9 +225,12 @@ def sine_timeseries( length Optionally, the length of the returned index. Works only with either `start` or `end`. freq - The time difference between two adjacent entries in the returned TimeSeries. Only effective if `start` is a - pandas Timestamp. A DateOffset alias is expected; see + The time difference between two adjacent entries in the returned index. In case `start` is a timestamp, + a DateOffset alias is expected; see `docs `_. + By default, "D" (daily) is used. + If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex. + The freq is optional for generating an integer index (if not specified, 1 is used). column_name Optionally, the name of the value column for the returned TimeSeries dtype @@ -226,7 +242,7 @@ def sine_timeseries( A sinusoidal TimeSeries parametrized as indicated above. """ - index = _generate_index(start=start, end=end, freq=freq, length=length) + index = generate_index(start=start, end=end, freq=freq, length=length) values = np.array(range(len(index)), dtype=dtype) f = np.vectorize( lambda x: value_amplitude @@ -246,7 +262,7 @@ def gaussian_timeseries( start: Optional[Union[pd.Timestamp, int]] = pd.Timestamp("2000-01-01"), end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, - freq: str = "D", + freq: str = None, column_name: Optional[str] = "gaussian", dtype: np.dtype = np.float64, ) -> TimeSeries: @@ -276,9 +292,12 @@ def gaussian_timeseries( length Optionally, the length of the returned index. Works only with either `start` or `end`. freq - The time difference between two adjacent entries in the returned TimeSeries. Only effective if `start` is a - pandas Timestamp. A DateOffset alias is expected; see + The time difference between two adjacent entries in the returned index. In case `start` is a timestamp, + a DateOffset alias is expected; see `docs `_. + By default, "D" (daily) is used. + If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex. + The freq is optional for generating an integer index (if not specified, 1 is used). column_name Optionally, the name of the value column for the returned TimeSeries dtype @@ -305,7 +324,7 @@ def gaussian_timeseries( logger, ) - index = _generate_index(start=start, end=end, freq=freq, length=length) + index = generate_index(start=start, end=end, freq=freq, length=length) values = np.random.normal(mean, std, size=len(index)).astype(dtype) return TimeSeries.from_times_and_values( @@ -319,7 +338,7 @@ def random_walk_timeseries( start: Optional[Union[pd.Timestamp, int]] = pd.Timestamp("2000-01-01"), end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, - freq: str = "D", + freq: str = None, column_name: Optional[str] = "random_walk", dtype: np.dtype = np.float64, ) -> TimeSeries: @@ -343,9 +362,12 @@ def random_walk_timeseries( length Optionally, the length of the returned index. Works only with either `start` or `end`. freq - The time difference between two adjacent entries in the returned TimeSeries. Only effective if `start` is a - pandas Timestamp. A DateOffset alias is expected; see + The time difference between two adjacent entries in the returned index. In case `start` is a timestamp, + a DateOffset alias is expected; see `docs `_. + By default, "D" (daily) is used. + If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex. + The freq is optional for generating an integer index (if not specified, 1 is used). column_name Optionally, the name of the value column for the returned TimeSeries dtype @@ -357,7 +379,7 @@ def random_walk_timeseries( A random walk TimeSeries created as indicated above. """ - index = _generate_index(start=start, end=end, freq=freq, length=length) + index = generate_index(start=start, end=end, freq=freq, length=length) values = np.cumsum(np.random.normal(mean, std, size=len(index)), dtype=dtype) return TimeSeries.from_times_and_values( @@ -371,7 +393,7 @@ def autoregressive_timeseries( start: Optional[Union[pd.Timestamp, int]] = pd.Timestamp("2000-01-01"), end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, - freq: str = "D", + freq: str = None, column_name: Optional[str] = "autoregressive", ) -> TimeSeries: """ @@ -396,9 +418,12 @@ def autoregressive_timeseries( length Optionally, the length of the returned index. Works only with either `start` or `end`. freq - The time difference between two adjacent entries in the returned TimeSeries. Only effective if `start` is a - pandas Timestamp. A DateOffset alias is expected; see + The time difference between two adjacent entries in the returned index. In case `start` is a timestamp, + a DateOffset alias is expected; see `docs `_. + By default, "D" (daily) is used. + If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex. + The freq is optional for generating an integer index (if not specified, 1 is used). column_name Optionally, the name of the value column for the returned TimeSeries @@ -417,7 +442,7 @@ def autoregressive_timeseries( "start_values must have same length as coef.", ) - index = _generate_index(start=start, end=end, freq=freq, length=length) + index = generate_index(start=start, end=end, freq=freq, length=length) values = np.empty(len(coef) + len(index)) values[: len(coef)] = start_values @@ -725,7 +750,7 @@ def _generate_new_dates( Generates `n` new dates after the end of the specified series """ last = input_series.end_time() - start = last + input_series.freq if input_series.has_datetime_index else last + 1 - return _generate_index( + start = last + input_series.freq + return generate_index( start=start, freq=input_series.freq, length=n, name=input_series.time_dim )