From 5231fa0a02a1cdd57102bc18c46a4f73abe8d309 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Mon, 20 Feb 2023 14:51:55 +0100 Subject: [PATCH] Backports v0.11.11 (#2675) * Faster index building in PandasDataset (#2663) * Speed up `PandasDataset.from_long_dataframe` (#2665) * Fix `DateSplitter` when split date is before start (#2670) * Remove creation of ragged sequences in MultivariateGrouper (#2671) Co-authored-by: Abdul Fatir Ansari Co-authored-by: Lorenzo Stella --------- Co-authored-by: Huibin Shen Co-authored-by: Gerald Woo Co-authored-by: Abdul Fatir Co-authored-by: Abdul Fatir Ansari --- src/gluonts/dataset/multivariate_grouper.py | 33 +++++++++++---------- src/gluonts/dataset/pandas.py | 13 ++++++-- src/gluonts/dataset/split.py | 2 ++ test/dataset/test_split.py | 15 ++++++++++ 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/src/gluonts/dataset/multivariate_grouper.py b/src/gluonts/dataset/multivariate_grouper.py index ceded1fb4b..ddc4909f62 100644 --- a/src/gluonts/dataset/multivariate_grouper.py +++ b/src/gluonts/dataset/multivariate_grouper.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd +from gluonts.itertools import batcher from gluonts.core.component import validated from gluonts.dataset.common import DataEntry, Dataset, ListDataset from gluonts.dataset.field_names import FieldName @@ -128,10 +129,14 @@ def _group_all(self, dataset: Dataset) -> Dataset: def _prepare_train_data(self, dataset: Dataset) -> Dataset: logging.info("group training time series to datasets") + # Creates a single multivariate time series from the + # univariate series in the dataset grouped_data = self._transform_target(self._align_data_entry, dataset) - for data in dataset: - fields = data.keys() - break + grouped_data[FieldName.TARGET] = np.vstack( + grouped_data[FieldName.TARGET] + ) + + fields = next(iter(dataset), {}).keys() if FieldName.FEAT_DYNAMIC_REAL in fields: grouped_data[FieldName.FEAT_DYNAMIC_REAL] = np.vstack( [data[FieldName.FEAT_DYNAMIC_REAL] for data in dataset], @@ -150,21 +155,19 @@ def _prepare_test_data(self, dataset: Dataset) -> Dataset: logging.info("group test time series to datasets") grouped_data = self._transform_target(self._left_pad_data, dataset) - # splits test dataset with rolling date into N R^d time series where - # N is the number of rolling evaluation dates - split_dataset = np.split( - grouped_data[FieldName.TARGET], self.num_test_dates - ) + # Splits test dataset with rolling date into N R^d time series, + # where N is the number of rolling evaluation dates + assert len(grouped_data[FieldName.TARGET]) % self.num_test_dates == 0 + split_size = len(grouped_data[FieldName.TARGET]) // self.num_test_dates + split_dataset = batcher(grouped_data[FieldName.TARGET], split_size) + + fields = next(iter(dataset), {}).keys() all_entries = list() for dataset_at_test_date in split_dataset: grouped_data = dict() - grouped_data[FieldName.TARGET] = np.array( - list(dataset_at_test_date), dtype=np.float32 - ) - for data in dataset: - fields = data.keys() - break + grouped_data[FieldName.TARGET] = np.vstack(dataset_at_test_date) + if FieldName.FEAT_DYNAMIC_REAL in fields: grouped_data[FieldName.FEAT_DYNAMIC_REAL] = np.vstack( [data[FieldName.FEAT_DYNAMIC_REAL] for data in dataset], @@ -202,7 +205,7 @@ def _left_pad_data(self, data: DataEntry) -> np.ndarray: @staticmethod def _transform_target(funcs, dataset: Dataset) -> DataEntry: - return {FieldName.TARGET: np.array([funcs(data) for data in dataset])} + return {FieldName.TARGET: [funcs(data) for data in dataset]} def _restrict_max_dimensionality(self, data: DataEntry) -> DataEntry: """ diff --git a/src/gluonts/dataset/pandas.py b/src/gluonts/dataset/pandas.py index a1486047d4..1687e985bf 100644 --- a/src/gluonts/dataset/pandas.py +++ b/src/gluonts/dataset/pandas.py @@ -138,7 +138,9 @@ def _pair_to_dataentry( df = df.to_frame(name=self.target) if self.timestamp: - df.index = pd.PeriodIndex(df[self.timestamp], freq=self.freq) + df.index = pd.DatetimeIndex(df[self.timestamp]).to_period( + freq=self.freq + ) if not self.assume_sorted: df.sort_index(inplace=True) @@ -187,7 +189,11 @@ def __str__(self) -> str: @classmethod def from_long_dataframe( - cls, dataframe: pd.DataFrame, item_id: str, **kwargs + cls, + dataframe: pd.DataFrame, + item_id: str, + timestamp: Optional[str] = None, + **kwargs, ) -> "PandasDataset": """ Construct ``PandasDataset`` out of a long dataframe. A long dataframe @@ -211,6 +217,9 @@ def from_long_dataframe( PandasDataset Gluonts dataset based on ``pandas.DataFrame``s. """ + if timestamp is not None: + dataframe.index = pd.to_datetime(dataframe[timestamp]) + if not isinstance(dataframe.index, DatetimeIndexOpsMixin): dataframe.index = pd.to_datetime(dataframe.index) return cls(dataframes=dataframe.groupby(item_id), **kwargs) diff --git a/src/gluonts/dataset/split.py b/src/gluonts/dataset/split.py index c36b66fb37..fac3902ebc 100644 --- a/src/gluonts/dataset/split.py +++ b/src/gluonts/dataset/split.py @@ -101,6 +101,8 @@ def periods_between( >>> periods_between(start, end) 9 """ + if start > end: + return 0 return ((end - start).n // start.freq.n) + 1 diff --git a/test/dataset/test_split.py b/test/dataset/test_split.py index 15f710e372..ea52cab4f0 100644 --- a/test/dataset/test_split.py +++ b/test/dataset/test_split.py @@ -95,6 +95,21 @@ def test_time_series_slice(): pd.Period("2021-01-01 11", "2H"), 6, ), + ( + pd.Period("2021-03-04", freq="2D"), + pd.Period("2021-03-02", freq="2D"), + 0, + ), + ( + pd.Period("2021-03-04", freq="2D"), + pd.Period("2021-03-04", freq="2D"), + 1, + ), + ( + pd.Period("2021-03-03 23:00", freq="30T"), + pd.Period("2021-03-03 03:29", freq="30T"), + 0, + ), ], ) def test_periods_between(start, end, count):