Time series transformer: input projection and Std scaler (#21020)
* added loc and scale outputs from scalers * fix typo * fix tests * fixed formatting * initial StdScaler * move scaling to optional str * calculate std feature for scalers * undid change as it does not help * added StdScaler with weights * added input projection layer and d_model hyperparam * use linear proj * add back layernorm_embedding * add sin-cos pos embeddings * updated scalers * formatting * fix type * fixed test * fix repeated_past_values cal. * fix when keepdim=false * fix default_scale * backward compatibility of scaling config * update integration test expected output * fix style * fix docs * use the actual num_static_real_features in feature_dim cal * clarified docs * Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * prediction_length is not optional * fix for reviewer * Update src/transformers/models/time_series_transformer/configuration_time_series_transformer.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * get rid of un-needed new lines * fix doc * remove unneeded new lines * fix style * static_categorical_features and static_real_features are optional * fix integration test * Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * fixing docs for multivariate setting * documentation for generate --------- Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" Time Series Transformer model configuration"""
|
""" Time Series Transformer model configuration"""
|
||||||
|
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
@@ -56,8 +56,9 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
|
|||||||
input_size (`int`, *optional*, defaults to 1):
|
input_size (`int`, *optional*, defaults to 1):
|
||||||
The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
|
The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
|
||||||
multivariate targets.
|
multivariate targets.
|
||||||
scaling (`bool`, *optional* defaults to `True`):
|
scaling (`string` or `bool`, *optional* defaults to `"mean"`):
|
||||||
Whether to scale the input targets.
|
Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
|
||||||
|
scaler is set to "mean".
|
||||||
lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
|
lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
|
||||||
The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
|
The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
|
||||||
5, 6, 7]`.
|
5, 6, 7]`.
|
||||||
@@ -77,6 +78,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
|
|||||||
The dimension of the embedding for each of the static categorical features. Should be a list of integers,
|
The dimension of the embedding for each of the static categorical features. Should be a list of integers,
|
||||||
having the same length as `num_static_categorical_features`. Cannot be `None` if
|
having the same length as `num_static_categorical_features`. Cannot be `None` if
|
||||||
`num_static_categorical_features` is > 0.
|
`num_static_categorical_features` is > 0.
|
||||||
|
d_model (`int`, *optional*, defaults to 64):
|
||||||
|
Dimensionality of the transformer layers.
|
||||||
encoder_layers (`int`, *optional*, defaults to 2):
|
encoder_layers (`int`, *optional*, defaults to 2):
|
||||||
Number of encoder layers.
|
Number of encoder layers.
|
||||||
decoder_layers (`int`, *optional*, defaults to 2):
|
decoder_layers (`int`, *optional*, defaults to 2):
|
||||||
@@ -132,13 +135,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_size: int = 1,
|
|
||||||
prediction_length: Optional[int] = None,
|
prediction_length: Optional[int] = None,
|
||||||
context_length: Optional[int] = None,
|
context_length: Optional[int] = None,
|
||||||
distribution_output: str = "student_t",
|
distribution_output: str = "student_t",
|
||||||
loss: str = "nll",
|
loss: str = "nll",
|
||||||
|
input_size: int = 1,
|
||||||
lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
|
lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
|
||||||
scaling: bool = True,
|
scaling: Optional[Union[str, bool]] = "mean",
|
||||||
num_dynamic_real_features: int = 0,
|
num_dynamic_real_features: int = 0,
|
||||||
num_static_categorical_features: int = 0,
|
num_static_categorical_features: int = 0,
|
||||||
num_static_real_features: int = 0,
|
num_static_real_features: int = 0,
|
||||||
@@ -153,6 +156,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
|
|||||||
decoder_layers: int = 2,
|
decoder_layers: int = 2,
|
||||||
is_encoder_decoder: bool = True,
|
is_encoder_decoder: bool = True,
|
||||||
activation_function: str = "gelu",
|
activation_function: str = "gelu",
|
||||||
|
d_model: int = 64,
|
||||||
dropout: float = 0.1,
|
dropout: float = 0.1,
|
||||||
encoder_layerdrop: float = 0.1,
|
encoder_layerdrop: float = 0.1,
|
||||||
decoder_layerdrop: float = 0.1,
|
decoder_layerdrop: float = 0.1,
|
||||||
@@ -182,7 +186,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
|
|||||||
)
|
)
|
||||||
self.cardinality = cardinality
|
self.cardinality = cardinality
|
||||||
else:
|
else:
|
||||||
self.cardinality = [1]
|
self.cardinality = [0]
|
||||||
if embedding_dimension and num_static_categorical_features > 0:
|
if embedding_dimension and num_static_categorical_features > 0:
|
||||||
if len(embedding_dimension) != num_static_categorical_features:
|
if len(embedding_dimension) != num_static_categorical_features:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -194,7 +198,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
|
|||||||
self.num_parallel_samples = num_parallel_samples
|
self.num_parallel_samples = num_parallel_samples
|
||||||
|
|
||||||
# Transformer architecture configuration
|
# Transformer architecture configuration
|
||||||
self.d_model = input_size * len(lags_sequence) + self._number_of_features
|
self.feature_size = input_size * len(lags_sequence) + self._number_of_features
|
||||||
|
self.d_model = d_model
|
||||||
self.encoder_attention_heads = encoder_attention_heads
|
self.encoder_attention_heads = encoder_attention_heads
|
||||||
self.decoder_attention_heads = decoder_attention_heads
|
self.decoder_attention_heads = decoder_attention_heads
|
||||||
self.encoder_ffn_dim = encoder_ffn_dim
|
self.encoder_ffn_dim = encoder_ffn_dim
|
||||||
@@ -224,6 +229,6 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
|
|||||||
sum(self.embedding_dimension)
|
sum(self.embedding_dimension)
|
||||||
+ self.num_dynamic_real_features
|
+ self.num_dynamic_real_features
|
||||||
+ self.num_time_features
|
+ self.num_time_features
|
||||||
+ max(1, self.num_static_real_features) # there is at least one dummy static real feature
|
+ self.num_static_real_features
|
||||||
+ self.input_size # the log(scale)
|
+ self.input_size * 2 # the log1p(abs(loc)) and log(scale) features
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import random
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.distributions import (
|
from torch.distributions import (
|
||||||
@@ -255,6 +256,39 @@ class FeatureEmbedder(nn.Module):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class StdScaler(nn.Module):
|
||||||
|
"""
|
||||||
|
Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
|
||||||
|
by subtracting from the mean and dividing by the standard deviation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dim (`int`):
|
||||||
|
Dimension along which to calculate the mean and standard deviation.
|
||||||
|
keepdim (`bool`, *optional*, defaults to `False`):
|
||||||
|
Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
|
||||||
|
minimum_scale (`float`, *optional*, defaults to 1e-5):
|
||||||
|
Default scale that is used for elements that are constantly zero along dimension `dim`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
|
||||||
|
super().__init__()
|
||||||
|
if not dim > 0:
|
||||||
|
raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
|
||||||
|
self.dim = dim
|
||||||
|
self.keepdim = keepdim
|
||||||
|
self.minimum_scale = minimum_scale
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||||
|
denominator = weights.sum(self.dim, keepdim=self.keepdim)
|
||||||
|
denominator = denominator.clamp_min(1.0)
|
||||||
|
loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
|
||||||
|
|
||||||
|
variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
|
||||||
|
scale = torch.sqrt(variance + self.minimum_scale)
|
||||||
|
return (data - loc) / scale, loc, scale
|
||||||
|
|
||||||
|
|
||||||
class MeanScaler(nn.Module):
|
class MeanScaler(nn.Module):
|
||||||
"""
|
"""
|
||||||
Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
|
Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
|
||||||
@@ -265,48 +299,49 @@ class MeanScaler(nn.Module):
|
|||||||
Dimension along which to compute the scale.
|
Dimension along which to compute the scale.
|
||||||
keepdim (`bool`, *optional*, defaults to `False`):
|
keepdim (`bool`, *optional*, defaults to `False`):
|
||||||
Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
|
Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
|
||||||
|
default_scale (`float`, *optional*, defaults to `None`):
|
||||||
|
Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
|
||||||
minimum_scale (`float`, *optional*, defaults to 1e-10):
|
minimum_scale (`float`, *optional*, defaults to 1e-10):
|
||||||
Default scale that is used for elements that are constantly zero along dimension `dim`.
|
Default minimum possible scale that is used for any item.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
|
def __init__(
|
||||||
|
self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
|
||||||
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if not dim > 0:
|
|
||||||
raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
|
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
self.keepdim = keepdim
|
self.keepdim = keepdim
|
||||||
self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
|
self.minimum_scale = minimum_scale
|
||||||
|
self.default_scale = default_scale
|
||||||
|
|
||||||
def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
@torch.no_grad()
|
||||||
# these will have shape (N, C)
|
def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
total_weight = weights.sum(dim=self.dim)
|
# shape: (N, [C], T=1)
|
||||||
weighted_sum = (data.abs() * weights).sum(dim=self.dim)
|
ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
|
||||||
|
num_observed = observed_indicator.sum(self.dim, keepdim=True)
|
||||||
|
|
||||||
# first compute a global scale per-dimension
|
scale = ts_sum / torch.clamp(num_observed, min=1)
|
||||||
total_observed = total_weight.sum(dim=0)
|
|
||||||
denominator = torch.max(total_observed, torch.ones_like(total_observed))
|
|
||||||
default_scale = weighted_sum.sum(dim=0) / denominator
|
|
||||||
|
|
||||||
# then compute a per-item, per-dimension scale
|
# If `default_scale` is provided, we use it, otherwise we use the scale
|
||||||
denominator = torch.max(total_weight, torch.ones_like(total_weight))
|
# of the batch.
|
||||||
scale = weighted_sum / denominator
|
if self.default_scale is None:
|
||||||
|
batch_sum = ts_sum.sum(dim=0)
|
||||||
|
batch_observations = torch.clamp(num_observed.sum(0), min=1)
|
||||||
|
default_scale = torch.squeeze(batch_sum / batch_observations)
|
||||||
|
else:
|
||||||
|
default_scale = self.default_scale * torch.ones_like(scale)
|
||||||
|
|
||||||
# use per-batch scale when no element is observed
|
# apply default scale where there are no observations
|
||||||
# or when the sequence contains only zeros
|
scale = torch.where(num_observed > 0, scale, default_scale)
|
||||||
scale = (
|
|
||||||
torch.max(
|
|
||||||
self.minimum_scale,
|
|
||||||
torch.where(
|
|
||||||
weighted_sum > torch.zeros_like(weighted_sum),
|
|
||||||
scale,
|
|
||||||
default_scale * torch.ones_like(total_weight),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
.detach()
|
|
||||||
.unsqueeze(dim=self.dim)
|
|
||||||
)
|
|
||||||
|
|
||||||
return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
|
# ensure the scale is at least `self.minimum_scale`
|
||||||
|
scale = torch.clamp(scale, min=self.minimum_scale)
|
||||||
|
scaled_data = data / scale
|
||||||
|
|
||||||
|
if not self.keepdim:
|
||||||
|
scale = scale.squeeze(dim=self.dim)
|
||||||
|
|
||||||
|
return scaled_data, torch.zeros_like(scale), scale
|
||||||
|
|
||||||
|
|
||||||
class NOPScaler(nn.Module):
|
class NOPScaler(nn.Module):
|
||||||
@@ -325,9 +360,12 @@ class NOPScaler(nn.Module):
|
|||||||
self.dim = dim
|
self.dim = dim
|
||||||
self.keepdim = keepdim
|
self.keepdim = keepdim
|
||||||
|
|
||||||
def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
def forward(
|
||||||
scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
|
self, data: torch.Tensor, observed_indicator: torch.Tensor
|
||||||
return data, scale
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||||
|
scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
|
||||||
|
loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
|
||||||
|
return data, loc, scale
|
||||||
|
|
||||||
|
|
||||||
def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
|
def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
|
||||||
@@ -394,6 +432,50 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
|
|||||||
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
||||||
|
|
||||||
|
|
||||||
|
# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->TimeSeries
|
||||||
|
class TimeSeriesSinusoidalPositionalEmbedding(nn.Embedding):
|
||||||
|
"""This module produces sinusoidal positional embeddings of any length."""
|
||||||
|
|
||||||
|
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
|
||||||
|
super().__init__(num_positions, embedding_dim)
|
||||||
|
self.weight = self._init_weight(self.weight)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _init_weight(out: nn.Parameter) -> nn.Parameter:
|
||||||
|
"""
|
||||||
|
Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
|
||||||
|
the 2nd half of the vector. [dim // 2:]
|
||||||
|
"""
|
||||||
|
n_pos, dim = out.shape
|
||||||
|
position_enc = np.array(
|
||||||
|
[[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
|
||||||
|
)
|
||||||
|
out.requires_grad = False # set early to avoid an error in pytorch-1.8+
|
||||||
|
sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
|
||||||
|
out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
|
||||||
|
out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
|
||||||
|
out.detach_()
|
||||||
|
return out
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
|
||||||
|
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
|
||||||
|
bsz, seq_len = input_ids_shape[:2]
|
||||||
|
positions = torch.arange(
|
||||||
|
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
|
||||||
|
)
|
||||||
|
return super().forward(positions)
|
||||||
|
|
||||||
|
|
||||||
|
class ValueEmbedding(nn.Module):
|
||||||
|
def __init__(self, feature_size, d_model):
|
||||||
|
super(ValueEmbedding, self).__init__()
|
||||||
|
self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.value_projection(x)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Seq2SeqTimeSeriesModelOutput(ModelOutput):
|
class Seq2SeqTimeSeriesModelOutput(ModelOutput):
|
||||||
"""
|
"""
|
||||||
@@ -443,9 +525,12 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||||
self-attention heads.
|
self-attention heads.
|
||||||
scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
||||||
|
Shift values of each time series' context window which is used to give the model inputs of the same
|
||||||
|
magnitude and then used to shift back to the original magnitude.
|
||||||
|
scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
||||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||||
magnitude and then used to rescale to the original scale.
|
magnitude and then used to rescale back to the original magnitude.
|
||||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||||
"""
|
"""
|
||||||
@@ -458,6 +543,7 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
|
|||||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
|
loc: Optional[torch.FloatTensor] = None
|
||||||
scale: Optional[torch.FloatTensor] = None
|
scale: Optional[torch.FloatTensor] = None
|
||||||
static_features: Optional[torch.FloatTensor] = None
|
static_features: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
@@ -510,9 +596,12 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||||
self-attention heads.
|
self-attention heads.
|
||||||
scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
||||||
|
Shift values of each time series' context window which is used to give the model inputs of the same
|
||||||
|
magnitude and then used to shift back to the original magnitude.
|
||||||
|
scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
||||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||||
magnitude and then used to rescale to the original scale.
|
magnitude and then used to rescale back to the original magnitude.
|
||||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||||
"""
|
"""
|
||||||
@@ -526,6 +615,7 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
|
|||||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
|
loc: Optional[torch.FloatTensor] = None
|
||||||
scale: Optional[torch.FloatTensor] = None
|
scale: Optional[torch.FloatTensor] = None
|
||||||
static_features: Optional[torch.FloatTensor] = None
|
static_features: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
@@ -889,6 +979,8 @@ class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, TimeSeriesSinusoidalPositionalEmbedding):
|
||||||
|
pass
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
@@ -917,30 +1009,41 @@ TIME_SERIES_TRANSFORMER_START_DOCSTRING = r"""
|
|||||||
|
|
||||||
TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
|
TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
|
||||||
Args:
|
Args:
|
||||||
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
|
||||||
Past values of the time series, that serve as context in order to predict the future. These values may
|
Past values of the time series, that serve as context in order to predict the future. The sequence size of
|
||||||
contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
|
this tensor must be larger than the `context_length` of the model, since the model will use the larger size
|
||||||
|
to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
|
||||||
|
context".
|
||||||
|
|
||||||
|
The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
|
||||||
|
`lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
|
||||||
|
look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
|
||||||
|
the past.
|
||||||
|
|
||||||
The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
|
The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
|
||||||
`static_categorical_features`, `static_real_features`, `past_time_features`).
|
`static_categorical_features`, `static_real_features`, `past_time_features` and lags).
|
||||||
|
|
||||||
The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
|
Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
|
||||||
|
|
||||||
Missing values need to be replaced with zeros.
|
For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
|
||||||
|
variates in the time series per time step.
|
||||||
past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
|
past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
|
||||||
Optional time features, which the model internally will add to `past_values`. These could be things like
|
Required time features, which the model internally will add to `past_values`. These could be things like
|
||||||
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
|
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
|
||||||
could also be so-called "age" features, which basically help the model know "at which point in life" a
|
could also be so-called "age" features, which basically help the model know "at which point in life" a
|
||||||
time-series is. Age features have small values for distant past time steps and increase monotonically the
|
time-series is. Age features have small values for distant past time steps and increase monotonically the
|
||||||
more we approach the current time step.
|
more we approach the current time step. Holiday features are also a good example of time features.
|
||||||
|
|
||||||
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
|
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
|
||||||
the position encodings are learned from scratch internally as parameters of the model, the Time Series
|
the position encodings are learned from scratch internally as parameters of the model, the Time Series
|
||||||
Transformer requires to provide additional time features.
|
Transformer requires to provide additional time features. The Time Series Transformer only learns
|
||||||
|
additional embeddings for `static_categorical_features`.
|
||||||
|
|
||||||
The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
|
Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
|
||||||
|
must but known at prediction time.
|
||||||
|
|
||||||
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
|
||||||
|
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||||
Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
|
Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
|
||||||
`[0, 1]`:
|
`[0, 1]`:
|
||||||
|
|
||||||
@@ -954,35 +1057,50 @@ TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
|
|||||||
Static categorical features are features which have the same value for all time steps (static over time).
|
Static categorical features are features which have the same value for all time steps (static over time).
|
||||||
|
|
||||||
A typical example of a static categorical feature is a time series ID.
|
A typical example of a static categorical feature is a time series ID.
|
||||||
|
|
||||||
static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
|
static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
|
||||||
Optional static real features which the model will add to the values of the time series.
|
Optional static real features which the model will add to the values of the time series.
|
||||||
|
|
||||||
Static real features are features which have the same value for all time steps (static over time).
|
Static real features are features which have the same value for all time steps (static over time).
|
||||||
|
|
||||||
A typical example of a static real feature is promotion information.
|
A typical example of a static real feature is promotion information.
|
||||||
|
future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
|
||||||
future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
|
|
||||||
Future values of the time series, that serve as labels for the model. The `future_values` is what the
|
Future values of the time series, that serve as labels for the model. The `future_values` is what the
|
||||||
Transformer needs to learn to output, given the `past_values`.
|
Transformer needs during training to learn to output, given the `past_values`.
|
||||||
|
|
||||||
|
The sequence length here is equal to `prediction_length`.
|
||||||
|
|
||||||
See the demo notebook and code snippets for details.
|
See the demo notebook and code snippets for details.
|
||||||
|
|
||||||
Missing values need to be replaced with zeros.
|
Optionally, during training any missing values need to be replaced with zeros and indicated via the
|
||||||
|
`future_observed_mask`.
|
||||||
|
|
||||||
future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
|
For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
|
||||||
Optional time features, which the model internally will add to `future_values`. These could be things like
|
variates in the time series per time step.
|
||||||
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
|
future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
|
||||||
could also be so-called "age" features, which basically help the model know "at which point in life" a
|
Required time features for the prediction window, which the model internally will add to `future_values`.
|
||||||
time-series is. Age features have small values for distant past time steps and increase monotonically the
|
These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
|
||||||
more we approach the current time step.
|
Fourier features). These could also be so-called "age" features, which basically help the model know "at
|
||||||
|
which point in life" a time-series is. Age features have small values for distant past time steps and
|
||||||
|
increase monotonically the more we approach the current time step. Holiday features are also a good example
|
||||||
|
of time features.
|
||||||
|
|
||||||
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
|
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
|
||||||
the position encodings are learned from scratch internally as parameters of the model, the Time Series
|
the position encodings are learned from scratch internally as parameters of the model, the Time Series
|
||||||
Transformer requires to provide additional features.
|
Transformer requires to provide additional time features. The Time Series Transformer only learns
|
||||||
|
additional embeddings for `static_categorical_features`.
|
||||||
|
|
||||||
The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
|
Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
|
||||||
|
must but known at prediction time.
|
||||||
|
|
||||||
|
The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
|
||||||
|
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||||
|
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
||||||
|
in `[0, 1]`:
|
||||||
|
|
||||||
|
- 1 for values that are **observed**,
|
||||||
|
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||||
|
|
||||||
|
This mask is used to filter out missing values for the final loss calculation.
|
||||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
|
Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
|
||||||
|
|
||||||
@@ -990,11 +1108,9 @@ TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
|
|||||||
- 0 for tokens that are **masked**.
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
[What are attention masks?](../glossary#attention-mask)
|
||||||
|
|
||||||
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||||
Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
|
Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
|
||||||
make sure the model can only look at previous inputs in order to predict the future.
|
make sure the model can only look at previous inputs in order to predict the future.
|
||||||
|
|
||||||
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
|
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
|
||||||
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
|
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
|
||||||
|
|
||||||
@@ -1032,7 +1148,6 @@ TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
|
|||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||||
model's internal embedding lookup matrix.
|
model's internal embedding lookup matrix.
|
||||||
|
|
||||||
use_cache (`bool`, *optional*):
|
use_cache (`bool`, *optional*):
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||||
`past_key_values`).
|
`past_key_values`).
|
||||||
@@ -1062,10 +1177,12 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
|
|||||||
self.dropout = config.dropout
|
self.dropout = config.dropout
|
||||||
self.layerdrop = config.encoder_layerdrop
|
self.layerdrop = config.encoder_layerdrop
|
||||||
|
|
||||||
embed_dim = config.d_model
|
self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
|
||||||
|
self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
|
||||||
|
config.context_length + config.prediction_length, config.d_model
|
||||||
|
)
|
||||||
self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
|
self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
|
||||||
self.layernorm_embedding = nn.LayerNorm(embed_dim)
|
self.layernorm_embedding = nn.LayerNorm(config.d_model)
|
||||||
|
|
||||||
self.gradient_checkpointing = False
|
self.gradient_checkpointing = False
|
||||||
# Initialize weights and apply final processing
|
# Initialize weights and apply final processing
|
||||||
@@ -1114,8 +1231,10 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
|
|||||||
)
|
)
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
hidden_states = inputs_embeds
|
hidden_states = self.value_embedding(inputs_embeds)
|
||||||
hidden_states = self.layernorm_embedding(hidden_states)
|
embed_pos = self.embed_positions(inputs_embeds.size())
|
||||||
|
|
||||||
|
hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
|
||||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
|
|
||||||
# expand attention_mask
|
# expand attention_mask
|
||||||
@@ -1193,6 +1312,10 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
|
|||||||
self.dropout = config.dropout
|
self.dropout = config.dropout
|
||||||
self.layerdrop = config.decoder_layerdrop
|
self.layerdrop = config.decoder_layerdrop
|
||||||
|
|
||||||
|
self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
|
||||||
|
self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
|
||||||
|
config.context_length + config.prediction_length, config.d_model
|
||||||
|
)
|
||||||
self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
|
self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
|
||||||
self.layernorm_embedding = nn.LayerNorm(config.d_model)
|
self.layernorm_embedding = nn.LayerNorm(config.d_model)
|
||||||
|
|
||||||
@@ -1278,20 +1401,16 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
|
|||||||
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
|
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
|
||||||
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
|
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
|
||||||
all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
||||||
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
||||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
than the model's internal embedding lookup matrix.
|
than the model's internal embedding lookup matrix.
|
||||||
|
|
||||||
output_attentions (`bool`, *optional*):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
returned tensors for more detail.
|
returned tensors for more detail.
|
||||||
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
output_hidden_states (`bool`, *optional*):
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
||||||
for more detail.
|
for more detail.
|
||||||
|
|
||||||
return_dict (`bool`, *optional*):
|
return_dict (`bool`, *optional*):
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||||
"""
|
"""
|
||||||
@@ -1316,9 +1435,9 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
|
|||||||
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
||||||
encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
|
encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
|
||||||
|
|
||||||
hidden_states = inputs_embeds
|
hidden_states = self.value_embedding(inputs_embeds)
|
||||||
hidden_states = self.layernorm_embedding(hidden_states)
|
embed_pos = self.embed_positions(inputs_embeds.size(), past_key_values_length=self.config.context_length)
|
||||||
|
hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
|
||||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
|
|
||||||
# decoder layers
|
# decoder layers
|
||||||
@@ -1423,15 +1542,18 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
def __init__(self, config: TimeSeriesTransformerConfig):
|
def __init__(self, config: TimeSeriesTransformerConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
if config.scaling:
|
if config.scaling == "mean" or config.scaling:
|
||||||
self.scaler = MeanScaler(dim=1, keepdim=True)
|
self.scaler = MeanScaler(dim=1, keepdim=True)
|
||||||
|
elif config.scaling == "std":
|
||||||
|
self.scaler = StdScaler(dim=1, keepdim=True)
|
||||||
else:
|
else:
|
||||||
self.scaler = NOPScaler(dim=1, keepdim=True)
|
self.scaler = NOPScaler(dim=1, keepdim=True)
|
||||||
|
|
||||||
self.embedder = FeatureEmbedder(
|
if config.num_static_categorical_features > 0:
|
||||||
cardinalities=config.cardinality,
|
self.embedder = FeatureEmbedder(
|
||||||
embedding_dims=config.embedding_dimension,
|
cardinalities=config.cardinality,
|
||||||
)
|
embedding_dims=config.embedding_dimension,
|
||||||
|
)
|
||||||
|
|
||||||
# transformer encoder-decoder and mask initializer
|
# transformer encoder-decoder and mask initializer
|
||||||
self.encoder = TimeSeriesTransformerEncoder(config)
|
self.encoder = TimeSeriesTransformerEncoder(config)
|
||||||
@@ -1483,8 +1605,8 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
self,
|
self,
|
||||||
past_values: torch.Tensor,
|
past_values: torch.Tensor,
|
||||||
past_time_features: torch.Tensor,
|
past_time_features: torch.Tensor,
|
||||||
static_categorical_features: torch.Tensor,
|
static_categorical_features: Optional[torch.Tensor] = None,
|
||||||
static_real_features: torch.Tensor,
|
static_real_features: Optional[torch.Tensor] = None,
|
||||||
past_observed_mask: Optional[torch.Tensor] = None,
|
past_observed_mask: Optional[torch.Tensor] = None,
|
||||||
future_values: Optional[torch.Tensor] = None,
|
future_values: Optional[torch.Tensor] = None,
|
||||||
future_time_features: Optional[torch.Tensor] = None,
|
future_time_features: Optional[torch.Tensor] = None,
|
||||||
@@ -1508,12 +1630,12 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
|
|
||||||
context = past_values[:, -self.config.context_length :]
|
context = past_values[:, -self.config.context_length :]
|
||||||
observed_context = past_observed_mask[:, -self.config.context_length :]
|
observed_context = past_observed_mask[:, -self.config.context_length :]
|
||||||
_, scale = self.scaler(context, observed_context)
|
_, loc, scale = self.scaler(context, observed_context)
|
||||||
|
|
||||||
inputs = (
|
inputs = (
|
||||||
torch.cat((past_values, future_values), dim=1) / scale
|
(torch.cat((past_values, future_values), dim=1) - loc) / scale
|
||||||
if future_values is not None
|
if future_values is not None
|
||||||
else past_values / scale
|
else (past_values - loc) / scale
|
||||||
)
|
)
|
||||||
|
|
||||||
inputs_length = (
|
inputs_length = (
|
||||||
@@ -1533,34 +1655,29 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
else self.config.context_length
|
else self.config.context_length
|
||||||
)
|
)
|
||||||
|
|
||||||
# embeddings
|
|
||||||
embedded_cat = self.embedder(static_categorical_features)
|
|
||||||
# static features
|
# static features
|
||||||
|
log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
|
||||||
log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
|
log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
|
||||||
static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
|
static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
|
||||||
|
|
||||||
|
if static_real_features is not None:
|
||||||
|
static_feat = torch.cat((static_real_features, static_feat), dim=1)
|
||||||
|
if static_categorical_features is not None:
|
||||||
|
embedded_cat = self.embedder(static_categorical_features)
|
||||||
|
static_feat = torch.cat((embedded_cat, static_feat), dim=1)
|
||||||
expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
|
expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
|
||||||
|
|
||||||
# all features
|
# all features
|
||||||
features = torch.cat((expanded_static_feat, time_feat), dim=-1)
|
features = torch.cat((expanded_static_feat, time_feat), dim=-1)
|
||||||
|
|
||||||
|
# lagged features
|
||||||
lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
|
lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
|
||||||
|
|
||||||
lags_shape = lagged_sequence.shape
|
lags_shape = lagged_sequence.shape
|
||||||
reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
|
reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
|
||||||
|
|
||||||
transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
|
transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
|
||||||
|
|
||||||
return transformer_inputs, scale, static_feat
|
return transformer_inputs, loc, scale, static_feat
|
||||||
|
|
||||||
def enc_dec_outputs(self, transformer_inputs):
|
|
||||||
enc_input = transformer_inputs[:, : self.config.context_length, ...]
|
|
||||||
dec_input = transformer_inputs[:, self.config.context_length :, ...]
|
|
||||||
|
|
||||||
encoder_outputs = self.encoder(inputs_embeds=enc_input)
|
|
||||||
decoder_outputs = self.decoder(
|
|
||||||
inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
|
|
||||||
)
|
|
||||||
return encoder_outputs, decoder_outputs
|
|
||||||
|
|
||||||
def get_encoder(self):
|
def get_encoder(self):
|
||||||
return self.encoder
|
return self.encoder
|
||||||
@@ -1575,8 +1692,8 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
past_values: torch.Tensor,
|
past_values: torch.Tensor,
|
||||||
past_time_features: torch.Tensor,
|
past_time_features: torch.Tensor,
|
||||||
past_observed_mask: torch.Tensor,
|
past_observed_mask: torch.Tensor,
|
||||||
static_categorical_features: torch.Tensor,
|
static_categorical_features: Optional[torch.Tensor] = None,
|
||||||
static_real_features: torch.Tensor,
|
static_real_features: Optional[torch.Tensor] = None,
|
||||||
future_values: Optional[torch.Tensor] = None,
|
future_values: Optional[torch.Tensor] = None,
|
||||||
future_time_features: Optional[torch.Tensor] = None,
|
future_time_features: Optional[torch.Tensor] = None,
|
||||||
decoder_attention_mask: Optional[torch.LongTensor] = None,
|
decoder_attention_mask: Optional[torch.LongTensor] = None,
|
||||||
@@ -1628,7 +1745,7 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
transformer_inputs, scale, static_feat = self.create_network_inputs(
|
transformer_inputs, loc, scale, static_feat = self.create_network_inputs(
|
||||||
past_values=past_values,
|
past_values=past_values,
|
||||||
past_time_features=past_time_features,
|
past_time_features=past_time_features,
|
||||||
past_observed_mask=past_observed_mask,
|
past_observed_mask=past_observed_mask,
|
||||||
@@ -1670,7 +1787,7 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
return decoder_outputs + encoder_outputs + (scale, static_feat)
|
return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
|
||||||
|
|
||||||
return Seq2SeqTimeSeriesModelOutput(
|
return Seq2SeqTimeSeriesModelOutput(
|
||||||
last_hidden_state=decoder_outputs.last_hidden_state,
|
last_hidden_state=decoder_outputs.last_hidden_state,
|
||||||
@@ -1681,6 +1798,7 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
|
|||||||
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
|
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
|
||||||
encoder_hidden_states=encoder_outputs.hidden_states,
|
encoder_hidden_states=encoder_outputs.hidden_states,
|
||||||
encoder_attentions=encoder_outputs.attentions,
|
encoder_attentions=encoder_outputs.attentions,
|
||||||
|
loc=loc,
|
||||||
scale=scale,
|
scale=scale,
|
||||||
static_features=static_feat,
|
static_features=static_feat,
|
||||||
)
|
)
|
||||||
@@ -1724,11 +1842,11 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
return self.model.get_decoder()
|
return self.model.get_decoder()
|
||||||
|
|
||||||
@torch.jit.ignore
|
@torch.jit.ignore
|
||||||
def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
|
def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
|
||||||
sliced_params = params
|
sliced_params = params
|
||||||
if trailing_n is not None:
|
if trailing_n is not None:
|
||||||
sliced_params = [p[:, -trailing_n:] for p in params]
|
sliced_params = [p[:, -trailing_n:] for p in params]
|
||||||
return self.distribution_output.distribution(sliced_params, scale=scale)
|
return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
|
||||||
|
|
||||||
@add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
|
@add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
|
||||||
@replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
|
@replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
|
||||||
@@ -1737,8 +1855,8 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
past_values: torch.Tensor,
|
past_values: torch.Tensor,
|
||||||
past_time_features: torch.Tensor,
|
past_time_features: torch.Tensor,
|
||||||
past_observed_mask: torch.Tensor,
|
past_observed_mask: torch.Tensor,
|
||||||
static_categorical_features: torch.Tensor,
|
static_categorical_features: Optional[torch.Tensor] = None,
|
||||||
static_real_features: torch.Tensor,
|
static_real_features: Optional[torch.Tensor] = None,
|
||||||
future_values: Optional[torch.Tensor] = None,
|
future_values: Optional[torch.Tensor] = None,
|
||||||
future_time_features: Optional[torch.Tensor] = None,
|
future_time_features: Optional[torch.Tensor] = None,
|
||||||
future_observed_mask: Optional[torch.Tensor] = None,
|
future_observed_mask: Optional[torch.Tensor] = None,
|
||||||
@@ -1756,15 +1874,6 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
r"""
|
r"""
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
||||||
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
|
||||||
in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for values that are **observed**,
|
|
||||||
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
|
||||||
|
|
||||||
This mask is used to filter out missing values for the final loss calculation.
|
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -1839,7 +1948,8 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
params = None
|
params = None
|
||||||
if future_values is not None:
|
if future_values is not None:
|
||||||
params = self.output_params(outputs[0]) # outputs.last_hidden_state
|
params = self.output_params(outputs[0]) # outputs.last_hidden_state
|
||||||
distribution = self.output_distribution(params, outputs[-2]) # outputs.scale
|
# loc is 3rd last and scale is 2nd last output
|
||||||
|
distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
|
||||||
|
|
||||||
loss = self.loss(distribution, future_values)
|
loss = self.loss(distribution, future_values)
|
||||||
|
|
||||||
@@ -1867,6 +1977,7 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
|
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
|
||||||
encoder_hidden_states=outputs.encoder_hidden_states,
|
encoder_hidden_states=outputs.encoder_hidden_states,
|
||||||
encoder_attentions=outputs.encoder_attentions,
|
encoder_attentions=outputs.encoder_attentions,
|
||||||
|
loc=outputs.loc,
|
||||||
scale=outputs.scale,
|
scale=outputs.scale,
|
||||||
static_features=outputs.static_features,
|
static_features=outputs.static_features,
|
||||||
)
|
)
|
||||||
@@ -1874,15 +1985,102 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
static_categorical_features: torch.Tensor,
|
|
||||||
static_real_features: torch.Tensor,
|
|
||||||
past_time_features: torch.Tensor,
|
|
||||||
past_values: torch.Tensor,
|
past_values: torch.Tensor,
|
||||||
past_observed_mask: torch.Tensor,
|
past_time_features: torch.Tensor,
|
||||||
future_time_features: Optional[torch.Tensor],
|
future_time_features: torch.Tensor,
|
||||||
|
past_observed_mask: Optional[torch.Tensor] = None,
|
||||||
|
static_categorical_features: Optional[torch.Tensor] = None,
|
||||||
|
static_real_features: Optional[torch.Tensor] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None,
|
||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
) -> torch.Tensor:
|
) -> SampleTimeSeriesPredictionOutput:
|
||||||
|
r"""
|
||||||
|
Greedily generate sequences of sample predictions from a model with a probability distribution head.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
|
||||||
|
Past values of the time series, that serve as context in order to predict the future. The sequence size
|
||||||
|
of this tensor must be larger than the `context_length` of the model, since the model will use the
|
||||||
|
larger size to construct lag features, i.e. additional values from the past which are added in order to
|
||||||
|
serve as "extra context".
|
||||||
|
|
||||||
|
The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if
|
||||||
|
no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
|
||||||
|
look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length
|
||||||
|
of the past.
|
||||||
|
|
||||||
|
The `past_values` is what the Transformer encoder gets as input (with optional additional features,
|
||||||
|
such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
|
||||||
|
|
||||||
|
Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
|
||||||
|
|
||||||
|
For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number
|
||||||
|
of variates in the time series per time step.
|
||||||
|
past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
|
||||||
|
Required time features, which the model internally will add to `past_values`. These could be things
|
||||||
|
like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features).
|
||||||
|
These could also be so-called "age" features, which basically help the model know "at which point in
|
||||||
|
life" a time-series is. Age features have small values for distant past time steps and increase
|
||||||
|
monotonically the more we approach the current time step. Holiday features are also a good example of
|
||||||
|
time features.
|
||||||
|
|
||||||
|
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
|
||||||
|
where the position encodings are learned from scratch internally as parameters of the model, the Time
|
||||||
|
Series Transformer requires to provide additional time features. The Time Series Transformer only
|
||||||
|
learns additional embeddings for `static_categorical_features`.
|
||||||
|
|
||||||
|
Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
|
||||||
|
features must but known at prediction time.
|
||||||
|
|
||||||
|
The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
|
||||||
|
future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
|
||||||
|
Required time features for the prediction window, which the model internally will add to sampled
|
||||||
|
predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors
|
||||||
|
(for instance as Fourier features). These could also be so-called "age" features, which basically help
|
||||||
|
the model know "at which point in life" a time-series is. Age features have small values for distant
|
||||||
|
past time steps and increase monotonically the more we approach the current time step. Holiday features
|
||||||
|
are also a good example of time features.
|
||||||
|
|
||||||
|
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
|
||||||
|
where the position encodings are learned from scratch internally as parameters of the model, the Time
|
||||||
|
Series Transformer requires to provide additional time features. The Time Series Transformer only
|
||||||
|
learns additional embeddings for `static_categorical_features`.
|
||||||
|
|
||||||
|
Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
|
||||||
|
features must but known at prediction time.
|
||||||
|
|
||||||
|
The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
|
||||||
|
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||||
|
Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
|
||||||
|
in `[0, 1]`:
|
||||||
|
|
||||||
|
- 1 for values that are **observed**,
|
||||||
|
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||||
|
|
||||||
|
static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
|
||||||
|
Optional static categorical features for which the model will learn an embedding, which it will add to
|
||||||
|
the values of the time series.
|
||||||
|
|
||||||
|
Static categorical features are features which have the same value for all time steps (static over
|
||||||
|
time).
|
||||||
|
|
||||||
|
A typical example of a static categorical feature is a time series ID.
|
||||||
|
static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
|
||||||
|
Optional static real features which the model will add to the values of the time series.
|
||||||
|
|
||||||
|
Static real features are features which have the same value for all time steps (static over time).
|
||||||
|
|
||||||
|
A typical example of a static real feature is promotion information.
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers.
|
||||||
|
output_hidden_states (`bool`, *optional*):
|
||||||
|
Whether or not to return the hidden states of all layers.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
[`SampleTimeSeriesPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
|
||||||
|
number of samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)`
|
||||||
|
for multivariate predictions.
|
||||||
|
"""
|
||||||
outputs = self(
|
outputs = self(
|
||||||
static_categorical_features=static_categorical_features,
|
static_categorical_features=static_categorical_features,
|
||||||
static_real_features=static_real_features,
|
static_real_features=static_real_features,
|
||||||
@@ -1899,13 +2097,17 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
|
|
||||||
decoder = self.model.get_decoder()
|
decoder = self.model.get_decoder()
|
||||||
enc_last_hidden = outputs.encoder_last_hidden_state
|
enc_last_hidden = outputs.encoder_last_hidden_state
|
||||||
|
loc = outputs.loc
|
||||||
scale = outputs.scale
|
scale = outputs.scale
|
||||||
static_feat = outputs.static_features
|
static_feat = outputs.static_features
|
||||||
|
|
||||||
num_parallel_samples = self.config.num_parallel_samples
|
num_parallel_samples = self.config.num_parallel_samples
|
||||||
|
repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0)
|
||||||
repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
|
repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
|
||||||
|
|
||||||
repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
|
repeated_past_values = (
|
||||||
|
past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc
|
||||||
|
) / repeated_scale
|
||||||
|
|
||||||
expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
|
expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
|
||||||
features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
|
features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
|
||||||
@@ -1932,10 +2134,12 @@ class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
|
|||||||
dec_last_hidden = dec_output.last_hidden_state
|
dec_last_hidden = dec_output.last_hidden_state
|
||||||
|
|
||||||
params = self.parameter_projection(dec_last_hidden[:, -1:])
|
params = self.parameter_projection(dec_last_hidden[:, -1:])
|
||||||
distr = self.output_distribution(params, scale=repeated_scale)
|
distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale)
|
||||||
next_sample = distr.sample()
|
next_sample = distr.sample()
|
||||||
|
|
||||||
repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
|
repeated_past_values = torch.cat(
|
||||||
|
(repeated_past_values, (next_sample - repeated_loc) / repeated_scale), dim=1
|
||||||
|
)
|
||||||
future_samples.append(next_sample)
|
future_samples.append(next_sample)
|
||||||
|
|
||||||
concat_future_samples = torch.cat(future_samples, dim=1)
|
concat_future_samples = torch.cat(future_samples, dim=1)
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class TimeSeriesTransformerModelTester:
|
|||||||
embedding_dimension=5,
|
embedding_dimension=5,
|
||||||
num_time_features=4,
|
num_time_features=4,
|
||||||
is_training=True,
|
is_training=True,
|
||||||
hidden_size=16,
|
hidden_size=64,
|
||||||
num_hidden_layers=2,
|
num_hidden_layers=2,
|
||||||
num_attention_heads=4,
|
num_attention_heads=4,
|
||||||
intermediate_size=4,
|
intermediate_size=4,
|
||||||
@@ -98,6 +98,7 @@ class TimeSeriesTransformerModelTester:
|
|||||||
context_length=self.context_length,
|
context_length=self.context_length,
|
||||||
lags_sequence=self.lags_sequence,
|
lags_sequence=self.lags_sequence,
|
||||||
num_time_features=self.num_time_features,
|
num_time_features=self.num_time_features,
|
||||||
|
num_static_real_features=1,
|
||||||
num_static_categorical_features=1,
|
num_static_categorical_features=1,
|
||||||
cardinality=[self.cardinality],
|
cardinality=[self.cardinality],
|
||||||
embedding_dimension=[self.embedding_dimension],
|
embedding_dimension=[self.embedding_dimension],
|
||||||
@@ -149,7 +150,7 @@ class TimeSeriesTransformerModelTester:
|
|||||||
encoder.save_pretrained(tmpdirname)
|
encoder.save_pretrained(tmpdirname)
|
||||||
encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname).to(torch_device)
|
encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname).to(torch_device)
|
||||||
|
|
||||||
transformer_inputs, _, _ = model.create_network_inputs(**inputs_dict)
|
transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
|
||||||
enc_input = transformer_inputs[:, : config.context_length, ...]
|
enc_input = transformer_inputs[:, : config.context_length, ...]
|
||||||
dec_input = transformer_inputs[:, config.context_length :, ...]
|
dec_input = transformer_inputs[:, config.context_length :, ...]
|
||||||
|
|
||||||
@@ -186,13 +187,18 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TimeSeriesTransformerModelTester(self)
|
self.model_tester = TimeSeriesTransformerModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig, has_text_modality=False)
|
self.config_tester = ConfigTester(
|
||||||
|
self,
|
||||||
|
config_class=TimeSeriesTransformerConfig,
|
||||||
|
has_text_modality=False,
|
||||||
|
prediction_length=self.model_tester.prediction_length,
|
||||||
|
)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
self.config_tester.run_common_tests()
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_save_load_strict(self):
|
def test_save_load_strict(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
|
config, _ = self.model_tester.prepare_config_and_inputs()
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
|
||||||
@@ -303,7 +309,7 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
out_len = len(outputs)
|
out_len = len(outputs)
|
||||||
|
|
||||||
correct_outlen = 6
|
correct_outlen = 7
|
||||||
|
|
||||||
if "last_hidden_state" in outputs:
|
if "last_hidden_state" in outputs:
|
||||||
correct_outlen += 1
|
correct_outlen += 1
|
||||||
@@ -389,13 +395,13 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
|
|||||||
static_real_features=batch["static_real_features"],
|
static_real_features=batch["static_real_features"],
|
||||||
future_values=batch["future_values"],
|
future_values=batch["future_values"],
|
||||||
future_time_features=batch["future_time_features"],
|
future_time_features=batch["future_time_features"],
|
||||||
)[0]
|
).last_hidden_state
|
||||||
|
|
||||||
expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
|
expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
|
||||||
self.assertEqual(output.shape, expected_shape)
|
self.assertEqual(output.shape, expected_shape)
|
||||||
|
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device
|
[[-0.6322, -1.5771, -0.9340], [-0.1011, -1.0263, -0.7208], [0.4979, -0.6487, -0.7189]], device=torch_device
|
||||||
)
|
)
|
||||||
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
|
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
|
||||||
|
|
||||||
@@ -412,12 +418,12 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
|
|||||||
static_categorical_features=batch["static_categorical_features"],
|
static_categorical_features=batch["static_categorical_features"],
|
||||||
static_real_features=batch["static_real_features"],
|
static_real_features=batch["static_real_features"],
|
||||||
future_time_features=batch["future_time_features"],
|
future_time_features=batch["future_time_features"],
|
||||||
)[1]
|
).encoder_last_hidden_state
|
||||||
expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
|
expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
|
||||||
self.assertEqual(output.shape, expected_shape)
|
self.assertEqual(output.shape, expected_shape)
|
||||||
|
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device
|
[[0.8177, -1.7989, -0.3127], [1.6964, -1.0607, -0.1749], [1.8395, 0.1110, 0.0263]], device=torch_device
|
||||||
)
|
)
|
||||||
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
|
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
|
||||||
|
|
||||||
@@ -438,6 +444,6 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
|
|||||||
expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
|
expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
|
||||||
self.assertEqual(outputs.sequences.shape, expected_shape)
|
self.assertEqual(outputs.sequences.shape, expected_shape)
|
||||||
|
|
||||||
expected_slice = torch.tensor([2289.5203, 2778.3054, 4648.1313], device=torch_device)
|
expected_slice = torch.tensor([3883.5037, 4630.2251, 7562.1338], device=torch_device)
|
||||||
mean_prediction = outputs.sequences.mean(dim=1)
|
mean_prediction = outputs.sequences.mean(dim=1)
|
||||||
self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
|
self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
|
||||||
|
|||||||
Reference in New Issue
Block a user