First post on this mailing list.
I have been working with time series data for a project, and thought I
could contribute a new transformer to segment time series data using a
sliding window, with variable overlap. I have attached demonstration of
how this would fit in the existing framework. The only challenge for me
here is that the transformer needs to transform both the X and y
variable in order to perform the segmentation. I am not sure from the
documentation how to implement this in the framework.
Overlapping segments is a great way to boost performance for time series
classifiers, so this may be a worthwhile contribution for some in this
area of ML. Ultimately, model_selection.TimeSeries.Split would need to
be modified to support overlapping segments, or a new class created to
enable validation for this.
Please let me know if this would be a worthwhile contribution, and if so
how to go about transforming the target vector y in the framework /
pipeline?
Thanks!
David Burns
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class TimeSeriesSegment(BaseEstimator, TransformerMixin):
'''
Segments time series data with sliding window and variable step size / overlap
'''
def __init__(self, width, step = None):
'''
:param width: sliding window length (time points), integer > 0
:param step: number of time steps between windows, integer > 0
if step < width, segments overlap
if step > width, there is a gap between segments
if step == width, a sliding window is created with no overlap or gaps
'''
assert width > 1
self.width = width
if step == None:
step = width
else:
assert step > 1
self.step = step
def fit(self, X, y):
'''
:param X: numpy array shape (N,)
each element is an array shape (Ti, D) corresponding to a time series of variable length Ti and dimension D
D must be the same for all series
:param y: target vector numpy array shape (N,)
'''
# checking input shape
assert X.shape[0] == y.shape[0]
N = X.shape[0]
shapes = np.array([X[i].shape for i in np.arange(N)])
assert len(np.unique(shapes[:,1])) == 1
Xs = []
ys = []
for i in np.arange(N):
Xs.append(self._segment(X[i]))
ys.append(np.full(Xs[i].shape[0], y[i]))
self.Xs = np.concatenate(Xs)
self.ys = np.concatenate(ys)
return self
def transform(self, X):
'''
:param X: numpy array shape (N,)
each element is an array shape (Ti, D) corresponding to a time series of variable length Ti and dimension D
D must be the same for all series
:return:
Xs: segmented temporal tensor shape (Nw, width, D)
ys: target shape (Nw,)
'''
### todo: need to be able to alter ys in the pipeline
return self.Xs, self.ys
def _segment(self, X):
'''
sliding window segmentation on tensor
:param X: numpy array shape (Ti, D)
:return: temporal tensor shape (Nw, width, D)
'''
Xs = []
for j in range(X.shape[1]):
Xs.append(self._sliding_window(X[:,j])) # each item is NxW, list length D
return np.stack(Xs, axis=2)
def _sliding_window(self, x):
'''
sliding window segmentation on vector
:param x: vector numpy array shape (Ti,)
:return: array shape (Nw, width)
'''
w = np.hstack(x[i:1 + i - self.width or None:self.step] for i in range(0, self.width))
return w.reshape((int(len(w) / self.width), self.width), order='F')
def main():
X = np.array([np.array([[1, 2], [3, 4], [5, 6], [6, 7]]),
np.array([[2, 4], [6, 8], [10, 12], [12, 14], [18, 20]])], dtype=object)
y = np.array([0,1])
segmenter = TimeSeriesSegment(2,2)
segmenter = segmenter.fit(X, y)
Xs, ys = segmenter.transform(X)
print("Input time series data: \n", X, "\n\n")
print("Input target vector: \n", y, "\n\n")
print("Segmented time series: \n ", Xs, "\n\n")
print("Segmented target vector: \n ", ys, "\n\n")
if __name__ == '__main__':
main()_______________________________________________
scikit-learn mailing list
[email protected]
https://mail.python.org/mailman/listinfo/scikit-learn