To all, I am working on a scikit-learn estimator that performs a version of SVC with a custom kernel. Unfortunately, I have been presented with a problem: when running a grid search (or even using the cross_val_score function), my estimator encounters an overflow error when evaluating my kernel (specifically, in an array multiplication operation). What is particularly strange about this is that, when I train the estimator on the whole dataset, this error does not occur. In other words: the problem only appears to occur when the data is split into folds. Is this something that has been seen before? How ought I fix this?
I have attached the source code below (in particular, see the notebook for how the problem arises). Best, Sam
import numpy as np
"""assume m positive"""
def shift( A,j,m):
newA=np.roll(A, m, axis=j)
for index, x in np.ndenumerate(newA):
if (index[j]<m):
newA[index]=0
return newA
def getDiffMatrix(x, y, primary_kernel):
shape = list([x.shape[0]-1, y.shape[0]-1])
left = np.zeros([shape[0], shape[0]+1])
right = np.zeros([shape[1]+1, shape[1]])
l = max(shape)+1
ones = np.diag(np.ones([l]))
ones_up = np.diag(np.ones([l-1]), 1)
ones_down = np.diag(np.ones([l-1]), -1)
left += (ones-ones_up)[tuple(slice(0,n) for n in left.shape)]
right += (ones-ones_down)[tuple(slice(0,n) for n in right.shape)]
diff = np.zeros([shape[0]+1, shape[1]+1])
for i in range(shape[0]+1):
for j in range(shape[1]+1):
diff[i,j] = primary_kernel(x[i], y[j])
return np.dot(left, np.dot(diff, right))
def kernel(K,M,D):
(lens,lent)=K.shape
A=np.ndarray([M,lens,lent],float)
A[0]=K
for m in range(1,M):
Q = np.cumsum(np.cumsum(A[m-1], axis=0), axis=1)
Qshifted=shift(Q,0,1)
Qshifted=shift(Qshifted,1,1)
A[m]=K*(Qshifted+1)
return 1+np.sum(A[M-1])
def kernelHO(K,M,D):
(lens,lent)=K.shape
B=np.ndarray([M,D,D,lens,lent],float)
B[0,0,0]=K
for m in range(1,M):
D_=min(D,m)
K1=np.sum(B[m-1],axis=0)
K2=np.sum(K1,axis=0)
K3=np.cumsum(np.cumsum(K2, axis=0), axis=1)
K3shifted=shift(K3,0,1)
K3shifted2=shift(K3shifted,1,1)
B[m,0,0]=np.multiply(K,(K3shifted2+1))
for d in range(1,D_):
K1=np.sum(B[m-1,d-1],axis=0)
K2=np.cumsum(K1,axis=1)
K2shifted=shift(K2,1,1)
B[m,d,0]=np.divide(1,(d+1))*K*K2shifted
K2_=np.sum(B[m-1],axis=0)
K4_=np.cumsum(K2_[d-1],axis=0)
K4shifted_=shift(K4_,0,1)
B[m,0,d]=np.divide(1,(d+1))*K*K4shifted_
for d_ in range(1,D_):
B[m,d,d_]+=np.divide(1,((d+1)*(d_+1)))*K*B[m-1,d-1,d_-1]
return 1+np.sum(B[M-1])
def seq_kernel_free(A, B, pri_kernel=np.dot, cut_off=2, order=1):
"""Computes the cross-kernel matrix between datasets A, B, whose
rows represent time series."""
gram_matrix = np.zeros( (A.shape[0], B.shape[0]) )
K_temp = np.zeros( (A.shape[1]-1, A.shape[1]-1) )
for row1ind in range(A.shape[0]):
for row2ind in range(B.shape[0]):
K_temp = getDiffMatrix(A[row1ind], B[row2ind], pri_kernel)
gram_matrix[row1ind,row2ind] = kernelHO(K_temp, cut_off, order)
return gram_matrix
SeqSVC Toy Data Tests.ipynb
Description: Binary data
import numpy as np
from functools import partial
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from kernelsqizer import seq_kernel_free
from timeseriestools import timeLagMult, timeIndexMult
# LIST OF STANDARD PRESET KERNELS.
def kPolynom(x,y,coef0=0,gamma=1,degree=1):
return (coef0+gamma*np.inner(x,y))**degree
def kGauss(x,y,scale=1,gamma=1):
return scale * np.exp(-gamma*np.sum(np.square(x-y)))
def kLinear(x,y,scale=1):
return scale * np.inner(x,y)
def kSigmoid(x,y,gamma=1,coef0=0):
return np.tanh(gamma*np.inner(x,y) +coef0)
def kernselect(kername, coef0, gamma, degree, scale):
"""If the input of kernselect is one of the listed strings, this
function returns the corresponding function with the specified
parameters. Otherwise, this function returns its input (one must
still specify the now-redundant parameters).
"""
switcher = {
'linear': partial(kPolynom, coef0=coef0, gamma=gamma, degree=degree),
'rbf': partial(kGauss, scale=scale, gamma=gamma),
'sigmoid': partial(kLinear, scale=scale),
'poly': partial(kSigmoid, gamma=gamma, coef0=coef0),
}
return switcher.get(kername, kername)
class TimeSeriesPreprocessor(object):
"""This class can be used reshape (and recover) higher-dimensional
time series as 2D arrays for use in the scikit-learn interface. This
class itself is NOT a scikit-learn estimator, and so cannot be
placed in a pipeline.
Parameters
----------
numfeatures: int
The number of recordings in time for a single path realisation.
"""
def __init__(self, numfeatures):
self.numfeatures = numfeatures
def flattenData(self, data):
highShape = data.shape
return data.reshape( (highShape[0], np.prod(highShape[1:])) )
def recoverHighData(self, data):
flatShape = data.shape
last_dim = int(flatShape[1]/self.numfeatures)
newShape = (flatShape[0], self.numfeatures, last_dim)
return data.reshape(newShape)
class SeqSVC(BaseEstimator, ClassifierMixin):
"""C-Support Vector Classification with Sequentialisation.
The implementation is based on scikit-learn's svm.SVC.
The fit time complexity is more than quadratic with the number of samples
which makes it hard to scale to dataset with more than a couple of 10000
samples.
The multiclass support is handled according to a one-vs-one scheme.
For details on the precise mathematical formulation of the provided
kernel functions and how `gamma`, `coef0` and `degree` affect each
other, see the narrative documentation:
:ref:`svm_kernels`.
Read more in the :ref:`User Guide <svm_classification>`.
Parameters
----------
C : float, optional (default=1.0)
Penalty parameter C of the error term.
kernel : string, optional (default='rbf')
Specifies the base kernel type to be used in the sequentialising
algorithm.
It must be one of 'linear', 'poly', 'rbf', 'sigmoid', or a callable.
If none is given, 'rbf' will be used. If a callable is given its
sequentialisation is used to pre-compute the kernel matrix from data
matrices; that matrix should be an array of shape
``(n_samples, n_samples)``.
degree : int, optional (default=3)
Degree of the polynomial kernel function ('poly').
Ignored by all other kernels.
gamma : float, optional (default='auto')
Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
If gamma is 'auto' then 1/n_features will be used instead.
coef0 : float, optional (default=0.0)
Independent term in kernel function.
It is only significant in 'poly' and 'sigmoid'.
probability : boolean, optional (default=False)
Whether to enable probability estimates. This must be enabled prior
to calling `fit`, and will slow down that method.
shrinking : boolean, optional (default=True)
Whether to use the shrinking heuristic.
tol : float, optional (default=1e-3)
Tolerance for stopping criterion.
cache_size : float, optional
Specify the size of the kernel cache (in MB).
class_weight : {dict, 'balanced'}, optional
Set the parameter C of class i to class_weight[i]*C for
SVC. If not given, all classes are supposed to have
weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
verbose : bool, default: False
Enable verbose output. Note that this setting takes advantage of a
per-process runtime setting in libsvm that, if enabled, may not work
properly in a multithreaded context.
max_iter : int, optional (default=-1)
Hard limit on iterations within solver, or -1 for no limit.
decision_function_shape : 'ovo', 'ovr' or None, default=None
Whether to return a one-vs-rest ('ovr') decision function of shape
(n_samples, n_classes) as all other classifiers, or the original
one-vs-one ('ovo') decision function of libsvm which has shape
(n_samples, n_classes * (n_classes - 1) / 2).
The default of None will currently behave as 'ovo' for backward
compatibility and raise a deprecation warning, but will change 'ovr'
in 0.19.
.. versionadded:: 0.17
*decision_function_shape='ovr'* is recommended.
.. versionchanged:: 0.17
Deprecated *decision_function_shape='ovo' and None*.
random_state : int seed, RandomState instance, or None (default)
The seed of the pseudo random number generator to use when
shuffling the data for probability estimation.
normalise : bool, optional (default = True)
Scale all the data with the scikit-learn StandardScaler tool.
The classifier is likely to fail if this is set to false, since
SVM classifiers are sensitive to the scale of the data.
scale : float, optional (default = 1)
Kernel coefficient for Gaussian and linear primary kernels.
cut_ord_pair : tuple (of ints), optional (default = (2, 1))
A tuple (M, D) representing the cut-off and order for the
sequentialised of the kernel. This is introduced as a pair in order to
prevent GridSearchCV from instantiating the classifier with the order
value strictly greater than the cut-off value.
n_iter_ : int, optional (default = 1)
A redundant parameter that serves as a hacky fix in order to pass
the check_estimator test.
preprocess : string or int, optional (default = 0)
Creates synthetic higher-dimensional time series out of a dataset of
one-dimensional time-series. If input is an int > 0, then classifier
uses 'lagged' version of dataset. If input is the str 'index', then
classifier adds the time-index to each feature. For more information,
see timeseriestools.
Examples - ADD WHEN FINISHED
--------
See also
--------
SVC
C-Support Vector Classification.
"""
def __init__(self, C=1.0, kernel='rbf', degree=3, gamma=1.0, \
coef0=0.0, shrinking=True, probability=False, tol=0.001, \
cache_size=200, class_weight=None, verbose=False, max_iter=-1, \
decision_function_shape=None, random_state=None, normalise=True,\
scale=1.0, cut_ord_pair=(2,1), n_iter_=1, preprocess=0):
self.C = C
self.shrinking = shrinking
self.probability = probability
self.tol = tol
self.cache_size = cache_size
self.class_weight = class_weight
self.verbose = verbose
self.max_iter = max_iter
self.decision_function_shape = decision_function_shape
self.random_state = random_state
self.normalise = normalise
self.kernel = kernel
self.degree = degree
self.gamma = gamma
self.coef0 = coef0
self.scale = scale
self.cut_ord_pair = cut_ord_pair
self.n_iter_ = n_iter_ # DOES NOTHING - ALLOWS ESTIMATOR TO PASS CHECK.
self.preprocess = preprocess
def fit(self, X, y=None):
"""Instantiate a standard SVM classifier with sequentialised
kernel and fit it to data-target pair X, y."""
# Check that X and y have correct shape
X, y = check_X_y(X, y)
cut_off = self.cut_ord_pair[0]
order = self.cut_ord_pair[1]
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.X_ = np.array(X)
self.y_ = np.array(y)
self.input_shape_ = X.shape
if self.normalise == True:
X = StandardScaler().fit_transform(X)
ts_preprocessor = TimeSeriesPreprocessor(X.shape[1])
if type(self.preprocess) == int and self.preprocess > 0:
X_high = timeLagMult(X, self.preprocess)
X = ts_preprocessor.flattenData(X_high)
elif type(self.preprocess) == str and self.preprocess == 'index':
X_high = timeIndexMult(X)
X = ts_preprocessor.flattenData(X_high)
else:
pass
self.ord_svc_ = SVC(C=self.C, \
kernel=partial(seq_kernel_free, \
pri_kernel=kernselect(self.kernel, self.coef0, self.gamma, self.degree, self.scale), \
cut_off=cut_off, order=order), \
degree=self.degree, gamma=self.gamma, \
coef0=self.coef0, shrinking=self.shrinking, probability=self.probability, tol=self.tol, \
cache_size=self.cache_size, class_weight=self.class_weight, verbose=self.verbose, \
max_iter=self.max_iter, decision_function_shape=self.decision_function_shape, \
random_state=self.random_state)
self.ord_svc_.fit(X, y)
return self
def predict(self, X):
"""Run the predict method of the previously-instantiated SVM
classifier, returning the predicted classes for test set X."""
# Check is fit had been called
check_is_fitted(self, ['X_', 'y_'])
# Input validation
X = check_array(X)
if self.normalise == True:
X = StandardScaler().fit_transform(X)
ts_preprocessor = TimeSeriesPreprocessor(X.shape[1])
if type(self.preprocess) == int and self.preprocess > 0:
X_high = timeLagMult(X, self.preprocess)
X = ts_preprocessor.flattenData(X_high)
elif type(self.preprocess) == str and self.preprocess == 'index':
X_high = timeIndexMult(X)
X = ts_preprocessor.flattenData(X_high)
else:
pass
return self.ord_svc_.predict(X)
import numpy as np
def timeLag(vec, n):
"""Turns a one-dimensional array vec into a two-dimensional array s.t.,
if vec = [x_0, ..., x_{N-1}], the output is equal to
[ [x_{-n}, ..., x_0], ..., [x_{N-1-n}, ..., x_{N-1}]].
If k-n < 0, then set x_{k-n} = x_0."""
# Initialise the lagged vector with zeros.
vec_lag = np.zeros( (len(vec), n+1) )
vec_lag[:, n] = vec
for ind in range(1, n+1):
vec_lag[:, n-ind] = np.roll(vec, ind)
vec_lag[0:ind, n-ind] = vec_lag[0,n]
return vec_lag
def timeIndex(vec):
"""Turns a one-dimensional array vec into a two-dimensional array s.t.,
if vec = [x_0, x_1, x_2, ...] the output is equal to
[[x_0, 0], [x_1, 1], [x_2, 2], ...]."""
vec_index = np.zeros( (len(vec), 2))
vec_index[:, 0] = vec
vec_index[:, 1] = np.arange(len(vec))
return vec_index
def timeLagMult(arr,n):
"""Performs timeLag on an array containing multiple vectors."""
arr_lag = np.zeros( (arr.shape[0], arr.shape[1], n+1))
for row_ind in range(arr.shape[0]):
arr_lag[row_ind, :, :] = timeLag(arr[row_ind, :], n)
return arr_lag
def timeIndexMult(arr):
"""Performs timeIndex on an array containing multiple vectors."""
arr_index = np.zeros( (arr.shape[0], arr.shape[1], 2))
for row_ind in range(arr.shape[0]):
arr_index[row_ind, :, :] = timeIndex(arr[row_ind, :])
return arr_index
_______________________________________________ scikit-learn mailing list [email protected] https://mail.python.org/mailman/listinfo/scikit-learn
