Dear PyMVPA users,
Since I couldn't find an implementation of "shrinkage LDA" (e.g.,
Pereira & Botvinick, NeuroImage, 2011) in PyMVPA, I implemented my own
based on Gaussian Discriminant Analysis (gda.py). I figured it could be
helpful for other users as well?
It is considerably faster than using the scikit-learn implementation
through the SKLLearnerAdapter, which I have been using until now:
SKLLearnerAdapter(LinearDiscriminantAnalysis(solver='lsqr',
shrinkage='auto'))
It is also a little more flexible because you can choose between the
Ledoit-Wolf estimator and the Oracle Approximating Shrinkage.
May it be useful.
Cheers,
Michael
# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the PyMVPA package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Gaussian Discriminant Analyses: LDA and QDA
Basic implementation at the moment: no data sphering, nor
dimensionality reduction tricks are in place ATM
"""
"""
TODO:
* too much in common with GNB -- LDA/QDA/GNB could reuse much of machinery
* provide actual probabilities computation as in GNB
* LDA/QDA -- make use of data sphering and may be operating in the
subspace of centroids
Was based on GNB code
"""
__docformat__ = 'restructuredtext'
import numpy as np
from numpy import ones, zeros, sum, abs, isfinite, dot
from mvpa2.base import warning, externals
from mvpa2.clfs.base import Classifier, accepts_dataset_as_samples
from mvpa2.base.learner import DegenerateInputError
from mvpa2.base.param import Parameter
from mvpa2.base.constraints import EnsureChoice
from mvpa2.base.state import ConditionalAttribute
#from mvpa2.measures.base import Sensitivity
from sklearn.covariance import LedoitWolf, OAS
if __debug__:
from mvpa2.base import debug
__all__ = [ "LDA", "QDA", "ShrinkageLDA" ]
class GDA(Classifier):
"""Gaussian Discriminant Analysis -- base for LDA and QDA
"""
__tags__ = ['binary', 'multiclass', 'oneclass']
prior = Parameter('laplacian_smoothing',
constraints=EnsureChoice('laplacian_smoothing', 'uniform', 'ratio'),
doc="""How to compute prior distribution.""")
allow_pinv = Parameter(True,
constraints='bool',
doc="""Allow pseudo-inverse in case of degenerate covariance(s).""")
def __init__(self, **kwargs):
"""Initialize a GDA classifier.
"""
# init base class first
Classifier.__init__(self, **kwargs)
# pylint friendly initializations
self.means = None
"""Means of features per class"""
self.cov = None
"""Co-variances per class, but "vars" is taken ;)"""
self.ulabels = None
"""Labels classifier was trained on"""
self.priors = None
"""Class probabilities"""
self.nsamples_per_class = None
"""Number of samples per class - used by derived classes"""
# Define internal state of classifier
self._norm_weight = None
def _get_priors(self, nlabels, nsamples, nsamples_per_class):
"""Return prior probabilities given data
"""
prior = self.params.prior
if prior == 'uniform':
priors = np.ones((nlabels,))/nlabels
elif prior == 'laplacian_smoothing':
priors = (1+np.squeeze(nsamples_per_class)) \
/ (float(nsamples) + nlabels)
elif prior == 'ratio':
priors = np.squeeze(nsamples_per_class) / float(nsamples)
else:
raise ValueError, \
"No idea on how to handle '%s' way to compute priors" \
% self.params.prior
return np.atleast_1d(priors)
def _train(self, dataset):
"""Train the classifier using `dataset` (`Dataset`).
"""
params = self.params
targets_sa_name = self.get_space()
targets_sa = dataset.sa[targets_sa_name]
# get the dataset information into easy vars
X = dataset.samples
labels = targets_sa.value
self.ulabels = ulabels = targets_sa.unique
nlabels = len(ulabels)
label2index = dict((l, il) for il, l in enumerate(ulabels))
# set the feature dimensions
nsamples = len(X)
nfeatures = dataset.nfeatures
self.means = means = \
np.zeros((nlabels, nfeatures))
# degenerate dimension are added for easy broadcasting later on
# XXX might want to remove -- for now taken from GNB as is
self.nsamples_per_class = nsamples_per_class \
= np.zeros((nlabels, 1))
self.cov = cov = \
np.zeros((nlabels, nfeatures, nfeatures))
# Estimate cov
# better loop than repmat! ;)
for l, il in label2index.iteritems():
Xl = X[labels == l]
nsamples_per_class[il] = len(Xl)
# TODO: degenerate case... no samples for known label for
# some reason?
means[il] = np.mean(Xl, axis=0)
# since we have means already lets do manually cov here
Xldm = Xl - means[il]
cov[il] = np.dot(Xldm.T, Xldm)
# scaling will be done correspondingly in LDA or QDA
# Store prior probabilities
self.priors = self._get_priors(nlabels, nsamples, nsamples_per_class)
if __debug__ and 'GDA' in debug.active:
debug('GDA', "training finished on data.shape=%s " % (X.shape, )
+ "min:max(data)=%f:%f" % (np.min(X), np.max(X)))
def _untrain(self):
"""Untrain classifier and reset all learnt params
"""
self.means = None
self.cov = None
self.ulabels = None
self.priors = None
super(GDA, self)._untrain()
@accepts_dataset_as_samples
def _predict(self, data):
"""Predict the output for the provided data.
"""
params = self.params
self.ca.estimates = prob_cs_cp = self._g_k(data)
# Take the class with maximal (log)probability
# XXX in GNB it is axis=0, i.e. classes were first
winners = prob_cs_cp.argmax(axis=1)
predictions = [self.ulabels[c] for c in winners]
if __debug__ and 'GDA' in debug.active:
debug('GDA', "predict on data.shape=%s min:max(data)=%f:%f " %
(data.shape, np.min(data), np.max(data)))
return predictions
def _inv(self, cov):
try:
return np.linalg.inv(cov)
except Exception, e:
if self.params.allow_pinv:
try:
return np.linalg.pinv(cov)
except Exception, e:
pass
raise DegenerateInputError, \
"Data is probably singular, since inverse fails. Got %s"\
% (e,)
class LDA(GDA):
"""Linear Discriminant Analysis.
"""
__tags__ = GDA.__tags__ + ['linear', 'lda']
def _untrain(self):
self._w = None
self._b = None
super(LDA, self)._untrain()
def _train(self, dataset):
super(LDA, self)._train(dataset)
nlabels = len(self.ulabels)
# Sum and scale the covariance
self.cov = cov = \
np.sum(self.cov, axis=0) \
/ (np.sum(self.nsamples_per_class) - nlabels)
# For now as simple as that -- see notes on top
covi = self._inv(cov)
# Precompute and store the actual separating hyperplane and offset
self._w = np.dot(covi, self.means.T)
self._b = b = np.zeros((nlabels,))
for il in xrange(nlabels):
m = self.means[il]
b[il] = np.log(self.priors[il]) - 0.5 * np.dot(np.dot(m.T, covi), m)
def _g_k(self, data):
"""Return decision function values"""
return np.dot(data, self._w) + self._b
class QDA(GDA):
"""Quadratic Discriminant Analysis.
"""
__tags__ = GDA.__tags__ + ['non-linear', 'qda']
def _untrain(self):
# XXX theoretically we could use the same _w although with
# different "content"
self._icov = None
self._b = None
super(QDA, self)._untrain()
def _train(self, dataset):
super(QDA, self)._train(dataset)
# XXX should we drag cov around at all then?
self._icov = np.zeros(self.cov.shape)
for ic, cov in enumerate(self.cov):
cov /= float(self.nsamples_per_class[ic])
self._icov[ic] = self._inv(cov)
self._b = np.array([np.log(p) - 0.5 * np.log(np.linalg.det(c))
for p,c in zip(self.priors, self.cov)])
def _g_k(self, data):
"""Return decision function values"""
res = []
for m, covi, b in zip(self.means, self._icov, self._b):
dm = data - m
res.append(b - 0.5 * np.sum(np.dot(dm, covi) * dm, axis=1))
return np.array(res).T
class ShrinkageLDA(GDA):
"""LDA with Ledoit-Wolf estimate of covariance matrix
"""
__tags__ = GDA.__tags__ + ['linear', 'lda', 'ledoit-wolf_shrinkage']
def __init__(self, shrinkage_estimator='LedoitWolf', block_size=1000, \
**kwargs):
super(ShrinkageLDA, self).__init__(**kwargs)
self.block_size = block_size
if shrinkage_estimator is 'LedoitWolf':
self.shrinkage_estimator = LedoitWolf(block_size=block_size)
elif shrinkage_estimator is 'OAS':
self.shrinkage_estimator = OAS()
else:
raise ValueError, "Unknown estimator '%s'" % shrinkage_estimator
def _train(self, dataset):
"""Train the classifier using `dataset` (`Dataset`).
"""
# Adapted from GDA's method _train. Couldn't use that implementation
# because it only computes the class-dependent SSCP matrices. The
# shrinkage coefficient, however, needs to be computed from the whole
# time series (with samples centered on their respective class means)
params = self.params
targets_sa_name = self.get_space()
targets_sa = dataset.sa[targets_sa_name]
# get the dataset information into easy vars
X = dataset.samples
labels = targets_sa.value
self.ulabels = ulabels = targets_sa.unique
nlabels = len(ulabels)
label2index = dict((l, il) for il, l in enumerate(ulabels))
# set the feature dimensions
nsamples = len(X)
nfeatures = dataset.nfeatures
self.means = means = \
np.zeros((nlabels, nfeatures))
# degenerate dimension are added for easy broadcasting later on
# XXX might want to remove -- for now taken from GNB as is
self.nsamples_per_class = nsamples_per_class \
= np.zeros((nlabels, 1))
self.cov = cov = \
np.zeros((nlabels, nfeatures, nfeatures))
# Subtract class means to compute within-class covariance later
# better loop than repmat! ;)
Xdm = list()
for l, il in label2index.iteritems():
Xl = X[labels == l]
nsamples_per_class[il] = len(Xl)
# TODO: degenerate case... no samples for known label for
# some reason?
means[il] = np.mean(Xl, axis=0)
# Instead of calculating within-class sum of squares, only subtract
# the class mean here.
Xdm.append(Xl - means[il])
# Concatenate the centered time series
Xdm = np.vstack(Xdm)
# Store prior probabilities
self.priors = self._get_priors(nlabels, nsamples, nsamples_per_class)
if __debug__ and 'ShrinkageLDA' in debug.active:
debug('ShrinkageLDA', "training finished on data.shape=%s " % (X.shape, )
+ "min:max(data)=%f:%f" % (np.min(X), np.max(X)))
nlabels = len(self.ulabels)
# Use shrinkage estimator to compute covariance matrix
self.cov = cov = self.shrinkage_estimator.fit(Xdm).covariance_
# For now as simple as that -- see notes on top
covi = self._inv(cov)
# Precompute and store the actual separating hyperplane and offset
self._w = np.dot(covi, self.means.T)
self._b = b = np.zeros((nlabels,))
for il in xrange(nlabels):
m = self.means[il]
b[il] = np.log(self.priors[il]) - 0.5 * np.dot(np.dot(m.T, covi), m)
def _g_k(self, data):
"""Return decision function values"""
return np.dot(data, self._w) + self._b
# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the PyMVPA package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Collection of classifiers to ease the exploration.
"""
__docformat__ = 'restructuredtext'
from mvpa2.base.types import is_sequence_type
# Define sets of classifiers
from mvpa2.clfs.meta import FeatureSelectionClassifier, SplitClassifier, \
MulticlassClassifier, RegressionAsClassifier
from mvpa2.clfs.smlr import SMLR
from mvpa2.clfs.knn import kNN
from mvpa2.clfs.gda import LDA, QDA, ShrinkageLDA
from mvpa2.clfs.gnb import GNB
from mvpa2.kernels.np import LinearKernel, SquaredExponentialKernel, \
GeneralizedLinearKernel
from mvpa2.featsel.rfe import SplitRFE
from mvpa2.clfs.dummies import RandomClassifier
# Helpers
from mvpa2.base import externals, cfg
from mvpa2.measures.anova import OneWayAnova
from mvpa2.mappers.fx import absolute_features, maxofabs_sample
from mvpa2.clfs.smlr import SMLRWeights
from mvpa2.featsel.helpers import FractionTailSelector, \
FixedNElementTailSelector, RangeElementSelector
from mvpa2.generators.partition import OddEvenPartitioner
from mvpa2.featsel.base import SensitivityBasedFeatureSelection
# Kernels
from mvpa2.kernels.libsvm import LinearLSKernel, RbfLSKernel, \
PolyLSKernel, SigmoidLSKernel
_KNOWN_INTERNALS = [ 'knn', 'binary', 'svm', 'linear',
'smlr', 'does_feature_selection', 'has_sensitivity',
'multiclass', 'non-linear', 'kernel-based', 'lars',
'regression', 'regression_based', 'random_tie_breaking',
'non-deterministic', 'needs_population',
'libsvm', 'sg', 'meta', 'retrainable', 'gpr',
'notrain2predict', 'ridge', 'blr', 'gnpp', 'enet', 'glmnet',
'gnb', 'plr', 'rpy2', 'swig', 'skl', 'lda', 'qda',
'random-forest', 'extra-trees', 'random',
# oneclass-binary can provide binary output for the labels
# oneclass would always output a single label but with
# estimates/probabilities of interest
'oneclass', 'oneclass-binary']
class Warehouse(object):
"""Class to keep known instantiated classifiers
Should provide easy ways to select classifiers of needed kind:
clfswh['linear', 'svm'] should return all linear SVMs
clfswh['linear', 'multiclass'] should return all linear classifiers
capable of doing multiclass classification
"""
def __init__(self, known_tags=None, matches=None):
"""Initialize warehouse
Parameters
----------
known_tags : list of str
List of known tags
matches : dict
Optional dictionary of additional matches. E.g. since any
regression can be used as a binary classifier,
matches={'binary':['regression']}, would allow to provide
regressions also if 'binary' was requested
"""
self._known_tags = set(known_tags)
self.__items = []
self.__keys = set()
if matches is None:
matches = {}
self.__matches = matches
self.__descriptions = {}
def __getitem__(self, *args):
if isinstance(args[0], tuple):
args = args[0]
# so we explicitely handle [:]
if args == (slice(None),):
args = []
# lets remove optional modifier '!'
dargs = set([str(x).lstrip('!') for x in args]).difference(
self._known_tags)
if len(dargs)>0:
raise ValueError, "Unknown internals %s requested. Known are %s" % \
(list(dargs), list(self._known_tags))
# dummy implementation for now
result = []
# check every known item
for item in self.__items:
good = True
# by default each one counts
for arg in args:
# check for rejection first
if arg.startswith('!'):
if (arg[1:] in item.__tags__):
good = False
break
else:
continue
# check for inclusion
found = False
for arg in [arg] + self.__matches.get(arg, []):
if (arg in item.__tags__):
found = True
break
good = found
if not good:
break
if good:
result.append(item)
return result
def __iadd__(self, item):
if is_sequence_type(item):
for item_ in item:
self.__iadd__(item_)
else:
if not hasattr(item, '__tags__'):
raise ValueError, "Cannot register %s " % item + \
"which has no __tags__ defined"
if item.descr in self.__descriptions:
raise ValueError("Cannot register %s, " % item + \
"an item with descriptions '%s' already exists" \
% item.descr)
if len(item.__tags__) == 0:
raise ValueError, "Cannot register %s " % item + \
"which has empty __tags__"
clf_internals = set(item.__tags__)
if clf_internals.issubset(self._known_tags):
self.__items.append(item)
self.__keys |= clf_internals
else:
raise ValueError, 'Unknown clf internal(s) %s' % \
clf_internals.difference(self._known_tags)
# access by descr
self.__descriptions[item.descr] = item
return self
def get_by_descr(self, descr):
return self.__descriptions[descr]
@property
def internals(self):
"""Known internal tags of the classifiers
"""
return self.__keys
def listing(self):
"""Listing (description + internals) of registered items
"""
return [(x.descr, x.__tags__) for x in self.__items]
def print_registered(self, *args):
if not len(args):
args = (slice(None))
import numpy as np
import textwrap
# sort by description
for lrn in sorted(self.__getitem__(args), key=lambda x: x.descr.lower()):
print '%s\n%s' % (
lrn.descr,
textwrap.fill(', '.join(np.unique(lrn.__tags__)), 70,
initial_indent=' ' * 4,
subsequent_indent=' ' * 4)
)
@property
def items(self):
"""Registered items
"""
return self.__items
@property
def descriptions(self):
"""Descriptions of registered items"""
return self.__descriptions.keys()
clfswh = Warehouse(known_tags=_KNOWN_INTERNALS) # classifiers
regrswh = Warehouse(known_tags=_KNOWN_INTERNALS) # regressions
# NB:
# - Nu-classifiers are turned off since for haxby DS default nu
# is an 'infisible' one
# - Python's SMLR is turned off for the duration of development
# since it is slow and results should be the same as of C version
#
clfswh += [ SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"),
SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"),
#SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"),
#SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"),
#SMLR(implementation="Python", descr="SMLR(Python)")
]
clfswh += \
[ MulticlassClassifier(SMLR(lm=0.1),
descr='Pairs+maxvote multiclass on SMLR(lm=0.1)') ]
clfswh += [ RandomClassifier(descr="Random"),
RandomClassifier(same=True, descr="RandomSame"),
]
if externals.exists('libsvm'):
from mvpa2.clfs.libsvmc import svm as libsvm
clfswh._known_tags.update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys())
clfswh += [libsvm.SVM(descr="libsvm.LinSVM(C=def)", probability=1),
libsvm.SVM(
C=-10.0, descr="libsvm.LinSVM(C=10*def)", probability=1),
libsvm.SVM(
C=1.0, descr="libsvm.LinSVM(C=1)", probability=1),
libsvm.SVM(svm_impl='NU_SVC',
descr="libsvm.LinNuSVM(nu=def)", probability=1)
]
clfswh += [libsvm.SVM(kernel=RbfLSKernel(), descr="libsvm.RbfSVM()"),
libsvm.SVM(kernel=RbfLSKernel(), svm_impl='NU_SVC',
descr="libsvm.RbfNuSVM(nu=def)"),
libsvm.SVM(kernel=PolyLSKernel(),
descr='libsvm.PolySVM()', probability=1),
#libsvm.svm.SVM(kernel=SigmoidLSKernel(),
# svm_impl='C_SVC',
# descr='libsvm.SigmoidSVM()'),
]
# regressions
regrswh._known_tags.update(['EPSILON_SVR', 'NU_SVR'])
regrswh += [libsvm.SVM(svm_impl='EPSILON_SVR', descr='libsvm epsilon-SVR'),
libsvm.SVM(svm_impl='NU_SVR', descr='libsvm nu-SVR')]
if externals.exists('shogun'):
from mvpa2.clfs import sg
from mvpa2.kernels.sg import LinearSGKernel, PolySGKernel, RbfSGKernel
clfswh._known_tags.update(sg.SVM._KNOWN_IMPLEMENTATIONS)
# TODO: some classifiers are not yet ready to be used out-of-the-box in
# PyMVPA, thus we don't populate warehouse with their instances
bad_classifiers = [
'mpd', # was segfault, now non-training on testcases, and XOR.
# and was described as "for educational purposes", thus
# shouldn't be used for real data ;-)
# Should be a drop-in replacement for lightsvm
'gpbt', # fails to train for testAnalyzerWithSplitClassifier
# also 'retraining' doesn't work -- fails to generalize
'gmnp', # would fail with 'assertion Cache_Size > 2'
# if shogun < 0.6.3, also refuses to train
'svrlight', # fails to 'generalize' as a binary classifier
# after 'binning'
'krr', # fails to generalize
'svmocas', # fails to generalize
'libsvr' # XXXregr removing regressions as classifiers
]
if not externals.exists('sg_fixedcachesize'):
# would fail with 'assertion Cache_Size > 2' if shogun < 0.6.3
bad_classifiers.append('gnpp')
for impl in sg.SVM._KNOWN_IMPLEMENTATIONS:
# Uncomment the ones to disable
if impl in bad_classifiers:
continue
clfswh += [
sg.SVM(
descr="sg.LinSVM(C=def)/%s" % impl, svm_impl=impl),
sg.SVM(
C=-10.0, descr="sg.LinSVM(C=10*def)/%s" % impl, svm_impl=impl),
sg.SVM(
C=1.0, descr="sg.LinSVM(C=1)/%s" % impl, svm_impl=impl),
]
if not impl in ['svmocas']: # inherently linear only
clfswh += [
sg.SVM(kernel=RbfSGKernel(),
descr="sg.RbfSVM()/%s" % impl, svm_impl=impl),
# sg.SVM(kernel=RbfSGKernel(),
# descr="sg.RbfSVM(gamma=0.1)/%s"
# % impl, svm_impl=impl, gamma=0.1),
# sg.SVM(descr="sg.SigmoidSVM()/%s"
# % impl, svm_impl=impl, kernel=SigmoidSGKernel(),),
]
_optional_regressions = []
if externals.exists('shogun.krr') and externals.versions['shogun'] >= '0.9':
_optional_regressions += ['krr']
for impl in ['libsvr'] + _optional_regressions:# \
# XXX svrlight sucks in SG -- dont' have time to figure it out
#+ ([], ['svrlight'])['svrlight' in sg.SVM._KNOWN_IMPLEMENTATIONS]:
regrswh._known_tags.update([impl])
regrswh += [ sg.SVM(svm_impl=impl, descr='sg.LinSVMR()/%s' % impl),
#sg.SVM(svm_impl=impl, kernel_type='RBF',
# descr='sg.RBFSVMR()/%s' % impl),
]
if len(clfswh['svm', 'linear']) > 0:
# if any SVM implementation is known, import default ones
from mvpa2.clfs.svm import *
# lars from R via RPy
if externals.exists('lars'):
import mvpa2.clfs.lars as lars
from mvpa2.clfs.lars import LARS
for model in lars.known_models:
# XXX create proper repository of classifiers!
lars_clf = RegressionAsClassifier(
LARS(descr="LARS(%s)" % model,
model_type=model),
descr='LARS(model_type=%r) classifier' % model)
clfswh += lars_clf
# is a regression, too
lars_regr = LARS(descr="_LARS(%s)" % model,
model_type=model)
regrswh += lars_regr
# clfswh += MulticlassClassifier(lars,
# descr='Multiclass %s' % lars.descr)
## Still fails unittests battery although overhauled otherwise.
## # enet from R via RPy2
## if externals.exists('elasticnet'):
## from mvpa2.clfs.enet import ENET
## clfswh += RegressionAsClassifier(ENET(),
## descr="RegressionAsClassifier(ENET())")
## regrswh += ENET(descr="ENET()")
# glmnet from R via RPy
if externals.exists('glmnet'):
from mvpa2.clfs.glmnet import GLMNET_C, GLMNET_R
clfswh += GLMNET_C(descr="GLMNET_C()")
regrswh += GLMNET_R(descr="GLMNET_R()")
# LDA/QDA
clfswh += LDA(descr='LDA()')
clfswh += QDA(descr='QDA()')
if externals.exists('skl'):
_skl_version = externals.versions['skl']
_skl_api09 = _skl_version >= '0.9'
def _skl_import(submod, class_):
if _skl_api09:
submod_ = __import__('sklearn.%s' % submod, fromlist=[submod])
else:
submod_ = __import__('scikits.learn.%s' % submod, fromlist=[submod])
return getattr(submod_, class_)
if _skl_version >= "0.17":
sklLDA = _skl_import('discriminant_analysis', 'LinearDiscriminantAnalysis')
else:
sklLDA = _skl_import('lda', 'LDA')
from mvpa2.clfs.skl.base import SKLLearnerAdapter
clfswh += SKLLearnerAdapter(sklLDA(),
tags=['lda', 'linear', 'multiclass', 'binary'],
descr='skl.LDA()')
if _skl_version >= '0.10':
# Out of Bag Estimates
sklRandomForestClassifier = _skl_import('ensemble', 'RandomForestClassifier')
clfswh += SKLLearnerAdapter(sklRandomForestClassifier(),
tags=['random-forest', 'linear', 'non-linear',
'binary', 'multiclass',
'oneclass',
'non-deterministic', 'needs_population',],
descr='skl.RandomForestClassifier()')
sklRandomForestRegression = _skl_import('ensemble', 'RandomForestRegressor')
regrswh += SKLLearnerAdapter(sklRandomForestRegression(),
tags=['random-forest', 'linear', 'non-linear',
'regression',
'non-deterministic', 'needs_population',],
descr='skl.RandomForestRegression()')
sklExtraTreesClassifier = _skl_import('ensemble', 'ExtraTreesClassifier')
clfswh += SKLLearnerAdapter(sklExtraTreesClassifier(),
tags=['extra-trees', 'linear', 'non-linear',
'binary', 'multiclass',
'oneclass',
'non-deterministic', 'needs_population',],
descr='skl.ExtraTreesClassifier()')
sklExtraTreesRegression = _skl_import('ensemble', 'ExtraTreesRegressor')
regrswh += SKLLearnerAdapter(sklExtraTreesRegression(),
tags=['extra-trees', 'linear', 'non-linear',
'regression',
'non-deterministic', 'needs_population',],
descr='skl.ExtraTreesRegression()')
if _skl_version >= '0.8':
if _skl_version >= '0.14':
sklPLSRegression = _skl_import('cross_decomposition', 'PLSRegression')
else:
sklPLSRegression = _skl_import('pls', 'PLSRegression')
# somewhat silly use of PLS, but oh well
regrswh += SKLLearnerAdapter(sklPLSRegression(n_components=1),
tags=['linear', 'regression'],
enforce_dim=1,
descr='skl.PLSRegression_1d()')
if externals.versions['skl'] >= '0.6.0':
sklLars = _skl_import('linear_model',
_skl_api09 and 'Lars' or 'LARS')
sklLassoLars = _skl_import('linear_model',
_skl_api09 and 'LassoLars' or 'LassoLARS')
sklElasticNet = _skl_import('linear_model', 'ElasticNet')
_lars_tags = ['lars', 'linear', 'regression', 'does_feature_selection']
_lars = SKLLearnerAdapter(sklLars(),
tags=_lars_tags,
descr='skl.Lars()')
_lasso_lars = SKLLearnerAdapter(sklLassoLars(alpha=0.01),
tags=_lars_tags,
descr='skl.LassoLars()')
_elastic_net = SKLLearnerAdapter(
sklElasticNet(alpha=.01,
**{'l1_ratio' if externals.versions['skl'] >= '0.13'
else 'rho': .3}),
tags=['enet', 'regression', 'linear', # 'has_sensitivity',
'does_feature_selection'],
descr='skl.ElasticNet()')
regrswh += [_lars, _lasso_lars, _elastic_net]
clfswh += [RegressionAsClassifier(_lars, descr="skl.Lars_C()"),
RegressionAsClassifier(_lasso_lars, descr="skl.LassoLars_C()"),
RegressionAsClassifier(_elastic_net, descr="skl.ElasticNet_C()"),
]
if _skl_version >= '0.10':
sklLassoLarsIC = _skl_import('linear_model', 'LassoLarsIC')
_lasso_lars_ic = SKLLearnerAdapter(sklLassoLarsIC(),
tags=_lars_tags,
descr='skl.LassoLarsIC()')
regrswh += [_lasso_lars_ic]
clfswh += [RegressionAsClassifier(_lasso_lars_ic,
descr='skl.LassoLarsIC_C()')]
# kNN
clfswh += kNN(k=5, descr="kNN(k=5)")
clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')")
clfswh += \
FeatureSelectionClassifier(
kNN(),
SensitivityBasedFeatureSelection(
SMLRWeights(SMLR(lm=1.0, implementation="C"),
postproc=maxofabs_sample()),
RangeElementSelector(mode='select')),
descr="kNN on SMLR(lm=1) non-0")
clfswh += \
FeatureSelectionClassifier(
kNN(),
SensitivityBasedFeatureSelection(
OneWayAnova(),
FractionTailSelector(0.05, mode='select', tail='upper')),
descr="kNN on 5%(ANOVA)")
clfswh += \
FeatureSelectionClassifier(
kNN(),
SensitivityBasedFeatureSelection(
OneWayAnova(),
FixedNElementTailSelector(50, mode='select', tail='upper')),
descr="kNN on 50(ANOVA)")
# GNB
clfswh += GNB(descr="GNB()")
clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)")
clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')")
clfswh += \
FeatureSelectionClassifier(
GNB(),
SensitivityBasedFeatureSelection(
OneWayAnova(),
FractionTailSelector(0.05, mode='select', tail='upper')),
descr="GNB on 5%(ANOVA)")
# GPR
if externals.exists('scipy'):
from mvpa2.clfs.gpr import GPR
regrswh += GPR(kernel=LinearKernel(), descr="GPR(kernel='linear')")
regrswh += GPR(kernel=SquaredExponentialKernel(),
descr="GPR(kernel='sqexp')")
# Add wrapped GPR as a classifier
gprcb = RegressionAsClassifier(
GPR(kernel=GeneralizedLinearKernel()), descr="GPRC(kernel='linear')")
# lets remove multiclass label from it
gprcb.__tags__.pop(gprcb.__tags__.index('multiclass'))
clfswh += gprcb
# and create a proper multiclass one
clfswh += MulticlassClassifier(
RegressionAsClassifier(
GPR(kernel=GeneralizedLinearKernel())),
descr="GPRCM(kernel='linear')")
# BLR
from mvpa2.clfs.blr import BLR
clfswh += RegressionAsClassifier(BLR(descr="BLR()"),
descr="BLR Classifier")
#PLR
from mvpa2.clfs.plr import PLR
clfswh += PLR(descr="PLR()")
if externals.exists('scipy'):
clfswh += PLR(reduced=0.05, descr="PLR(reduced=0.01)")
# SVM stuff
if len(clfswh['linear', 'svm']) > 0:
linearSVMC = clfswh['linear', 'svm',
cfg.get('svm', 'backend', default='libsvm').lower()
][0]
# "Interesting" classifiers
clfswh += \
FeatureSelectionClassifier(
linearSVMC.clone(),
SensitivityBasedFeatureSelection(
SMLRWeights(SMLR(lm=0.1, implementation="C"),
postproc=maxofabs_sample()),
RangeElementSelector(mode='select')),
descr="LinSVM on SMLR(lm=0.1) non-0")
_rfeclf = linearSVMC.clone()
clfswh += \
FeatureSelectionClassifier(
_rfeclf,
SplitRFE(
_rfeclf,
OddEvenPartitioner(),
fselector=FractionTailSelector(
0.2, mode='discard', tail='lower')),
descr="LinSVM with nested-CV RFE")
_clfs_splitrfe_svm_anova = \
FeatureSelectionClassifier(
linearSVMC.clone(),
SplitRFE(
linearSVMC.clone(),
OddEvenPartitioner(),
fselector=FractionTailSelector(
0.2, mode='discard', tail='lower'),
fmeasure=OneWayAnova()),
descr="LinSVM with nested-CV Anova RFE")
clfswh += _clfs_splitrfe_svm_anova
clfswh += \
FeatureSelectionClassifier(
linearSVMC.clone(),
SensitivityBasedFeatureSelection(
SMLRWeights(SMLR(lm=1.0, implementation="C"),
postproc=maxofabs_sample()),
RangeElementSelector(mode='select')),
descr="LinSVM on SMLR(lm=1) non-0")
# "Interesting" classifiers
clfswh += \
FeatureSelectionClassifier(
RbfCSVMC(),
SensitivityBasedFeatureSelection(
SMLRWeights(SMLR(lm=1.0, implementation="C"),
postproc=maxofabs_sample()),
RangeElementSelector(mode='select')),
descr="RbfSVM on SMLR(lm=1) non-0")
clfswh += \
FeatureSelectionClassifier(
linearSVMC.clone(),
SensitivityBasedFeatureSelection(
OneWayAnova(),
FractionTailSelector(0.05, mode='select', tail='upper')),
descr="LinSVM on 5%(ANOVA)")
clfswh += \
FeatureSelectionClassifier(
linearSVMC.clone(),
SensitivityBasedFeatureSelection(
OneWayAnova(),
FixedNElementTailSelector(50, mode='select', tail='upper')),
descr="LinSVM on 50(ANOVA)")
clfswh += \
FeatureSelectionClassifier(
linearSVMC.clone(),
SensitivityBasedFeatureSelection(
linearSVMC.get_sensitivity_analyzer(postproc=maxofabs_sample()),
FractionTailSelector(0.05, mode='select', tail='upper')),
descr="LinSVM on 5%(SVM)")
clfswh += \
FeatureSelectionClassifier(
linearSVMC.clone(),
SensitivityBasedFeatureSelection(
linearSVMC.get_sensitivity_analyzer(postproc=maxofabs_sample()),
FixedNElementTailSelector(50, mode='select', tail='upper')),
descr="LinSVM on 50(SVM)")
### Imports which are specific to RFEs
# from mvpa2.datasets.splitters import OddEvenSplitter
# from mvpa2.clfs.transerror import TransferError
# from mvpa2.featsel.rfe import RFE
# from mvpa2.featsel.helpers import FixedErrorThresholdStopCrit
# from mvpa2.clfs.transerror import ConfusionBasedError
# SVM with unbiased RFE -- transfer-error to another splits, or in
# other terms leave-1-out error on the same dataset
# Has to be bound outside of the RFE definition since both analyzer and
# error should use the same instance.
rfesvm_split = SplitClassifier(linearSVMC)#clfswh['LinearSVMC'][0])
# "Almost" classical RFE. If this works it would differ only that
# our transfer_error is based on internal splitting and classifier used
# within RFE is a split classifier and its sensitivities per split will get
# averaged
#
#clfswh += \
# FeatureSelectionClassifier(
# clf = LinearCSVMC(), #clfswh['LinearSVMC'][0], # we train LinearSVM
# feature_selection = RFE( # on features selected via RFE
# # based on sensitivity of a clf which does splitting internally
# sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(),
# transfer_error=ConfusionBasedError(
# rfesvm_split,
# confusion_state="confusion"),
# # and whose internal error we use
# feature_selector=FractionTailSelector(
# 0.2, mode='discard', tail='lower'),
# # remove 20% of features at each step
# update_sensitivity=True),
# # update sensitivity at each step
# descr='LinSVM+RFE(splits_avg)' )
#
#clfswh += \
# FeatureSelectionClassifier(
# clf = LinearCSVMC(), # we train LinearSVM
# feature_selection = RFE( # on features selected via RFE
# # based on sensitivity of a clf which does splitting internally
# sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(),
# transfer_error=ConfusionBasedError(
# rfesvm_split,
# confusion_state="confusion"),
# # and whose internal error we use
# feature_selector=FractionTailSelector(
# 0.2, mode='discard', tail='lower'),
# # remove 20% of features at each step
# update_sensitivity=False),
# # update sensitivity at each step
# descr='LinSVM+RFE(splits_avg,static)' )
rfesvm = LinearCSVMC()
# This classifier will do RFE while taking transfer error to testing
# set of that split. Resultant classifier is voted classifier on top
# of all splits, let see what that would do ;-)
#clfswh += \
# SplitClassifier( # which does splitting internally
# FeatureSelectionClassifier(
# clf = LinearCSVMC(),
# feature_selection = RFE( # on features selected via RFE
# sensitivity_analyzer=
# rfesvm.get_sensitivity_analyzer(postproc=absolute_features()),
# transfer_error=TransferError(rfesvm),
# stopping_criterion=FixedErrorThresholdStopCrit(0.05),
# feature_selector=FractionTailSelector(
# 0.2, mode='discard', tail='lower'),
# # remove 20% of features at each step
# update_sensitivity=True)),
# # update sensitivity at each step
# descr='LinSVM+RFE(N-Fold)')
#
#
#clfswh += \
# SplitClassifier( # which does splitting internally
# FeatureSelectionClassifier(
# clf = LinearCSVMC(),
# feature_selection = RFE( # on features selected via RFE
# sensitivity_analyzer=
# rfesvm.get_sensitivity_analyzer(postproc=absolute_features()),
# transfer_error=TransferError(rfesvm),
# stopping_criterion=FixedErrorThresholdStopCrit(0.05),
# feature_selector=FractionTailSelector(
# 0.2, mode='discard', tail='lower'),
# # remove 20% of features at each step
# update_sensitivity=True)),
# # update sensitivity at each step
# splitter = OddEvenSplitter(),
# descr='LinSVM+RFE(OddEven)')
_______________________________________________
Pkg-ExpPsy-PyMVPA mailing list
Pkg-ExpPsy-PyMVPA@lists.alioth.debian.org
http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-exppsy-pymvpa