I am using GridSearchCV to find the best parameter setting of my
sklearn.pipeline estimator. The pipeline consists of data transformation, UMAP
dimension reduction and Kmeans clustering.
The final Kmeans clustering results are scored using silhouette_score. I tried
to verify the whole pipeline/GridSearchCV worked correctly by only changing the
parameter order in param_grid (
e.g., change 'reduce__n_neighbors': (5, 10), to 'reduce__n_neighbors': (10,
5)). I got totally different best parameters although I expect the parameter
order change should not impact the
best paramters determined by GridSearchCV. Where did I make mistake and how to
fix this unexpected results?
Below is the code. The Debug class is used to save the output from 'reduce'
step. This saved output is used in cv_silhouette_scorer() to calculate
silhouette_score. I suspect Debug class and cv_silhouette_scorer()
did not work as I expected.
I really appreciate your help.
class Debug(BaseEstimator, TransformerMixin):
def __init__(self):
self.transX = None
def transform(self, X):
print(X)
self.transX = X.copy()
return X
def fit(self, X, y=None, **fit_params):
return self
def cv_silhouette_scorer(estimator, X):
# estimator.fit(X)
sdata = estimator.named_steps['debug'].transX
cluster_labels = estimator.named_steps['cluster'].labels_
num_labels = len(set(cluster_labels))
num_samples = sdata.shape[0]
if num_labels == 1 or num_labels == num_samples:
return -1
else:
return silhouette_score(sdata, cluster_labels)
ohe = OneHotEncoder(drop='if_binary', dtype=np.float32)
ore = OrdinalEncoder(dtype=np.float32)
ctenc = ColumnTransformer(transformers=[('ohe', ohe, nom_vars), ('ore', ore,
ord_vars)],
remainder='passthrough')
nftr = FunctionTransformer(nominal_indicator_missing, check_inverse=False,
kw_args={'feat_names': ohecols, 'orig_cols':
nom_vars})
oftr = FunctionTransformer(ordinal_indicator_missing, check_inverse=False,
kw_args={'miss_value': 0.})
ctmiss = ColumnTransformer(transformers=[('nftr', nftr, slice(0, 19)), ('oftr',
oftr, slice(19, 20)), ('drop_cols', 'drop' , slice(32, 36) )],
remainder='passthrough')
mputer = IterativeImputer(random_state=RS, add_indicator=True,
initial_strategy="most_frequent", skip_complete=True)
# Add below keep_vars transformer to drop all demographic columns before pass
to UMAP
keep_cols = ColumnTransformer(transformers=[('keep_cols1', 'passthrough' ,
slice(17, 25) ), ('keep_cols2', 'passthrough' , slice(46, 54) )] )
scaler = StandardScaler()
trans = FunctionTransformer(np.transpose, check_inverse=False)
dreduce = umap.UMAP(random_state=RS)
knn = KMeans(random_state=RS)
pipe = Pipeline(steps=[
('enc', ctenc)
, ('functr', ctmiss)
, ('mpute', mputer)
, ('keep_cols', keep_cols)
, ('scale', scaler)
, ('trans', trans)
, ('reduce', dreduce)
, ("debug", Debug())
, ('cluster', knn)
]
)
parameters = {
'mpute__max_iter': (15, 20),
'reduce__n_neighbors': (5, 10),
'reduce__min_dist': (0.02, 0.05),
'reduce__n_components': (2, 3),
'reduce__metric': ('euclidean', 'manhattan'),
'cluster__n_clusters': (2, 3),
'cluster__n_init': (10, 25)
}
# Changing parameter order above as below, GridSearchCV reports different best
parameters.
# parameters = {
# 'mpute__max_iter': (20, 15),
# 'reduce__n_neighbors': (10, 5),
# 'reduce__min_dist': (0.05, 0.02),
# 'reduce__n_components': (3, 2),
# 'reduce__metric': ('manhattan', 'eucidean'),
# 'cluster__n_clusters': (3, 2),
# 'cluster__n_init': (25, 10)
# }
def cv_silhouette_scorer(estimator, X):
# estimator.fit(X)
sdata = estimator.named_steps['debug'].transX
cluster_labels = estimator.named_steps['cluster'].labels_
num_labels = len(set(cluster_labels))
num_samples = sdata.shape[0]
if num_labels == 1 or num_labels == num_samples:
return -1
else:
return silhouette_score(sdata, cluster_labels)
gsearch3 = GridSearchCV(pipe, parameters, n_jobs=-1,
scoring=cv_silhouette_scorer , cv=5, verbose=1)
gsearch3.fit(dfnew)
--
https://mail.python.org/mailman/listinfo/python-list