I am using GridSearchCV to find the best parameter setting of my 
sklearn.pipeline estimator. The pipeline consists of data transformation, UMAP 
dimension reduction and Kmeans clustering.
The final Kmeans clustering results are scored using silhouette_score. I tried 
to verify the whole pipeline/GridSearchCV worked correctly by only changing the 
parameter order in param_grid (
e.g., change 'reduce__n_neighbors': (5, 10), to 'reduce__n_neighbors': (10, 
5)). I got totally different best parameters although I expect the parameter 
order change should not impact the 
best paramters determined by GridSearchCV. Where did I make mistake and how to 
fix this unexpected results?

Below is the code. The Debug class is used to save the output from 'reduce' 
step. This saved output is used in cv_silhouette_scorer() to calculate 
silhouette_score. I suspect Debug class and cv_silhouette_scorer() 
did not work as I expected. 

I really appreciate your help.

class Debug(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.transX = None

    def transform(self, X):
        print(X)
        self.transX = X.copy()
        return X

    def fit(self, X, y=None, **fit_params):
        return self

def cv_silhouette_scorer(estimator, X):
#     estimator.fit(X)
    sdata = estimator.named_steps['debug'].transX
    cluster_labels = estimator.named_steps['cluster'].labels_
    num_labels = len(set(cluster_labels))
    num_samples = sdata.shape[0]
    if num_labels == 1 or num_labels == num_samples:
        return -1
    else:
        return silhouette_score(sdata, cluster_labels)


ohe = OneHotEncoder(drop='if_binary', dtype=np.float32)
ore = OrdinalEncoder(dtype=np.float32)
ctenc = ColumnTransformer(transformers=[('ohe', ohe, nom_vars), ('ore', ore, 
ord_vars)], 
                          remainder='passthrough')
nftr = FunctionTransformer(nominal_indicator_missing, check_inverse=False, 
                          kw_args={'feat_names': ohecols, 'orig_cols': 
nom_vars})
oftr = FunctionTransformer(ordinal_indicator_missing, check_inverse=False,
                            kw_args={'miss_value': 0.})

ctmiss = ColumnTransformer(transformers=[('nftr', nftr, slice(0, 19)), ('oftr', 
oftr, slice(19, 20)), ('drop_cols', 'drop' , slice(32, 36) )], 
remainder='passthrough')

mputer = IterativeImputer(random_state=RS, add_indicator=True, 
initial_strategy="most_frequent", skip_complete=True)


# Add below keep_vars transformer to drop all demographic columns before pass 
to UMAP

keep_cols = ColumnTransformer(transformers=[('keep_cols1', 'passthrough' , 
slice(17, 25) ), ('keep_cols2', 'passthrough' , slice(46, 54) )] )

scaler = StandardScaler()

trans = FunctionTransformer(np.transpose, check_inverse=False)

dreduce = umap.UMAP(random_state=RS)
knn = KMeans(random_state=RS)
pipe = Pipeline(steps=[
        ('enc', ctenc)
        , ('functr', ctmiss)
        , ('mpute', mputer)
        , ('keep_cols', keep_cols)
        , ('scale', scaler)
        , ('trans', trans)
        , ('reduce', dreduce)
        , ("debug", Debug())
        , ('cluster', knn)
    ]
)

parameters = {
    'mpute__max_iter': (15, 20),
    'reduce__n_neighbors': (5, 10),
    'reduce__min_dist': (0.02, 0.05),
    'reduce__n_components': (2, 3),
    'reduce__metric': ('euclidean', 'manhattan'),
    'cluster__n_clusters': (2, 3),
    'cluster__n_init': (10, 25)
}
# Changing parameter order above as below, GridSearchCV reports different best 
parameters.

# parameters = {
#     'mpute__max_iter': (20, 15),
#     'reduce__n_neighbors': (10, 5),
#     'reduce__min_dist': (0.05, 0.02),
#     'reduce__n_components': (3, 2),
#     'reduce__metric': ('manhattan', 'eucidean'),
#     'cluster__n_clusters': (3, 2),
#     'cluster__n_init': (25, 10)
# }

def cv_silhouette_scorer(estimator, X):
#     estimator.fit(X)
    sdata = estimator.named_steps['debug'].transX
    cluster_labels = estimator.named_steps['cluster'].labels_
    num_labels = len(set(cluster_labels))
    num_samples = sdata.shape[0]
    if num_labels == 1 or num_labels == num_samples:
        return -1
    else:
        return silhouette_score(sdata, cluster_labels)

gsearch3 = GridSearchCV(pipe, parameters, n_jobs=-1, 
scoring=cv_silhouette_scorer , cv=5, verbose=1)
gsearch3.fit(dfnew)
-- 
https://mail.python.org/mailman/listinfo/python-list

Reply via email to