GridSearchCV generates unexpected different best parameters by only changing the parameters order in param_grid. Please help!

2021-02-25 Thread Yi Li
I am using GridSearchCV to find the best parameter setting of my 
sklearn.pipeline estimator. The pipeline consists of data transformation, UMAP 
dimension reduction and Kmeans clustering.
The final Kmeans clustering results are scored using silhouette_score. I tried 
to verify the whole pipeline/GridSearchCV worked correctly by only changing the 
parameter order in param_grid (
e.g., change 'reduce__n_neighbors': (5, 10), to 'reduce__n_neighbors': (10, 
5)). I got totally different best parameters although I expect the parameter 
order change should not impact the 
best paramters determined by GridSearchCV. Where did I make mistake and how to 
fix this unexpected results?

Below is the code. The Debug class is used to save the output from 'reduce' 
step. This saved output is used in cv_silhouette_scorer() to calculate 
silhouette_score. I suspect Debug class and cv_silhouette_scorer() 
did not work as I expected. 

I really appreciate your help.

class Debug(BaseEstimator, TransformerMixin):
def __init__(self):
self.transX = None

def transform(self, X):
print(X)
self.transX = X.copy()
return X

def fit(self, X, y=None, **fit_params):
return self

def cv_silhouette_scorer(estimator, X):
# estimator.fit(X)
sdata = estimator.named_steps['debug'].transX
cluster_labels = estimator.named_steps['cluster'].labels_
num_labels = len(set(cluster_labels))
num_samples = sdata.shape[0]
if num_labels == 1 or num_labels == num_samples:
return -1
else:
return silhouette_score(sdata, cluster_labels)


ohe = OneHotEncoder(drop='if_binary', dtype=np.float32)
ore = OrdinalEncoder(dtype=np.float32)
ctenc = ColumnTransformer(transformers=[('ohe', ohe, nom_vars), ('ore', ore, 
ord_vars)], 
  remainder='passthrough')
nftr = FunctionTransformer(nominal_indicator_missing, check_inverse=False, 
  kw_args={'feat_names': ohecols, 'orig_cols': 
nom_vars})
oftr = FunctionTransformer(ordinal_indicator_missing, check_inverse=False,
kw_args={'miss_value': 0.})

ctmiss = ColumnTransformer(transformers=[('nftr', nftr, slice(0, 19)), ('oftr', 
oftr, slice(19, 20)), ('drop_cols', 'drop' , slice(32, 36) )], 
remainder='passthrough')

mputer = IterativeImputer(random_state=RS, add_indicator=True, 
initial_strategy="most_frequent", skip_complete=True)


# Add below keep_vars transformer to drop all demographic columns before pass 
to UMAP

keep_cols = ColumnTransformer(transformers=[('keep_cols1', 'passthrough' , 
slice(17, 25) ), ('keep_cols2', 'passthrough' , slice(46, 54) )] )

scaler = StandardScaler()

trans = FunctionTransformer(np.transpose, check_inverse=False)

dreduce = umap.UMAP(random_state=RS)
knn = KMeans(random_state=RS)
pipe = Pipeline(steps=[
('enc', ctenc)
, ('functr', ctmiss)
, ('mpute', mputer)
, ('keep_cols', keep_cols)
, ('scale', scaler)
, ('trans', trans)
, ('reduce', dreduce)
, ("debug", Debug())
, ('cluster', knn)
]
)

parameters = {
'mpute__max_iter': (15, 20),
'reduce__n_neighbors': (5, 10),
'reduce__min_dist': (0.02, 0.05),
'reduce__n_components': (2, 3),
'reduce__metric': ('euclidean', 'manhattan'),
'cluster__n_clusters': (2, 3),
'cluster__n_init': (10, 25)
}
# Changing parameter order above as below, GridSearchCV reports different best 
parameters.

# parameters = {
# 'mpute__max_iter': (20, 15),
# 'reduce__n_neighbors': (10, 5),
# 'reduce__min_dist': (0.05, 0.02),
# 'reduce__n_components': (3, 2),
# 'reduce__metric': ('manhattan', 'eucidean'),
# 'cluster__n_clusters': (3, 2),
# 'cluster__n_init': (25, 10)
# }


gsearch3 = GridSearchCV(pipe, parameters, n_jobs=-1, 
scoring=cv_silhouette_scorer , cv=5, verbose=1)
gsearch3.fit(dfnew)
-- 
https://mail.python.org/mailman/listinfo/python-list


GridSearchCV generates unexpected different best parameters by only changing the parameters order in param_grid. Where did I make mistake and how to fix this unexpected results?

2021-02-25 Thread Yi Li
I am using GridSearchCV to find the best parameter setting of my 
sklearn.pipeline estimator. The pipeline consists of data transformation, UMAP 
dimension reduction and Kmeans clustering.
The final Kmeans clustering results are scored using silhouette_score. I tried 
to verify the whole pipeline/GridSearchCV worked correctly by only changing the 
parameter order in param_grid (
e.g., change 'reduce__n_neighbors': (5, 10), to 'reduce__n_neighbors': (10, 
5)). I got totally different best parameters although I expect the parameter 
order change should not impact the 
best paramters determined by GridSearchCV. Where did I make mistake and how to 
fix this unexpected results?

Below is the code. The Debug class is used to save the output from 'reduce' 
step. This saved output is used in cv_silhouette_scorer() to calculate 
silhouette_score. I suspect Debug class and cv_silhouette_scorer() 
did not work as I expected. 

I really appreciate your help.

class Debug(BaseEstimator, TransformerMixin):
def __init__(self):
self.transX = None

def transform(self, X):
print(X)
self.transX = X.copy()
return X

def fit(self, X, y=None, **fit_params):
return self

def cv_silhouette_scorer(estimator, X):
# estimator.fit(X)
sdata = estimator.named_steps['debug'].transX
cluster_labels = estimator.named_steps['cluster'].labels_
num_labels = len(set(cluster_labels))
num_samples = sdata.shape[0]
if num_labels == 1 or num_labels == num_samples:
return -1
else:
return silhouette_score(sdata, cluster_labels)


ohe = OneHotEncoder(drop='if_binary', dtype=np.float32)
ore = OrdinalEncoder(dtype=np.float32)
ctenc = ColumnTransformer(transformers=[('ohe', ohe, nom_vars), ('ore', ore, 
ord_vars)], 
  remainder='passthrough')
nftr = FunctionTransformer(nominal_indicator_missing, check_inverse=False, 
  kw_args={'feat_names': ohecols, 'orig_cols': 
nom_vars})
oftr = FunctionTransformer(ordinal_indicator_missing, check_inverse=False,
kw_args={'miss_value': 0.})

ctmiss = ColumnTransformer(transformers=[('nftr', nftr, slice(0, 19)), ('oftr', 
oftr, slice(19, 20)), ('drop_cols', 'drop' , slice(32, 36) )], 
remainder='passthrough')

mputer = IterativeImputer(random_state=RS, add_indicator=True, 
initial_strategy="most_frequent", skip_complete=True)


# Add below keep_vars transformer to drop all demographic columns before pass 
to UMAP

keep_cols = ColumnTransformer(transformers=[('keep_cols1', 'passthrough' , 
slice(17, 25) ), ('keep_cols2', 'passthrough' , slice(46, 54) )] )

scaler = StandardScaler()

trans = FunctionTransformer(np.transpose, check_inverse=False)

dreduce = umap.UMAP(random_state=RS)
knn = KMeans(random_state=RS)
pipe = Pipeline(steps=[
('enc', ctenc)
, ('functr', ctmiss)
, ('mpute', mputer)
, ('keep_cols', keep_cols)
, ('scale', scaler)
, ('trans', trans)
, ('reduce', dreduce)
, ("debug", Debug())
, ('cluster', knn)
]
)

parameters = {
'mpute__max_iter': (15, 20),
'reduce__n_neighbors': (5, 10),
'reduce__min_dist': (0.02, 0.05),
'reduce__n_components': (2, 3),
'reduce__metric': ('euclidean', 'manhattan'),
'cluster__n_clusters': (2, 3),
'cluster__n_init': (10, 25)
}
# Changing parameter order above as below, GridSearchCV reports different best 
parameters.

# parameters = {
# 'mpute__max_iter': (20, 15),
# 'reduce__n_neighbors': (10, 5),
# 'reduce__min_dist': (0.05, 0.02),
# 'reduce__n_components': (3, 2),
# 'reduce__metric': ('manhattan', 'eucidean'),
# 'cluster__n_clusters': (3, 2),
# 'cluster__n_init': (25, 10)
# }

def cv_silhouette_scorer(estimator, X):
# estimator.fit(X)
sdata = estimator.named_steps['debug'].transX
cluster_labels = estimator.named_steps['cluster'].labels_
num_labels = len(set(cluster_labels))
num_samples = sdata.shape[0]
if num_labels == 1 or num_labels == num_samples:
return -1
else:
return silhouette_score(sdata, cluster_labels)

gsearch3 = GridSearchCV(pipe, parameters, n_jobs=-1, 
scoring=cv_silhouette_scorer , cv=5, verbose=1)
gsearch3.fit(dfnew)
-- 
https://mail.python.org/mailman/listinfo/python-list