EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/366005 )
Change subject: Finish implementation of zero-feature cli arg ...................................................................... Finish implementation of zero-feature cli arg The zero-feature cli argument to training_pipeline.py was only half finished, complete the implementation by actually calling the backend function. Also fix a couple small problems identified by flake8. Change-Id: Iddbd75eb6d55f059ed95a4aab19586da47d6064d --- M mjolnir/cli/training_pipeline.py M mjolnir/feature_engineering.py M mjolnir/training/tuning.py 3 files changed, 8 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/05/366005/1 diff --git a/mjolnir/cli/training_pipeline.py b/mjolnir/cli/training_pipeline.py index 7e91457..fc08b8d 100644 --- a/mjolnir/cli/training_pipeline.py +++ b/mjolnir/cli/training_pipeline.py @@ -21,7 +21,7 @@ def main(sc, sqlContext, input_dir, output_dir, wikis, target_node_evaluations, - num_workers, num_cv_jobs, num_folds, test_dir): + num_workers, num_cv_jobs, num_folds, test_dir, zero_features): if os.path.exists(output_dir): logging.error('Output directory (%s) already exists' % (output_dir)) @@ -44,6 +44,10 @@ print '' continue + if zero_features: + df_hits_with_features = mjolnir.feature_engineering.zero_features( + df_hits_with_features, zero_features) + # Explore a hyperparameter space. Skip the most expensive part of tuning, # increasing the # of trees, with target_node_evaluations=None tune_results = mjolnir.training.xgboost.tune( diff --git a/mjolnir/feature_engineering.py b/mjolnir/feature_engineering.py index 8301547..9f86a53 100644 --- a/mjolnir/feature_engineering.py +++ b/mjolnir/feature_engineering.py @@ -41,6 +41,7 @@ """ features = df.schema['features'].metadata['features'] idxs = [features.index(name) for name in feature_names] + def zero_features(feat): raw = feat.toArray() for idx in idxs: @@ -63,6 +64,7 @@ pyspark.sql.DataFrame """ features = df.schema['features'].metadata['features'] + def extract_feature(features, idx): return features.toArray()[idx] extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType()) diff --git a/mjolnir/training/tuning.py b/mjolnir/training/tuning.py index 8548f5d..2799db6 100644 --- a/mjolnir/training/tuning.py +++ b/mjolnir/training/tuning.py @@ -83,7 +83,7 @@ .collect()) df_splits = ( - sc.parallelize(split_rows(rows)) + df._sc.parallelize(split_rows(rows)) .toDF(['wikiid', 'norm_query_id', output_column])) return df.join(df_splits, how='inner', on=['wikiid', 'norm_query_id']) -- To view, visit https://gerrit.wikimedia.org/r/366005 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iddbd75eb6d55f059ed95a4aab19586da47d6064d Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits