DCausse has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/377343 )
Change subject: Allow selecting exact number of trees for training ...................................................................... Allow selecting exact number of trees for training The idea to base the number of trees off node evaluations, rather than some explicit number of trees, seems to abstract. Lets just allow explicitly setting the exact number of trees to use. Change-Id: Ie5a50865244af91e082d8acb04596d12dfbe3f0a --- M mjolnir/cli/training_pipeline.py M mjolnir/training/xgboost.py 2 files changed, 28 insertions(+), 27 deletions(-) Approvals: DCausse: Verified; Looks good to me, approved diff --git a/mjolnir/cli/training_pipeline.py b/mjolnir/cli/training_pipeline.py index 32d25cd..7425e87 100644 --- a/mjolnir/cli/training_pipeline.py +++ b/mjolnir/cli/training_pipeline.py @@ -20,7 +20,7 @@ from pyspark.sql import functions as F -def main(sc, sqlContext, input_dir, output_dir, wikis, target_node_evaluations, +def main(sc, sqlContext, input_dir, output_dir, wikis, initial_num_trees, final_num_trees, num_workers, num_cv_jobs, num_folds, test_dir, zero_features): if os.path.exists(output_dir): @@ -48,12 +48,11 @@ df_hits_with_features = mjolnir.feature_engineering.zero_features( df_hits_with_features, zero_features) - # Explore a hyperparameter space. Skip the most expensive part of tuning, - # increasing the # of trees, with target_node_evaluations=None tune_results = mjolnir.training.xgboost.tune( df_hits_with_features, num_folds=num_folds, num_cv_jobs=num_cv_jobs, num_workers=num_workers, - target_node_evaluations=target_node_evaluations) + initial_num_trees=initial_num_trees, + final_num_trees=final_num_trees) print 'CV test-ndcg@10: %.4f' % (tune_results['metrics']['cv-test']) print 'CV train-ndcg@10: %.4f' % (tune_results['metrics']['cv-train']) @@ -131,11 +130,12 @@ '-f', '--folds', dest='num_folds', default=5, type=int, help='Number of cross validation folds to use. (Default: 5)') parser.add_argument( - '-n', '--node-evaluations', dest='target_node_evaluations', type=int, default=None, - help='Approximate number of node evaluations per predication that ' - + 'the final result will require. This controls the number of ' - + 'trees used in the final result. Default uses 100 trees rather ' - + 'than dynamically choosing based on max_depth. (Default: None)') + '--initial-trees', dest='initial_num_trees', default=100, type=int, + help='Number of trees to perform hyperparamter tuning with. (Default: 100)') + parser.add_argument( + '--final-trees', dest='final_num_trees', default=None, type=int, + help='Number of trees in the final ensemble. If not provided the value from ' + + '--initial-trees will be used. (Default: None)') parser.add_argument( '-t', '--test-path', dest='test_dir', type=str, required=False, default=None, help='A holdout test set to evaluate the final model against') diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py index 487a619..c010fb0 100644 --- a/mjolnir/training/xgboost.py +++ b/mjolnir/training/xgboost.py @@ -403,7 +403,7 @@ return eta_pred[idx] -def tune(df, num_folds=5, num_cv_jobs=5, num_workers=5, target_node_evaluations=5000): +def tune(df, num_folds=5, num_cv_jobs=5, num_workers=5, initial_num_trees=100, final_num_trees=500): """Find appropriate hyperparameters for training df This is far from perfect, hyperparameter tuning is a bit of a black art @@ -432,13 +432,14 @@ number of executors used will be (num_cv_jobs * num_workers). Generally prefer executors with more cpu's over a higher number of workers where possible. (Default: 5) - target_node_evaluations : int, optional - The approximate number of node evaluations per prediction that the - final result will require. This controls the number of trees used in - the final result. The number of trees will be (target_node_evaluations - / optimal_max_depth). This is by far the most expensive part to tune, - setting to None skips this and uses a constant 100 trees. - (Default: 5000) + initial_num_trees: int, optional + The number of trees to do most of the hyperparameter tuning with. This + should be large enough to be resonably representative of the final + training size. (Default: 100) + final_num_trees: int, optional + The number of trees to do the final eta optimization with. If set to + None the final eta optimization will be skipped and initial_n_tree will + be kept. Returns ------- @@ -482,7 +483,7 @@ space = { 'objective': 'rank:ndcg', 'eval_metric': 'ndcg@10', - 'num_rounds': 100, + 'num_rounds': initial_num_trees, 'min_child_weight': 200, 'max_depth': 6, 'gamma': 0, @@ -535,19 +536,19 @@ space['colsample_bytree'] = best_noise['colsample_bytree'] pprint.pprint(space) - # Finally increase the number of trees to our target, which is mostly based - # on how computationally expensive it is to generate predictions with the final - # model. Find the optimal eta for this new # of trees. This step can take as - # much time as all previous steps combined, and then some, so it can be disabled - # with target_node_evalations of None. - if target_node_evaluations is None: + # Finally increase the number of trees to our target, if it was requested. + if final_num_trees is None or final_num_trees == initial_num_trees: trials_trees = None trials_final = trials_noise else: - space['num_rounds'] = target_node_evaluations / space['max_depth'] + space['num_rounds'] = final_num_trees # TODO: Is 30 steps right amount? too many? too few? This generally - # uses a large number of trees which takes 10 to 20 minutes per evaluation. - # That means evaluating 15 points is 2.5 to 5 hours. + # uses a large number of trees which takes 10 to 20 minutes per evaluation + # on large training sets.That means evaluating 15 points is 2.5 to 5 hours. + # TODO: The appropriate space here really depends on the amount of data and + # the number of trees. A small wiki with 300k observations and 500 trees needs + # to search a very different space than a large wiki with 30M observations + # and the same 500 trees. etas = np.linspace(0.01, 0.3, 30) space['eta'] = hyperopt.hp.choice('eta', etas) best_trees, trials_trees = eval_space_grid(space) -- To view, visit https://gerrit.wikimedia.org/r/377343 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie5a50865244af91e082d8acb04596d12dfbe3f0a Gerrit-PatchSet: 2 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits