[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: JVM components to support file-based training
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/403545 ) Change subject: JVM components to support file-based training .. JVM components to support file-based training An upcoming refactor changes training_pipeline.py from dataframe based training to file based training, where we emit partitioned and formatted folds/splits to hdfs and load them into training by copying to a local file and pointing c++ as it. This is a separate patch so we can release a new version of the MjoLniR jar. Due to how our CI works python cannot test against new jvm code until it has been released. The entry points that python will be using are: * DataWriter.write * MlrXGBoost.trainWithFiles Change-Id: Ib5e8cd9d3e87e724f05b5ec0941c140aa5077d71 --- M .gitignore D jvm/mjolnir.iml M jvm/pom.xml A jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/MlrXGBoost.scala A jvm/src/test/resources/fixtures/datasets/test.txt A jvm/src/test/resources/fixtures/datasets/test.txt.query A jvm/src/test/resources/fixtures/datasets/train.txt A jvm/src/test/resources/fixtures/datasets/train.txt.query M jvm/src/test/scala/org/wikimedia/search/mjolnir/DBNSuite.scala A jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala M jvm/src/test/scala/org/wikimedia/search/mjolnir/PythonUtilsSuite.scala M jvm/src/test/scala/org/wikimedia/search/mjolnir/SharedSparkContext.scala 15 files changed, 841 insertions(+), 165 deletions(-) Approvals: jenkins-bot: Verified DCausse: Looks good to me, approved diff --git a/.gitignore b/.gitignore index f2a9cf7..83930f6 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ # Editor temporary files .*.sw[po] /jvm/.idea +/jvm/mjolnir.iml # Vagrant, and cdh stuff in vagrant .vagrant diff --git a/jvm/mjolnir.iml b/jvm/mjolnir.iml deleted file mode 100644 index b341014..000 --- a/jvm/mjolnir.iml +++ /dev/null @@ -1,162 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/jvm/pom.xml b/jvm/pom.xml index 479b4ec..d1fdc13 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -14,7 +14,7 @@ 2.1.0 2.11.8 2.11 -0.7-wmf-1 +0.8-wmf-1-SNAPSHOT @@ -146,6 +146,16 @@ jackson-module-scala_${scala.binary.version} 2.6.5 + +ml.dmlc +xgboost4j-spark +${xgboost.version} + + +ml.dmlc +xgboost4j +${xgboost.version} + diff --git a/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala new file mode 100644 index 000..45a00af --- /dev/null +++ b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala @@ -0,0 +1,28 @@ +package ml.dmlc.xgboost4j.scala.spark + +import ml.dmlc.xgboost4j.java.IRabitTracker +import ml.dmlc.xgboost4j.scala.Booster +import ml.dmlc.xgboost4j.scala.rabit.RabitTracker + +/** + * Provide access to package-private constructs of xgboost4j-spark + */ +object MjolnirUtils { + def model(booster: Booster, metrics: Map[String, Array[Float]], trainMatrix: String): XGBoostModel = { +// Arbitrarily take an 'other' matrix if available +val xgMetrics = metrics.keys.find(!_.equals(trainMatrix)).map{ name => Map( + "train" -> metrics(trainMatrix), + "test" -> metrics(name) +) }.getOrElse(Map( + "train" -> metrics(trainMatrix) +)) + +val model = new XGBoostRegressionModel(booster) +model.setSummary(XGBoostTrainingSummary(xgMetrics)) +model + } + + def scalaRabitTracker(nWorkers: Int): IRabitTracker = { +new RabitTracker(nWorkers) + } +} diff --git
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: JVM components to support file-based training
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403545 ) Change subject: JVM components to support file-based training .. JVM components to support file-based training An upcoming refactor changes training_pipeline.py from dataframe based training to file based training, where we emit partitioned and formatted folds/splits to hdfs and load them into training by copying to a local file and pointing c++ as it. This is a separate patch so we can release a new version of the MjoLniR jar. Due to how our CI works python cannot test against new jvm code until it has been released. The entry points that python will be using are: * DataWriter.write * MlrXGBoost.trainWithFiles Change-Id: Ib5e8cd9d3e87e724f05b5ec0941c140aa5077d71 --- M .gitignore D jvm/mjolnir.iml M jvm/pom.xml A jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/MlrXGBoost.scala A jvm/src/test/resources/fixtures/datasets/test.txt A jvm/src/test/resources/fixtures/datasets/test.txt.query A jvm/src/test/resources/fixtures/datasets/train.txt A jvm/src/test/resources/fixtures/datasets/train.txt.query M jvm/src/test/scala/org/wikimedia/search/mjolnir/PythonUtilsSuite.scala 12 files changed, 749 insertions(+), 163 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/45/403545/1 diff --git a/.gitignore b/.gitignore index f2a9cf7..83930f6 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ # Editor temporary files .*.sw[po] /jvm/.idea +/jvm/mjolnir.iml # Vagrant, and cdh stuff in vagrant .vagrant diff --git a/jvm/mjolnir.iml b/jvm/mjolnir.iml deleted file mode 100644 index b341014..000 --- a/jvm/mjolnir.iml +++ /dev/null @@ -1,162 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/jvm/pom.xml b/jvm/pom.xml index 479b4ec..d1fdc13 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -14,7 +14,7 @@ 2.1.0 2.11.8 2.11 -0.7-wmf-1 +0.8-wmf-1-SNAPSHOT @@ -146,6 +146,16 @@ jackson-module-scala_${scala.binary.version} 2.6.5 + +ml.dmlc +xgboost4j-spark +${xgboost.version} + + +ml.dmlc +xgboost4j +${xgboost.version} + diff --git a/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala new file mode 100644 index 000..45a00af --- /dev/null +++ b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala @@ -0,0 +1,28 @@ +package ml.dmlc.xgboost4j.scala.spark + +import ml.dmlc.xgboost4j.java.IRabitTracker +import ml.dmlc.xgboost4j.scala.Booster +import ml.dmlc.xgboost4j.scala.rabit.RabitTracker + +/** + * Provide access to package-private constructs of xgboost4j-spark + */ +object MjolnirUtils { + def model(booster: Booster, metrics: Map[String, Array[Float]], trainMatrix: String): XGBoostModel = { +// Arbitrarily take an 'other' matrix if available +val xgMetrics = metrics.keys.find(!_.equals(trainMatrix)).map{ name => Map( + "train" -> metrics(trainMatrix), + "test" -> metrics(name) +) }.getOrElse(Map( + "train" -> metrics(trainMatrix) +)) + +val model = new XGBoostRegressionModel(booster) +model.setSummary(XGBoostTrainingSummary(xgMetrics)) +model + } + + def scalaRabitTracker(nWorkers: Int): IRabitTracker = { +new RabitTracker(nWorkers) + } +} diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala new file mode 100644 index 000..9962b3a --- /dev/null +++