Close #104: [HIVEMALL-101-2] Renamed train_regression to train_regressor
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/7205de1e Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/7205de1e Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/7205de1e Branch: refs/heads/master Commit: 7205de1e959f0d9b96ac756e415d8a8ada7e92af Parents: 0737e23 Author: Makoto Yui <m...@apache.org> Authored: Thu Jul 20 20:25:27 2017 +0900 Committer: Makoto Yui <m...@apache.org> Committed: Thu Jul 20 20:25:27 2017 +0900 ---------------------------------------------------------------------- .../java/hivemall/regression/AdaDeltaUDTF.java | 2 +- .../java/hivemall/regression/AdaGradUDTF.java | 2 +- .../regression/GeneralRegressionUDTF.java | 71 ---- .../regression/GeneralRegressorUDTF.java | 71 ++++ .../java/hivemall/regression/LogressUDTF.java | 2 +- .../regression/GeneralRegressionUDTFTest.java | 331 ---------------- .../regression/GeneralRegressorUDTFTest.java | 331 ++++++++++++++++ docs/gitbook/FOOTER.md | 2 +- docs/gitbook/SUMMARY.md | 2 +- docs/gitbook/anomaly/sst.md | 2 +- docs/gitbook/binaryclass/a9a_dataset.md | 2 +- docs/gitbook/binaryclass/a9a_lr.md | 4 +- docs/gitbook/binaryclass/a9a_minibatch.md | 4 +- docs/gitbook/binaryclass/general.md | 2 +- docs/gitbook/binaryclass/kdd2010a_dataset.md | 4 +- docs/gitbook/binaryclass/kdd2010a_scw.md | 12 +- docs/gitbook/binaryclass/kdd2010b_arow.md | 6 +- docs/gitbook/binaryclass/kdd2010b_dataset.md | 2 +- docs/gitbook/binaryclass/news20_adagrad.md | 387 ++++++++++--------- docs/gitbook/binaryclass/news20_dataset.md | 4 +- docs/gitbook/binaryclass/news20_pa.md | 10 +- docs/gitbook/binaryclass/news20_scw.md | 18 +- docs/gitbook/binaryclass/webspam_dataset.md | 6 +- docs/gitbook/clustering/plsa.md | 2 +- docs/gitbook/eval/lr_datagen.md | 2 +- docs/gitbook/eval/rank.md | 2 +- docs/gitbook/ft_engineering/polynomial.md | 2 +- docs/gitbook/ft_engineering/quantify.md | 2 +- docs/gitbook/ft_engineering/scaling.md | 2 +- docs/gitbook/ft_engineering/tfidf.md | 335 ++++++++-------- docs/gitbook/geospatial/latlon.md | 2 +- docs/gitbook/getting_started/README.md | 2 +- docs/gitbook/getting_started/input-format.md | 4 +- docs/gitbook/misc/prediction.md | 10 +- docs/gitbook/misc/tokenizer.md | 2 +- docs/gitbook/multiclass/iris_randomforest.md | 2 +- docs/gitbook/multiclass/news20_dataset.md | 4 +- docs/gitbook/multiclass/news20_ensemble.md | 14 +- .../news20_one-vs-the-rest_dataset.md | 2 +- docs/gitbook/multiclass/news20_pa.md | 4 +- docs/gitbook/multiclass/news20_scw.md | 16 +- docs/gitbook/recommend/item_based_cf.md | 2 +- docs/gitbook/recommend/movielens_cf.md | 2 +- docs/gitbook/recommend/movielens_cv.md | 2 +- docs/gitbook/recommend/movielens_fm.md | 2 +- docs/gitbook/recommend/movielens_mf.md | 2 +- docs/gitbook/recommend/news20_bbit_minhash.md | 2 +- docs/gitbook/recommend/news20_jaccard.md | 2 +- docs/gitbook/regression/e2006_arow.md | 12 +- docs/gitbook/regression/e2006_dataset.md | 35 +- docs/gitbook/regression/general.md | 4 +- docs/gitbook/regression/kddcup12tr2_dataset.md | 2 +- docs/gitbook/regression/kddcup12tr2_lr.md | 2 +- .../regression/kddcup12tr2_lr_amplify.md | 2 +- docs/gitbook/tips/addbias.md | 10 +- docs/gitbook/tips/emr.md | 6 +- docs/gitbook/tips/ensemble_learning.md | 14 +- docs/gitbook/tips/hadoop_tuning.md | 2 +- docs/gitbook/tips/mixserver.md | 4 +- docs/gitbook/tips/rand_amplify.md | 2 +- docs/gitbook/tips/rt_prediction.md | 4 +- docs/gitbook/troubleshooting/asterisk.md | 2 +- .../troubleshooting/mapjoin_classcastex.md | 2 +- .../troubleshooting/mapjoin_task_error.md | 2 +- docs/gitbook/troubleshooting/num_mappers.md | 2 +- docs/gitbook/troubleshooting/oom.md | 2 +- resources/ddl/define-all-as-permanent.hive | 4 +- resources/ddl/define-all.hive | 4 +- resources/ddl/define-all.spark | 4 +- resources/ddl/define-udfs.td.hql | 2 +- 70 files changed, 917 insertions(+), 906 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/core/src/main/java/hivemall/regression/AdaDeltaUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/regression/AdaDeltaUDTF.java b/core/src/main/java/hivemall/regression/AdaDeltaUDTF.java index 68cd35c..81ff550 100644 --- a/core/src/main/java/hivemall/regression/AdaDeltaUDTF.java +++ b/core/src/main/java/hivemall/regression/AdaDeltaUDTF.java @@ -37,7 +37,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; /** * ADADELTA: AN ADAPTIVE LEARNING RATE METHOD. * - * @deprecated Use {@link hivemall.regression.GeneralRegressionUDTF} instead + * @deprecated Use {@link hivemall.regression.GeneralRegressorUDTF} instead */ @Deprecated @Description( http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/core/src/main/java/hivemall/regression/AdaGradUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/regression/AdaGradUDTF.java b/core/src/main/java/hivemall/regression/AdaGradUDTF.java index 237566c..9f01874 100644 --- a/core/src/main/java/hivemall/regression/AdaGradUDTF.java +++ b/core/src/main/java/hivemall/regression/AdaGradUDTF.java @@ -37,7 +37,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; /** * ADAGRAD algorithm with element-wise adaptive learning rates. * - * @deprecated Use {@link hivemall.regression.GeneralRegressionUDTF} instead + * @deprecated Use {@link hivemall.regression.GeneralRegressorUDTF} instead */ @Deprecated @Description( http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/core/src/main/java/hivemall/regression/GeneralRegressionUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/regression/GeneralRegressionUDTF.java b/core/src/main/java/hivemall/regression/GeneralRegressionUDTF.java deleted file mode 100644 index a34a6e6..0000000 --- a/core/src/main/java/hivemall/regression/GeneralRegressionUDTF.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.regression; - -import hivemall.GeneralLearnerBaseUDTF; -import hivemall.annotations.Since; -import hivemall.model.FeatureValue; -import hivemall.optimizer.LossFunctions.LossFunction; -import hivemall.optimizer.LossFunctions.LossType; - -import javax.annotation.Nonnull; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; - -/** - * A general regression class with replaceable optimization functions. - */ -@Description(name = "train_regression", - value = "_FUNC_(list<string|int|bigint> features, double label [, const string options])" - + " - Returns a relation consists of <string|int|bigint feature, float weight>", - extended = "Build a prediction model by a generic regressor") -@Since(version = "0.5-rc.1") -public final class GeneralRegressionUDTF extends GeneralLearnerBaseUDTF { - - @Override - protected String getLossOptionDescription() { - return "Loss function [SquaredLoss (default), QuantileLoss, EpsilonInsensitiveLoss, " - + "SquaredEpsilonInsensitiveLoss, HuberLoss]"; - } - - @Override - protected LossType getDefaultLossType() { - return LossType.SquaredLoss; - } - - @Override - protected void checkLossFunction(@Nonnull LossFunction lossFunction) - throws UDFArgumentException { - if (!lossFunction.forRegression()) { - throw new UDFArgumentException("The loss function `" + lossFunction.getType() - + "` is not designed for regression"); - } - } - - @Override - protected void checkTargetValue(float label) throws UDFArgumentException {} - - @Override - protected void train(@Nonnull final FeatureValue[] features, final float target) { - float p = predict(features); - update(features, target, p); - } - -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/core/src/main/java/hivemall/regression/GeneralRegressorUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/regression/GeneralRegressorUDTF.java b/core/src/main/java/hivemall/regression/GeneralRegressorUDTF.java new file mode 100644 index 0000000..cbf8132 --- /dev/null +++ b/core/src/main/java/hivemall/regression/GeneralRegressorUDTF.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.regression; + +import hivemall.GeneralLearnerBaseUDTF; +import hivemall.annotations.Since; +import hivemall.model.FeatureValue; +import hivemall.optimizer.LossFunctions.LossFunction; +import hivemall.optimizer.LossFunctions.LossType; + +import javax.annotation.Nonnull; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; + +/** + * A general regression class with replaceable optimization functions. + */ +@Description(name = "train_regressor", + value = "_FUNC_(list<string|int|bigint> features, double label [, const string options])" + + " - Returns a relation consists of <string|int|bigint feature, float weight>", + extended = "Build a prediction model by a generic regressor") +@Since(version = "0.5-rc.1") +public final class GeneralRegressorUDTF extends GeneralLearnerBaseUDTF { + + @Override + protected String getLossOptionDescription() { + return "Loss function [SquaredLoss (default), QuantileLoss, EpsilonInsensitiveLoss, " + + "SquaredEpsilonInsensitiveLoss, HuberLoss]"; + } + + @Override + protected LossType getDefaultLossType() { + return LossType.SquaredLoss; + } + + @Override + protected void checkLossFunction(@Nonnull LossFunction lossFunction) + throws UDFArgumentException { + if (!lossFunction.forRegression()) { + throw new UDFArgumentException("The loss function `" + lossFunction.getType() + + "` is not designed for regression"); + } + } + + @Override + protected void checkTargetValue(float label) throws UDFArgumentException {} + + @Override + protected void train(@Nonnull final FeatureValue[] features, final float target) { + float p = predict(features); + update(features, target, p); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/core/src/main/java/hivemall/regression/LogressUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/regression/LogressUDTF.java b/core/src/main/java/hivemall/regression/LogressUDTF.java index c5c5bae..a5670df 100644 --- a/core/src/main/java/hivemall/regression/LogressUDTF.java +++ b/core/src/main/java/hivemall/regression/LogressUDTF.java @@ -31,7 +31,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; /** * Logistic regression using SGD. * - * @deprecated Use {@link hivemall.regression.GeneralRegressionUDTF} instead + * @deprecated Use {@link hivemall.regression.GeneralRegressorUDTF} instead */ @Deprecated @Description( http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/core/src/test/java/hivemall/regression/GeneralRegressionUDTFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/regression/GeneralRegressionUDTFTest.java b/core/src/test/java/hivemall/regression/GeneralRegressionUDTFTest.java deleted file mode 100644 index f352b89..0000000 --- a/core/src/test/java/hivemall/regression/GeneralRegressionUDTFTest.java +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.regression; - -import static hivemall.utils.hadoop.HiveUtils.lazyInteger; -import static hivemall.utils.hadoop.HiveUtils.lazyLong; -import static hivemall.utils.hadoop.HiveUtils.lazyString; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import javax.annotation.Nonnull; - -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.Collector; -import org.apache.hadoop.hive.serde2.lazy.LazyInteger; -import org.apache.hadoop.hive.serde2.lazy.LazyLong; -import org.apache.hadoop.hive.serde2.lazy.LazyString; -import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.junit.Assert; -import org.junit.Test; - -public class GeneralRegressionUDTFTest { - private static final boolean DEBUG = false; - - @Test(expected = UDFArgumentException.class) - public void testUnsupportedOptimizer() throws Exception { - GeneralRegressionUDTF udtf = new GeneralRegressionUDTF(); - ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; - ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); - ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-opt UnsupportedOpt"); - - udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); - } - - @Test(expected = UDFArgumentException.class) - public void testUnsupportedLossFunction() throws Exception { - GeneralRegressionUDTF udtf = new GeneralRegressionUDTF(); - ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; - ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); - ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-loss UnsupportedLoss"); - - udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); - } - - @Test(expected = UDFArgumentException.class) - public void testInvalidLossFunction() throws Exception { - GeneralRegressionUDTF udtf = new GeneralRegressionUDTF(); - ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; - ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); - ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-loss HingeLoss"); - - udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); - } - - @Test(expected = UDFArgumentException.class) - public void testUnsupportedRegularization() throws Exception { - GeneralRegressionUDTF udtf = new GeneralRegressionUDTF(); - ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; - ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); - ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-reg UnsupportedReg"); - - udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); - } - - @Test - public void testNoOptions() throws Exception { - List<String> x = Arrays.asList("1:-2", "2:-1"); - float y = 0.f; - - GeneralRegressionUDTF udtf = new GeneralRegressionUDTF(); - ObjectInspector intOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; - ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); - - udtf.initialize(new ObjectInspector[] {stringListOI, intOI}); - - udtf.process(new Object[] {x, y}); - - udtf.finalizeTraining(); - - float predicted = udtf.predict(udtf.parseFeatures(x)); - Assert.assertEquals(y, predicted, 1E-5); - } - - private <T> void testFeature(@Nonnull List<T> x, @Nonnull ObjectInspector featureOI, - @Nonnull Class<T> featureClass, @Nonnull Class<?> modelFeatureClass) throws Exception { - float y = 0.f; - - GeneralRegressionUDTF udtf = new GeneralRegressionUDTF(); - ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; - ListObjectInspector featureListOI = ObjectInspectorFactory.getStandardListObjectInspector(featureOI); - - udtf.initialize(new ObjectInspector[] {featureListOI, valueOI}); - - final List<Object> modelFeatures = new ArrayList<Object>(); - udtf.setCollector(new Collector() { - @Override - public void collect(Object input) throws HiveException { - Object[] forwardMapObj = (Object[]) input; - modelFeatures.add(forwardMapObj[0]); - } - }); - - udtf.process(new Object[] {x, y}); - - udtf.close(); - - Assert.assertFalse(modelFeatures.isEmpty()); - for (Object modelFeature : modelFeatures) { - Assert.assertEquals("All model features must have same type", modelFeatureClass, - modelFeature.getClass()); - } - } - - @Test - public void testLazyStringFeature() throws Exception { - LazyStringObjectInspector oi = LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector( - false, (byte) 0); - List<LazyString> x = Arrays.asList(lazyString("ãã¹ã:-2", oi), lazyString("æ¼¢å:-333.0", oi), - lazyString("test:-1")); - testFeature(x, oi, LazyString.class, String.class); - } - - @Test - public void testStringFeature() throws Exception { - List<String> x = Arrays.asList("1:-2", "2:-1"); - ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - testFeature(x, featureOI, String.class, String.class); - } - - @Test(expected = IllegalArgumentException.class) - public void testIlleagalStringFeature() throws Exception { - List<String> x = Arrays.asList("1:-2jjjj", "2:-1"); - ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - testFeature(x, featureOI, String.class, String.class); - } - - @Test - public void testTextFeature() throws Exception { - List<Text> x = Arrays.asList(new Text("1:-2"), new Text("2:-1")); - ObjectInspector featureOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector; - testFeature(x, featureOI, Text.class, String.class); - } - - @Test - public void testIntegerFeature() throws Exception { - List<Integer> x = Arrays.asList(111, 222); - ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaIntObjectInspector; - testFeature(x, featureOI, Integer.class, Integer.class); - } - - @Test - public void testLazyIntegerFeature() throws Exception { - List<LazyInteger> x = Arrays.asList(lazyInteger(111), lazyInteger(222)); - ObjectInspector featureOI = LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR; - testFeature(x, featureOI, LazyInteger.class, Integer.class); - } - - @Test - public void testWritableIntFeature() throws Exception { - List<IntWritable> x = Arrays.asList(new IntWritable(111), new IntWritable(222)); - ObjectInspector featureOI = PrimitiveObjectInspectorFactory.writableIntObjectInspector; - testFeature(x, featureOI, IntWritable.class, Integer.class); - } - - @Test - public void testLongFeature() throws Exception { - List<Long> x = Arrays.asList(111L, 222L); - ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaLongObjectInspector; - testFeature(x, featureOI, Long.class, Long.class); - } - - @Test - public void testLazyLongFeature() throws Exception { - List<LazyLong> x = Arrays.asList(lazyLong(111), lazyLong(222)); - ObjectInspector featureOI = LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR; - testFeature(x, featureOI, LazyLong.class, Long.class); - } - - @Test - public void testWritableLongFeature() throws Exception { - List<LongWritable> x = Arrays.asList(new LongWritable(111L), new LongWritable(222L)); - ObjectInspector featureOI = PrimitiveObjectInspectorFactory.writableLongObjectInspector; - testFeature(x, featureOI, LongWritable.class, Long.class); - } - - private void run(@Nonnull String options) throws Exception { - println(options); - - int numSamples = 100; - - float x1Min = -5.f, x1Max = 5.f; - float x1Step = (x1Max - x1Min) / numSamples; - - float x2Min = -3.f, x2Max = 3.f; - float x2Step = (x2Max - x2Min) / numSamples; - - ArrayList<List<String>> samplesList = new ArrayList<List<String>>(numSamples); - ArrayList<Float> ys = new ArrayList<Float>(numSamples); - float x1 = x1Min, x2 = x2Min; - - for (int i = 0; i < numSamples; i++) { - samplesList.add(Arrays.asList("1:" + String.valueOf(x1), "2:" + String.valueOf(x2))); - - ys.add(x1 * 0.5f); - - x1 += x1Step; - x2 += x2Step; - } - - GeneralRegressionUDTF udtf = new GeneralRegressionUDTF(); - ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; - ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); - ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector, options); - - udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); - - float accum = 0.f; - for (int i = 0; i < numSamples; i++) { - float y = ys.get(i).floatValue(); - float predicted = udtf.predict(udtf.parseFeatures(samplesList.get(i))); - accum += Math.abs(y - predicted); - } - float maeInit = accum / numSamples; - println("Mean absolute error before training: " + maeInit); - - for (int i = 0; i < numSamples; i++) { - udtf.process(new Object[] {samplesList.get(i), (Float) ys.get(i)}); - } - - udtf.finalizeTraining(); - - double cumLoss = udtf.getCumulativeLoss(); - println("Cumulative loss: " + cumLoss); - double normalizedLoss = cumLoss / numSamples; - Assert.assertTrue("cumLoss: " + cumLoss + ", normalizedLoss: " + normalizedLoss - + "\noptions: " + options, normalizedLoss < 0.1d); - - accum = 0.f; - for (int i = 0; i < numSamples; i++) { - float y = ys.get(i).floatValue(); - - float predicted = udtf.predict(udtf.parseFeatures(samplesList.get(i))); - println("Predicted: " + predicted + ", Actual: " + y); - - accum += Math.abs(y - predicted); - } - float mae = accum / numSamples; - println("Mean absolute error after training: " + mae); - Assert.assertTrue("accum: " + accum + ", mae (init):" + maeInit + ", mae:" + mae - + "\noptions: " + options, mae < maeInit); - } - - @Test - public void test() throws Exception { - String[] optimizers = new String[] {"SGD", "AdaDelta", "AdaGrad", "Adam"}; - String[] regularizations = new String[] {"NO", "L1", "L2", "ElasticNet", "RDA"}; - String[] lossFunctions = new String[] {"SquaredLoss", "QuantileLoss", - "EpsilonInsensitiveLoss", "SquaredEpsilonInsensitiveLoss", "HuberLoss"}; - - for (String opt : optimizers) { - for (String reg : regularizations) { - if (reg == "RDA" && opt != "AdaGrad") { - continue; - } - - for (String loss : lossFunctions) { - String options = "-opt " + opt + " -reg " + reg + " -loss " + loss - + " -iter 512"; - - // sparse - run(options); - - // mini-batch - if (opt != "AdaGrad") { - options += " -mini_batch 10"; - run(options); - } - - // dense - options += " -dense"; - run(options); - } - } - } - } - - private static void println(String msg) { - if (DEBUG) { - System.out.println(msg); - } - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/core/src/test/java/hivemall/regression/GeneralRegressorUDTFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/regression/GeneralRegressorUDTFTest.java b/core/src/test/java/hivemall/regression/GeneralRegressorUDTFTest.java new file mode 100644 index 0000000..efc0699 --- /dev/null +++ b/core/src/test/java/hivemall/regression/GeneralRegressorUDTFTest.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.regression; + +import static hivemall.utils.hadoop.HiveUtils.lazyInteger; +import static hivemall.utils.hadoop.HiveUtils.lazyLong; +import static hivemall.utils.hadoop.HiveUtils.lazyString; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import javax.annotation.Nonnull; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.Collector; +import org.apache.hadoop.hive.serde2.lazy.LazyInteger; +import org.apache.hadoop.hive.serde2.lazy.LazyLong; +import org.apache.hadoop.hive.serde2.lazy.LazyString; +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +public class GeneralRegressorUDTFTest { + private static final boolean DEBUG = false; + + @Test(expected = UDFArgumentException.class) + public void testUnsupportedOptimizer() throws Exception { + GeneralRegressorUDTF udtf = new GeneralRegressorUDTF(); + ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); + ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-opt UnsupportedOpt"); + + udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); + } + + @Test(expected = UDFArgumentException.class) + public void testUnsupportedLossFunction() throws Exception { + GeneralRegressorUDTF udtf = new GeneralRegressorUDTF(); + ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); + ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-loss UnsupportedLoss"); + + udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); + } + + @Test(expected = UDFArgumentException.class) + public void testInvalidLossFunction() throws Exception { + GeneralRegressorUDTF udtf = new GeneralRegressorUDTF(); + ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); + ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-loss HingeLoss"); + + udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); + } + + @Test(expected = UDFArgumentException.class) + public void testUnsupportedRegularization() throws Exception { + GeneralRegressorUDTF udtf = new GeneralRegressorUDTF(); + ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); + ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-reg UnsupportedReg"); + + udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); + } + + @Test + public void testNoOptions() throws Exception { + List<String> x = Arrays.asList("1:-2", "2:-1"); + float y = 0.f; + + GeneralRegressorUDTF udtf = new GeneralRegressorUDTF(); + ObjectInspector intOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); + + udtf.initialize(new ObjectInspector[] {stringListOI, intOI}); + + udtf.process(new Object[] {x, y}); + + udtf.finalizeTraining(); + + float predicted = udtf.predict(udtf.parseFeatures(x)); + Assert.assertEquals(y, predicted, 1E-5); + } + + private <T> void testFeature(@Nonnull List<T> x, @Nonnull ObjectInspector featureOI, + @Nonnull Class<T> featureClass, @Nonnull Class<?> modelFeatureClass) throws Exception { + float y = 0.f; + + GeneralRegressorUDTF udtf = new GeneralRegressorUDTF(); + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ListObjectInspector featureListOI = ObjectInspectorFactory.getStandardListObjectInspector(featureOI); + + udtf.initialize(new ObjectInspector[] {featureListOI, valueOI}); + + final List<Object> modelFeatures = new ArrayList<Object>(); + udtf.setCollector(new Collector() { + @Override + public void collect(Object input) throws HiveException { + Object[] forwardMapObj = (Object[]) input; + modelFeatures.add(forwardMapObj[0]); + } + }); + + udtf.process(new Object[] {x, y}); + + udtf.close(); + + Assert.assertFalse(modelFeatures.isEmpty()); + for (Object modelFeature : modelFeatures) { + Assert.assertEquals("All model features must have same type", modelFeatureClass, + modelFeature.getClass()); + } + } + + @Test + public void testLazyStringFeature() throws Exception { + LazyStringObjectInspector oi = LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector( + false, (byte) 0); + List<LazyString> x = Arrays.asList(lazyString("ãã¹ã:-2", oi), lazyString("æ¼¢å:-333.0", oi), + lazyString("test:-1")); + testFeature(x, oi, LazyString.class, String.class); + } + + @Test + public void testStringFeature() throws Exception { + List<String> x = Arrays.asList("1:-2", "2:-1"); + ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + testFeature(x, featureOI, String.class, String.class); + } + + @Test(expected = IllegalArgumentException.class) + public void testIlleagalStringFeature() throws Exception { + List<String> x = Arrays.asList("1:-2jjjj", "2:-1"); + ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + testFeature(x, featureOI, String.class, String.class); + } + + @Test + public void testTextFeature() throws Exception { + List<Text> x = Arrays.asList(new Text("1:-2"), new Text("2:-1")); + ObjectInspector featureOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + testFeature(x, featureOI, Text.class, String.class); + } + + @Test + public void testIntegerFeature() throws Exception { + List<Integer> x = Arrays.asList(111, 222); + ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaIntObjectInspector; + testFeature(x, featureOI, Integer.class, Integer.class); + } + + @Test + public void testLazyIntegerFeature() throws Exception { + List<LazyInteger> x = Arrays.asList(lazyInteger(111), lazyInteger(222)); + ObjectInspector featureOI = LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR; + testFeature(x, featureOI, LazyInteger.class, Integer.class); + } + + @Test + public void testWritableIntFeature() throws Exception { + List<IntWritable> x = Arrays.asList(new IntWritable(111), new IntWritable(222)); + ObjectInspector featureOI = PrimitiveObjectInspectorFactory.writableIntObjectInspector; + testFeature(x, featureOI, IntWritable.class, Integer.class); + } + + @Test + public void testLongFeature() throws Exception { + List<Long> x = Arrays.asList(111L, 222L); + ObjectInspector featureOI = PrimitiveObjectInspectorFactory.javaLongObjectInspector; + testFeature(x, featureOI, Long.class, Long.class); + } + + @Test + public void testLazyLongFeature() throws Exception { + List<LazyLong> x = Arrays.asList(lazyLong(111), lazyLong(222)); + ObjectInspector featureOI = LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR; + testFeature(x, featureOI, LazyLong.class, Long.class); + } + + @Test + public void testWritableLongFeature() throws Exception { + List<LongWritable> x = Arrays.asList(new LongWritable(111L), new LongWritable(222L)); + ObjectInspector featureOI = PrimitiveObjectInspectorFactory.writableLongObjectInspector; + testFeature(x, featureOI, LongWritable.class, Long.class); + } + + private void run(@Nonnull String options) throws Exception { + println(options); + + int numSamples = 100; + + float x1Min = -5.f, x1Max = 5.f; + float x1Step = (x1Max - x1Min) / numSamples; + + float x2Min = -3.f, x2Max = 3.f; + float x2Step = (x2Max - x2Min) / numSamples; + + ArrayList<List<String>> samplesList = new ArrayList<List<String>>(numSamples); + ArrayList<Float> ys = new ArrayList<Float>(numSamples); + float x1 = x1Min, x2 = x2Min; + + for (int i = 0; i < numSamples; i++) { + samplesList.add(Arrays.asList("1:" + String.valueOf(x1), "2:" + String.valueOf(x2))); + + ys.add(x1 * 0.5f); + + x1 += x1Step; + x2 += x2Step; + } + + GeneralRegressorUDTF udtf = new GeneralRegressorUDTF(); + ObjectInspector floatOI = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ListObjectInspector stringListOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); + ObjectInspector params = ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, options); + + udtf.initialize(new ObjectInspector[] {stringListOI, floatOI, params}); + + float accum = 0.f; + for (int i = 0; i < numSamples; i++) { + float y = ys.get(i).floatValue(); + float predicted = udtf.predict(udtf.parseFeatures(samplesList.get(i))); + accum += Math.abs(y - predicted); + } + float maeInit = accum / numSamples; + println("Mean absolute error before training: " + maeInit); + + for (int i = 0; i < numSamples; i++) { + udtf.process(new Object[] {samplesList.get(i), (Float) ys.get(i)}); + } + + udtf.finalizeTraining(); + + double cumLoss = udtf.getCumulativeLoss(); + println("Cumulative loss: " + cumLoss); + double normalizedLoss = cumLoss / numSamples; + Assert.assertTrue("cumLoss: " + cumLoss + ", normalizedLoss: " + normalizedLoss + + "\noptions: " + options, normalizedLoss < 0.1d); + + accum = 0.f; + for (int i = 0; i < numSamples; i++) { + float y = ys.get(i).floatValue(); + + float predicted = udtf.predict(udtf.parseFeatures(samplesList.get(i))); + println("Predicted: " + predicted + ", Actual: " + y); + + accum += Math.abs(y - predicted); + } + float mae = accum / numSamples; + println("Mean absolute error after training: " + mae); + Assert.assertTrue("accum: " + accum + ", mae (init):" + maeInit + ", mae:" + mae + + "\noptions: " + options, mae < maeInit); + } + + @Test + public void test() throws Exception { + String[] optimizers = new String[] {"SGD", "AdaDelta", "AdaGrad", "Adam"}; + String[] regularizations = new String[] {"NO", "L1", "L2", "ElasticNet", "RDA"}; + String[] lossFunctions = new String[] {"SquaredLoss", "QuantileLoss", + "EpsilonInsensitiveLoss", "SquaredEpsilonInsensitiveLoss", "HuberLoss"}; + + for (String opt : optimizers) { + for (String reg : regularizations) { + if (reg == "RDA" && opt != "AdaGrad") { + continue; + } + + for (String loss : lossFunctions) { + String options = "-opt " + opt + " -reg " + reg + " -loss " + loss + + " -iter 512"; + + // sparse + run(options); + + // mini-batch + if (opt != "AdaGrad") { + options += " -mini_batch 10"; + run(options); + } + + // dense + options += " -dense"; + run(options); + } + } + } + } + + private static void println(String msg) { + if (DEBUG) { + System.out.println(msg); + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/FOOTER.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/FOOTER.md b/docs/gitbook/FOOTER.md index b6f2c55..7c18fad 100644 --- a/docs/gitbook/FOOTER.md +++ b/docs/gitbook/FOOTER.md @@ -19,4 +19,4 @@ <sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. -</font></sub> \ No newline at end of file +</font></sub> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/SUMMARY.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index f228cfe..f5ab81e 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -27,7 +27,7 @@ * [Input Format](getting_started/input-format.md) * [Tips for Effective Hivemall](tips/README.md) - * [Explicit addBias() for better prediction](tips/addbias.md) + * [Explicit add_bias() for better prediction](tips/addbias.md) * [Use rand_amplify() to better prediction results](tips/rand_amplify.md) * [Real-time Prediction on RDBMS](tips/rt_prediction.md) * [Ensemble learning for stable prediction](tips/ensemble_learning.md) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/anomaly/sst.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/anomaly/sst.md b/docs/gitbook/anomaly/sst.md index 7268eda..6fc49af 100644 --- a/docs/gitbook/anomaly/sst.md +++ b/docs/gitbook/anomaly/sst.md @@ -151,4 +151,4 @@ For instance, partial outputs obtained as a result of this query are: |7560 | {"changepoint_score":0.0010629833145070489,"is_changepoint":false}| |...|...| -Obviously, the 7555-th sample is detected as a change-point in this example. \ No newline at end of file +Obviously, the 7555-th sample is detected as a change-point in this example. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/a9a_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/a9a_dataset.md b/docs/gitbook/binaryclass/a9a_dataset.md index 76ccb0d..cd53d46 100644 --- a/docs/gitbook/binaryclass/a9a_dataset.md +++ b/docs/gitbook/binaryclass/a9a_dataset.md @@ -59,4 +59,4 @@ create external table a9atest ( label float, features ARRAY<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/a9a/test'; -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/a9a_lr.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/a9a_lr.md b/docs/gitbook/binaryclass/a9a_lr.md index 9bac63e..247d5a2 100644 --- a/docs/gitbook/binaryclass/a9a_lr.md +++ b/docs/gitbook/binaryclass/a9a_lr.md @@ -39,7 +39,7 @@ select avg(weight) as weight from (select - logress(addBias(features),label,"-total_steps ${total_steps}") as (feature,weight) + logress(add_bias(features),label,"-total_steps ${total_steps}") as (feature,weight) from a9atrain ) t @@ -59,7 +59,7 @@ select extract_feature(feature) as feature, extract_weight(feature) as value from - a9atest LATERAL VIEW explode(addBias(features)) t AS feature + a9atest LATERAL VIEW explode(add_bias(features)) t AS feature ) select t.rowid, http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/a9a_minibatch.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/a9a_minibatch.md b/docs/gitbook/binaryclass/a9a_minibatch.md index a79ed86..3fc5945 100644 --- a/docs/gitbook/binaryclass/a9a_minibatch.md +++ b/docs/gitbook/binaryclass/a9a_minibatch.md @@ -35,7 +35,7 @@ select avg(weight) as weight from (select - logress(addBias(features),label,"-total_steps ${total_steps} -mini_batch ${mini_batch_size}") as (feature,weight) + logress(add_bias(features),label,"-total_steps ${total_steps} -mini_batch ${mini_batch_size}") as (feature,weight) from a9atrain ) t @@ -52,4 +52,4 @@ where actual == predicted; | Stochastic Gradient Descent | Minibatch Gradient Descent | | ------------- | ------------- | -| 0.8430071862907684 | 0.8463239358761747 | \ No newline at end of file +| 0.8430071862907684 | 0.8463239358761747 | http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/general.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/general.md b/docs/gitbook/binaryclass/general.md index 931cc58..60483a4 100644 --- a/docs/gitbook/binaryclass/general.md +++ b/docs/gitbook/binaryclass/general.md @@ -133,4 +133,4 @@ from a9a_train ``` -Likewise, you can generate many different classifiers based on its options. \ No newline at end of file +Likewise, you can generate many different classifiers based on its options. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/kdd2010a_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010a_dataset.md b/docs/gitbook/binaryclass/kdd2010a_dataset.md index 7634f66..d1a346c 100644 --- a/docs/gitbook/binaryclass/kdd2010a_dataset.md +++ b/docs/gitbook/binaryclass/kdd2010a_dataset.md @@ -77,7 +77,7 @@ select split(feature,":")[0] as feature, cast(split(feature,":")[1] as float) as value from - kdd10a_test LATERAL VIEW explode(addBias(features)) t AS feature; + kdd10a_test LATERAL VIEW explode(add_bias(features)) t AS feature; set hivevar:xtimes=3; set hivevar:shufflebuffersize=1000; @@ -88,4 +88,4 @@ select rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features) from kdd10a_train_orcfile; -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/kdd2010a_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010a_scw.md b/docs/gitbook/binaryclass/kdd2010a_scw.md index 5cb19fc..416124a 100644 --- a/docs/gitbook/binaryclass/kdd2010a_scw.md +++ b/docs/gitbook/binaryclass/kdd2010a_scw.md @@ -28,7 +28,7 @@ select voted_avg(weight) as weight from (select - train_pa1(addBias(features),label) as (feature,weight) + train_pa1(add_bias(features),label) as (feature,weight) from kdd10a_train_x3 ) t @@ -76,7 +76,7 @@ select argmin_kld(weight, covar) as weight from (select - train_cw(addBias(features),label) as (feature,weight,covar) + train_cw(add_bias(features),label) as (feature,weight,covar) from kdd10a_train_x3 ) t @@ -119,8 +119,8 @@ select argmin_kld(weight, covar) as weight -- [hivemall v0.2alpha3 or later] from (select - -- train_arow(addBias(features),label) as (feature,weight) -- [hivemall v0.1] - train_arow(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + -- train_arow(add_bias(features),label) as (feature,weight) -- [hivemall v0.1] + train_arow(add_bias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] from kdd10a_train_x3 ) t @@ -162,7 +162,7 @@ select argmin_kld(weight, covar) as weight from (select - train_scw(addBias(features),label) as (feature,weight,covar) + train_scw(add_bias(features),label) as (feature,weight,covar) from kdd10a_train_x3 ) t @@ -201,4 +201,4 @@ where actual = predicted; | AROW | 0.8676038894615345 | | PA1 | 0.8677782959894337 | | CW | 0.8678037711002504 | -| SCW1 | 0.8678096499719774 | \ No newline at end of file +| SCW1 | 0.8678096499719774 | http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/kdd2010b_arow.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010b_arow.md b/docs/gitbook/binaryclass/kdd2010b_arow.md index 2ca0d90..ec58cef 100644 --- a/docs/gitbook/binaryclass/kdd2010b_arow.md +++ b/docs/gitbook/binaryclass/kdd2010b_arow.md @@ -28,8 +28,8 @@ select argmin_kld(weight, covar) as weight -- [hivemall v0.2alpha3 or later] from (select - -- train_arow(addBias(features),label) as (feature,weight) -- [hivemall v0.1] - train_arow(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + -- train_arow(add_bias(features),label) as (feature,weight) -- [hivemall v0.1] + train_arow(add_bias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] from kdd10b_train_x3 ) t @@ -67,4 +67,4 @@ from select count(1)/748401 from kdd10b_arow_submit1 where actual = predicted; ``` -> 0.8565808971393678 \ No newline at end of file +> 0.8565808971393678 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/kdd2010b_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010b_dataset.md b/docs/gitbook/binaryclass/kdd2010b_dataset.md index 291a783..1a1c3ce 100644 --- a/docs/gitbook/binaryclass/kdd2010b_dataset.md +++ b/docs/gitbook/binaryclass/kdd2010b_dataset.md @@ -65,7 +65,7 @@ select split(feature,":")[0] as feature, cast(split(feature,":")[1] as float) as value from - kdd10b_test LATERAL VIEW explode(addBias(features)) t AS feature; + kdd10b_test LATERAL VIEW explode(add_bias(features)) t AS feature; set hivevar:xtimes=3; set hivevar:shufflebuffersize=1000; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/news20_adagrad.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_adagrad.md b/docs/gitbook/binaryclass/news20_adagrad.md index cbcc0f2..e3dfb47 100644 --- a/docs/gitbook/binaryclass/news20_adagrad.md +++ b/docs/gitbook/binaryclass/news20_adagrad.md @@ -1,189 +1,200 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. --> - -_Note that this feature is supported since Hivemall v0.3-beta2 or later._ - -## UDF preparation -``` -add jar ./tmp/hivemall-with-dependencies.jar; -source ./tmp/define-all.hive; - -use news20; -``` - -#[AdaGradRDA] - -_Note that the current AdaGradRDA implmenetation can only be applied to classification, not to regression, because it uses hinge loss for the loss function._ - - -## model building -```sql -drop table news20b_adagrad_rda_model1; -create table news20b_adagrad_rda_model1 as -select - feature, - voted_avg(weight) as weight -from - (select - train_adagrad_rda(addBias(features),label) as (feature,weight) - from - news20b_train_x3 - ) t -group by feature; -``` - -## prediction -```sql -create or replace view news20b_adagrad_rda_predict1 -as -select - t.rowid, - sum(m.weight * t.value) as total_weight, - case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label -from - news20b_test_exploded t LEFT OUTER JOIN - news20b_adagrad_rda_model1 m ON (t.feature = m.feature) -group by - t.rowid; -``` - -## evaluation -```sql -create or replace view news20b_adagrad_rda_submit1 as -select - t.label as actual, - pd.label as predicted -from - news20b_test t JOIN news20b_adagrad_rda_predict1 pd - on (t.rowid = pd.rowid); -``` - -```sql -select count(1)/4996 from news20b_adagrad_rda_submit1 -where actual == predicted; -``` -> SCW1 0.9661729383506805 - -> ADAGRAD+RDA 0.9677742193755005 - -#[AdaGrad] - -_Note that AdaGrad is better suited for a regression problem because the current implementation only support logistic loss._ - -## model building -```sql -drop table news20b_adagrad_model1; -create table news20b_adagrad_model1 as -select - feature, - voted_avg(weight) as weight -from - (select - adagrad(addBias(features),convert_label(label)) as (feature,weight) - from - news20b_train_x3 - ) t -group by feature; -``` -_adagrad takes 0/1 for a label value and convert_label(label) converts a label value from -1/+1 to 0/1._ -## prediction -```sql -create or replace view news20b_adagrad_predict1 -as -select - t.rowid, - case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1 else -1 end as label -from - news20b_test_exploded t LEFT OUTER JOIN - news20b_adagrad_model1 m ON (t.feature = m.feature) -group by - t.rowid; -``` - -## evaluation -```sql -create or replace view news20b_adagrad_submit1 as -select - t.label as actual, - p.label as predicted -from - news20b_test t JOIN news20b_adagrad_predict1 p - on (t.rowid = p.rowid); -``` - -```sql -select count(1)/4996 from news20b_adagrad_submit1 -where actual == predicted; -``` -> 0.9549639711769415 (adagrad) - -#[AdaDelta] - -_Note that AdaDelta is better suited for regression problem because the current implementation only support logistic loss._ - -## model building -```sql -drop table news20b_adadelta_model1; -create table news20b_adadelta_model1 as -select - feature, - voted_avg(weight) as weight -from - (select - adadelta(addBias(features),convert_label(label)) as (feature,weight) - from - news20b_train_x3 - ) t -group by feature; -``` - -## prediction -```sql -create or replace view news20b_adadelta_predict1 -as -select - t.rowid, - case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1 else -1 end as label -from - news20b_test_exploded t LEFT OUTER JOIN - news20b_adadelta_model1 m ON (t.feature = m.feature) -group by - t.rowid; -``` - -## evaluation -```sql -create or replace view news20b_adadelta_submit1 as -select - t.label as actual, - p.label as predicted -from - news20b_test t JOIN news20b_adadelta_predict1 p - on (t.rowid = p.rowid); -``` - -```sql -select count(1)/4996 from news20b_adadelta_submit1 -where actual == predicted; -``` -> 0.9549639711769415 (adagrad) - -> 0.9545636509207366 (adadelta) - -_Note that AdaDelta often performs better than AdaGrad._ \ No newline at end of file + +<!-- toc --> + +> #### Note +> This feature is supported since Hivemall `v0.3-beta2` or later. + +## UDF preparation + +``` +add jar ./tmp/hivemall-with-dependencies.jar; +source ./tmp/define-all.hive; + +use news20; +``` + +#[AdaGradRDA] + +> #### Note +> The current AdaGradRDA implmenetation can only be applied to classification, not to regression, because it uses hinge loss for the loss function. + +## model building +```sql +drop table news20b_adagrad_rda_model1; +create table news20b_adagrad_rda_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + train_adagrad_rda(addBias(features),label) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_adagrad_rda_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_adagrad_rda_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_adagrad_rda_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_adagrad_rda_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_adagrad_rda_submit1 +where actual == predicted; +``` +> SCW1 0.9661729383506805 + +> ADAGRAD+RDA 0.9677742193755005 + +#[AdaGrad] + +_Note that AdaGrad is better suited for a regression problem because the current implementation only support logistic loss._ + +## model building +```sql +drop table news20b_adagrad_model1; +create table news20b_adagrad_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + adagrad(addBias(features),convert_label(label)) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +> #### Caution +> `adagrad` takes 0/1 for a label value and `convert_label(label)` converts a label value from -1/+1 to 0/1. + +## prediction +```sql +create or replace view news20b_adagrad_predict1 +as +select + t.rowid, + case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_adagrad_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_adagrad_submit1 as +select + t.label as actual, + p.label as predicted +from + news20b_test t JOIN news20b_adagrad_predict1 p + on (t.rowid = p.rowid); +``` + +```sql +select count(1)/4996 from news20b_adagrad_submit1 +where actual == predicted; +``` +> 0.9549639711769415 (adagrad) + +#[AdaDelta] + +> #### Caution +> AdaDelta can only be applied for regression problem because the current implementation only support logistic loss. + +## model building +```sql +drop table news20b_adadelta_model1; +create table news20b_adadelta_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + adadelta(addBias(features),convert_label(label)) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_adadelta_predict1 +as +select + t.rowid, + case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_adadelta_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_adadelta_submit1 as +select + t.label as actual, + p.label as predicted +from + news20b_test t JOIN news20b_adadelta_predict1 p + on (t.rowid = p.rowid); +``` + + + +```sql +select count(1)/4996 from news20b_adadelta_submit1 +where actual == predicted; +``` + +_AdaDelta often performs better than AdaGrad._ + +> 0.9549639711769415 (adagrad) + +> 0.9545636509207366 (adadelta) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/news20_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_dataset.md b/docs/gitbook/binaryclass/news20_dataset.md index 87208cf..d50452d 100644 --- a/docs/gitbook/binaryclass/news20_dataset.md +++ b/docs/gitbook/binaryclass/news20_dataset.md @@ -99,5 +99,5 @@ select -- extract_feature(feature) as feature, -- extract_weight(feature) as value from - news20b_test LATERAL VIEW explode(addBias(features)) t AS feature; -``` \ No newline at end of file + news20b_test LATERAL VIEW explode(add_bias(features)) t AS feature; +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/news20_pa.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_pa.md b/docs/gitbook/binaryclass/news20_pa.md index df082b9..d40b433 100644 --- a/docs/gitbook/binaryclass/news20_pa.md +++ b/docs/gitbook/binaryclass/news20_pa.md @@ -37,7 +37,7 @@ select voted_avg(weight) as weight from (select - perceptron(addBias(features),label) as (feature,weight) + perceptron(add_bias(features),label) as (feature,weight) from news20b_train_x3 ) t @@ -96,7 +96,7 @@ select voted_avg(weight) as weight from (select - train_pa(addBias(features),label) as (feature,weight) + train_pa(add_bias(features),label) as (feature,weight) from news20b_train_x3 ) t @@ -155,7 +155,7 @@ select voted_avg(weight) as weight from (select - train_pa1(addBias(features),label) as (feature,weight) + train_pa1(add_bias(features),label) as (feature,weight) from news20b_train_x3 ) t @@ -214,7 +214,7 @@ select voted_avg(weight) as weight from (select - train_pa2(addBias(features),label) as (feature,weight) + train_pa2(add_bias(features),label) as (feature,weight) from news20b_train_x3 ) t @@ -259,4 +259,4 @@ where actual == predicted; drop table news20b_pa2_model1; drop view news20b_pa2_predict1; drop view news20b_pa2_submit1; -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/news20_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_scw.md b/docs/gitbook/binaryclass/news20_scw.md index c3f51f4..f364c12 100644 --- a/docs/gitbook/binaryclass/news20_scw.md +++ b/docs/gitbook/binaryclass/news20_scw.md @@ -39,8 +39,8 @@ select argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] from (select - -- train_cw(addBias(features), label) as (feature, weight) -- [hivemall v0.1] - train_cw(addBias(features), label) as (feature, weight, covar) -- [hivemall v0.2 or later] + -- train_cw(add_bias(features), label) as (feature, weight) -- [hivemall v0.1] + train_cw(add_bias(features), label) as (feature, weight, covar) -- [hivemall v0.2 or later] from news20b_train_x3 ) t @@ -102,8 +102,8 @@ select argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] from (select - -- train_arow(addBias(features),label) as (feature,weight) -- [hivemall v0.1] - train_arow(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + -- train_arow(add_bias(features),label) as (feature,weight) -- [hivemall v0.1] + train_arow(add_bias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] from news20b_train_x3 ) t @@ -164,8 +164,8 @@ select argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] from (select - -- train_scw(addBias(features),label) as (feature,weight) -- [hivemall v0.1] - train_scw(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + -- train_scw(add_bias(features),label) as (feature,weight) -- [hivemall v0.1] + train_scw(add_bias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] from news20b_train_x3 ) t @@ -226,8 +226,8 @@ select argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] from (select - -- train_scw2(addBias(features),label) as (feature,weight) -- [hivemall v0.1] - train_scw2(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + -- train_scw2(add_bias(features),label) as (feature,weight) -- [hivemall v0.1] + train_scw2(add_bias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] from news20b_train_x3 ) t @@ -288,4 +288,4 @@ drop view news20b_scw2_submit1; | AROW | 0.9659727782225781 | | SCW1 | 0.9661729383506805 | -My recommendation is AROW for classification. \ No newline at end of file +My recommendation is AROW for classification. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/binaryclass/webspam_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/webspam_dataset.md b/docs/gitbook/binaryclass/webspam_dataset.md index 40cac07..fe00111 100644 --- a/docs/gitbook/binaryclass/webspam_dataset.md +++ b/docs/gitbook/binaryclass/webspam_dataset.md @@ -67,7 +67,7 @@ INSERT OVERWRITE TABLE webspam_train_orcfile select s.rowid, label, - addBias(features) as features + add_bias(features) as features from webspam_raw s where not exists (select rowid from webspam_test t where s.rowid = t.rowid) CLUSTER BY rand(43); @@ -90,6 +90,6 @@ select split(feature,":")[0] as feature, cast(split(feature,":")[1] as float) as value from - webspam_test LATERAL VIEW explode(addBias(features)) t AS feature; + webspam_test LATERAL VIEW explode(add_bias(features)) t AS feature; ``` -*Caution:* For this dataset, use small *shufflebuffersize* because each training example has lots of features though (xtimes * shufflebuffersize * N) training examples are cached in memory. \ No newline at end of file +*Caution:* For this dataset, use small *shufflebuffersize* because each training example has lots of features though (xtimes * shufflebuffersize * N) training examples are cached in memory. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/clustering/plsa.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/clustering/plsa.md b/docs/gitbook/clustering/plsa.md index 31cc08d..cfdb0ec 100644 --- a/docs/gitbook/clustering/plsa.md +++ b/docs/gitbook/clustering/plsa.md @@ -175,4 +175,4 @@ For instance, [20 newsgroups dataset](http://qwone.com/~jason/20Newsgroups/) whi SELECT train_plsa(features, '-topics 20 -iter 10 -s 128 -delta 0.01 -alpha 512 -eps 0.1') ``` -Clearly, `alpha` is much larger than `0.01` which was used for the dummy data above. Let you keep in mind that an appropriate value of `alpha` highly depends on the number of documents and mini-batch size. \ No newline at end of file +Clearly, `alpha` is much larger than `0.01` which was used for the dummy data above. Let you keep in mind that an appropriate value of `alpha` highly depends on the number of documents and mini-batch size. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/eval/lr_datagen.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/eval/lr_datagen.md b/docs/gitbook/eval/lr_datagen.md index c0cbce0..a48dad7 100644 --- a/docs/gitbook/eval/lr_datagen.md +++ b/docs/gitbook/eval/lr_datagen.md @@ -108,4 +108,4 @@ set mapred.reduce.tasks=-1; -- reset to the default setting hive> select count(1) from lrdata1k; OK 1000 -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/eval/rank.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/eval/rank.md b/docs/gitbook/eval/rank.md index b84979f..ed1a44c 100644 --- a/docs/gitbook/eval/rank.md +++ b/docs/gitbook/eval/rank.md @@ -259,4 +259,4 @@ select from rec t1 join truth t2 on (t1.userid = t2.userid) ; -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/ft_engineering/polynomial.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/polynomial.md b/docs/gitbook/ft_engineering/polynomial.md index 8f3d8cf..4a2cde4 100644 --- a/docs/gitbook/ft_engineering/polynomial.md +++ b/docs/gitbook/ft_engineering/polynomial.md @@ -70,4 +70,4 @@ The `powered_features(array<String> features, int degree [, boolean truncate=tru ```sql select powered_features(array("a:0.5","b:0.2"), 3); > ["a:0.5","a^2:0.25","a^3:0.125","b:0.2","b^2:0.040000003","b^3:0.008"] -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/ft_engineering/quantify.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/quantify.md b/docs/gitbook/ft_engineering/quantify.md index 1bfaa73..86e0636 100644 --- a/docs/gitbook/ft_engineering/quantify.md +++ b/docs/gitbook/ft_engineering/quantify.md @@ -180,4 +180,4 @@ limit 10; 8 [41,0,0,0,0,121,0,0,0,13,5,16,6,176,5,3] 9 [52,8,0,1,0,1466,1,1,0,20,9,150,1,0,0,0] 10 [32,2,0,0,0,6217,0,1,0,18,9,486,2,181,2,1] -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/ft_engineering/scaling.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/scaling.md b/docs/gitbook/ft_engineering/scaling.md index 7f388d6..b419254 100644 --- a/docs/gitbook/ft_engineering/scaling.md +++ b/docs/gitbook/ft_engineering/scaling.md @@ -191,4 +191,4 @@ group by t1.id ) ... -``` \ No newline at end of file +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/ft_engineering/tfidf.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/tfidf.md b/docs/gitbook/ft_engineering/tfidf.md index 46e4fac..4bcaae7 100644 --- a/docs/gitbook/ft_engineering/tfidf.md +++ b/docs/gitbook/ft_engineering/tfidf.md @@ -1,168 +1,171 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. --> - -This document explains how to compute [TF-IDF](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) with Apache Hive/Hivemall. - -What you need to compute TF-IDF is a table/view composing (docid, word) pair, 2 views, and 1 query. - -_Note that this feature is supported since Hivemall v0.3-beta3 or later. Macro is supported since Hive 0.12 or later._ - -# Define macros used in the TF-IDF computation -```sql -create temporary macro max2(x INT, y INT) -if(x>y,x,y); - --- create temporary macro idf(df_t INT, n_docs INT) --- (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0); - -create temporary macro tfidf(tf FLOAT, df_t INT, n_docs INT) -tf * (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0); -``` - -# Data preparation -To calculate TF-IDF, you need to prepare a relation consists of (docid,word) tuples. -```sql -create external table wikipage ( - docid int, - page string -) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS TEXTFILE; - -cd ~/tmp -wget https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv - -LOAD DATA LOCAL INPATH '/home/myui/tmp/tfidf_test.tsv' INTO TABLE wikipage; - -create or replace view wikipage_exploded -as -select - docid, - word -from - wikipage LATERAL VIEW explode(tokenize(page,true)) t as word -where - not is_stopword(word); -``` -You can download the data of the wikipage table from [this link]( https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv). - -# Define views of TF/DF -```sql -create or replace view term_frequency -as -select - docid, - word, - freq -from ( -select - docid, - tf(word) as word2freq -from - wikipage_exploded -group by - docid -) t -LATERAL VIEW explode(word2freq) t2 as word, freq; - -create or replace view document_frequency -as -select - word, - count(distinct docid) docs -from - wikipage_exploded -group by - word; -``` - -# TF-IDF calculation for each docid/word pair -```sql --- set the total number of documents -select count(distinct docid) from wikipage; -set hivevar:n_docs=3; - -create or replace view tfidf -as -select - tf.docid, - tf.word, - -- tf.freq * (log(10, CAST(${n_docs} as FLOAT)/max2(1,df.docs)) + 1.0) as tfidf - tfidf(tf.freq, df.docs, ${n_docs}) as tfidf -from - term_frequency tf - JOIN document_frequency df ON (tf.word = df.word) -order by - tfidf desc; -``` - -The result will be as follows: -``` -docid word tfidf -1 justice 0.1641245850805637 -3 knowledge 0.09484606645205085 -2 action 0.07033910867777095 -1 law 0.06564983513276658 -1 found 0.06564983513276658 -1 religion 0.06564983513276658 -1 discussion 0.06564983513276658 - ... - ... -2 act 0.017584777169442737 -2 virtues 0.017584777169442737 -2 well 0.017584777169442737 -2 willingness 0.017584777169442737 -2 find 0.017584777169442737 -2 1 0.014001086678120098 -2 experience 0.014001086678120098 -2 often 0.014001086678120098 -``` -The above result is considered to be appropriate as docid 1, 2, and 3 are the Wikipedia entries of Justice, Wisdom, and Knowledge, respectively. - -# Feature Vector with TF-IDF values - -```sql -select - docid, - -- collect_list(concat(word, ":", tfidf)) as features -- Hive 0.13 or later - collect_list(feature(word, tfidf)) as features -- Hivemall v0.3.4 & Hive 0.13 or later - -- collect_all(concat(word, ":", tfidf)) as features -- before Hive 0.13 -from - tfidf -group by - docid; -``` - -``` -1 ["justice:0.1641245850805637","found:0.06564983513276658","discussion:0.06564983513276658","law:0.065 -64983513276658","based:0.06564983513276658","religion:0.06564983513276658","viewpoints:0.03282491756638329"," -rationality:0.03282491756638329","including:0.03282491756638329","context:0.03282491756638329","concept:0.032 -82491756638329","rightness:0.03282491756638329","general:0.03282491756638329","many:0.03282491756638329","dif -fering:0.03282491756638329","fairness:0.03282491756638329","social:0.03282491756638329","broadest:0.032824917 -56638329","equity:0.03282491756638329","includes:0.03282491756638329","theology:0.03282491756638329","ethics: -0.03282491756638329","moral:0.03282491756638329","numerous:0.03282491756638329","philosophical:0.032824917566 -38329","application:0.03282491756638329","perspectives:0.03282491756638329","procedural:0.03282491756638329", -"realm:0.03282491756638329","divided:0.03282491756638329","concepts:0.03282491756638329","attainment:0.032824 -91756638329","fields:0.03282491756638329","often:0.026135361945200226","philosophy:0.026135361945200226","stu -dy:0.026135361945200226"] -2 ["action:0.07033910867777095","wisdom:0.05275433288400458","one:0.05275433288400458","understanding:0 -.04200326112968063","judgement:0.035169554338885474","apply:0.035169554338885474","disposition:0.035169554338 -885474","given:0.035169554338885474" -... + +This document explains how to compute [TF-IDF](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) with Apache Hive/Hivemall. + +What you need to compute TF-IDF is a table/view composing (docid, word) pair, 2 views, and 1 query. + +<!-- toc --> + +> #### Note +> This feature is supported since Hivemall v0.3-beta3 or later. Macro is supported since Hive 0.12 or later. + +# Define macros used in the TF-IDF computation +```sql +create temporary macro max2(x INT, y INT) +if(x>y,x,y); + +-- create temporary macro idf(df_t INT, n_docs INT) +-- (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0); + +create temporary macro tfidf(tf FLOAT, df_t INT, n_docs INT) +tf * (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0); +``` + +# Data preparation +To calculate TF-IDF, you need to prepare a relation consists of (docid,word) tuples. +```sql +create external table wikipage ( + docid int, + page string +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE; + +cd ~/tmp +wget https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv + +LOAD DATA LOCAL INPATH '/home/myui/tmp/tfidf_test.tsv' INTO TABLE wikipage; + +create or replace view wikipage_exploded +as +select + docid, + word +from + wikipage LATERAL VIEW explode(tokenize(page,true)) t as word +where + not is_stopword(word); +``` +You can download the data of the wikipage table from [this link]( https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv). + +# Define views of TF/DF +```sql +create or replace view term_frequency +as +select + docid, + word, + freq +from ( +select + docid, + tf(word) as word2freq +from + wikipage_exploded +group by + docid +) t +LATERAL VIEW explode(word2freq) t2 as word, freq; + +create or replace view document_frequency +as +select + word, + count(distinct docid) docs +from + wikipage_exploded +group by + word; +``` + +# TF-IDF calculation for each docid/word pair +```sql +-- set the total number of documents +select count(distinct docid) from wikipage; +set hivevar:n_docs=3; + +create or replace view tfidf +as +select + tf.docid, + tf.word, + -- tf.freq * (log(10, CAST(${n_docs} as FLOAT)/max2(1,df.docs)) + 1.0) as tfidf + tfidf(tf.freq, df.docs, ${n_docs}) as tfidf +from + term_frequency tf + JOIN document_frequency df ON (tf.word = df.word) +order by + tfidf desc; +``` + +The result will be as follows: +``` +docid word tfidf +1 justice 0.1641245850805637 +3 knowledge 0.09484606645205085 +2 action 0.07033910867777095 +1 law 0.06564983513276658 +1 found 0.06564983513276658 +1 religion 0.06564983513276658 +1 discussion 0.06564983513276658 + ... + ... +2 act 0.017584777169442737 +2 virtues 0.017584777169442737 +2 well 0.017584777169442737 +2 willingness 0.017584777169442737 +2 find 0.017584777169442737 +2 1 0.014001086678120098 +2 experience 0.014001086678120098 +2 often 0.014001086678120098 +``` +The above result is considered to be appropriate as docid 1, 2, and 3 are the Wikipedia entries of Justice, Wisdom, and Knowledge, respectively. + +# Feature Vector with TF-IDF values + +```sql +select + docid, + -- collect_list(concat(word, ":", tfidf)) as features -- Hive 0.13 or later + collect_list(feature(word, tfidf)) as features -- Hivemall v0.3.4 & Hive 0.13 or later + -- collect_all(concat(word, ":", tfidf)) as features -- before Hive 0.13 +from + tfidf +group by + docid; +``` + +``` +1 ["justice:0.1641245850805637","found:0.06564983513276658","discussion:0.06564983513276658","law:0.065 +64983513276658","based:0.06564983513276658","religion:0.06564983513276658","viewpoints:0.03282491756638329"," +rationality:0.03282491756638329","including:0.03282491756638329","context:0.03282491756638329","concept:0.032 +82491756638329","rightness:0.03282491756638329","general:0.03282491756638329","many:0.03282491756638329","dif +fering:0.03282491756638329","fairness:0.03282491756638329","social:0.03282491756638329","broadest:0.032824917 +56638329","equity:0.03282491756638329","includes:0.03282491756638329","theology:0.03282491756638329","ethics: +0.03282491756638329","moral:0.03282491756638329","numerous:0.03282491756638329","philosophical:0.032824917566 +38329","application:0.03282491756638329","perspectives:0.03282491756638329","procedural:0.03282491756638329", +"realm:0.03282491756638329","divided:0.03282491756638329","concepts:0.03282491756638329","attainment:0.032824 +91756638329","fields:0.03282491756638329","often:0.026135361945200226","philosophy:0.026135361945200226","stu +dy:0.026135361945200226"] +2 ["action:0.07033910867777095","wisdom:0.05275433288400458","one:0.05275433288400458","understanding:0 +.04200326112968063","judgement:0.035169554338885474","apply:0.035169554338885474","disposition:0.035169554338 +885474","given:0.035169554338885474" +... ``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/geospatial/latlon.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/geospatial/latlon.md b/docs/gitbook/geospatial/latlon.md index 6c86bd1..4d6ec06 100644 --- a/docs/gitbook/geospatial/latlon.md +++ b/docs/gitbook/geospatial/latlon.md @@ -132,4 +132,4 @@ from |http://tile.openstreetmap.org/4/8/5.png|https://www.google.com/maps/@51.51202,0.02435,4z| |NULL|NULL| -![http://tile.openstreetmap.org/17/65544/43582.png](http://tile.openstreetmap.org/17/65544/43582.png "http://tile.openstreetmap.org/17/65544/43582.png") \ No newline at end of file +![http://tile.openstreetmap.org/17/65544/43582.png](http://tile.openstreetmap.org/17/65544/43582.png "http://tile.openstreetmap.org/17/65544/43582.png") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/getting_started/README.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/getting_started/README.md b/docs/gitbook/getting_started/README.md index 98393aa..5ccdd69 100644 --- a/docs/gitbook/getting_started/README.md +++ b/docs/gitbook/getting_started/README.md @@ -17,4 +17,4 @@ under the License. --> -# Summary \ No newline at end of file +# Summary http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7205de1e/docs/gitbook/getting_started/input-format.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/getting_started/input-format.md b/docs/gitbook/getting_started/input-format.md index 59e6a5f..7bd8573 100644 --- a/docs/gitbook/getting_started/input-format.md +++ b/docs/gitbook/getting_started/input-format.md @@ -80,7 +80,7 @@ Note 1.0 is used for the weight when omitting *weight*. Note that "0" is reserved for a Bias variable (called dummy variable in Statistics). -The [addBias](../tips/addbias.html) function is Hivemall appends "0:1.0" as an element of array in *features*. +The [add_bias](../tips/addbias.html) function is Hivemall appends "0:1.0" as an element of array in *features*. ## Feature hashing @@ -230,4 +230,4 @@ select click_or_not as label from table; -``` \ No newline at end of file +```