Author: srowen
Date: Wed Jan 26 23:21:41 2011
New Revision: 1063916
URL: http://svn.apache.org/viewvc?rev=1063916&view=rev
Log:
MAHOUT-594 Replace FileWriter and FileReader usage with idiom that properly
specifies character encoding
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/minhash/LastfmDataConverter.java
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
Wed Jan 26 23:21:41 2011
@@ -42,15 +42,18 @@ import org.apache.mahout.math.stats.Onli
import java.io.DataInputStream;
import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
+import java.io.Writer;
import java.lang.reflect.Type;
+import java.nio.charset.Charset;
import java.util.List;
/**
@@ -89,7 +92,7 @@ public final class ModelSerializer {
}
public static void writeJson(String path, OnlineLearner model) throws
IOException {
- OutputStreamWriter out = new FileWriter(path);
+ Writer out = new OutputStreamWriter(new FileOutputStream(new File(path)),
Charset.forName("UTF-8"));
try {
out.write(gson().toJson(model));
} finally {
@@ -414,6 +417,8 @@ public final class ModelSerializer {
}
public static void main(String[] args) throws FileNotFoundException {
- OnlineLogisticRegression m = ModelSerializer.loadJsonFrom(new
FileReader("/tmp/news-group-1000.model"), OnlineLogisticRegression.class);
+ loadJsonFrom(new InputStreamReader(new FileInputStream(new
File("/tmp/news-group-1000.model")),
+ Charset.forName("UTF-8")),
+ OnlineLogisticRegression.class);
}
}
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Wed Jan 26 23:21:41 2011
@@ -19,8 +19,10 @@ package org.apache.mahout.cf.taste.hadoo
import java.io.BufferedReader;
import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
import java.io.FilenameFilter;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
@@ -212,7 +214,8 @@ public final class ItemSimilarityTest ex
return name.startsWith("part-");
}
})[0];
- BufferedReader reader = new BufferedReader(new FileReader(outPart));
+ BufferedReader reader = new BufferedReader(
+ new InputStreamReader(new FileInputStream(outPart),
Charset.forName("UTF-8")));
String line;
int currentLine = 1;
@@ -308,7 +311,8 @@ public final class ItemSimilarityTest ex
return name.startsWith("part-");
}
})[0];
- BufferedReader reader = new BufferedReader(new FileReader(outPart));
+ BufferedReader reader = new BufferedReader(
+ new InputStreamReader(new FileInputStream(outPart),
Charset.forName("UTF-8")));
String line;
int currentLine = 1;
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java
Wed Jan 26 23:21:41 2011
@@ -17,9 +17,11 @@
package org.apache.mahout.fpm.pfpgrowth;
-import java.io.BufferedWriter;
import java.io.File;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -54,7 +56,7 @@ public final class PFPGrowthTest extends
File input = new File(inputDir, "test.txt");
params.set(PFPGrowth.INPUT, input.getAbsolutePath());
params.set(PFPGrowth.OUTPUT, outputDir.getAbsolutePath());
- BufferedWriter writer = new BufferedWriter(new FileWriter(input));
+ Writer writer = new OutputStreamWriter(new FileOutputStream(input),
Charset.forName("UTF-8"));
try {
Collection<List<String>> transactions = new ArrayList<List<String>>();
transactions.add(Arrays.asList("E", "A", "D", "B"));
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
Wed Jan 26 23:21:41 2011
@@ -20,7 +20,6 @@ package org.apache.mahout.classifier.sgd
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
import com.google.gson.InstanceCreator;
import com.google.gson.JsonArray;
import com.google.gson.JsonDeserializationContext;
@@ -34,12 +33,13 @@ import org.apache.mahout.math.DenseMatri
import org.apache.mahout.math.Matrix;
import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.lang.reflect.Type;
+import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -145,7 +145,7 @@ public class LogisticModelParameters {
* @throws IOException If there is an error opening or closing the file.
*/
public static LogisticModelParameters loadFrom(File in) throws IOException {
- InputStreamReader input = new FileReader(in);
+ Reader input = new InputStreamReader(new FileInputStream(in),
Charset.forName("UTF-8"));
try {
return loadFrom(input);
} finally {
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
Wed Jan 26 23:21:41 2011
@@ -31,11 +31,12 @@ import org.apache.mahout.vectorizer.enco
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
import java.util.List;
import java.util.Random;
@@ -84,7 +85,8 @@ public class SimpleCsvExamples {
}
out.close();
} else if ("--parse".equals(args[0])) {
- BufferedReader in = new BufferedReader(new FileReader(args[1]));
+ BufferedReader in = new BufferedReader(
+ new InputStreamReader(new FileInputStream(new File(args[1])),
Charset.forName("UTF-8")));
String line = in.readLine();
while (line != null) {
v.assign(0);
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
Wed Jan 26 23:21:41 2011
@@ -31,13 +31,16 @@ import org.apache.mahout.math.RandomAcce
import org.apache.mahout.math.Vector;
import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.FileWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
-import java.net.URL;
+import java.io.Writer;
+import java.nio.charset.Charset;
import java.util.List;
@@ -101,7 +104,7 @@ public final class TrainLogistic {
in.close();
}
- OutputStreamWriter modelOutput = new FileWriter(outputFile);
+ Writer modelOutput = new OutputStreamWriter(new
FileOutputStream(outputFile), Charset.forName("UTF-8"));
try {
lmp.saveTo(modelOutput);
} finally {
@@ -299,13 +302,12 @@ public final class TrainLogistic {
}
static BufferedReader open(String inputFile) throws IOException {
- InputStreamReader s;
+ InputStream in;
try {
- URL resource = Resources.getResource(inputFile);
- s = new InputStreamReader(resource.openStream());
+ in= Resources.getResource(inputFile).openStream();
} catch (IllegalArgumentException e) {
- s = new FileReader(inputFile);
+ in = new FileInputStream(new File(inputFile));
}
- return new BufferedReader(s);
+ return new BufferedReader(new InputStreamReader(in,
Charset.forName("UTF-8")));
}
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
Wed Jan 26 23:21:41 2011
@@ -42,10 +42,12 @@ import org.apache.mahout.vectorizer.enco
import java.io.BufferedReader;
import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
+import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Collection;
@@ -214,7 +216,8 @@ public final class TrainNewsGroups {
}
if (k % (bump * scale) == 0) {
if (learningAlgorithm.getBest() != null) {
- ModelSerializer.writeBinary("/tmp/news-group-" + k + ".model",
learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
+ ModelSerializer.writeBinary("/tmp/news-group-" + k + ".model",
+
learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
}
step += 0.25;
@@ -227,7 +230,8 @@ public final class TrainNewsGroups {
dissect(leakType, newsGroups, learningAlgorithm, files);
System.out.println("exiting main");
- ModelSerializer.writeBinary("/tmp/news-group.model",
learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
+ ModelSerializer.writeBinary("/tmp/news-group.model",
+
learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
List<Integer> counts = Lists.newArrayList();
System.out.printf("Word counts\n");
@@ -270,8 +274,9 @@ public final class TrainNewsGroups {
List<String> ngNames = Lists.newArrayList(newsGroups.values());
List<ModelDissector.Weight> weights = md.summary(100);
for (ModelDissector.Weight w : weights) {
- System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s\n", w.getFeature(),
w.getWeight(), ngNames.get(w.getMaxImpact() + 1),
- w.getCategory(1), w.getWeight(1), w.getCategory(2), w.getWeight(2));
+ System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s\n",
+ w.getFeature(), w.getWeight(),
ngNames.get(w.getMaxImpact() + 1),
+ w.getCategory(1), w.getWeight(1), w.getCategory(2),
w.getWeight(2));
}
}
@@ -279,7 +284,8 @@ public final class TrainNewsGroups {
long date = (long) (1000 * (DATE_REFERENCE + actual * MONTH + 1 * WEEK *
rand.nextDouble()));
Multiset<String> words = ConcurrentHashMultiset.create();
- BufferedReader reader = new BufferedReader(new FileReader(file));
+ BufferedReader reader =
+ new BufferedReader(new InputStreamReader(new FileInputStream(file),
Charset.forName("UTF-8")));
try {
String line = reader.readLine();
Reader dateString = new StringReader(DATE_FORMATS[leakType %
3].format(new Date(date)));
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
Wed Jan 26 23:21:41 2011
@@ -19,8 +19,11 @@ package org.apache.mahout.clustering.dis
import java.awt.Graphics;
import java.awt.Graphics2D;
-import java.io.FileWriter;
-import java.io.PrintWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -54,16 +57,16 @@ class DisplaySpectralKMeans extends Disp
if (!fs.exists(output)) {
fs.mkdirs(output);
}
- FileWriter writer = new FileWriter(affinities.toString());
- PrintWriter out = new PrintWriter(writer);
+ Writer writer = new OutputStreamWriter(
+ new FileOutputStream(new File(affinities.toString())),
Charset.forName("UTF-8"));
try {
for (int i = 0; i < SAMPLE_DATA.size(); i++) {
for (int j = 0; j < SAMPLE_DATA.size(); j++) {
- out.println(i + "," + j + ',' +
measure.distance(SAMPLE_DATA.get(i).get(), SAMPLE_DATA.get(j).get()));
+ writer.write(i + "," + j + ',' +
measure.distance(SAMPLE_DATA.get(i).get(), SAMPLE_DATA.get(j).get()) + '\n');
}
}
} finally {
- out.close();
+ writer.close();
}
int maxIter = 10;
double convergenceDelta = 0.001;
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/minhash/LastfmDataConverter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/minhash/LastfmDataConverter.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/minhash/LastfmDataConverter.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/minhash/LastfmDataConverter.java
Wed Jan 26 23:21:41 2011
@@ -27,8 +27,11 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import java.io.BufferedReader;
-import java.io.FileReader;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -97,7 +100,8 @@ public final class LastfmDataConverter {
Map<String, Integer> featureIdxMap = new HashMap<String, Integer>();
Map<String, List<Integer>> itemFeaturesMap = new HashMap<String,
List<Integer>>();
String msg = usedMemory() + "Converting data to internal vector format: ";
- BufferedReader br = new BufferedReader(new FileReader(inputFile));
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(new FileInputStream(new File(inputFile)),
Charset.forName("UTF-8")));
try {
System.out.print(msg);
int prevPercentDone = 1;
@@ -194,8 +198,7 @@ public final class LastfmDataConverter {
return;
}
Lastfm dataSet = Lastfm.valueOf(args[2]);
- Map<String, List<Integer>> itemFeatures = convertToItemFeatures(args[0],
- dataSet);
+ Map<String, List<Integer>> itemFeatures = convertToItemFeatures(args[0],
dataSet);
if (itemFeatures.isEmpty()) {
throw new IllegalStateException("Error converting the data file: [" +
args[0] + ']');
}
Modified:
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
(original)
+++
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
Wed Jan 26 23:21:41 2011
@@ -17,9 +17,7 @@
package org.apache.mahout.classifier.sgd;
-import com.google.common.base.CharMatcher;
import com.google.common.base.Charsets;
-import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
import com.google.common.io.Resources;
@@ -30,23 +28,24 @@ import org.apache.mahout.math.Vector;
import org.junit.Test;
import java.io.ByteArrayOutputStream;
-import java.io.FileReader;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.PrintStream;
+import java.io.Reader;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
+import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class TrainLogisticTest extends MahoutTestCase {
- private static final Splitter ON_WHITE_SPACE =
-
Splitter.on(CharMatcher.BREAKING_WHITESPACE).trimResults().omitEmptyStrings();
-
@Test
- public void example13_1() throws IOException, NoSuchFieldException,
IllegalAccessException, InvocationTargetException, NoSuchMethodException {
+ public void example13_1() throws Exception {
String outputFile = getTestTempFile("model").getAbsolutePath();
String trainOut = runMain(TrainLogistic.class, new String[]{
@@ -78,35 +77,57 @@ public class TrainLogisticTest extends M
verifyModel(lmp, csv, data, model, expectedValues);
// test saved model
- LogisticModelParameters lmpOut = LogisticModelParameters.loadFrom(new
FileReader(outputFile));
+ Reader in = new InputStreamReader(new FileInputStream(new
File(outputFile)), Charset.forName("UTF-8"));
+ LogisticModelParameters lmpOut = LogisticModelParameters.loadFrom(in);
+ in.close();
CsvRecordFactory csvOut = lmpOut.getCsvRecordFactory();
csvOut.firstLine(data.get(0));
OnlineLogisticRegression lrOut = lmpOut.createRegression();
verifyModel(lmpOut, csvOut, data, lrOut, expectedValues);
- String output = runMain(RunLogistic.class, new String[]{"--input",
"donut.csv", "--model", outputFile, "--auc", "--confusion"});
+ String output = runMain(RunLogistic.class, new String[]{
+ "--input", "donut.csv",
+ "--model", outputFile,
+ "--auc",
+ "--confusion"
+ });
assertTrue(output.contains("AUC = 0.57"));
assertTrue(output.contains("confusion: [[27.0, 13.0], [0.0, 0.0]]"));
}
@Test
- public void example13_2() throws InvocationTargetException, IOException,
NoSuchMethodException, NoSuchFieldException, IllegalAccessException {
+ public void example13_2() throws Exception {
String outputFile = getTestTempFile("model").getAbsolutePath();
String trainOut = runMain(TrainLogistic.class, new String[]{
- "--input", "donut.csv", "--output", outputFile,
- "--target", "color", "--categories", "2",
- "--predictors", "x", "y", "a", "b", "c", "--types", "numeric",
- "--features", "20", "--passes", "100", "--rate", "50"
+ "--input", "donut.csv",
+ "--output", outputFile,
+ "--target", "color",
+ "--categories", "2",
+ "--predictors", "x", "y", "a", "b", "c",
+ "--types", "numeric",
+ "--features", "20",
+ "--passes", "100",
+ "--rate", "50"
});
assertTrue(trainOut.contains("a 0."));
assertTrue(trainOut.contains("b -1."));
assertTrue(trainOut.contains("c -25."));
- String output = runMain(RunLogistic.class, new String[]{"--input",
"donut.csv", "--model", outputFile, "--auc", "--confusion"});
+ String output = runMain(RunLogistic.class, new String[]{
+ "--input", "donut.csv",
+ "--model", outputFile,
+ "--auc",
+ "--confusion"
+ });
assertTrue(output.contains("AUC = 1.00"));
- String heldout = runMain(RunLogistic.class, new String[]{"--input",
"donut-test.csv", "--model", outputFile, "--auc", "--confusion"});
+ String heldout = runMain(RunLogistic.class, new String[]{
+ "--input", "donut-test.csv",
+ "--model", outputFile,
+ "--auc",
+ "--confusion"
+ });
assertTrue(heldout.contains("AUC = 0.9"));
}
@@ -124,7 +145,8 @@ public class TrainLogisticTest extends M
* @throws NoSuchMethodException If there isn't a main method.
* @throws InvocationTargetException If the main method throws an
exception.
*/
- private String runMain(Class clazz, String[] args) throws IOException,
NoSuchFieldException, IllegalAccessException, NoSuchMethodException,
InvocationTargetException {
+ private static String runMain(Class<?> clazz, String[] args)
+ throws NoSuchFieldException, IllegalAccessException,
NoSuchMethodException, InvocationTargetException {
ByteArrayOutputStream trainOutput = new ByteArrayOutputStream();
PrintStream printStream = new PrintStream(trainOutput);
@@ -139,7 +161,11 @@ public class TrainLogisticTest extends M
return new String(trainOutput.toByteArray(), Charsets.UTF_8);
}
- private void verifyModel(LogisticModelParameters lmp, CsvRecordFactory csv,
List<String> data, AbstractVectorClassifier model, Map<String, Double>
expectedValues) {
+ private static void verifyModel(LogisticModelParameters lmp,
+ RecordFactory csv,
+ List<String> data,
+ AbstractVectorClassifier model,
+ Map<String, Double> expectedValues) {
ModelDissector md = new ModelDissector();
for (String line : data.subList(1, data.size())) {
Vector v = new DenseVector(lmp.getNumFeatures());
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
Wed Jan 26 23:21:41 2011
@@ -18,9 +18,11 @@
package org.apache.mahout.clustering.lda;
import java.io.File;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.PrintWriter;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -233,14 +235,17 @@ public final class LDAPrintTopics {
private static void writeTopWords(List<List<String>> topWords, File output)
throws IOException {
for (int i = 0; i < topWords.size(); ++i) {
List<String> topK = topWords.get(i);
- File out = new File(output, "topic-" + i);
- PrintWriter writer = new PrintWriter(new FileWriter(out));
- writer.println("Topic " + i);
- writer.println("===========");
- for (String word : topK) {
- writer.println(word);
+ Writer writer = new OutputStreamWriter(
+ new FileOutputStream(new File(output, "topic-" + i)),
Charset.forName("UTF-8"));
+ try {
+ writer.write("Topic " + i + '\n');
+ writer.write("===========\n");
+ for (String word : topK) {
+ writer.write(word + '\n');
+ }
+ } finally {
+ writer.close();
}
- writer.close();
}
}
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
Wed Jan 26 23:21:41 2011
@@ -17,10 +17,12 @@
package org.apache.mahout.utils;
-import java.io.FileWriter;
+import java.io.File;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
@@ -86,7 +88,8 @@ public final class SequenceFileDumper {
Writer writer;
if (cmdLine.hasOption(outputOpt)) {
- writer = new FileWriter(cmdLine.getValue(outputOpt).toString());
+ writer = new OutputStreamWriter(
+ new FileOutputStream(new
File(cmdLine.getValue(outputOpt).toString())), Charset.forName("UTF-8"));
} else {
writer = new OutputStreamWriter(System.out);
}
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Wed Jan 26 23:21:41 2011
@@ -18,10 +18,11 @@
package org.apache.mahout.utils.clustering;
import java.io.File;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@@ -158,7 +159,12 @@ public final class ClusterDumper extends
}
}
- Writer writer = this.outputFile == null ? new
OutputStreamWriter(System.out) : new FileWriter(this.outputFile);
+ Writer writer;
+ if (this.outputFile == null) {
+ writer = new OutputStreamWriter(System.out);
+ } else {
+ writer = new OutputStreamWriter(new FileOutputStream(new
File(this.outputFile)), Charset.forName("UTF-8"));
+ }
try {
FileSystem fs = seqFileDir.getFileSystem(conf);
for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*")))
{
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
Wed Jan 26 23:21:41 2011
@@ -38,9 +38,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
/**
* Can read in a {@link SequenceFile} of {@link Vector}s and dump
@@ -129,9 +130,13 @@ public final class VectorDumper {
Writable valueWritable =
reader.getValueClass().asSubclass(Writable.class).newInstance();
boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
try {
- Writer writer = cmdLine.hasOption(outputOpt)
- ? new FileWriter(cmdLine.getValue(outputOpt).toString())
- : new OutputStreamWriter(System.out);
+ Writer writer;
+ if (cmdLine.hasOption(outputOpt)) {
+ writer = new OutputStreamWriter(
+ new FileOutputStream(new
File(cmdLine.getValue(outputOpt).toString())), Charset.forName("UTF-8"));
+ } else {
+ writer = new OutputStreamWriter(System.out);
+ }
try {
boolean printKey = cmdLine.hasOption(printKeyOpt);
long i = 0;
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
Wed Jan 26 23:21:41 2011
@@ -20,7 +20,6 @@ package org.apache.mahout.utils.vectors.
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
@@ -61,7 +60,7 @@ public class ARFFVectorIterable implemen
private final ARFFModel model;
public ARFFVectorIterable(File file, ARFFModel model) throws IOException {
- this(new FileReader(file), model);
+ this(file, Charset.forName("UTF-8"), model);
}
public ARFFVectorIterable(File file, Charset encoding, ARFFModel model)
throws IOException {
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
Wed Jan 26 23:21:41 2011
@@ -20,7 +20,6 @@ package org.apache.mahout.utils.vectors.
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
-import java.io.FileWriter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
@@ -173,7 +172,8 @@ public final class Driver {
vectorWriter = getSeqFileWriter(outFile);
} else {
if ("file".equals(outWriter)) {
- vectorWriter = new JWriterVectorWriter(new BufferedWriter(new
FileWriter(outFile)));
+ vectorWriter = new JWriterVectorWriter(
+ new OutputStreamWriter(new FileOutputStream(new File(outFile)),
Charset.forName("UTF-8")));
} else {
vectorWriter = getSeqFileWriter(outFile);
}
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
Wed Jan 26 23:21:41 2011
@@ -18,10 +18,11 @@
package org.apache.mahout.utils.vectors.lucene;
import java.io.File;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
@@ -144,7 +145,12 @@ public class ClusterLabels {
public void getLabels() throws IOException {
- Writer writer = this.output == null ? new OutputStreamWriter(System.out) :
new FileWriter(this.output);
+ Writer writer;
+ if (this.output == null) {
+ writer = new OutputStreamWriter(System.out);
+ } else {
+ writer = new OutputStreamWriter(new FileOutputStream(new
File(this.output)), Charset.forName("UTF-8"));
+ }
try {
for (Map.Entry<Integer, List<WeightedVectorWritable>> integerListEntry :
clusterIdToPoints.entrySet()) {
List<WeightedVectorWritable> wvws = integerListEntry.getValue();
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=1063916&r1=1063915&r2=1063916&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
Wed Jan 26 23:21:41 2011
@@ -17,12 +17,11 @@
package org.apache.mahout.utils.vectors.lucene;
-import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
-import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
+import java.io.Writer;
import java.nio.charset.Charset;
import org.apache.commons.cli2.CommandLine;
@@ -214,7 +213,7 @@ public final class Driver {
if (cmdLine.hasOption(outWriterOpt)) {
String outWriter = cmdLine.getValue(outWriterOpt).toString();
if ("file".equals(outWriter)) {
- BufferedWriter writer = new BufferedWriter(new
FileWriter(outFile));
+ Writer writer = new OutputStreamWriter(new FileOutputStream(new
File(outFile)), Charset.forName("UTF8"));
vectorWriter = new JWriterVectorWriter(writer);
} else {
vectorWriter = getSeqFileWriter(outFile);
@@ -231,8 +230,7 @@ public final class Driver {
File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
log.info("Dictionary Output file: {}", dictOutFile);
- BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+ Writer writer = new OutputStreamWriter(new
FileOutputStream(dictOutFile), Charset.forName("UTF8"));
JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer,
delimiter, field);
tiWriter.write(termInfo);
tiWriter.close();