Repository: mahout Updated Branches: refs/heads/master a551b15e2 -> 670a7d219
MAHOUT-1649: Upgrade to Lucene 4.10.x, this closes apache/mahout#114 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/670a7d21 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/670a7d21 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/670a7d21 Branch: refs/heads/master Commit: 670a7d219e4eab8c7735083d52cefa13e81197fb Parents: a551b15 Author: Suneel Marthi <[email protected]> Authored: Mon Apr 6 22:25:12 2015 -0400 Committer: Suneel Marthi <[email protected]> Committed: Mon Apr 6 22:25:12 2015 -0400 ---------------------------------------------------------------------- .../mahout/classifier/NewsgroupHelper.java | 2 +- integration/pom.xml | 10 + .../mahout/text/LuceneSegmentInputFormat.java | 4 +- .../mahout/text/LuceneSegmentInputSplit.java | 4 +- .../mahout/text/LuceneSegmentRecordReader.java | 3 +- .../mahout/text/LuceneStorageConfiguration.java | 4 +- .../text/MailArchivesClusteringAnalyzer.java | 22 +- .../text/ReadOnlyFileSystemDirectory.java | 354 ------------------- .../text/SequenceFilesFromLuceneStorage.java | 1 - .../SequenceFilesFromLuceneStorageDriver.java | 3 +- .../SequenceFilesFromMailArchivesMapper.java | 29 +- .../text/wikipedia/WikipediaAnalyzer.java | 10 +- .../mahout/utils/regex/AnalyzerTransformer.java | 2 +- .../mahout/common/lucene/AnalyzerUtils.java | 4 +- .../encoders/InteractionValueEncoder.java | 6 +- .../mahout/classifier/ConfusionMatrixTest.java | 4 +- .../classifier/df/DecisionForestTest.java | 1 - .../apache/mahout/classifier/df/data/Utils.java | 10 +- .../mapreduce/partial/PartialBuilderTest.java | 16 +- .../classifier/mlp/TestNeuralNetwork.java | 11 +- .../classifier/naivebayes/NaiveBayesTest.java | 17 +- .../encoders/TextValueEncoderTest.java | 2 +- pom.xml | 4 +- 23 files changed, 75 insertions(+), 448 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java index 3674a57..2c857cc 100644 --- a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java +++ b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java @@ -60,7 +60,7 @@ public final class NewsgroupHelper { private static final long WEEK = 7 * 24 * 3600; private final Random rand = RandomUtils.getRandom(); - private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); + private final Analyzer analyzer = new StandardAnalyzer(); private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body"); private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept"); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/pom.xml ---------------------------------------------------------------------- diff --git a/integration/pom.xml b/integration/pom.xml index fcb85cb..9dcc03a 100644 --- a/integration/pom.xml +++ b/integration/pom.xml @@ -139,6 +139,16 @@ </dependency> <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-core</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + <version>3.1</version> + </dependency> + <dependency> <groupId>org.mongodb</groupId> <artifactId>mongo-java-driver</artifactId> <version>2.11.2</version> http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java index 1c4f8de..60d48ce 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java @@ -32,6 +32,7 @@ import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfos; +import org.apache.solr.store.hdfs.HdfsDirectory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,8 +53,7 @@ public class LuceneSegmentInputFormat extends InputFormat { List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths(); for (Path indexPath : indexPaths) { - ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath, - false, configuration); + HdfsDirectory directory = new HdfsDirectory(indexPath, configuration); SegmentInfos segmentInfos = new SegmentInfos(); segmentInfos.read(directory); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java index 1441e32..f30c7fb 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java @@ -24,6 +24,7 @@ import org.apache.hadoop.mapreduce.InputSplit; import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentInfos; +import org.apache.solr.store.hdfs.HdfsDirectory; import java.io.DataInput; import java.io.DataOutput; @@ -88,8 +89,7 @@ public class LuceneSegmentInputSplit extends InputSplit implements Writable { * @throws IOException if an error occurs when accessing the directory */ public SegmentCommitInfo getSegment(Configuration configuration) throws IOException { - ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath, - false, configuration); + HdfsDirectory directory = new HdfsDirectory(indexPath, configuration); SegmentInfos segmentInfos = new SegmentInfos(); segmentInfos.read(directory); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java index 485e856..d41ead2 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java @@ -62,9 +62,8 @@ public class LuceneSegmentRecordReader extends RecordReader<Text, NullWritable> for (String field : lucene2SeqConfiguration.getFields()) { LuceneIndexHelper.fieldShouldExistInIndex(segmentReader, field); } - Weight weight = lucene2SeqConfiguration.getQuery().createWeight(searcher); - scorer = weight.scorer(segmentReader.getContext(), false, false, null); + scorer = weight.scorer(segmentReader.getContext(), segmentReader.getLiveDocs()); if (scorer == null) { throw new IllegalArgumentException("Could not create query scorer for query: " + lucene2SeqConfiguration.getQuery()); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java index b36f3e9..7eed822 100644 --- a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java +++ b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java @@ -40,12 +40,12 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.util.Version; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import static org.apache.lucene.util.Version.LUCENE_46; /** * Holds all the configuration for {@link SequenceFilesFromLuceneStorage}, which generates a sequence file @@ -213,7 +213,7 @@ public class LuceneStorageConfiguration implements Writable { } idField = in.readUTF(); fields = Arrays.asList(in.readUTF().split(SEPARATOR_FIELDS)); - query = new QueryParser(LUCENE_46, "query", new StandardAnalyzer(LUCENE_46)).parse(in.readUTF()); + query = new QueryParser(Version.LUCENE_4_10_3, "query", new StandardAnalyzer(Version.LUCENE_4_10_3)).parse(in.readUTF()); maxHits = in.readInt(); } catch (ParseException e) { throw new RuntimeException("Could not deserialize " + this.getClass().getName(), e); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java index 8776c5f..4f6ba78 100644 --- a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java +++ b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java @@ -21,7 +21,6 @@ import java.io.Reader; import java.util.Arrays; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -42,13 +41,13 @@ import org.apache.lucene.util.Version; * stop words, excluding non-alpha-numeric tokens, and porter stemming. */ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase { - private static final Version LUCENE_VERSION = Version.LUCENE_46; - + private static final Version LUCENE_VERSION = Version.LUCENE_4_10_3; + // extended set of stop words composed of common mail terms like "hi", // HTML tags, and Java keywords asmany of the messages in the archives // are subversion check-in notifications - private static final CharArraySet STOP_SET = new CharArraySet(LUCENE_VERSION, Arrays.asList( + private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList( "3d","7bit","a0","about","above","abstract","across","additional","after", "afterwards","again","against","align","all","almost","alone","along", "already","also","although","always","am","among","amongst","amoungst", @@ -108,22 +107,17 @@ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase { private static final Matcher MATCHER = ALPHA_NUMERIC.matcher(""); public MailArchivesClusteringAnalyzer() { - super(LUCENE_VERSION, STOP_SET); + super(STOP_SET); } - public MailArchivesClusteringAnalyzer(CharArraySet stopSet) { - super(LUCENE_VERSION, stopSet); - - } - @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader); - TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer); - result = new LowerCaseFilter(LUCENE_VERSION, result); + Tokenizer tokenizer = new StandardTokenizer(reader); + TokenStream result = new StandardFilter(tokenizer); + result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new AlphaNumericMaxLengthFilter(result); - result = new StopFilter(LUCENE_VERSION, result, STOP_SET); + result = new StopFilter(result, STOP_SET); result = new PorterStemFilter(result); return new TokenStreamComponents(tokenizer, result); } http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java deleted file mode 100644 index e97e35b..0000000 --- a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java +++ /dev/null @@ -1,354 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.text; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.store.BaseDirectory; -import org.apache.lucene.store.BufferedIndexInput; -import org.apache.lucene.store.BufferedIndexOutput; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.Lock; -import org.apache.lucene.store.LockFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collection; - -//TODO: is there a better way of doing this in Lucene 4.x? - -/** - * This class implements a read-only Lucene Directory on top of a general FileSystem. - * Currently it does not support locking. - * <p/> - * // TODO: Rename to FileSystemReadOnlyDirectory - */ -public class ReadOnlyFileSystemDirectory extends BaseDirectory { - - private final FileSystem fs; - private final Path directory; - private final int ioFileBufferSize; - - private static final Logger log = LoggerFactory.getLogger(ReadOnlyFileSystemDirectory.class); - - /** - * Constructor - * - * @param fs - filesystem - * @param directory - directory path - * @param create - if true create the directory - * @param conf - MR Job Configuration - * @throws IOException - */ - - public ReadOnlyFileSystemDirectory(FileSystem fs, Path directory, boolean create, - Configuration conf) throws IOException { - - this.fs = fs; - this.directory = directory; - this.ioFileBufferSize = conf.getInt("io.file.buffer.size", 4096); - - if (create) { - create(); - } - - boolean isDir = false; - try { - FileStatus status = fs.getFileStatus(directory); - if (status != null) { - isDir = status.isDir(); - } - } catch (IOException e) { - log.error(e.getMessage(), e); - } - if (!isDir) { - throw new IOException(directory + " is not a directory"); - } - } - - - private void create() throws IOException { - if (!fs.exists(directory)) { - fs.mkdirs(directory); - } - - boolean isDir = false; - try { - FileStatus status = fs.getFileStatus(directory); - if (status != null) { - isDir = status.isDir(); - } - } catch (IOException e) { - log.error(e.getMessage(), e); - } - if (!isDir) { - throw new IOException(directory + " is not a directory"); - } - - // clear old index files - FileStatus[] fileStatus = - fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter()); - for (FileStatus status : fileStatus) { - if (!fs.delete(status.getPath(), true)) { - throw new IOException("Cannot delete index file " - + status.getPath()); - } - } - } - - public String[] list() throws IOException { - FileStatus[] fileStatus = - fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter()); - String[] result = new String[fileStatus.length]; - for (int i = 0; i < fileStatus.length; i++) { - result[i] = fileStatus[i].getPath().getName(); - } - return result; - } - - @Override - public String[] listAll() throws IOException { - return list(); - } - - @Override - public boolean fileExists(String name) throws IOException { - return fs.exists(new Path(directory, name)); - } - - @Override - public long fileLength(String name) throws IOException { - return fs.getFileStatus(new Path(directory, name)).getLen(); - } - - @Override - public void deleteFile(String name) throws IOException { - if (!fs.delete(new Path(directory, name), true)) { - throw new IOException("Cannot delete index file " + name); - } - } - - @Override - public IndexOutput createOutput(String name, IOContext context) throws IOException { - //TODO: What should we be doing with the IOContext here, if anything? - Path file = new Path(directory, name); - if (fs.exists(file) && !fs.delete(file, true)) { - // delete the existing one if applicable - throw new IOException("Cannot overwrite index file " + file); - } - - return new FileSystemIndexOutput(file, ioFileBufferSize); - } - - @Override - public void sync(Collection<String> names) throws IOException { - // do nothing, as this is read-only - } - - @Override - public IndexInput openInput(String name, IOContext context) throws IOException { - return new FileSystemIndexInput(new Path(directory, name), ioFileBufferSize); - } - - @Override - public Lock makeLock(final String name) { - return new Lock() { - public boolean obtain() { - return true; - } - - public void release() { - } - - public boolean isLocked() { - throw new UnsupportedOperationException(); - } - - public String toString() { - return "Lock@" + new Path(directory, name); - } - }; - } - - @Override - public void clearLock(String name) throws IOException { - // do nothing - } - - @Override - public void close() throws IOException { - // do not close the file system - } - - @Override - public void setLockFactory(LockFactory lockFactory) throws IOException { - // do nothing - } - - @Override - public LockFactory getLockFactory() { - return null; - } - - @Override - public String toString() { - return this.getClass().getName() + "@" + directory; - } - - private class FileSystemIndexInput extends BufferedIndexInput implements Cloneable { - - // shared by clones - private class Descriptor { - public final FSDataInputStream in; - public long position; // cache of in.getPos() - - public Descriptor(Path file, int ioFileBufferSize) throws IOException { - this.in = fs.open(file, ioFileBufferSize); - } - } - - private final Path filePath; // for debugging - private final Descriptor descriptor; - private final long length; - private boolean isOpen; - private boolean isClone; - - public FileSystemIndexInput(Path path, int ioFileBufferSize) - throws IOException { - super("FSII_" + path.getName(), ioFileBufferSize); - filePath = path; - descriptor = new Descriptor(path, ioFileBufferSize); - length = fs.getFileStatus(path).getLen(); - isOpen = true; - } - - @Override - protected void readInternal(byte[] b, int offset, int len) - throws IOException { - long position = getFilePointer(); - if (position != descriptor.position) { - descriptor.in.seek(position); - descriptor.position = position; - } - int total = 0; - do { - int i = descriptor.in.read(b, offset + total, len - total); - if (i == -1) { - throw new IOException("Read past EOF"); - } - descriptor.position += i; - total += i; - } while (total < len); - } - - @Override - public void close() throws IOException { - if (!isClone) { - if (isOpen) { - descriptor.in.close(); - isOpen = false; - } else { - throw new IOException("Index file " + filePath + " already closed"); - } - } - } - - @Override - protected void seekInternal(long position) { - // handled in readInternal() - } - - @Override - public long length() { - return length; - } - - @Override - protected void finalize() throws Throwable { - super.finalize(); - if (!isClone && isOpen) { - close(); // close the file - } - } - - @Override - public BufferedIndexInput clone() { - FileSystemIndexInput clone = (FileSystemIndexInput) super.clone(); - clone.isClone = true; - return clone; - } - } - - private class FileSystemIndexOutput extends BufferedIndexOutput { - - private final Path filePath; // for debugging - private final FSDataOutputStream out; - private boolean isOpen; - - public FileSystemIndexOutput(Path path, int ioFileBufferSize) - throws IOException { - filePath = path; - // overwrite is true by default - out = fs.create(path, true, ioFileBufferSize); - isOpen = true; - } - - @Override - public void flushBuffer(byte[] b, int offset, int size) throws IOException { - out.write(b, offset, size); - } - - @Override - public void close() throws IOException { - if (isOpen) { - super.close(); - out.close(); - isOpen = false; - } else { - throw new IOException("Index file " + filePath + " already closed"); - } - } - - @Override - public void seek(long pos) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long length() throws IOException { - return out.getPos(); - } - - @Override - protected void finalize() throws Throwable { - super.finalize(); - if (isOpen) { - close(); // close the file - } - } - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java index b7fd495..4906d3a 100644 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java +++ b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java @@ -82,7 +82,6 @@ public class SequenceFilesFromLuceneStorage { processedDocs = writerCollector.processedDocs; Closeables.close(sequenceFileWriter, false); directory.close(); - //searcher.close(); reader.close(); } } http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java index 1bd3f3e..4de372f 100644 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java +++ b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java @@ -96,8 +96,7 @@ public class SequenceFilesFromLuceneStorageDriver extends AbstractJob { if (hasOption(OPTION_QUERY)) { try { String queryString = COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll(""); - QueryParser queryParser = new QueryParser(Version.LUCENE_46, queryString, - new StandardAnalyzer(Version.LUCENE_46)); + QueryParser queryParser = new QueryParser(queryString, new StandardAnalyzer()); query = queryParser.parse(queryString); } catch (ParseException e) { throw new IllegalArgumentException(e.getMessage(), e); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java index 203e8fb..07226d3 100644 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java +++ b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java @@ -17,9 +17,21 @@ package org.apache.mahout.text; +import java.io.ByteArrayInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import com.google.common.base.Joiner; import com.google.common.collect.Lists; -import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; @@ -32,17 +44,6 @@ import org.apache.mahout.common.iterator.FileLineIterable; import org.apache.mahout.utils.email.MailOptions; import org.apache.mahout.utils.email.MailProcessor; -import java.io.ByteArrayInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.Charset; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_OPTION; import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_SEPARATOR_OPTION; import static org.apache.mahout.text.SequenceFilesFromMailArchives.CHARSET_OPTION; @@ -93,13 +94,13 @@ public class SequenceFilesFromMailArchivesMapper extends Mapper<IntWritable, Byt options.setCharset(charset); } - List<Pattern> patterns = Lists.newArrayListWithCapacity(5); + List<Pattern> patterns = new ArrayList<>(5); // patternOrder is used downstream so that we can know what order the // text is in instead // of encoding it in the string, which // would require more processing later to remove it pre feature // selection. - Map<String, Integer> patternOrder = Maps.newHashMap(); + Map<String, Integer> patternOrder = new HashMap<>(); int order = 0; if (!configuration.get(FROM_OPTION[1], "").equals("")) { patterns.add(MailProcessor.FROM_PREFIX); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java index ad55ba7..eae3d6d 100644 --- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java +++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java @@ -34,19 +34,19 @@ import org.apache.lucene.util.Version; public class WikipediaAnalyzer extends StopwordAnalyzerBase { public WikipediaAnalyzer() { - super(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + super(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } public WikipediaAnalyzer(CharArraySet stopSet) { - super(Version.LUCENE_46, stopSet); + super(stopSet); } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new WikipediaTokenizer(reader); - TokenStream result = new StandardFilter(Version.LUCENE_46, tokenizer); - result = new LowerCaseFilter(Version.LUCENE_46, result); - result = new StopFilter(Version.LUCENE_46, result, getStopwordSet()); + TokenStream result = new StandardFilter(tokenizer); + result = new LowerCaseFilter(result); + result = new StopFilter(result, getStopwordSet()); return new TokenStreamComponents(tokenizer, result); } } http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java index 36b166a..16623c9 100644 --- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java +++ b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java @@ -37,7 +37,7 @@ public class AnalyzerTransformer implements RegexTransformer { private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class); public AnalyzerTransformer() { - this(new StandardAnalyzer(Version.LUCENE_46), "text"); + this(new StandardAnalyzer()); } public AnalyzerTransformer(Analyzer analyzer) { http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java index 37ca383..cfaac07 100644 --- a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java +++ b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java @@ -32,7 +32,7 @@ public final class AnalyzerUtils { * @throws ClassNotFoundException - {@link ClassNotFoundException} */ public static Analyzer createAnalyzer(String analyzerClassName) throws ClassNotFoundException { - return createAnalyzer(analyzerClassName, Version.LUCENE_46); + return createAnalyzer(analyzerClassName, Version.LUCENE_4_10_3); } public static Analyzer createAnalyzer(String analyzerClassName, Version version) throws ClassNotFoundException { @@ -47,7 +47,7 @@ public final class AnalyzerUtils { * @return {@link Analyzer} */ public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass) { - return createAnalyzer(analyzerClass, Version.LUCENE_46); + return createAnalyzer(analyzerClass, Version.LUCENE_4_10_3); } public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass, Version version) { http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java ---------------------------------------------------------------------- diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java index 0be8823..e0f6ce1 100644 --- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java +++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java @@ -18,11 +18,9 @@ package org.apache.mahout.vectorizer.encoders; import java.util.Locale; - +import org.apache.commons.io.Charsets; import org.apache.mahout.math.Vector; -import com.google.common.base.Charsets; - public class InteractionValueEncoder extends FeatureVectorEncoder { private final FeatureVectorEncoder firstEncoder; private final FeatureVectorEncoder secondEncoder; @@ -88,7 +86,7 @@ public class InteractionValueEncoder extends FeatureVectorEncoder { int n = (k + j) % data.size(); if (isTraceEnabled()) { trace(String.format("%s:%s", new String(originalForm1, Charsets.UTF_8), new String(originalForm2, - Charsets.UTF_8)), n); + Charsets.UTF_8)), n); } data.set(n, data.get(n) + w); } http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/test/java/org/apache/mahout/classifier/ConfusionMatrixTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/classifier/ConfusionMatrixTest.java b/mr/src/test/java/org/apache/mahout/classifier/ConfusionMatrixTest.java index 3ffff85..8edc99b 100644 --- a/mr/src/test/java/org/apache/mahout/classifier/ConfusionMatrixTest.java +++ b/mr/src/test/java/org/apache/mahout/classifier/ConfusionMatrixTest.java @@ -17,11 +17,11 @@ package org.apache.mahout.classifier; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Map; -import com.google.common.collect.Lists; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.math.Matrix; import org.junit.Test; @@ -102,7 +102,7 @@ public final class ConfusionMatrixTest extends MahoutTestCase { } private static ConfusionMatrix fillConfusionMatrix(int[][] values, String[] labels, String defaultLabel) { - Collection<String> labelList = Lists.newArrayList(); + Collection<String> labelList = new ArrayList<>(); labelList.add(labels[0]); labelList.add(labels[1]); ConfusionMatrix confusionMatrix = new ConfusionMatrix(labelList, defaultLabel); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/test/java/org/apache/mahout/classifier/df/DecisionForestTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/classifier/df/DecisionForestTest.java b/mr/src/test/java/org/apache/mahout/classifier/df/DecisionForestTest.java index f1ec07f..d7ab09c 100644 --- a/mr/src/test/java/org/apache/mahout/classifier/df/DecisionForestTest.java +++ b/mr/src/test/java/org/apache/mahout/classifier/df/DecisionForestTest.java @@ -143,7 +143,6 @@ public final class DecisionForestTest extends MahoutTestCase { Data testData = DataLoader.loadData(dataset, TEST_DATA); double noValue = dataset.valueOf(4, "no"); - double yesValue = dataset.valueOf(4, "yes"); assertEquals(noValue, forest.classify(testData.getDataset(), rng, testData.get(0)), EPSILON); // This one is tie-broken -- 1 is OK too //assertEquals(yesValue, forest.classify(testData.getDataset(), rng, testData.get(1)), EPSILON); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/test/java/org/apache/mahout/classifier/df/data/Utils.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/classifier/df/data/Utils.java b/mr/src/test/java/org/apache/mahout/classifier/df/data/Utils.java index 1cf8b6a..db62d85 100644 --- a/mr/src/test/java/org/apache/mahout/classifier/df/data/Utils.java +++ b/mr/src/test/java/org/apache/mahout/classifier/df/data/Utils.java @@ -23,9 +23,8 @@ import java.io.IOException; import java.util.Arrays; import java.util.Random; -import com.google.common.base.Charsets; -import com.google.common.io.Closeables; import com.google.common.io.Files; +import org.apache.commons.io.Charsets; import org.apache.commons.lang3.ArrayUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -234,17 +233,12 @@ public final class Utils { } private static void writeDataToFile(String[] sData, Path path) throws IOException { - BufferedWriter output = null; - try { - output = Files.newWriter(new File(path.toString()), Charsets.UTF_8); + try (BufferedWriter output = Files.newWriter(new File(path.toString()), Charsets.UTF_8)){ for (String line : sData) { output.write(line); output.write('\n'); } - } finally { - Closeables.close(output, false); } - } public static Path writeDataToTestFile(String[] sData) throws IOException { http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilderTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilderTest.java b/mr/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilderTest.java index 3903c33..e41071c 100644 --- a/mr/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilderTest.java +++ b/mr/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilderTest.java @@ -18,25 +18,24 @@ package org.apache.mahout.classifier.df.mapreduce.partial; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Random; -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.Writer; import org.apache.hadoop.mapreduce.Job; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.RandomUtils; import org.apache.mahout.classifier.df.builder.DefaultTreeBuilder; import org.apache.mahout.classifier.df.builder.TreeBuilder; import org.apache.mahout.classifier.df.mapreduce.MapredOutput; import org.apache.mahout.classifier.df.node.Leaf; import org.apache.mahout.classifier.df.node.Node; +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.common.RandomUtils; import org.junit.Test; public final class PartialBuilderTest extends MahoutTestCase { @@ -66,15 +65,10 @@ public final class PartialBuilderTest extends MahoutTestCase { FileSystem fs = base.getFileSystem(conf); Path outputFile = new Path(base, "PartialBuilderTest.seq"); - Writer writer = SequenceFile.createWriter(fs, conf, outputFile, - TreeID.class, MapredOutput.class); - - try { + try (Writer writer = SequenceFile.createWriter(fs, conf, outputFile, TreeID.class, MapredOutput.class)){ for (int index = 0; index < NUM_TREES; index++) { writer.append(keys[index], values[index]); } - } finally { - Closeables.close(writer, false); } // load the output and make sure its valid @@ -116,7 +110,7 @@ public final class PartialBuilderTest extends MahoutTestCase { private static void randomKeyValues(Random rng, TreeID[] keys, MapredOutput[] values, int[] firstIds) { int index = 0; int firstId = 0; - Collection<Integer> partitions = Lists.newArrayList(); + Collection<Integer> partitions = new ArrayList<>(); for (int p = 0; p < NUM_MAPS; p++) { // select a random partition, not yet selected http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/test/java/org/apache/mahout/classifier/mlp/TestNeuralNetwork.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/classifier/mlp/TestNeuralNetwork.java b/mr/src/test/java/org/apache/mahout/classifier/mlp/TestNeuralNetwork.java index ebe5424..917bf1a 100644 --- a/mr/src/test/java/org/apache/mahout/classifier/mlp/TestNeuralNetwork.java +++ b/mr/src/test/java/org/apache/mahout/classifier/mlp/TestNeuralNetwork.java @@ -19,11 +19,14 @@ package org.apache.mahout.classifier.mlp; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import com.google.common.io.Files; import org.apache.commons.csv.CSVUtils; +import org.apache.commons.io.Charsets; import org.apache.mahout.classifier.mlp.NeuralNetwork.TrainingMethod; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.math.DenseMatrix; @@ -32,10 +35,6 @@ import org.apache.mahout.math.Matrix; import org.apache.mahout.math.Vector; import org.junit.Test; -import com.google.common.base.Charsets; -import com.google.common.collect.Lists; -import com.google.common.io.Files; - /** Test the functionality of {@link NeuralNetwork}. */ public class TestNeuralNetwork extends MahoutTestCase { @@ -218,7 +217,7 @@ public class TestNeuralNetwork extends MahoutTestCase { File cancerDataset = getTestTempFile("cancer.csv"); writeLines(cancerDataset, Datasets.CANCER); - List<Vector> records = Lists.newArrayList(); + List<Vector> records = new ArrayList<>(); // Returns a mutable list of the data List<String> cancerDataSetList = Files.readLines(cancerDataset, Charsets.UTF_8); // Skip the header line, hence remove the first element in the list @@ -272,7 +271,7 @@ public class TestNeuralNetwork extends MahoutTestCase { writeLines(irisDataset, Datasets.IRIS); int numOfClasses = 3; - List<Vector> records = Lists.newArrayList(); + List<Vector> records = new ArrayList<>(); // Returns a mutable list of the data List<String> irisDataSetList = Files.readLines(irisDataset, Charsets.UTF_8); // Skip the header line, hence remove the first element in the list http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/test/java/org/apache/mahout/classifier/naivebayes/NaiveBayesTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/classifier/naivebayes/NaiveBayesTest.java b/mr/src/test/java/org/apache/mahout/classifier/naivebayes/NaiveBayesTest.java index abd666e..b0672bf 100644 --- a/mr/src/test/java/org/apache/mahout/classifier/naivebayes/NaiveBayesTest.java +++ b/mr/src/test/java/org/apache/mahout/classifier/naivebayes/NaiveBayesTest.java @@ -19,7 +19,6 @@ package org.apache.mahout.classifier.naivebayes; import java.io.File; -import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -65,10 +64,8 @@ public class NaiveBayesTest extends MahoutTestCase { outputDir.delete(); tempDir = getTestTempDir("tmp"); - SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, - new Path(inputFile.getAbsolutePath()), Text.class, VectorWritable.class); - - try { + try (SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, + new Path(inputFile.getAbsolutePath()), Text.class, VectorWritable.class)) { writer.append(LABEL_STOLEN, trainingInstance(COLOR_RED, TYPE_SPORTS, ORIGIN_DOMESTIC)); writer.append(LABEL_NOT_STOLEN, trainingInstance(COLOR_RED, TYPE_SPORTS, ORIGIN_DOMESTIC)); writer.append(LABEL_STOLEN, trainingInstance(COLOR_RED, TYPE_SPORTS, ORIGIN_DOMESTIC)); @@ -79,8 +76,6 @@ public class NaiveBayesTest extends MahoutTestCase { writer.append(LABEL_NOT_STOLEN, trainingInstance(COLOR_YELLOW, TYPE_SUV, ORIGIN_DOMESTIC)); writer.append(LABEL_NOT_STOLEN, trainingInstance(COLOR_RED, TYPE_SUV, ORIGIN_IMPORTED)); writer.append(LABEL_STOLEN, trainingInstance(COLOR_RED, TYPE_SPORTS, ORIGIN_IMPORTED)); - } finally { - Closeables.close(writer, false); } } @@ -88,8 +83,8 @@ public class NaiveBayesTest extends MahoutTestCase { public void toyData() throws Exception { TrainNaiveBayesJob trainNaiveBayes = new TrainNaiveBayesJob(); trainNaiveBayes.setConf(conf); - trainNaiveBayes.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(), - "--tempDir", tempDir.getAbsolutePath() }); + trainNaiveBayes.run(new String[]{"--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(), + "--tempDir", tempDir.getAbsolutePath()}); NaiveBayesModel naiveBayesModel = NaiveBayesModel.materialize(new Path(outputDir.getAbsolutePath()), conf); @@ -107,9 +102,9 @@ public class NaiveBayesTest extends MahoutTestCase { public void toyDataComplementary() throws Exception { TrainNaiveBayesJob trainNaiveBayes = new TrainNaiveBayesJob(); trainNaiveBayes.setConf(conf); - trainNaiveBayes.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(), + trainNaiveBayes.run(new String[]{"--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(), "--trainComplementary", - "--tempDir", tempDir.getAbsolutePath() }); + "--tempDir", tempDir.getAbsolutePath()}); NaiveBayesModel naiveBayesModel = NaiveBayesModel.materialize(new Path(outputDir.getAbsolutePath()), conf); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java index 4446fef..3b7c93e 100644 --- a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java +++ b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java @@ -70,7 +70,7 @@ public final class TextValueEncoderTest extends MahoutTestCase { @Test public void testLuceneEncoding() throws Exception { LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text"); - enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46)); + enc.setAnalyzer(new WhitespaceAnalyzer()); Vector v1 = new DenseVector(200); enc.addToVector("test1 and more", v1); enc.flush(1, v1); http://git-wip-us.apache.org/repos/asf/mahout/blob/670a7d21/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 80b4a20..ab1734d 100644 --- a/pom.xml +++ b/pom.xml @@ -115,8 +115,8 @@ <mfindbugs.version>2.5.2</mfindbugs.version> <mjavadoc.version>2.9.1</mjavadoc.version> <hbase.version>1.0.0</hbase.version> - <lucene.version>4.6.1</lucene.version> - <slf4j.version>1.7.10</slf4j.version> + <lucene.version>4.10.3</lucene.version> + <slf4j.version>1.7.12</slf4j.version> <scala.compat.version>2.10</scala.compat.version> <scala.version>2.10.4</scala.version> <spark.version>1.1.1</spark.version>
