[ https://issues.apache.org/jira/browse/NUTCH-2038?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14592945#comment-14592945 ]
ASF GitHub Bot commented on NUTCH-2038: --------------------------------------- Github user chrismattmann commented on a diff in the pull request: https://github.com/apache/nutch/pull/32#discussion_r32798921 --- Diff: src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java --- @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.urlfilter.model; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.SequenceFile.Writer; +import org.apache.hadoop.io.Text; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Version; +import org.apache.mahout.classifier.naivebayes.BayesUtils; +import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; +import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; +import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob; +import org.apache.mahout.common.Pair; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.Vector.Element; +import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles; +import org.apache.mahout.vectorizer.TFIDF; + +import com.google.common.collect.ConcurrentHashMultiset; +import com.google.common.collect.Multiset; + +public class NBClassifier { + + public static Map<String, Integer> readDictionnary(Configuration conf, + Path dictionnaryPath) { + Map<String, Integer> dictionnary = new HashMap<String, Integer>(); + for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>( + dictionnaryPath, true, conf)) { + dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); + } + return dictionnary; + } + + public static Map<Integer, Long> readDocumentFrequency(Configuration conf, + Path documentFrequencyPath) { + Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>(); + for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>( + documentFrequencyPath, true, conf)) { + documentFrequency + .put(pair.getFirst().get(), pair.getSecond().get()); + } + return documentFrequency; + } + + public static void createModel(String inputTrainFilePath) throws Exception { + + String[] args1 = new String[4]; + + args1[0] = "-i"; + args1[1] = "outseq"; + args1[2] = "-o"; + args1[3] = "vectors"; + + String[] args2 = new String[9]; + + args2[0] = "-i"; + args2[1] = "vectors/tfidf-vectors"; + args2[2] = "-el"; + args2[3] = "-li"; + args2[4] = "labelindex"; + args2[5] = "-o"; + args2[6] = "model"; + args2[7] = "-ow"; + args2[8] = "-c"; + + convertToSeq(inputTrainFilePath, "outseq"); + + SparseVectorsFromSequenceFiles.main(args1); + + TrainNaiveBayesJob.main(args2); + } + + public static String classify(String text) throws IOException { + return classify(text, "model", "labelindex", + "vectors/dictionary.file-0", "vectors/df-count/part-r-00000"); + } + + public static String classify(String text, String modelPath, + String labelIndexPath, String dictionaryPath, + String documentFrequencyPath) throws IOException { + + Configuration configuration = new Configuration(); + + // model is a matrix (wordId, labelId) => probability score + NaiveBayesModel model = NaiveBayesModel.materialize( + new Path(modelPath), configuration); + + StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier( + model); + + // labels is a map label => classId + Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, + new Path(labelIndexPath)); + Map<String, Integer> dictionary = readDictionnary(configuration, + new Path(dictionaryPath)); + Map<Integer, Long> documentFrequency = readDocumentFrequency( + configuration, new Path(documentFrequencyPath)); + + // analyzer used to extract word from text + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + // int labelCount = labels.size(); + int documentCount = documentFrequency.get(-1).intValue(); + + Multiset<String> words = ConcurrentHashMultiset.create(); + + // extract words from text + TokenStream ts = analyzer.tokenStream("text", new StringReader(text)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + int wordCount = 0; + while (ts.incrementToken()) { + if (termAtt.length() > 0) { + String word = ts.getAttribute(CharTermAttribute.class) + .toString(); + Integer wordId = dictionary.get(word); + // if the word is not in the dictionary, skip it + if (wordId != null) { + words.add(word); + wordCount++; + } + } + } + + ts.end(); + ts.close(); + // create vector wordId => weight using tfidf + Vector vector = new RandomAccessSparseVector(10000); + TFIDF tfidf = new TFIDF(); + for (Multiset.Entry<String> entry : words.entrySet()) { + String word = entry.getElement(); + int count = entry.getCount(); + Integer wordId = dictionary.get(word); + Long freq = documentFrequency.get(wordId); + double tfIdfValue = tfidf.calculate(count, freq.intValue(), + wordCount, documentCount); + vector.setQuick(wordId, tfIdfValue); + } + // one score for each label + + Vector resultVector = classifier.classifyFull(vector); + double bestScore = -Double.MAX_VALUE; + int bestCategoryId = -1; + for (Element element : resultVector.all()) { + int categoryId = element.index(); + double score = element.get(); + if (score > bestScore) { + bestScore = score; + bestCategoryId = categoryId; + } + + } + + analyzer.close(); + return labels.get(bestCategoryId); + + } + + static void convertToSeq(String inputFileName, String outputDirName) + throws IOException { + Configuration configuration = new Configuration(); + FileSystem fs = FileSystem.get(configuration); + Writer writer = new SequenceFile.Writer(fs, configuration, new Path( + outputDirName + "/chunk-0"), Text.class, Text.class); + + BufferedReader reader = new BufferedReader( + new FileReader(inputFileName)); + Text key = new Text(); + Text value = new Text(); + while (true) { + String line = reader.readLine(); + if (line == null) { + break; + } + String[] tokens = line.split("\t", 3); + if (tokens.length != 3) { + // System.out.println("Skip line: " + line); + continue; + } + String category = tokens[0]; + String id = tokens[1]; + String message = tokens[2]; + key.set("/" + category + "/" + id); + value.set(message); + writer.append(key, value); + + } + reader.close(); + writer.close(); + + } + + public static void main(String args[]) throws Exception { --- End diff -- +1 > Naive Bayes classifier based url filter > --------------------------------------- > > Key: NUTCH-2038 > URL: https://issues.apache.org/jira/browse/NUTCH-2038 > Project: Nutch > Issue Type: New Feature > Components: fetcher, injector, parser > Reporter: Asitang Mishra > Assignee: Chris A. Mattmann > Labels: memex, nutch > Fix For: 1.11 > > > A url filter that will filter out the urls (after the parsing stage, will > keep only those urls that contain some "hot words" provided again in a list.) > from that pages that are classified irrelevant by the classifier. -- This message was sent by Atlassian JIRA (v6.3.4#6332)