[ 
https://issues.apache.org/jira/browse/NUTCH-2038?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14592945#comment-14592945
 ] 

ASF GitHub Bot commented on NUTCH-2038:
---------------------------------------

Github user chrismattmann commented on a diff in the pull request:

    https://github.com/apache/nutch/pull/32#discussion_r32798921
  
    --- Diff: 
src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java
 ---
    @@ -0,0 +1,234 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.nutch.urlfilter.model;
    +
    +import java.io.BufferedReader;
    +import java.io.FileReader;
    +import java.io.IOException;
    +import java.io.StringReader;
    +import java.util.HashMap;
    +import java.util.Map;
    +
    +import org.apache.hadoop.conf.Configuration;
    +import org.apache.hadoop.fs.FileSystem;
    +import org.apache.hadoop.fs.Path;
    +import org.apache.hadoop.io.IntWritable;
    +import org.apache.hadoop.io.LongWritable;
    +import org.apache.hadoop.io.SequenceFile;
    +import org.apache.hadoop.io.SequenceFile.Writer;
    +import org.apache.hadoop.io.Text;
    +import org.apache.lucene.analysis.Analyzer;
    +import org.apache.lucene.analysis.TokenStream;
    +import org.apache.lucene.analysis.standard.StandardAnalyzer;
    +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    +import org.apache.lucene.util.Version;
    +import org.apache.mahout.classifier.naivebayes.BayesUtils;
    +import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
    +import 
org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
    +import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob;
    +import org.apache.mahout.common.Pair;
    +import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
    +import org.apache.mahout.math.RandomAccessSparseVector;
    +import org.apache.mahout.math.Vector;
    +import org.apache.mahout.math.Vector.Element;
    +import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
    +import org.apache.mahout.vectorizer.TFIDF;
    +
    +import com.google.common.collect.ConcurrentHashMultiset;
    +import com.google.common.collect.Multiset;
    +
    +public class NBClassifier {
    +
    +   public static Map<String, Integer> readDictionnary(Configuration conf,
    +                   Path dictionnaryPath) {
    +           Map<String, Integer> dictionnary = new HashMap<String, 
Integer>();
    +           for (Pair<Text, IntWritable> pair : new 
SequenceFileIterable<Text, IntWritable>(
    +                           dictionnaryPath, true, conf)) {
    +                   dictionnary.put(pair.getFirst().toString(), 
pair.getSecond().get());
    +           }
    +           return dictionnary;
    +   }
    +
    +   public static Map<Integer, Long> readDocumentFrequency(Configuration 
conf,
    +                   Path documentFrequencyPath) {
    +           Map<Integer, Long> documentFrequency = new HashMap<Integer, 
Long>();
    +           for (Pair<IntWritable, LongWritable> pair : new 
SequenceFileIterable<IntWritable, LongWritable>(
    +                           documentFrequencyPath, true, conf)) {
    +                   documentFrequency
    +                                   .put(pair.getFirst().get(), 
pair.getSecond().get());
    +           }
    +           return documentFrequency;
    +   }
    +
    +   public static void createModel(String inputTrainFilePath) throws 
Exception {
    +
    +           String[] args1 = new String[4];
    +
    +           args1[0] = "-i";
    +           args1[1] = "outseq";
    +           args1[2] = "-o";
    +           args1[3] = "vectors";
    +
    +           String[] args2 = new String[9];
    +
    +           args2[0] = "-i";
    +           args2[1] = "vectors/tfidf-vectors";
    +           args2[2] = "-el";
    +           args2[3] = "-li";
    +           args2[4] = "labelindex";
    +           args2[5] = "-o";
    +           args2[6] = "model";
    +           args2[7] = "-ow";
    +           args2[8] = "-c";
    +
    +           convertToSeq(inputTrainFilePath, "outseq");
    +
    +           SparseVectorsFromSequenceFiles.main(args1);
    +
    +           TrainNaiveBayesJob.main(args2);
    +   }
    +
    +   public static String classify(String text) throws IOException {
    +           return classify(text, "model", "labelindex",
    +                           "vectors/dictionary.file-0", 
"vectors/df-count/part-r-00000");
    +   }
    +
    +   public static String classify(String text, String modelPath,
    +                   String labelIndexPath, String dictionaryPath,
    +                   String documentFrequencyPath) throws IOException {
    +
    +           Configuration configuration = new Configuration();
    +
    +           // model is a matrix (wordId, labelId) => probability score
    +           NaiveBayesModel model = NaiveBayesModel.materialize(
    +                           new Path(modelPath), configuration);
    +
    +           StandardNaiveBayesClassifier classifier = new 
StandardNaiveBayesClassifier(
    +                           model);
    +
    +           // labels is a map label => classId
    +           Map<Integer, String> labels = 
BayesUtils.readLabelIndex(configuration,
    +                           new Path(labelIndexPath));
    +           Map<String, Integer> dictionary = readDictionnary(configuration,
    +                           new Path(dictionaryPath));
    +           Map<Integer, Long> documentFrequency = readDocumentFrequency(
    +                           configuration, new Path(documentFrequencyPath));
    +
    +           // analyzer used to extract word from text
    +           Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
    +           // int labelCount = labels.size();
    +           int documentCount = documentFrequency.get(-1).intValue();
    +
    +           Multiset<String> words = ConcurrentHashMultiset.create();
    +
    +           // extract words from text
    +           TokenStream ts = analyzer.tokenStream("text", new 
StringReader(text));
    +           CharTermAttribute termAtt = 
ts.addAttribute(CharTermAttribute.class);
    +           ts.reset();
    +           int wordCount = 0;
    +           while (ts.incrementToken()) {
    +                   if (termAtt.length() > 0) {
    +                           String word = 
ts.getAttribute(CharTermAttribute.class)
    +                                           .toString();
    +                           Integer wordId = dictionary.get(word);
    +                           // if the word is not in the dictionary, skip it
    +                           if (wordId != null) {
    +                                   words.add(word);
    +                                   wordCount++;
    +                           }
    +                   }
    +           }
    +
    +           ts.end();
    +           ts.close();
    +           // create vector wordId => weight using tfidf
    +           Vector vector = new RandomAccessSparseVector(10000);
    +           TFIDF tfidf = new TFIDF();
    +           for (Multiset.Entry<String> entry : words.entrySet()) {
    +                   String word = entry.getElement();
    +                   int count = entry.getCount();
    +                   Integer wordId = dictionary.get(word);
    +                   Long freq = documentFrequency.get(wordId);
    +                   double tfIdfValue = tfidf.calculate(count, 
freq.intValue(),
    +                                   wordCount, documentCount);
    +                   vector.setQuick(wordId, tfIdfValue);
    +           }
    +           // one score for each label
    +
    +           Vector resultVector = classifier.classifyFull(vector);
    +           double bestScore = -Double.MAX_VALUE;
    +           int bestCategoryId = -1;
    +           for (Element element : resultVector.all()) {
    +                   int categoryId = element.index();
    +                   double score = element.get();
    +                   if (score > bestScore) {
    +                           bestScore = score;
    +                           bestCategoryId = categoryId;
    +                   }
    +
    +           }
    +
    +           analyzer.close();
    +           return labels.get(bestCategoryId);
    +
    +   }
    +
    +   static void convertToSeq(String inputFileName, String outputDirName)
    +                   throws IOException {
    +           Configuration configuration = new Configuration();
    +           FileSystem fs = FileSystem.get(configuration);
    +           Writer writer = new SequenceFile.Writer(fs, configuration, new 
Path(
    +                           outputDirName + "/chunk-0"), Text.class, 
Text.class);
    +
    +           BufferedReader reader = new BufferedReader(
    +                           new FileReader(inputFileName));
    +           Text key = new Text();
    +           Text value = new Text();
    +           while (true) {
    +                   String line = reader.readLine();
    +                   if (line == null) {
    +                           break;
    +                   }
    +                   String[] tokens = line.split("\t", 3);
    +                   if (tokens.length != 3) {
    +                           // System.out.println("Skip line: " + line);
    +                           continue;
    +                   }
    +                   String category = tokens[0];
    +                   String id = tokens[1];
    +                   String message = tokens[2];
    +                   key.set("/" + category + "/" + id);
    +                   value.set(message);
    +                   writer.append(key, value);
    +
    +           }
    +           reader.close();
    +           writer.close();
    +
    +   }
    +
    +   public static void main(String args[]) throws Exception {
    --- End diff --
    
    +1


> Naive Bayes classifier based url filter
> ---------------------------------------
>
>                 Key: NUTCH-2038
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2038
>             Project: Nutch
>          Issue Type: New Feature
>          Components: fetcher, injector, parser
>            Reporter: Asitang Mishra
>            Assignee: Chris A. Mattmann
>              Labels: memex, nutch
>             Fix For: 1.11
>
>
> A url filter that will filter out the urls (after the parsing stage,  will 
> keep only those urls that contain some "hot words" provided again in a list.) 
> from that pages that are classified irrelevant by the classifier.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to