lang: HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java LanguageQueryFilter.java NGramProfile.java

jerome Fri, 26 Aug 2005 07:54:27 -0700

Author: jerome
Date: Fri Aug 26 07:54:16 2005
New Revision: 240254

URL: http://svn.apache.org/viewcvs?rev=240254&view=rev
Log:
Javadoc updates, corrections on input stream reading


Modified:
    
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
    
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
    
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
    
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
    
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java

Modified: 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 Fri Aug 26 07:54:16 2005
@@ -23,20 +23,37 @@
 import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;
 
-/** Adds metadata identifying language of document if found
- * We could also run statistical analysis here but we'd miss all other formats
+/**
+ * An [EMAIL PROTECTED] org.apache.nutch.parse.HtmlParseFilter} that looks for 
possible
+ * indications of content language.
+ *
+ * If some indication is found, it is added in the [EMAIL PROTECTED] 
#META_LANG_NAME}
+ * attribute of the [EMAIL PROTECTED] org.apache.nutch.parse.ParseData} 
metadata.
+ *
+ * @author Sami Siren
+ * @author Jerome Charron
  */
 public class HTMLLanguageParser implements HtmlParseFilter {
+
+  /** The language meta data attribute name */
   public static final String META_LANG_NAME="X-meta-lang";
-  public static final Logger LOG = LogFormatter
+  
+  private static final Logger LOG = LogFormatter
     .getLogger(HTMLLanguageParser.class.getName());
 
   /**
-   * Scan the HTML document looking at possible indications of content 
language<br>
-   * <li>1. html lang attribute 
(http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
-   * <li>2. meta dc.language 
(http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language)
-   * <li>3. meta http-equiv (content-language) 
(http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
-   * <br>Only the first occurence of language is stored.
+   * Scan the HTML document looking at possible indications of content 
language.
+   * <ol>
+   * <li>html lang attribute
+   *     (<a href="http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1";>
+   *     http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1</a>),</li>
+   * <li>meta dc.language (<a 
href="http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language";>
+   *     
http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language</a>),</li>
+   * <li>meta http-equiv (content-language) (
+   *     <a 
href="http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2";>
+   *     
http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2</a>).</li>
+   * </ol>
+   * Only the first occurence of language is stored.
    */
   public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
     String lang = findLanguage(doc);

Modified: 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 (original)
+++ 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 Fri Aug 26 07:54:16 2005
@@ -20,6 +20,7 @@
 import java.io.InputStream;
 import java.io.IOException;
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.util.List;
@@ -48,6 +49,10 @@
 
 
 /**
+ * Identify the language of a content, based on statistical analysis.
+ *
+ * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+ *      Language Codes</a>.
  * 
  * @author Sami Siren
  * @author Jerome Charron
@@ -59,8 +64,8 @@
   
   private final static float SCORE_THRESOLD = 0.00F;
 
-  public final static Logger LOG = 
LogFormatter.getLogger(LanguageIdentifier.class.getName());
-
+  private final static Logger LOG =
+          LogFormatter.getLogger(LanguageIdentifier.class.getName());
   
   private ArrayList languages = new ArrayList();
 
@@ -168,7 +173,8 @@
   }
 
   /**
-   * return handle to singleton instance
+   * Get a LanguageIdentifier instance.
+   * @return the LanguageIdentifier singleton instance.
    */
   public static LanguageIdentifier getInstance() {
     if (identifier == null) {
@@ -182,13 +188,25 @@
   }
 
   /**
-   * main method used for testing
-   * 
-   * @param args
+   * Main method used for command line process.
+   * <br/>Usage is:
+   * <pre>
+   * LanguageIdentifier [-identifyrows filename maxlines]
+   *                    [-identifyfile charset filename]
+   *                    [-identifyfileset charset files]
+   *                    [-identifytext text]
+   *                    [-identifyurl url]
+   * </pre>
+   * @param args arguments.
    */
   public static void main(String args[]) {
 
-    String usage = "Usage: LanguageIdentifier [-identifyrows filename 
maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext 
text] [-identifyurl url]";
+    String usage = "Usage: LanguageIdentifier "            +
+                      "[-identifyrows filename maxlines] " +
+                      "[-identifyfile charset filename] "  +
+                      "[-identifyfileset charset files] "  +
+                      "[-identifytext text] "              +
+                      "[-identifyurl url]";
     int command = 0;
 
     final int IDFILE = 1;
@@ -199,6 +217,7 @@
 
     Vector fileset = new Vector();
     String filename = "";
+    String charset = "";
     String url = "";
     String text = "";
     int max = 0;
@@ -211,6 +230,7 @@
     for (int i = 0; i < args.length; i++) { // parse command line
       if (args[i].equals("-identifyfile")) {
         command = IDFILE;
+        charset = args[++i];
         filename = args[++i];
       }
 
@@ -233,6 +253,7 @@
 
       if (args[i].equals("-identifyfileset")) {
         command = IDFILESET;
+        charset = args[++i];
         for (i++; i < args.length; i++) {
           File[] files = null;
           File f = new File(args[i]);
@@ -264,7 +285,7 @@
         case IDFILE:
           f = new File(filename);
           fis = new FileInputStream(f);
-          lang = idfr.identify(fis);
+          lang = idfr.identify(fis, charset);
           fis.close();
           break;
 
@@ -302,7 +323,7 @@
               filename = (String) i.next();
               f = new File(filename);
               fis = new FileInputStream(f);
-              lang = idfr.identify(fis);
+              lang = idfr.identify(fis, charset);
               fis.close();
             } catch (Exception e) {
               System.out.println(e);
@@ -349,22 +370,26 @@
   }
 
   /**
-   * Identify language based on submitted content
+   * Identify language of a content.
    * 
-   * @param text to analyze
-   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
-   *         unknown
+   * @param content is the content to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the specified content.
    */
-  public String identify(String text) {
-    return identify(new StringBuffer(text));
+  public String identify(String content) {
+    return identify(new StringBuffer(content));
   }
 
   /**
-   * Identify language based on submitted content
+   * Identify language of a content.
    * 
-   * @param text to analyze
-   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
-   *         unknown
+   * @param content is the content to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the specified content.
    */
   public String identify(StringBuffer content) {
 
@@ -405,26 +430,48 @@
   }
 
   /**
-   * Identify language from inputstream
-   * 
-   * @param is
-   * @return language code
-   * @throws IOException
+   * Identify language from input stream.
+   * This method uses the platform default encoding to read the input stream.
+   * For using a specific encoding, use the
+   * [EMAIL PROTECTED] #identify(InputStream, String)} method.
+   *
+   * @param is is the input stream to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the content of the specified input stream.
+   * @throws IOException if something wrong occurs on the input stream.
    */
   public String identify(InputStream is) throws IOException {
+    return identify(is, null);
+  }
+  
+  /**
+   * Identify language from input stream.
+   * 
+   * @param is is the input stream to analyze.
+   * @param charset is the charset to use to read the input stream.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the content of the specified input stream.
+   * @throws IOException if something wrong occurs on the input stream.
+   */
+  public String identify(InputStream is, String charset) throws IOException {
 
-    StringBuffer text = new StringBuffer();
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
     byte[] buffer = new byte[2048];
     int len = 0;
 
     while (((len = is.read(buffer)) != -1) &&
-           ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+           ((analyzeLength == 0) || (out.size() < analyzeLength))) {
       if (analyzeLength != 0) {
-          len = Math.min(len, analyzeLength - text.length());
+          len = Math.min(len, analyzeLength - out.size());
       }
-      text.append(new String(buffer, 0, len));
+      out.write(buffer, 0, len);
     }
-    return identify(text);
+    return identify((charset == null) ? out.toString()
+                                      : out.toString(charset));
   }
 
 }

Modified: 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
 (original)
+++ 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
 Fri Aug 26 07:54:16 2005
@@ -37,7 +37,7 @@
  *       information</li>
  *   <li>Then, checking if a <code>Content-Language</code> HTTP header can be
  *       found</li>
- *   <li>Finaly by analyzing the document content</li>
+ *   <li>Finaly by statisticaly analyzing the document content</li>
  * </ul>
  *   
  * @author Sami Siren

Modified: 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
 (original)
+++ 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
 Fri Aug 26 07:54:16 2005
@@ -18,10 +18,19 @@
 
 import org.apache.nutch.searcher.RawFieldQueryFilter;
 
-/** Handles "lang:" query clauses, causing them to search the "lang" field
- * indexed by LanguageIdentifier. */
+/**
+ * A [EMAIL PROTECTED] org.apache.nutch.searcher.QueryFilter} that handles
+ * <code>"lang:"</code> query clauses.
+ * It search the <code>"lang"</code> field indexed by the
+ * LanguageIdentifier.
+ *
+ * @author Sami Siren
+ * @author Jerome Charron
+ */
 public class LanguageQueryFilter extends RawFieldQueryFilter {
+
   public LanguageQueryFilter() {
     super("lang");
   }
+
 }

Modified: 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
 (original)
+++ 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
 Fri Aug 26 07:54:16 2005
@@ -25,6 +25,8 @@
 import java.io.FileOutputStream;
 import java.io.InputStreamReader;
 import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.UnsupportedEncodingException;
 import java.util.Date;
 import java.util.List;
 import java.util.Iterator;
@@ -42,19 +44,20 @@
 
 
 /**
- * This class runs a ngram analysis over submitted text, results might be used
- * for automatic language identifiaction.
- * 
- * The similarity calculation is at experimental level. You have been warned.
- * 
- * Methods are provided to build new NGramProfiles profiles.
+ * This class represents a ngram profile.
+ * A ngram profile is a set of the most frequently used sequences of chars
+ * in a text or set of texts.
+ * This class can be used to runs a ngram analysis over submitted text and
+ * then to build new NGramProfiles.
+ * A profile can then be serialized into a textual file, or a profile can
+ * be initialized from a ngram profile file (ngp files).
  * 
  * @author Sami Siren
- * @author Jerome Charron - http://frutch.free.fr/
+ * @author Jerome Charron
  */
 public class NGramProfile {
 
-  public static final Logger LOG = LogFormatter
+  static final Logger LOG = LogFormatter
       .getLogger("org.apache.nutch.analysis.lang.NGramProfile");
 
   /** The minimum length allowed for a ngram. */
@@ -119,7 +122,8 @@
   }
 
   /**
-   * @return Returns the name.
+   * Returns the profile name.
+   * @return the profile name.
    */
   public String getName() {
     return name;
@@ -178,9 +182,9 @@
   }
 
   /**
-   * Analyze a piece of text
+   * Analyze a piece of text.
    * 
-   * @param text the text to be analyzed
+   * @param text is the text to be analyzed
    */
   public void analyze(StringBuffer text) {
 
@@ -248,9 +252,11 @@
   }
 
   /**
-   * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
+   * Return a sorted list of ngrams.
+   * The list is sorted by:
+   * <ol><li>frequency</li><li>sequence</li></ol>
    * 
-   * @return sorted vector of ngrams
+   * @return A sorted list of ngrams
    */
   public List getSorted() {
     // make sure sorting is done only once
@@ -285,10 +291,10 @@
 
   /**
    * Calculate a score how well NGramProfiles match each other
+   * The similarity calculation is at experimental level. You have been warned.
    * 
-   * @param another
-   *          ngram profile to compare against
-   * @return similarity 0=exact match
+   * @param another is the ngram profile to compare against
+   * @return a similarity indicator, where 0 stands for an exact match.
    */
   public float getSimilarity(NGramProfile another) {
       
@@ -322,9 +328,10 @@
   }
 
   /**
-   * Loads a ngram profile from an InputStream
-   * (assumes UTF-8 encoded content)
-   * @param is the InputStream to read
+   * Loads a ngram profile from an InputStream.
+   * <i>Please notice, that this method assumes that the stream is UTF-8
+   * encoded</i>.
+   * @param is is the InputStream to read
    */
   public void load(InputStream is) throws IOException {
 
@@ -352,40 +359,43 @@
   }
 
   /**
-   * Create a new Language profile from (preferably quite large) text file
+   * Create a new ngram profile from an input stream.
+   * <i>Please notice that the size of the submitted content must be quite
+   * large for a good result</i>.
    * 
-   * @param name is thename of profile
-   * @param is is the stream to read
-   * @param encoding is the encoding of stream
-   */
-  public static NGramProfile create(String name, InputStream is, String 
encoding) {
+   * @param name is the name of the profile.
+   * @param is is the stream to read.
+   * @param encoding is the encoding of the stream.
+   */
+  public static NGramProfile create(String name,
+                                    InputStream is,
+                                    String encoding)
+                                    throws UnsupportedEncodingException {
 
     NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
                                                      
ABSOLUTE_MAX_NGRAM_LENGTH);
     BufferedInputStream bis = new BufferedInputStream(is);
-
+    ByteArrayOutputStream bao = new ByteArrayOutputStream();
     byte buffer[] = new byte[4096];
-    StringBuffer text = new StringBuffer();
     int len;
 
     try {
       while ((len = bis.read(buffer)) != -1) {
-        text.append(new String(buffer, 0, len, encoding));
+        bao.write(buffer, 0, len);
       }
     } catch (IOException e) {
       e.printStackTrace();
     }
-
-    newProfile.analyze(text);
+    newProfile.analyze(new StringBuffer(bao.toString(encoding)));
     return newProfile;
   }
 
   /**
-   * Writes NGramProfile content into OutputStream, content is outputted with
-   * UTF-8 encoding
+   * Writes NGramProfile content into OutputStream.
+   * The content is outputted using UTF-8 encoding.
    * 
-   * @param os the Stream to output to
-   * @throws IOException
+   * @param os is the stream to output to.
+   * @throws IOException if something wrong occurs on the output stream.
    */
   public void save(OutputStream os) throws IOException {
 
@@ -424,9 +434,14 @@
   }
 
   /**
-   * main method used for testing only
-   * 
-   * @param args
+   * Main method used for command line process.
+   * <br/>Usage is:
+   * <pre>
+   * NGramProfile [-create profilename filename encoding]
+   *              [-similarity file1 file2]
+   *              [-score profile-name filename encoding]
+   * </pre>
+   * @param args arguments.
    */
   public static void main(String args[]) {

svn commit: r240254 - in /lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang: HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java LanguageQueryFilter.java NGramProfile.java

Reply via email to