Author: jerome Date: Fri Aug 26 07:54:16 2005 New Revision: 240254 URL: http://svn.apache.org/viewcvs?rev=240254&view=rev Log: Javadoc updates, corrections on input stream reading
Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=240254&r1=240253&r2=240254&view=diff ============================================================================== --- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Aug 26 07:54:16 2005 @@ -23,20 +23,37 @@ import java.util.logging.Logger; import org.apache.nutch.util.LogFormatter; -/** Adds metadata identifying language of document if found - * We could also run statistical analysis here but we'd miss all other formats +/** + * An [EMAIL PROTECTED] org.apache.nutch.parse.HtmlParseFilter} that looks for possible + * indications of content language. + * + * If some indication is found, it is added in the [EMAIL PROTECTED] #META_LANG_NAME} + * attribute of the [EMAIL PROTECTED] org.apache.nutch.parse.ParseData} metadata. + * + * @author Sami Siren + * @author Jerome Charron */ public class HTMLLanguageParser implements HtmlParseFilter { + + /** The language meta data attribute name */ public static final String META_LANG_NAME="X-meta-lang"; - public static final Logger LOG = LogFormatter + + private static final Logger LOG = LogFormatter .getLogger(HTMLLanguageParser.class.getName()); /** - * Scan the HTML document looking at possible indications of content language<br> - * <li>1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) - * <li>2. meta dc.language (http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language) - * <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) - * <br>Only the first occurence of language is stored. + * Scan the HTML document looking at possible indications of content language. + * <ol> + * <li>html lang attribute + * (<a href="http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1"> + * http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1</a>),</li> + * <li>meta dc.language (<a href="http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language"> + * http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language</a>),</li> + * <li>meta http-equiv (content-language) ( + * <a href="http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2"> + * http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2</a>).</li> + * </ol> + * Only the first occurence of language is stored. */ public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { String lang = findLanguage(doc); Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=240254&r1=240253&r2=240254&view=diff ============================================================================== --- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original) +++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Fri Aug 26 07:54:16 2005 @@ -20,6 +20,7 @@ import java.io.InputStream; import java.io.IOException; import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.List; @@ -48,6 +49,10 @@ /** + * Identify the language of a content, based on statistical analysis. + * + * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 + * Language Codes</a>. * * @author Sami Siren * @author Jerome Charron @@ -59,8 +64,8 @@ private final static float SCORE_THRESOLD = 0.00F; - public final static Logger LOG = LogFormatter.getLogger(LanguageIdentifier.class.getName()); - + private final static Logger LOG = + LogFormatter.getLogger(LanguageIdentifier.class.getName()); private ArrayList languages = new ArrayList(); @@ -168,7 +173,8 @@ } /** - * return handle to singleton instance + * Get a LanguageIdentifier instance. + * @return the LanguageIdentifier singleton instance. */ public static LanguageIdentifier getInstance() { if (identifier == null) { @@ -182,13 +188,25 @@ } /** - * main method used for testing - * - * @param args + * Main method used for command line process. + * <br/>Usage is: + * <pre> + * LanguageIdentifier [-identifyrows filename maxlines] + * [-identifyfile charset filename] + * [-identifyfileset charset files] + * [-identifytext text] + * [-identifyurl url] + * </pre> + * @param args arguments. */ public static void main(String args[]) { - String usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]"; + String usage = "Usage: LanguageIdentifier " + + "[-identifyrows filename maxlines] " + + "[-identifyfile charset filename] " + + "[-identifyfileset charset files] " + + "[-identifytext text] " + + "[-identifyurl url]"; int command = 0; final int IDFILE = 1; @@ -199,6 +217,7 @@ Vector fileset = new Vector(); String filename = ""; + String charset = ""; String url = ""; String text = ""; int max = 0; @@ -211,6 +230,7 @@ for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-identifyfile")) { command = IDFILE; + charset = args[++i]; filename = args[++i]; } @@ -233,6 +253,7 @@ if (args[i].equals("-identifyfileset")) { command = IDFILESET; + charset = args[++i]; for (i++; i < args.length; i++) { File[] files = null; File f = new File(args[i]); @@ -264,7 +285,7 @@ case IDFILE: f = new File(filename); fis = new FileInputStream(f); - lang = idfr.identify(fis); + lang = idfr.identify(fis, charset); fis.close(); break; @@ -302,7 +323,7 @@ filename = (String) i.next(); f = new File(filename); fis = new FileInputStream(f); - lang = idfr.identify(fis); + lang = idfr.identify(fis, charset); fis.close(); } catch (Exception e) { System.out.println(e); @@ -349,22 +370,26 @@ } /** - * Identify language based on submitted content + * Identify language of a content. * - * @param text to analyze - * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if - * unknown + * @param content is the content to analyze. + * @return The 2 letter + * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 + * language code</a> (en, fi, sv, ...) of the language that best + * matches the specified content. */ - public String identify(String text) { - return identify(new StringBuffer(text)); + public String identify(String content) { + return identify(new StringBuffer(content)); } /** - * Identify language based on submitted content + * Identify language of a content. * - * @param text to analyze - * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if - * unknown + * @param content is the content to analyze. + * @return The 2 letter + * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 + * language code</a> (en, fi, sv, ...) of the language that best + * matches the specified content. */ public String identify(StringBuffer content) { @@ -405,26 +430,48 @@ } /** - * Identify language from inputstream - * - * @param is - * @return language code - * @throws IOException + * Identify language from input stream. + * This method uses the platform default encoding to read the input stream. + * For using a specific encoding, use the + * [EMAIL PROTECTED] #identify(InputStream, String)} method. + * + * @param is is the input stream to analyze. + * @return The 2 letter + * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 + * language code</a> (en, fi, sv, ...) of the language that best + * matches the content of the specified input stream. + * @throws IOException if something wrong occurs on the input stream. */ public String identify(InputStream is) throws IOException { + return identify(is, null); + } + + /** + * Identify language from input stream. + * + * @param is is the input stream to analyze. + * @param charset is the charset to use to read the input stream. + * @return The 2 letter + * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 + * language code</a> (en, fi, sv, ...) of the language that best + * matches the content of the specified input stream. + * @throws IOException if something wrong occurs on the input stream. + */ + public String identify(InputStream is, String charset) throws IOException { - StringBuffer text = new StringBuffer(); + ByteArrayOutputStream out = new ByteArrayOutputStream(); byte[] buffer = new byte[2048]; int len = 0; while (((len = is.read(buffer)) != -1) && - ((analyzeLength == 0) || (text.length() < analyzeLength))) { + ((analyzeLength == 0) || (out.size() < analyzeLength))) { if (analyzeLength != 0) { - len = Math.min(len, analyzeLength - text.length()); + len = Math.min(len, analyzeLength - out.size()); } - text.append(new String(buffer, 0, len)); + out.write(buffer, 0, len); } - return identify(text); + return identify((charset == null) ? out.toString() + : out.toString(charset)); } } Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=240254&r1=240253&r2=240254&view=diff ============================================================================== --- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original) +++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Aug 26 07:54:16 2005 @@ -37,7 +37,7 @@ * information</li> * <li>Then, checking if a <code>Content-Language</code> HTTP header can be * found</li> - * <li>Finaly by analyzing the document content</li> + * <li>Finaly by statisticaly analyzing the document content</li> * </ul> * * @author Sami Siren Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java?rev=240254&r1=240253&r2=240254&view=diff ============================================================================== --- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java (original) +++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java Fri Aug 26 07:54:16 2005 @@ -18,10 +18,19 @@ import org.apache.nutch.searcher.RawFieldQueryFilter; -/** Handles "lang:" query clauses, causing them to search the "lang" field - * indexed by LanguageIdentifier. */ +/** + * A [EMAIL PROTECTED] org.apache.nutch.searcher.QueryFilter} that handles + * <code>"lang:"</code> query clauses. + * It search the <code>"lang"</code> field indexed by the + * LanguageIdentifier. + * + * @author Sami Siren + * @author Jerome Charron + */ public class LanguageQueryFilter extends RawFieldQueryFilter { + public LanguageQueryFilter() { super("lang"); } + } Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java?rev=240254&r1=240253&r2=240254&view=diff ============================================================================== --- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java (original) +++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Fri Aug 26 07:54:16 2005 @@ -25,6 +25,8 @@ import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; +import java.io.UnsupportedEncodingException; import java.util.Date; import java.util.List; import java.util.Iterator; @@ -42,19 +44,20 @@ /** - * This class runs a ngram analysis over submitted text, results might be used - * for automatic language identifiaction. - * - * The similarity calculation is at experimental level. You have been warned. - * - * Methods are provided to build new NGramProfiles profiles. + * This class represents a ngram profile. + * A ngram profile is a set of the most frequently used sequences of chars + * in a text or set of texts. + * This class can be used to runs a ngram analysis over submitted text and + * then to build new NGramProfiles. + * A profile can then be serialized into a textual file, or a profile can + * be initialized from a ngram profile file (ngp files). * * @author Sami Siren - * @author Jerome Charron - http://frutch.free.fr/ + * @author Jerome Charron */ public class NGramProfile { - public static final Logger LOG = LogFormatter + static final Logger LOG = LogFormatter .getLogger("org.apache.nutch.analysis.lang.NGramProfile"); /** The minimum length allowed for a ngram. */ @@ -119,7 +122,8 @@ } /** - * @return Returns the name. + * Returns the profile name. + * @return the profile name. */ public String getName() { return name; @@ -178,9 +182,9 @@ } /** - * Analyze a piece of text + * Analyze a piece of text. * - * @param text the text to be analyzed + * @param text is the text to be analyzed */ public void analyze(StringBuffer text) { @@ -248,9 +252,11 @@ } /** - * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence) + * Return a sorted list of ngrams. + * The list is sorted by: + * <ol><li>frequency</li><li>sequence</li></ol> * - * @return sorted vector of ngrams + * @return A sorted list of ngrams */ public List getSorted() { // make sure sorting is done only once @@ -285,10 +291,10 @@ /** * Calculate a score how well NGramProfiles match each other + * The similarity calculation is at experimental level. You have been warned. * - * @param another - * ngram profile to compare against - * @return similarity 0=exact match + * @param another is the ngram profile to compare against + * @return a similarity indicator, where 0 stands for an exact match. */ public float getSimilarity(NGramProfile another) { @@ -322,9 +328,10 @@ } /** - * Loads a ngram profile from an InputStream - * (assumes UTF-8 encoded content) - * @param is the InputStream to read + * Loads a ngram profile from an InputStream. + * <i>Please notice, that this method assumes that the stream is UTF-8 + * encoded</i>. + * @param is is the InputStream to read */ public void load(InputStream is) throws IOException { @@ -352,40 +359,43 @@ } /** - * Create a new Language profile from (preferably quite large) text file + * Create a new ngram profile from an input stream. + * <i>Please notice that the size of the submitted content must be quite + * large for a good result</i>. * - * @param name is thename of profile - * @param is is the stream to read - * @param encoding is the encoding of stream - */ - public static NGramProfile create(String name, InputStream is, String encoding) { + * @param name is the name of the profile. + * @param is is the stream to read. + * @param encoding is the encoding of the stream. + */ + public static NGramProfile create(String name, + InputStream is, + String encoding) + throws UnsupportedEncodingException { NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH); BufferedInputStream bis = new BufferedInputStream(is); - + ByteArrayOutputStream bao = new ByteArrayOutputStream(); byte buffer[] = new byte[4096]; - StringBuffer text = new StringBuffer(); int len; try { while ((len = bis.read(buffer)) != -1) { - text.append(new String(buffer, 0, len, encoding)); + bao.write(buffer, 0, len); } } catch (IOException e) { e.printStackTrace(); } - - newProfile.analyze(text); + newProfile.analyze(new StringBuffer(bao.toString(encoding))); return newProfile; } /** - * Writes NGramProfile content into OutputStream, content is outputted with - * UTF-8 encoding + * Writes NGramProfile content into OutputStream. + * The content is outputted using UTF-8 encoding. * - * @param os the Stream to output to - * @throws IOException + * @param os is the stream to output to. + * @throws IOException if something wrong occurs on the output stream. */ public void save(OutputStream os) throws IOException { @@ -424,9 +434,14 @@ } /** - * main method used for testing only - * - * @param args + * Main method used for command line process. + * <br/>Usage is: + * <pre> + * NGramProfile [-create profilename filename encoding] + * [-similarity file1 file2] + * [-score profile-name filename encoding] + * </pre> + * @param args arguments. */ public static void main(String args[]) {