Author: jerome Date: Wed Aug 17 09:10:23 2005 New Revision: 233192 URL: http://svn.apache.org/viewcvs?rev=233192&view=rev Log: Fix the issue reported by Andrzej Bialecki in http://www.mail-archive.com/nutch-dev%40lucene.apache.org/msg00065.html. The HTMLLanguageParser: * can now extract the language code from a string not compliant with rfc1766 * check that the extracted language is a valid language code.
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=233192&r1=233191&r2=233192&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Wed Aug 17 09:10:23 2005 @@ -14,23 +14,62 @@ * limitations under the License. */ package org.apache.nutch.analysis.lang; + +// JDK imports +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.logging.Logger; + +// Nutch imports import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.protocol.Content; -import org.w3c.dom.*; - -import java.util.logging.Logger; import org.apache.nutch.util.LogFormatter; -/** Adds metadata identifying language of document if found +// DOM imports +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + + +/** + * Adds metadata identifying language of document if found * We could also run statistical analysis here but we'd miss all other formats */ public class HTMLLanguageParser implements HtmlParseFilter { + public static final String META_LANG_NAME="X-meta-lang"; public static final Logger LOG = LogFormatter .getLogger(HTMLLanguageParser.class.getName()); + /* A static Map of ISO-639 language codes */ + private static Map LANGUAGES_MAP = new HashMap(); + static { + try { + Properties p = new Properties(); + p.load(HTMLLanguageParser.class + .getResourceAsStream("langmappings.properties")); + Enumeration keys = p.keys(); + while (keys.hasMoreElements()) { + String key = (String) keys.nextElement(); + String[] values = p.getProperty(key).split(",", -1); + LANGUAGES_MAP.put(key, key); + for (int i=0; i<values.length; i++) { + LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key); + } + } + } catch (Exception e) { + LOG.severe(e.toString()); + } + } + + + /** * Scan the HTML document looking at possible indications of content language<br> * <li>1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) @@ -39,60 +78,122 @@ * <br>Only the first occurence of language is stored. */ public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { - String lang = findLanguage(doc); + + // Trying to find the document's language + LanguageParser parser = new LanguageParser(doc); + String lang = parser.getLanguage(); if (lang != null) { parse.getData().getMetadata().put(META_LANG_NAME, lang); } - return parse; } - - private String findLanguage(Node node) { - String lang = null; - if (node.getNodeType() == Node.ELEMENT_NODE) { - - //lang attribute - lang = ((Element) node).getAttribute("lang"); - if (lang != null && lang.length()>1) { - return lang; - } - if ("meta".equalsIgnoreCase(node.getNodeName())) { + static class LanguageParser { + + private String dublinCore = null; + private String htmlAttribute = null; + private String httpEquiv = null; + private String language = null; + + LanguageParser(Node node) { + parse(node); + if (htmlAttribute != null) { language = htmlAttribute; } + else if (dublinCore != null) { language = dublinCore; } + else {language = httpEquiv; } + } + + String getLanguage() { + return language; + } + + void parse(Node node) { - NamedNodeMap attrs=node.getAttributes(); + String lang = null; + + if (node.getNodeType() == Node.ELEMENT_NODE) { + + // Check for the lang HTML attribute + if (htmlAttribute == null) { + htmlAttribute = parseLanguage(((Element) node).getAttribute("lang")); + } - //dc.language - for(int i=0;i<attrs.getLength();i++){ - Node attrnode=attrs.item(i); - if("name".equalsIgnoreCase(attrnode.getNodeName())){ - if("dc.language".equalsIgnoreCase(attrnode.getNodeValue())){ - Node valueattr=attrs.getNamedItem("content"); - lang = (valueattr!=null)?valueattr.getNodeValue():null; + // Check for Meta + if ("meta".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + + // Check for the dc.language Meta + if (dublinCore == null) { + for (int i=0; i<attrs.getLength(); i++) { + Node attrnode = attrs.item(i); + if ("name".equalsIgnoreCase(attrnode.getNodeName())) { + if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) { + Node valueattr = attrs.getNamedItem("content"); + if (valueattr != null) { + dublinCore = parseLanguage(valueattr.getNodeValue()); + } + } + } } } - } - - //http-equiv content-language - for(int i=0;i<attrs.getLength();i++){ - Node attrnode=attrs.item(i); - if("http-equiv".equalsIgnoreCase(attrnode.getNodeName())){ - if("content-language".equals(attrnode.getNodeValue().toLowerCase())){ - Node valueattr=attrs.getNamedItem("content"); - lang = (valueattr!=null)?valueattr.getNodeValue():null; + + // Check for the http-equiv content-language + if (httpEquiv == null) { + for (int i=0; i<attrs.getLength(); i++){ + Node attrnode = attrs.item(i); + if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) { + if ("content-language".equals(attrnode.getNodeValue().toLowerCase())) { + Node valueattr = attrs.getNamedItem("content"); + if (valueattr != null) { + httpEquiv = parseLanguage(valueattr.getNodeValue()); + } + } + } } } } } + + // Recurse + NodeList children = node.getChildNodes(); + for (int i=0; children != null && i<children.getLength(); i++) { + parse(children.item(i)); + if ((dublinCore != null) && + (htmlAttribute != null) && + (httpEquiv != null)) { + return; + } + } } - - //recurse - NodeList children = node.getChildNodes(); - for (int i = 0; children != null && i < children.getLength(); i++) { - lang = findLanguage(children.item(i)); - if(lang != null && lang.length()>1) return lang; + + /** + * Parse a language string and return an ISO 639 primary code, + * or <code>null</code> if something wrong occurs, or if no language is found. + */ + final static String parseLanguage(String lang) { + + if (lang == null) { return null; } + + String code = null; + String language = null; + + // First, split multi-valued values + String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1); + + int i = 0; + while ((language == null) && (i<langs.length)) { + // Then, get the primary code + code = langs[i].split("-")[0]; + code = code.split("_")[0]; + // Find the ISO 639 code + language = (String) LANGUAGES_MAP.get(code.toLowerCase()); + i++; + } + + return language; } - - return lang; + } + + } Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties?rev=233192&r1=233191&r2=233192&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties Wed Aug 17 09:10:23 2005 @@ -1,187 +1,188 @@ -aa=aar -ab=abk +# Defines some mapping between common erroneous languages codes and +# the ISO 639 two-letters language codes. +aa=aar,Afar +ab=abk,Abkhazian ae=ave -af=afr +af=afr,Afrikaans ak=aka -am=amh +am=amh,Amharic an=arg -ar=ara -as=asm +ar=ara,Arabic +as=asm,Assamese av=ava -ay=aym -az=aze -ba=bak -be=bel -bg=bul -bh=bih -bi=bis +ay=aym,Aymara +az=aze,Azerbaijani +ba=bak,Bashkir +be=bel,Byelorussian +bg=bul,Bulgarian +bh=bih,Bihari +bi=bis,Bislama bm=bam -bn=ben -bo=tib/bod -br=bre +bn=ben,Bengali +bo=bod,tib,Tibetan +br=bre,Breton bs=bos -ca=cat +ca=cat,Catalan ce=che ch=cha -co=cos +co=cos,Corsican cr=cre -cs=cze/ces +cs=ces,cze,Czech cu=chu cv=chv -cy=wel/cym -da=dan -de=ger/deu +cy=cym,wel,Welsh +da=dan,Danish +de=deu,ger,German dv=div -dz=dzo +dz=dzo,Dzongkha ee=ewe -el=gre/ell -en=eng -eo=epo -es=spa -et=est -eu=baq/eus -fa=per/fas +el=ell,gre,Greek +en=eng,English +eo=epo,Esperanto +es=esl,spa,Spanish +et=est,Estonian +eu=baq,eus,Basque +fa=fas,per,Persian ff=ful -fi=fin -fj=fij -fo=fao -fr=fre/fra -fy=fry -ga=gle +fi=fin,Finnish +fj=fij,Fijian +fo=fao,Faroese +fr=fra,fre,French +fy=fry,Frisian +ga=gai,iri,Irish gd=gla -gl=glg -gn=grn -gu=guj +gl=glg,Gallegan +gn=grn,Guarani +gu=guj,Gujarati gv=glv -ha=hau -he=heb -hi=hin +ha=hau,Hausa +he=heb,Hebrew +hi=hin,Hindi ho=hmo -hr=scr/hrv +hr=scr,hrv,Croatian ht=hat -hu=hun -hy=arm/hye +hu=hun,Hungarian +hy=arm,hye,Armenian hz=her -ia=ina -id=ind +ia=ina,Interlingua +id=ind,Indonesian ie=ile ig=ibo ii=iii -ik=ipk +ik=ipk,Inupiak io=ido -is=ice/isl -it=ita -iu=iku -ja=jpn -jv=jav -ka=geo/kat +is=ice,isl,Icelandic +it=ita,Italian +iu=iku,Inuktitut +ja=jpn,Japanese +jv=jw,jav,jaw,Javanese +ka=geo,kat,Georgian kg=kon ki=kik -ki=kik -kj=kua kj=kua -kk=kaz -kl=kal -km=khm -kn=kan -ko=kor +kk=kaz,Kazakh +kl=kal,Greenlandic +km=khm,Khmer +kn=kan,Kannada +ko=kor,Korean kr=kau -ks=kas -ku=kur +ks=kas,Kashmiri +ku=kur,Kurdish kv=kom kw=cor -ky=kir -la=lat +ky=kir,Kirghiz +la=lat,Latin lb=ltz lg=lug li=lim -ln=lin -lo=lao -lt=lit +ln=lin,Lingala +lo=lao,Lao +lt=lit,Lithuanian lu=lub -lv=lav -mg=mlg +lv=lav,Latvian +mg=mlg,Malagasy mh=mah -mi=mao/mri -mk=mac/mkd -ml=mal -mn=mon -mo=mol -mr=mar -ms=may/msa +mi=mao,mri,Maori +mk=mac,mak,Macedonian +ml=mal,mlt,Maltese +mn=mon,Mongolian +mo=mol,Moldavian +mr=mar,Marathi +ms=may,msa,Malay mt=mlt -my=bur/mya -na=nau +my=bur,mya,Burmese +na=nau,Nauru nb=nob nd=nde -ne=nep +ne=nep,Nepali ng=ndo -nl=dut/nld +nl=dut,nla,Dutch nn=nno -no=nor +no=nor,Norwegian nr=nbl nv=nav ny=nya -oc=oci +oc=oci,Langue d'Oc oj=oji -om=orm -or=ori +om=orm,Oromo +or=ori,Oriya os=oss -pa=pan +pa=pan,Panjabi pi=pli -pl=pol -ps=pus -pt=por -qu=que -rm=roh -rn=run -ro=rum/ron -ru=rus -rw=kin -sa=san +pl=pol,Polish +ps=pus,Pushto +pt=por,Portuguese +qu=que,Quechua +rm=roh,Rhaeto-Romance +rn=run,Rundi +ro=ron,rum,Romanian +ru=rus,Russian +rw=kin,Kinyarwanda +sa=san,Sanskrit sc=srd -sd=snd +sd=snd,Sindhi se=sme -sg=sag -si=sin -sk=slo/slk -sl=slv -sm=smo -sn=sna -so=som -sq=alb/sqi -sr=scc/srp -ss=ssw -st=sot -su=sun -sv=swe -sw=swa -ta=tam -te=tel -tg=tgk -th=tha -ti=tir -tk=tuk -tl=tgl -tn=tsn -to=ton -tr=tur -ts=tso -tt=tat -tw=twi +sg=sag,Sango +sh=scr,Serbo-Croatian +si=sin,Singhalese +sk=slk,slo,Slovak +sl=slv,Slovenian +sm=smo,Samoan +sn=sna,Shona +so=som,Somali +sq=alb,sqi,Albanian +sr=scc,srp,Serbian +ss=ssw,Siswant +st=sot,Sotho +su=sun,Sudanese +sv=sve,swe,Swedish,Svenska,Sweden +sw=swa,Swahili +ta=tam,Tamil +te=tel,Telugu +tg=tgk,Tajik +th=tha,Thai +ti=tir,Tigrinya +tk=tuk,Turkmen +tl=tgl,Tagalog +tn=tsn,Tswana +to=tog,Tonga +tr=tur,Turkish +ts=tso,Tsonga +tt=tat,Tatar +tw=twi,Twi ty=tah -ug=uig -uk=ukr -ur=urd -uz=uzb +ug=uig,Uighur +uk=ukr,Ukrainian +ur=urd,Urdu +uz=uzb,Uzbek ve=ven -vi=vie -vo=vol +vi=vie,Vietnamese +vo=vol,Volapk wa=wln -wo=wol -xh=xho -yi=yid -yo=yor -za=zha -zh=chi/zho -zu=zul +wo=wol,Wolof +xh=xho,Xhosa +yi=yidYiddish +yo=yor,Yoruba +za=zha,Zhuang +zh=chi,zho,Chinese +zu=zul,Zulu Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=233192&r1=233191&r2=233192&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Wed Aug 17 09:10:23 2005 @@ -15,14 +15,19 @@ */ package org.apache.nutch.analysis.lang; +// JDK imports import java.util.Properties; +// JUnit imports import junit.framework.TestCase; + +// Nutch imports import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.ParserFactory; import org.apache.nutch.protocol.Content; + public class TestHTMLLanguageParser extends TestCase { private static String URL = "http://foo.bar/"; @@ -61,6 +66,61 @@ } + /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */ + public void testParseLanguage() { + String tests[][] = { + { "(SCHEME=ISO.639-1) sv", "sv" }, + { "(SCHEME=RFC1766) sv-FI", "sv" }, + { "(SCHEME=Z39.53) SWE", "sv" }, + { "EN_US, SV, EN, EN_UK", "en" }, + { "English Swedish", "en" }, + { "English, swedish", "en" }, + { "English,Swedish", "en" }, + { "Other (Svenska)", "sv" }, + { "SE", "se" }, + { "SV", "sv" }, + { "SV charset=iso-8859-1", "sv" }, + { "SV-FI", "sv" }, + { "SV; charset=iso-8859-1", "sv" }, + { "SVE", "sv" }, + { "SW", "sw" }, + { "SWE", "sv" }, + { "SWEDISH", "sv" }, + { "Sv", "sv" }, + { "Sve", "sv" }, + { "Svenska", "sv" }, + { "Swedish", "sv" }, + { "Swedish, svenska", "sv" }, + { "en, sv", "en" }, + { "sv", "sv" }, + { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, + { "sv,en", "sv" }, + { "sv-FI", "sv" }, + { "sv-SE", "sv" }, + { "sv-en", "sv" }, + { "sv-fi", "sv" }, + { "sv-se", "sv" }, + { "sv; Content-Language: sv", "sv" }, + { "sv_SE", "sv" }, + { "sve", "sv" }, + { "svenska, swedish, engelska, english", "sv" }, + { "sw", "sw" }, + { "swe", "sv" }, + { "swe.SPR.", "sv" }, + { "sweden", "sv" }, + { "swedish", "sv" }, + { "swedish,", "sv" }, + { "text/html; charset=sv-SE", "sv" }, + { "text/html; sv", "sv" }, + { "torp, stuga, uthyres, bed & breakfast", null } + }; + + for (int i=0; i<44; i++) { + assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0])); + } + } + + private Content getContent(String text) { Properties p = new Properties(); p.put("Content-Type", "text/html"); @@ -68,4 +128,5 @@ Content content = new Content(URL, BASE, text.getBytes(), "text/html", p); return content; } + }