On 19/01/2013 17:04, Daniel Naber wrote: > On 19.01.2013, 16:28:42 Mauro Condarelli wrote: > > Hi Mauro, > >> I would like to submit present code and get feedback before doing >> further work. > please send your code as a patch file (created with "svn diff") to this list. > > here they come. Patches include two distinct areas: 1) implementation of Italian disambiguator (with token rules) 2) Implementation of MultiTagger and its usage in Italian tagging.
Regards Mauro ============================================================== Index: src/main/java/org/languagetool/language/Italian.java =================================================================== --- src/main/java/org/languagetool/language/Italian.java (revision 9109) +++ src/main/java/org/languagetool/language/Italian.java (working copy) @@ -31,11 +31,14 @@ import org.languagetool.rules.WordRepeatRule; import org.languagetool.rules.it.MorfologikItalianSpellerRule; import org.languagetool.tagging.Tagger; +import org.languagetool.tagging.disambiguation.Disambiguator; +import org.languagetool.tagging.disambiguation.rules.it.ItalianRuleDisambiguator; import org.languagetool.tagging.it.ItalianTagger; public class Italian extends Language { private Tagger tagger; + private Disambiguator disambiguator; @Override public String getName() { @@ -71,6 +74,14 @@ } @Override + public final Disambiguator getDisambiguator() { + if (disambiguator == null) { + disambiguator = new ItalianRuleDisambiguator(); + } + return disambiguator; + } + + @Override public Contributor[] getMaintainers() { final Contributor contributor = new Contributor("Paolo Bianchini"); return new Contributor[] { contributor }; Index: src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java =================================================================== --- src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java (revision 9109) +++ src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java (working copy) @@ -20,6 +20,7 @@ package org.languagetool.rules.spelling.morfologik; import org.languagetool.AnalyzedSentence; +import org.languagetool.AnalyzedToken; import org.languagetool.AnalyzedTokenReadings; import org.languagetool.JLanguageTool; import org.languagetool.Language; @@ -78,11 +79,17 @@ return toRuleMatchArray(ruleMatches); } } + skip: for (AnalyzedTokenReadings token : tokens) { final String word = token.getToken(); if (ignoreWord(word) || token.isImmunized()) { continue; } + for (AnalyzedToken at : token.getReadings()) { + if (!at.hasNoTag()) + continue skip; // if it HAS a POS tag then it is a known word. + } + if (tokenizingPattern() == null) { ruleMatches.addAll(getRuleMatch(word, token.getStartPos())); } else { Index: src/main/java/org/languagetool/tagging/BaseTagger.java =================================================================== --- src/main/java/org/languagetool/tagging/BaseTagger.java (revision 9109) +++ src/main/java/org/languagetool/tagging/BaseTagger.java (working copy) @@ -56,9 +56,6 @@ @Override public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException { - List<AnalyzedToken> taggerTokens; - List<AnalyzedToken> lowerTaggerTokens; - List<AnalyzedToken> upperTaggerTokens; final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); int pos = 0; // caching IStemmer instance - lazy init @@ -70,32 +67,37 @@ for (String word : sentenceTokens) { final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); final String lowerWord = word.toLowerCase(conversionLocale); - taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(word)); - lowerTaggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)); final boolean isLowercase = word.equals(lowerWord); //normal case - addTokens(taggerTokens, l); + { + List<AnalyzedToken> taggerTokens; + taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(word)); + addTokens(taggerTokens, l); + } if (!isLowercase) { //lowercase + List<AnalyzedToken> lowerTaggerTokens; + lowerTaggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)); addTokens(lowerTaggerTokens, l); } //uppercase - if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) { - if (isLowercase) { - upperTaggerTokens = asAnalyzedTokenList(word, - dictLookup.lookup(StringTools.uppercaseFirstChar(word))); - if (!upperTaggerTokens.isEmpty()) { - addTokens(upperTaggerTokens, l); - } else { - l.add(new AnalyzedToken(word, null, null)); - } - } else { - l.add(new AnalyzedToken(word, null, null)); + if (isLowercase && l.isEmpty()) { + List<AnalyzedToken> upperTaggerTokens; + upperTaggerTokens = asAnalyzedTokenList(word, + dictLookup.lookup(StringTools.uppercaseFirstChar(word))); + if (!upperTaggerTokens.isEmpty()) { + addTokens(upperTaggerTokens, l); } } + + //still empty? last resort... + if (l.isEmpty()) { + l.add(new AnalyzedToken(word, null, null)); + } + tokenReadings.add(new AnalyzedTokenReadings(l, pos)); pos += word.length(); } Index: src/main/java/org/languagetool/tagging/MultiTagger.java =================================================================== --- src/main/java/org/languagetool/tagging/MultiTagger.java (revision 0) +++ src/main/java/org/languagetool/tagging/MultiTagger.java (revision 0) @@ -0,0 +1,291 @@ +package org.languagetool.tagging; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; + +import org.apache.commons.io.FilenameUtils; +import org.languagetool.AnalyzedToken; +import org.languagetool.AnalyzedTokenReadings; +import org.languagetool.JLanguageTool; +import org.languagetool.tools.StringTools; + +public class MultiTagger implements Tagger { + + private Set<String> addDi = new HashSet<>(); + private Object[] dicts; + private Locale conversionLocale = Locale.getDefault(); + + public void setLocale(Locale locale) { + if (!conversionLocale.equals(locale)) { + conversionLocale = locale; + if (dicts != null) + reinit(); + } + } + + public void addDictionaries(String[] additional) { + boolean reinitialize = false; + for (String d : additional) { + if (!addDi.contains(d)) { + addDi.add(d); + reinitialize = true; + } + } + if ((dicts != null) && reinitialize) + reinit(); + } + + public MultiTagger() { + // TODO Auto-generated constructor stub + } + + protected boolean initialize() { + if (dicts == null) { + return reinit(); + } + return true; + } + + protected boolean reinit() { + dicts = null; + + List<Object> sl = new ArrayList<>(4); + String ps = System.getProperty("file.separator", "/"); + String fn; + String id = conversionLocale.getLanguage(); + String ln = conversionLocale.getDisplayLanguage(Locale.ENGLISH).toLowerCase(); + + // default dictionary + fn = ps + id + ps + ln + ".dict"; + try { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(fn); + IStemmer is = new DictionaryLookup(Dictionary.read(url)); + sl.add(is); + } catch (IllegalArgumentException | IOException e) { + // TODO: we could try to locate alternate .dict (hunspell?) + } + + // user dictionary + String home = System.getProperty("user.home"); + if (home != null) { + File f = new File(home + ps + "." + ln + ".dict"); + if (f.canRead()) { + // morfologik dictionary + try { + final URL url = f.toURI().toURL(); + IStemmer is = new DictionaryLookup(Dictionary.read(url)); + sl.add(is); + } catch (Exception e) { + System.err.println(e.getMessage()); + } + } else { + f = new File(home + ps + "." + ln + ".dic"); + if (f.canRead()) { + // manual dictionary + try { + InputStream fis = new FileInputStream(f); + ManualTagger mt = new ManualTagger(fis); + sl.add(mt); + } catch (IOException e) { + System.err.println(e.getMessage()); + } + } + } + } + + // application dictionary + String file = System.getProperty("file.name"); // surrounding app can set this before calling us + if (file != null) { + String bn = FilenameUtils.getBaseName(file); + String fp = FilenameUtils.getFullPath(file); + File f = new File(fp + "." + bn + ".dict"); + if (f.canRead()) { + // morfologik dictionary + try { + final URL url = f.toURI().toURL(); + IStemmer is = new DictionaryLookup(Dictionary.read(url)); + sl.add(is); + } catch (Exception e) { + System.err.println(e.getMessage()); + } + } else { + f = new File(fp + "." + bn + ".dic"); + if (f.canRead()) { + // manual dictionary + try { + InputStream fis = new FileInputStream(f); + ManualTagger mt = new ManualTagger(fis); + sl.add(mt); + } catch (IOException e) { + System.err.println(e.getMessage()); + } + } + } + } + + // additional dictionaries + for (String fp : addDi) { + File f = new File(fp); + switch ("." + FilenameUtils.getExtension(fp)) { + case ".dict": + // morfologik dictionary + try { + final URL url = f.toURI().toURL(); + IStemmer is = new DictionaryLookup(Dictionary.read(url)); + sl.add(is); + } catch (Exception e) { + System.err.println(e.getMessage()); + } + break; + case ".dic": + // manual dictionary + try { + InputStream fis = new FileInputStream(f); + ManualTagger mt = new ManualTagger(fis); + sl.add(mt); + } catch (IOException e) { + System.err.println(e.getMessage()); + } + break; + default: + System.err.println("File '" + fp + "' ignored: Unknown extension '" + FilenameUtils.getExtension(fp) + "'"); + } + } + + if (!sl.isEmpty()) + dicts = sl.toArray(new Object[sl.size()]); + + return (dicts != null); + } + + @Override + public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException { + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + // caching IStemmer instance - lazy init + initialize(); + + for (String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + final String lowerWord = word.toLowerCase(conversionLocale); + final boolean isLowercase = word.equals(lowerWord); + + // normal case + { + List<AnalyzedToken> taggerTokens; + taggerTokens = asAnalyzedTokenList(word, lookup(word)); + addTokens(taggerTokens, l); + } + + if (!isLowercase) { + // lowercase + List<AnalyzedToken> lowerTaggerTokens; + lowerTaggerTokens = asAnalyzedTokenList(word, lookup(lowerWord)); + addTokens(lowerTaggerTokens, l); + } + + // uppercase + if (isLowercase && l.isEmpty()) { + List<AnalyzedToken> upperTaggerTokens; + upperTaggerTokens = asAnalyzedTokenList(word, lookup(StringTools.uppercaseFirstChar(word))); + if (!upperTaggerTokens.isEmpty()) { + addTokens(upperTaggerTokens, l); + } + } + + // still empty? last resort... + if (l.isEmpty()) { + l.add(new AnalyzedToken(word, null, null)); + } + + tokenReadings.add(new AnalyzedTokenReadings(l, pos)); + pos += word.length(); + } + + return tokenReadings; + } + + protected List<AnalyzedToken> lookup(String word) { + Set<AnalyzedToken> set = new HashSet<>(); + for (Object o : dicts) { + if (o instanceof IStemmer) { + IStemmer is = (IStemmer) o; + List<WordData> wds = is.lookup(word); + for (WordData wd : wds) { + AnalyzedToken at = new AnalyzedToken(word, StringTools.asString(wd.getTag()), StringTools.asString(wd.getStem())); + set.add(at); + } + } else if (o instanceof ManualTagger) { + ManualTagger mt = (ManualTagger) o; + String[] sds = mt.lookup(word); + if (sds != null) { + for (int i = 0; i < sds.length; i += 2) { + AnalyzedToken at = new AnalyzedToken(word, sds[i + 1], sds[i]); + set.add(at); + } + } + } + } + return new ArrayList<>(set); + } + + protected List<AnalyzedToken> asAnalyzedTokenList(final String word, final List<AnalyzedToken> tList) { + final List<AnalyzedToken> aTokenList = new ArrayList<AnalyzedToken>(); + for (AnalyzedToken t : tList) { + aTokenList.add(new AnalyzedToken(word, t.getPOSTag(), t.getLemma())); + } + return aTokenList; + } + + private void addTokens(final List<AnalyzedToken> taggedTokens, final List<AnalyzedToken> l) { + if (taggedTokens != null) { + for (AnalyzedToken at : taggedTokens) { + /* + * if (!StringTools.isEmpty(at.getPOSTag())) { l.add(at); } else { l.add(new AnalyzedToken(at.getToken(), null, null)); } + */ + l.add(at); + } + } + } + + @Override + public AnalyzedTokenReadings createNullToken(String token, int startPos) { + return new AnalyzedTokenReadings(createToken(token, null), startPos); + } + + @Override + public AnalyzedToken createToken(String token, String posTag) { + return new AnalyzedToken(token, posTag, null); + } + + @SuppressWarnings("unchecked") + public List<String> getCorpus() { + Set<String> set = new HashSet<>(); + for (Object o : dicts) { + if (o instanceof Iterable<?>) { + Iterable<WordData> is = (Iterable<WordData>) o; + for (WordData wd : is) { + set.add(wd.getWord().toString()); + } + } else if (o instanceof ManualTagger) { + ManualTagger mt = (ManualTagger) o; + set.addAll(mt.getCorpus()); + } + } + return new ArrayList<>(set); + } + +} Index: src/main/java/org/languagetool/tagging/it/ItalianTagger.java =================================================================== --- src/main/java/org/languagetool/tagging/it/ItalianTagger.java (revision 9109) +++ src/main/java/org/languagetool/tagging/it/ItalianTagger.java (working copy) @@ -20,7 +20,7 @@ import java.util.Locale; -import org.languagetool.tagging.BaseTagger; +import org.languagetool.tagging.MultiTagger; /** * Italian tagger @@ -31,12 +31,13 @@ * * @author Marcin Milkowski */ -public class ItalianTagger extends BaseTagger { +public class ItalianTagger extends MultiTagger { - @Override - public final String getFileName() { - return "/it/italian.dict"; - } public ItalianTagger() { super(); Index: src/main/java/org/languagetool/tagging/ManualTagger.java =================================================================== --- src/main/java/org/languagetool/tagging/ManualTagger.java (revision 9109) +++ src/main/java/org/languagetool/tagging/ManualTagger.java (working copy) @@ -97,6 +97,10 @@ } return map; } + + public List<String> getCorpus() { + return new ArrayList<String>(mapping.keySet()); + } } Index: src/main/java/org/languagetool/tagging/disambiguation/rules/it/ItalianRuleDisambiguator.java =================================================================== --- src/main/java/org/languagetool/tagging/disambiguation/rules/it/ItalianRuleDisambiguator.java (revision 0) +++ src/main/java/org/languagetool/tagging/disambiguation/rules/it/ItalianRuleDisambiguator.java (revision 0) @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package org.languagetool.tagging.disambiguation.rules.it; + +import org.languagetool.Language; +import org.languagetool.tagging.disambiguation.rules.AbstractRuleDisambiguator; + +public class ItalianRuleDisambiguator extends AbstractRuleDisambiguator { + + @Override + protected Language getLanguage() { + return Language.ITALIAN; + } + +} Index: src/main/resources/org/languagetool/resource/it/disambiguation.xml =================================================================== --- src/main/resources/org/languagetool/resource/it/disambiguation.xml (revision 0) +++ src/main/resources/org/languagetool/resource/it/disambiguation.xml (revision 0) @@ -0,0 +1,63 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- Italian Disambiguation Rules for LanguageTool Copyright (C) 2012 Mauro Condarelli. + See disambiguation.xsd for syntax. $Id: $ --> +<rules lang="it" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:noNamespaceSchemaLocation="../disambiguation.xsd"> + <unification feature="numero"> + <equivalence type="S"> + <token postag=".+?\+s(\+.+)?" postag_regexp="yes"></token> + </equivalence> + <equivalence type="P"> + <token postag=".+?\+p(\+.+)?" postag_regexp="yes"></token> + </equivalence> + </unification> + <unification feature="genere"> + <equivalence type="M"> + <token postag="(NOUN|ART)-M.+|.+?\+m(\+.+)?" postag_regexp="yes"></token> + </equivalence> + <equivalence type="F"> + <token postag="(NOUN|ART)-F.+|.+?\+f(\+.+)?" postag_regexp="yes"></token> + </equivalence> + </unification> + <unification feature="persona"> + <equivalence type="prima"> + <token postag="VER:.+?\+1\+.+" postag_regexp="yes"></token> + </equivalence> + <equivalence type="seconda"> + <token postag="VER:.+?\+2\+.+" postag_regexp="yes"></token> + </equivalence> + <equivalence type="terza"> + <token postag="VER:.+?\+3\+.+" postag_regexp="yes"></token> + </equivalence> + </unification> + <rulegroup id="art-" name="ART+VER→delete"> + <!-- "gli chiese" --> + <rule id="art-nom"> + <pattern> + <unify feature="numero,genere" negate="yes"> + <token postag="ART.+" postag_regexp="yes" /> + <marker> + <and> + <token postag="NOUN.+" postag_regexp="yes" /> + <token postag="VER.+" postag_regexp="yes" /> + </and> + </marker> + </unify> + </pattern> + <disambig action="filter" postag="VER.+"/> + </rule> + <!-- "gli chiese" --> + <rule id="art-ver"> + <pattern> + <token postag="PRE|ART.*" postag_regexp="yes"><exception postag="PRO.*" postag_regexp="yes"/></token> + <marker> + <and> + <token postag="VER.*" postag_regexp="yes"></token> + <token postag="NOUN.*|ADJ.*" postag_regexp="yes"></token> + </and> + </marker> + </pattern> + <disambig action="filter" postag="[^V].*" postag_regexp="yes"></disambig> + </rule> + </rulegroup> +</rules> Index: src/test/java/org/languagetool/TestTools.java =================================================================== --- src/test/java/org/languagetool/TestTools.java (revision 9109) +++ src/test/java/org/languagetool/TestTools.java (working copy) @@ -37,6 +37,7 @@ import morfologik.stemming.WordData; import org.languagetool.tagging.BaseTagger; +import org.languagetool.tagging.MultiTagger; import org.languagetool.tagging.Tagger; import org.languagetool.tagging.disambiguation.Disambiguator; import org.languagetool.tokenizers.SentenceTokenizer; @@ -229,4 +230,24 @@ } } + public static void testDictionary(MultiTagger tagger, Language language) throws IOException { + //tagger.setLocale(language.getLocale()); + for (String word : tagger.getCorpus()) { + List<String> l = new ArrayList<String>(1); + l.add(word); + List<AnalyzedTokenReadings> atrs = tagger.tag(l); + if (atrs == null || atrs.isEmpty()) { + System.err.println("**** Warning: " + language + ": the word " + word + " not found in dictionary."); + for (AnalyzedTokenReadings atr : atrs) { + for (AnalyzedToken at : atr.getReadings()) { + String pos = at.getPOSTag(); + if (pos == null || pos.isEmpty()) { + System.err.println("**** Warning: " + language + ": the word " + at.getToken() + "/" + at.getLemma() + " lacks a POS tag in dictionary."); + } + } + } + } + } + } + } ------------------------------------------------------------------------------ Master Visual Studio, SharePoint, SQL, ASP.NET, C# 2012, HTML5, CSS, MVC, Windows 8 Apps, JavaScript and much more. Keep your skills current with LearnDevNow - 3,200 step-by-step video tutorials by Microsoft MVPs and experts. SALE $99.99 this month only -- learn more at: http://p.sf.net/sfu/learnmore_122912 _______________________________________________ Languagetool-devel mailing list Languagetool-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-devel