Revision: 8077 http://languagetool.svn.sourceforge.net/languagetool/?rev=8077&view=rev Author: dnaber Date: 2012-09-22 10:28:48 +0000 (Sat, 22 Sep 2012) Log Message: ----------- introduce an error limit for the corpus check to avoid flooding the database
Modified Paths: -------------- trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java Added Paths: ----------- trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java Modified: trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java =================================================================== --- trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java 2012-09-22 10:03:55 UTC (rev 8076) +++ trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java 2012-09-22 10:28:48 UTC (rev 8077) @@ -44,34 +44,39 @@ protected Date dumpDate; protected String langCode; + protected int maxErrors = 0; + protected int errorCount = 0; private final JLanguageTool languageTool; private int ruleMatchCount = 0; + private int maxArticles = 0; private int articleCount = 0; - private int maxArticles = 0; private boolean inText = false; private StringBuilder text = new StringBuilder(); - + private String title; + private TextFilter textFilter = new BlikiWikipediaTextFilter(); - private String title; - private final Language lang; - //=========================================================== // SAX DocumentHandler methods //=========================================================== - protected BaseWikipediaDumpHandler(JLanguageTool languageTool, int maxArticles, Date dumpDate, - String langCode, Language lang) { - this.lang = lang; + protected BaseWikipediaDumpHandler(JLanguageTool languageTool, Date dumpDate, String langCode, Language lang) { this.languageTool = languageTool; - this.maxArticles = maxArticles; this.dumpDate = dumpDate; this.langCode = langCode; textFilter = TextFilterTools.getTextFilter(lang); } + public void setMaximumArticles(int maxArticles) { + this.maxArticles = maxArticles; + } + + public void setMaximumErrors(int maxErrors) { + this.maxErrors= maxErrors; + } + @Override @SuppressWarnings("unused") public void startElement(String namespaceURI, String lName, String qName, @@ -90,12 +95,8 @@ title = text.toString(); text = new StringBuilder(); } else if (qName.equals("text")) { - //System.err.println(text.length() + " " + text.substring(0, Math.min(50, text.length()))); final String textToCheck = textFilter.filter(text.toString()); - //System.out.println(textToCheck); if (!textToCheck.contains("#REDIRECT")) { - //System.err.println("#########################"); - //System.err.println(textToCheck); try { articleCount++; if (maxArticles > 0 && articleCount > maxArticles) { @@ -109,6 +110,8 @@ ", found " + ruleMatches.size() + " matches"); try { handleResult(title, ruleMatches, textToCheck, languageTool.getLanguage()); + } catch (ErrorLimitReachedException e) { + throw e; } catch (Exception e) { throw new RuntimeException(e); } Modified: trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java =================================================================== --- trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java 2012-09-22 10:03:55 UTC (rev 8076) +++ trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java 2012-09-22 10:28:48 UTC (rev 8077) @@ -16,11 +16,6 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ - -/* - * - * Created on 21.12.2006 - */ package org.languagetool.dev.wikipedia; import java.io.File; @@ -79,15 +74,13 @@ addDisabledRules("all", disabledRuleIds, disabledRules); addDisabledRules(languageCode, disabledRuleIds, disabledRules); } - int maxArticles = 0; - if (args.length == 6) { - maxArticles = Integer.parseInt(args[5]); - } + final int maxArticles = Integer.parseInt(args[5]); + final int maxErrors = Integer.parseInt(args[6]); String[] ruleIds = null; if (!"-".equals(args[4])) { ruleIds = args[4].split(","); } - prg.run(propFile, disabledRuleIds, languageCode, args[3], ruleIds, maxArticles); + prg.run(propFile, disabledRuleIds, languageCode, args[3], ruleIds, maxArticles, maxErrors); } private static void addDisabledRules(String languageCode, Set<String> disabledRuleIds, Properties disabledRules) { @@ -99,27 +92,28 @@ } private static void ensureCorrectUsageOrExit(String[] args) { - if (args.length < 5 || args.length > 6) { - System.err.println("Usage: CheckWikipediaDump <propertyFile> <rulePropertyFile> <language> <filename> <ruleIds> [maxArticleCheck]"); - System.err.println("\tpropertyFile a file to set database access properties. Use '-' to print results to stdout."); - System.err.println("\trulePropertyFile a file to set rules which should be disabled per language (e.g. en=RULE1,RULE2 or all=RULE3,RULE4). Use '-' to ignore."); - System.err.println("\tlanguage languagecode like 'en' or 'de'"); - System.err.println("\tfilename path to unpacked Wikipedia XML dump"); - System.err.println("\truleIds comma-separated list of rule-ids to activate. Use '-' to activate the default rules."); - System.err.println("\tmaxArticleCheck optional: maximum number of articles to check"); + if (args.length != 7) { + System.err.println("Usage: CheckWikipediaDump <propertyFile> <rulePropertyFile> <language> <filename> <ruleIds> <maxArticles> <maxErrors>"); + System.err.println(" propertyFile a file to set database access properties. Use '-' to print results to stdout."); + System.err.println(" rulePropertyFile a file to set rules which should be disabled per language (e.g. en=RULE1,RULE2 or all=RULE3,RULE4). Use '-' to ignore."); + System.err.println(" language language code like 'en' or 'de'"); + System.err.println(" filename path to unpacked Wikipedia XML dump"); + System.err.println(" ruleIds comma-separated list of rule-ids to activate. Use '-' to activate the default rules."); + System.err.println(" maxArticles maximum number of articles to check, 0 for no limit"); + System.err.println(" maxErrors stop when reaching this many errors, 0 for no limit"); System.exit(1); } } - private void run(File propFile, Set<String> disabledRules, String language, String textFilename, String[] ruleIds, int maxArticles) + private void run(File propFile, Set<String> disabledRules, String langCode, String textFilename, String[] ruleIds, int maxArticles, int maxErrors) throws IOException, SAXException, ParserConfigurationException { final File file = new File(textFilename); if (!file.exists() || !file.isFile()) { throw new IOException("File doesn't exist or isn't a file: " + textFilename); } - final Language lang = Language.getLanguageForShortName(language); + final Language lang = Language.getLanguageForShortName(langCode); if (lang == null) { - System.err.println("Language not supported: " + language); + System.err.println("Language not supported: " + langCode); System.exit(1); } final JLanguageTool languageTool = new JLanguageTool(lang); @@ -130,16 +124,24 @@ applyRuleDeactivation(languageTool, disabledRules); } final Date dumpDate = getDumpFileDate(file); - System.out.println("Dump date: " + dumpDate + ", language: " + language); - final BaseWikipediaDumpHandler handler; - if (propFile != null) { - handler = new DatabaseDumpHandler(languageTool, maxArticles, dumpDate, language, propFile, lang); - } else { - handler = new OutputDumpHandler(languageTool, maxArticles, dumpDate, language, lang); + System.out.println("Dump date: " + dumpDate + ", language: " + langCode); + BaseWikipediaDumpHandler xmlHandler = null; + try { + if (propFile != null) { + xmlHandler = new DatabaseDumpHandler(languageTool, dumpDate, langCode, propFile, lang); + } else { + xmlHandler = new OutputDumpHandler(languageTool, dumpDate, langCode, lang); + } + xmlHandler.setMaximumArticles(maxArticles); + xmlHandler.setMaximumErrors(maxErrors); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.parse(file, xmlHandler); + } catch (ErrorLimitReachedException e) { + System.out.println(e); + } finally { + if (xmlHandler != null) { xmlHandler.close(); } } - final SAXParserFactory factory = SAXParserFactory.newInstance(); - final SAXParser saxParser = factory.newSAXParser(); - saxParser.parse(file, handler); } private void enableSpecifiedRules(String[] ruleIds, JLanguageTool languageTool) { @@ -149,7 +151,7 @@ for (String ruleId : ruleIds) { languageTool.enableRule(ruleId); } - System.err.println("Only these rules are enabled: " + Arrays.toString(ruleIds)); + System.out.println("Only these rules are enabled: " + Arrays.toString(ruleIds)); } private void applyRuleDeactivation(JLanguageTool languageTool, Set<String> disabledRules) throws IOException { @@ -157,7 +159,7 @@ for (String disabledRuleId : disabledRules) { languageTool.disableRule(disabledRuleId); } - System.err.println("These rules are disabled: " + languageTool.getDisabledRules()); + System.out.println("These rules are disabled: " + languageTool.getDisabledRules()); } private Date getDumpFileDate(File file) throws IOException { Modified: trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java =================================================================== --- trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java 2012-09-22 10:03:55 UTC (rev 8076) +++ trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java 2012-09-22 10:28:48 UTC (rev 8077) @@ -1,5 +1,20 @@ -/* - * Created on 04.04.2010 +/* LanguageTool, a natural language style checker + * Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA */ package org.languagetool.dev.wikipedia; @@ -27,9 +42,9 @@ private final Connection conn; - DatabaseDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, String langCode, + DatabaseDumpHandler(JLanguageTool lt, Date dumpDate, String langCode, File propertiesFile, Language lang) throws IOException { - super(lt, maxArticles, dumpDate, langCode, lang); + super(lt, dumpDate, langCode, lang); final Properties dbProperties = new Properties(); final FileInputStream inStream = new FileInputStream(propertiesFile); try { @@ -97,6 +112,10 @@ prepSt.setDate(8, nowDate); prepSt.setString(9, URL_PREFIX.replaceAll(LANG_MARKER, langCode) + title); prepSt.executeUpdate(); + errorCount++; + if (maxErrors > 0 && errorCount > maxErrors) { + throw new ErrorLimitReachedException(maxErrors); + } } } finally { prepSt.close(); Added: trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java =================================================================== --- trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java (rev 0) +++ trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java 2012-09-22 10:28:48 UTC (rev 8077) @@ -0,0 +1,34 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.dev.wikipedia; + +class ErrorLimitReachedException extends RuntimeException { + + private final int limit; + + ErrorLimitReachedException(int limit) { + this.limit = limit; + } + + @Override + public String getMessage() { + return "Maximum number of errors (" + limit + ") reached"; + } + +} Modified: trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java =================================================================== --- trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java 2012-09-22 10:03:55 UTC (rev 8076) +++ trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java 2012-09-22 10:28:48 UTC (rev 8077) @@ -34,9 +34,9 @@ */ class OutputDumpHandler extends BaseWikipediaDumpHandler { - OutputDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, String langCode, + OutputDumpHandler(JLanguageTool lt, Date dumpDate, String langCode, Language lang) { - super(lt, maxArticles, dumpDate, langCode, lang); + super(lt, dumpDate, langCode, lang); } @Override @@ -65,9 +65,12 @@ if (!replacements.isEmpty()) { System.out.println("Suggestion: " + StringTools.listToString(replacements, "; ")); } - System.out.println(StringTools.getContext(match.getFromPos(), match - .getToPos(), text, CONTEXT_SIZE)); + System.out.println(StringTools.getContext(match.getFromPos(), match.getToPos(), text, CONTEXT_SIZE)); i++; + errorCount++; + if (maxErrors > 0 && errorCount > maxErrors) { + throw new ErrorLimitReachedException(maxErrors); + } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ How fast is your code? 3 out of 4 devs don\\\'t know how their code performs in production. Find out how slow your code is with AppDynamics Lite. http://ad.doubleclick.net/clk;262219672;13503038;z? http://info.appdynamics.com/FreeJavaPerformanceDownload.html _______________________________________________ Languagetool-commits mailing list Languagetool-commits@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-commits