Revision: 8077
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=8077&view=rev
Author:   dnaber
Date:     2012-09-22 10:28:48 +0000 (Sat, 22 Sep 2012)
Log Message:
-----------
introduce an error limit for the corpus check to avoid flooding the database

Modified Paths:
--------------
    
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
    
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java
    
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java
    
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java

Added Paths:
-----------
    
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java

Modified: 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
===================================================================
--- 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
       2012-09-22 10:03:55 UTC (rev 8076)
+++ 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
       2012-09-22 10:28:48 UTC (rev 8077)
@@ -44,34 +44,39 @@
 
   protected Date dumpDate;
   protected String langCode;
+  protected int maxErrors = 0;
+  protected int errorCount = 0;
 
   private final JLanguageTool languageTool;
   private int ruleMatchCount = 0;
+  private int maxArticles = 0;
   private int articleCount = 0;
-  private int maxArticles = 0;
 
   private boolean inText = false;
   private StringBuilder text = new StringBuilder();
-  
+  private String title;
+
   private TextFilter textFilter = new BlikiWikipediaTextFilter();
 
-  private String title;
-  private final Language lang;
-
   //===========================================================
   // SAX DocumentHandler methods
   //===========================================================
 
-  protected BaseWikipediaDumpHandler(JLanguageTool languageTool, int 
maxArticles, Date dumpDate,
-      String langCode, Language lang) {
-    this.lang = lang;
+  protected BaseWikipediaDumpHandler(JLanguageTool languageTool, Date 
dumpDate, String langCode, Language lang) {
     this.languageTool = languageTool;
-    this.maxArticles = maxArticles;
     this.dumpDate = dumpDate;
     this.langCode = langCode;
     textFilter = TextFilterTools.getTextFilter(lang);
   }
 
+  public void setMaximumArticles(int maxArticles) {
+    this.maxArticles = maxArticles;
+  }
+
+  public void setMaximumErrors(int maxErrors) {
+    this.maxErrors= maxErrors;
+  }
+
   @Override
   @SuppressWarnings("unused")
   public void startElement(String namespaceURI, String lName, String qName,
@@ -90,12 +95,8 @@
       title = text.toString();
       text = new StringBuilder();
     } else if (qName.equals("text")) {
-      //System.err.println(text.length() + " " + text.substring(0, 
Math.min(50, text.length())));
       final String textToCheck = textFilter.filter(text.toString());
-      //System.out.println(textToCheck);
       if (!textToCheck.contains("#REDIRECT")) {
-        //System.err.println("#########################");
-        //System.err.println(textToCheck);
         try {
           articleCount++;
           if (maxArticles > 0 && articleCount > maxArticles) {
@@ -109,6 +110,8 @@
               ", found " + ruleMatches.size() + " matches");
           try {
             handleResult(title, ruleMatches, textToCheck, 
languageTool.getLanguage());
+          } catch (ErrorLimitReachedException e) {
+            throw e;
           } catch (Exception e) {
             throw new RuntimeException(e);
           }

Modified: 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java
===================================================================
--- 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java
     2012-09-22 10:03:55 UTC (rev 8076)
+++ 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/CheckWikipediaDump.java
     2012-09-22 10:28:48 UTC (rev 8077)
@@ -16,11 +16,6 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  * USA
  */
-
-/*
- *
- * Created on 21.12.2006
- */
 package org.languagetool.dev.wikipedia;
 
 import java.io.File;
@@ -79,15 +74,13 @@
       addDisabledRules("all", disabledRuleIds, disabledRules);
       addDisabledRules(languageCode, disabledRuleIds, disabledRules);
     }
-    int maxArticles = 0;
-    if (args.length == 6) {
-      maxArticles = Integer.parseInt(args[5]);
-    }
+    final int maxArticles = Integer.parseInt(args[5]);
+    final int maxErrors = Integer.parseInt(args[6]);
     String[] ruleIds = null;
     if (!"-".equals(args[4])) {
       ruleIds = args[4].split(",");
     }
-    prg.run(propFile, disabledRuleIds, languageCode, args[3], ruleIds, 
maxArticles);
+    prg.run(propFile, disabledRuleIds, languageCode, args[3], ruleIds, 
maxArticles, maxErrors);
   }
 
   private static void addDisabledRules(String languageCode, Set<String> 
disabledRuleIds, Properties disabledRules) {
@@ -99,27 +92,28 @@
   }
 
   private static void ensureCorrectUsageOrExit(String[] args) {
-    if (args.length < 5 || args.length > 6) {
-      System.err.println("Usage: CheckWikipediaDump <propertyFile> 
<rulePropertyFile> <language> <filename> <ruleIds> [maxArticleCheck]");
-      System.err.println("\tpropertyFile a file to set database access 
properties. Use '-' to print results to stdout.");
-      System.err.println("\trulePropertyFile a file to set rules which should 
be disabled per language (e.g. en=RULE1,RULE2 or all=RULE3,RULE4). Use '-' to 
ignore.");
-      System.err.println("\tlanguage languagecode like 'en' or 'de'");
-      System.err.println("\tfilename path to unpacked Wikipedia XML dump");
-      System.err.println("\truleIds comma-separated list of rule-ids to 
activate. Use '-' to activate the default rules.");
-      System.err.println("\tmaxArticleCheck optional: maximum number of 
articles to check");
+    if (args.length != 7) {
+      System.err.println("Usage: CheckWikipediaDump <propertyFile> 
<rulePropertyFile> <language> <filename> <ruleIds> <maxArticles> <maxErrors>");
+      System.err.println("  propertyFile      a file to set database access 
properties. Use '-' to print results to stdout.");
+      System.err.println("  rulePropertyFile  a file to set rules which should 
be disabled per language (e.g. en=RULE1,RULE2 or all=RULE3,RULE4). Use '-' to 
ignore.");
+      System.err.println("  language          language code like 'en' or 
'de'");
+      System.err.println("  filename          path to unpacked Wikipedia XML 
dump");
+      System.err.println("  ruleIds           comma-separated list of rule-ids 
to activate. Use '-' to activate the default rules.");
+      System.err.println("  maxArticles       maximum number of articles to 
check, 0 for no limit");
+      System.err.println("  maxErrors         stop when reaching this many 
errors, 0 for no limit");
       System.exit(1);
     }
   }
 
-  private void run(File propFile, Set<String> disabledRules, String language, 
String textFilename, String[] ruleIds, int maxArticles)
+  private void run(File propFile, Set<String> disabledRules, String langCode, 
String textFilename, String[] ruleIds, int maxArticles, int maxErrors)
       throws IOException, SAXException, ParserConfigurationException {
     final File file = new File(textFilename);
     if (!file.exists() || !file.isFile()) {
       throw new IOException("File doesn't exist or isn't a file: " + 
textFilename);
     }
-    final Language lang = Language.getLanguageForShortName(language);
+    final Language lang = Language.getLanguageForShortName(langCode);
     if (lang == null) {
-      System.err.println("Language not supported: " + language);
+      System.err.println("Language not supported: " + langCode);
       System.exit(1);
     }
     final JLanguageTool languageTool = new JLanguageTool(lang);
@@ -130,16 +124,24 @@
       applyRuleDeactivation(languageTool, disabledRules);
     }
     final Date dumpDate = getDumpFileDate(file);
-    System.out.println("Dump date: " + dumpDate + ", language: " + language);
-    final BaseWikipediaDumpHandler handler;
-    if (propFile != null) {
-      handler = new DatabaseDumpHandler(languageTool, maxArticles, dumpDate, 
language, propFile, lang);
-    } else {
-      handler = new OutputDumpHandler(languageTool, maxArticles, dumpDate, 
language, lang);
+    System.out.println("Dump date: " + dumpDate + ", language: " + langCode);
+    BaseWikipediaDumpHandler xmlHandler = null;
+    try {
+      if (propFile != null) {
+        xmlHandler = new DatabaseDumpHandler(languageTool, dumpDate, langCode, 
propFile, lang);
+      } else {
+        xmlHandler = new OutputDumpHandler(languageTool, dumpDate, langCode, 
lang);
+      }
+      xmlHandler.setMaximumArticles(maxArticles);
+      xmlHandler.setMaximumErrors(maxErrors);
+      final SAXParserFactory factory = SAXParserFactory.newInstance();
+      final SAXParser saxParser = factory.newSAXParser();
+      saxParser.parse(file, xmlHandler);
+    } catch (ErrorLimitReachedException e) {
+      System.out.println(e);
+    } finally {
+      if (xmlHandler != null) { xmlHandler.close(); }
     }
-    final SAXParserFactory factory = SAXParserFactory.newInstance();
-    final SAXParser saxParser = factory.newSAXParser();
-    saxParser.parse(file, handler);
   }
 
   private void enableSpecifiedRules(String[] ruleIds, JLanguageTool 
languageTool) {
@@ -149,7 +151,7 @@
     for (String ruleId : ruleIds) {
       languageTool.enableRule(ruleId);
     }
-    System.err.println("Only these rules are enabled: " + 
Arrays.toString(ruleIds));
+    System.out.println("Only these rules are enabled: " + 
Arrays.toString(ruleIds));
   }
 
   private void applyRuleDeactivation(JLanguageTool languageTool, Set<String> 
disabledRules) throws IOException {
@@ -157,7 +159,7 @@
     for (String disabledRuleId : disabledRules) {
       languageTool.disableRule(disabledRuleId);
     }
-    System.err.println("These rules are disabled: " + 
languageTool.getDisabledRules());
+    System.out.println("These rules are disabled: " + 
languageTool.getDisabledRules());
   }
 
   private Date getDumpFileDate(File file) throws IOException {

Modified: 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java
===================================================================
--- 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java
    2012-09-22 10:03:55 UTC (rev 8076)
+++ 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/DatabaseDumpHandler.java
    2012-09-22 10:28:48 UTC (rev 8077)
@@ -1,5 +1,20 @@
-/*
- * Created on 04.04.2010
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
  */
 package org.languagetool.dev.wikipedia;
 
@@ -27,9 +42,9 @@
 
     private final Connection conn;
 
-    DatabaseDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, 
String langCode,
+    DatabaseDumpHandler(JLanguageTool lt, Date dumpDate, String langCode,
             File propertiesFile, Language lang) throws IOException {
-    super(lt, maxArticles, dumpDate, langCode, lang);
+    super(lt, dumpDate, langCode, lang);
     final Properties dbProperties = new Properties();
     final FileInputStream inStream = new FileInputStream(propertiesFile);
     try {
@@ -97,6 +112,10 @@
           prepSt.setDate(8, nowDate);
           prepSt.setString(9, URL_PREFIX.replaceAll(LANG_MARKER, langCode) + 
title);
           prepSt.executeUpdate();
+          errorCount++;
+          if (maxErrors > 0 && errorCount > maxErrors) {
+            throw new ErrorLimitReachedException(maxErrors);
+          }
         }
       } finally {
         prepSt.close();

Added: 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java
===================================================================
--- 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java
                             (rev 0)
+++ 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/ErrorLimitReachedException.java
     2012-09-22 10:28:48 UTC (rev 8077)
@@ -0,0 +1,34 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.dev.wikipedia;
+
+class ErrorLimitReachedException extends RuntimeException {
+
+  private final int limit;
+
+  ErrorLimitReachedException(int limit) {
+    this.limit = limit;
+  }
+
+  @Override
+  public String getMessage() {
+    return "Maximum number of errors (" + limit + ") reached";
+  }
+
+}

Modified: 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java
===================================================================
--- 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java
      2012-09-22 10:03:55 UTC (rev 8076)
+++ 
trunk/JLanguageTool/src/main/dev/org/languagetool/dev/wikipedia/OutputDumpHandler.java
      2012-09-22 10:28:48 UTC (rev 8077)
@@ -34,9 +34,9 @@
  */
 class OutputDumpHandler extends BaseWikipediaDumpHandler {
 
-    OutputDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, String 
langCode,
+    OutputDumpHandler(JLanguageTool lt, Date dumpDate, String langCode,
             Language lang) {
-      super(lt, maxArticles, dumpDate, langCode, lang);
+      super(lt, dumpDate, langCode, lang);
     }
     
     @Override
@@ -65,9 +65,12 @@
           if (!replacements.isEmpty()) {
             System.out.println("Suggestion: " + 
StringTools.listToString(replacements, "; "));
           }
-          System.out.println(StringTools.getContext(match.getFromPos(), match
-              .getToPos(), text, CONTEXT_SIZE));
+          System.out.println(StringTools.getContext(match.getFromPos(), 
match.getToPos(), text, CONTEXT_SIZE));
           i++;
+          errorCount++;
+          if (maxErrors > 0 && errorCount > maxErrors) {
+            throw new ErrorLimitReachedException(maxErrors);
+          }
         }
       }
     }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
How fast is your code?
3 out of 4 devs don\\\'t know how their code performs in production.
Find out how slow your code is with AppDynamics Lite.
http://ad.doubleclick.net/clk;262219672;13503038;z?
http://info.appdynamics.com/FreeJavaPerformanceDownload.html
_______________________________________________
Languagetool-commits mailing list
Languagetool-commits@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-commits

Reply via email to