http://www.mediawiki.org/wiki/Special:Code/MediaWiki/65639

Revision: 65639
Author:   daniel
Date:     2010-04-28 21:52:24 +0000 (Wed, 28 Apr 2010)

Log Message:
-----------
bette rdefault word pattern for phrase detection

Modified Paths:
--------------
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java

Modified: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
     2010-04-28 21:51:30 UTC (rev 65638)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
     2010-04-28 21:52:24 UTC (rev 65639)
@@ -98,7 +98,7 @@
        }
        
        public void defaults() throws IOException {
-               if (this.wordPattern==null) this.wordPattern = 
Pattern.compile("\\p{L}+|\\p{Nd}+"); 
+               if (this.wordPattern==null) this.wordPattern = 
Pattern.compile("[\\p{L}'']+(?:[\\p{Pc}\\p{Pd}][\\p{L}'']+)*|\\p{Nd}+(?:.\\p{Nd}+)?");
 
 
                this.sentenceManglers.add( new 
RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks 
                this.sentenceManglers.add( new 
RegularExpressionMangler("^([^\\p{L}]*(\\r\\n|\\r|\\n))+[^\\p{L}0-9]*\\s*", "", 
0) ); //strip leading cruft (lines without any characters)



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to