http://www.mediawiki.org/wiki/Special:Code/MediaWiki/65639
Revision: 65639 Author: daniel Date: 2010-04-28 21:52:24 +0000 (Wed, 28 Apr 2010) Log Message: ----------- bette rdefault word pattern for phrase detection Modified Paths: -------------- trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java Modified: trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java =================================================================== --- trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java 2010-04-28 21:51:30 UTC (rev 65638) +++ trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java 2010-04-28 21:52:24 UTC (rev 65639) @@ -98,7 +98,7 @@ } public void defaults() throws IOException { - if (this.wordPattern==null) this.wordPattern = Pattern.compile("\\p{L}+|\\p{Nd}+"); + if (this.wordPattern==null) this.wordPattern = Pattern.compile("[\\p{L}'']+(?:[\\p{Pc}\\p{Pd}][\\p{L}'']+)*|\\p{Nd}+(?:.\\p{Nd}+)?"); this.sentenceManglers.add( new RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks this.sentenceManglers.add( new RegularExpressionMangler("^([^\\p{L}]*(\\r\\n|\\r|\\n))+[^\\p{L}0-9]*\\s*", "", 0) ); //strip leading cruft (lines without any characters) _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs