http://www.mediawiki.org/wiki/Special:Code/MediaWiki/61856
Revision: 61856 Author: philip Date: 2010-02-02 15:09:01 +0000 (Tue, 02 Feb 2010) Log Message: ----------- Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch into wordSegmentation and normalizeForSearch. So the wordSegmentation could be called by search engines separately. Modified Paths: -------------- trunk/phase3/includes/Title.php trunk/phase3/includes/search/SearchIBM_DB2.php trunk/phase3/includes/search/SearchMySQL.php trunk/phase3/includes/search/SearchOracle.php trunk/phase3/includes/search/SearchSqlite.php trunk/phase3/includes/search/SearchUpdate.php trunk/phase3/languages/Language.php trunk/phase3/languages/classes/LanguageGan.php trunk/phase3/languages/classes/LanguageJa.php trunk/phase3/languages/classes/LanguageYue.php trunk/phase3/languages/classes/LanguageZh.php trunk/phase3/languages/classes/LanguageZh_hans.php Modified: trunk/phase3/includes/Title.php =================================================================== --- trunk/phase3/includes/Title.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/includes/Title.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -435,7 +435,7 @@ global $wgContLang; $lc = SearchEngine::legalSearchChars() . '&#;'; - $t = $wgContLang->stripForSearch( $title ); + $t = $wgContLang->normalizeForSearch( $title ); $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); $t = $wgContLang->lc( $t ); Modified: trunk/phase3/includes/search/SearchIBM_DB2.php =================================================================== --- trunk/phase3/includes/search/SearchIBM_DB2.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/includes/search/SearchIBM_DB2.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -158,10 +158,10 @@ if( is_array( $temp_terms )) { $temp_terms = array_unique( array_values( $temp_terms )); foreach( $temp_terms as $t ) - $q[] = $terms[1] . $wgContLang->stripForSearch( $t ); + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t ); } else - $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] ); + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] ); if (!empty($terms[3])) { $regexp = preg_quote( $terms[3], '/' ); Modified: trunk/phase3/includes/search/SearchMySQL.php =================================================================== --- trunk/phase3/includes/search/SearchMySQL.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/includes/search/SearchMySQL.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -80,7 +80,7 @@ // fulltext engine. // For Chinese this also inserts spaces between adjacent Han characters. $strippedVariants = array_map( - array( $wgContLang, 'stripForSearch' ), + array( $wgContLang, 'normalizeForSearch' ), $variants ); // Some languages such as Chinese force all variants to a canonical @@ -95,7 +95,7 @@ $stripped = $this->normalizeText( $stripped ); if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { // Hack for Chinese: we need to toss in quotes for - // multiple-character phrases since stripForSearch() + // multiple-character phrases since normalizeForSearch() // added spaces between them to make word breaks. $stripped = '"' . trim( $stripped ) . '"'; } @@ -324,13 +324,16 @@ global $wgContLang; wfProfileIn( __METHOD__ ); + + // Some languages such as Chinese require word segmentation + $out = $wgContLang->wordSegmentation( $string ); // MySQL fulltext index doesn't grok utf-8, so we // need to fold cases and convert to hex $out = preg_replace_callback( "/([\\xc0-\\xff][\\x80-\\xbf]*)/", array( $this, 'stripForSearchCallback' ), - $wgContLang->lc( $string ) ); + $wgContLang->lc( $out ) ); // And to add insult to injury, the default indexing // ignores short words... Pad them so we can pass them Modified: trunk/phase3/includes/search/SearchOracle.php =================================================================== --- trunk/phase3/includes/search/SearchOracle.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/includes/search/SearchOracle.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -217,7 +217,7 @@ private function escapeTerm($t) { global $wgContLang; - $t = $wgContLang->stripForSearch($t); + $t = $wgContLang->normalizeForSearch($t); $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t; $t = preg_replace('/^"(.*)"$/', '($1)', $t); $t = preg_replace('/([-&|])/', '\\\\$1', $t); Modified: trunk/phase3/includes/search/SearchSqlite.php =================================================================== --- trunk/phase3/includes/search/SearchSqlite.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/includes/search/SearchSqlite.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -92,7 +92,7 @@ // fulltext engine. // For Chinese this also inserts spaces between adjacent Han characters. $strippedVariants = array_map( - array( $wgContLang, 'stripForSearch' ), + array( $wgContLang, 'normalizeForSearch' ), $variants ); // Some languages such as Chinese force all variants to a canonical @@ -106,7 +106,7 @@ foreach( $strippedVariants as $stripped ) { if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { // Hack for Chinese: we need to toss in quotes for - // multiple-character phrases since stripForSearch() + // multiple-character phrases since normalizeForSearch() // added spaces between them to make word breaks. $stripped = '"' . trim( $stripped ) . '"'; } Modified: trunk/phase3/includes/search/SearchUpdate.php =================================================================== --- trunk/phase3/includes/search/SearchUpdate.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/includes/search/SearchUpdate.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -43,7 +43,7 @@ } # Language-specific strip/conversion - $text = $wgContLang->stripForSearch( $this->mText ); + $text = $wgContLang->normalizeForSearch( $this->mText ); wfProfileIn( $fname.'-regexps' ); $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", Modified: trunk/phase3/languages/Language.php =================================================================== --- trunk/phase3/languages/Language.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/languages/Language.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -1686,15 +1686,26 @@ function hasWordBreaks() { return true; } + + /** + * Some languages such as Chinese require word segmentation, + * Specify such segmentation when overridden in derived class. + * + * @param $string String + * @return String + */ + function wordSegmentation( $string ) { + return $string; + } /** - * Some languages have special punctuation to strip out. + * Some languages have special punctuation need to be normalized. * Make such changes here. * * @param $string String * @return String */ - function stripForSearch( $string, $doStrip = true ) { + function normalizeForSearch( $string ) { return $string; } @@ -1708,7 +1719,7 @@ return $string; } - protected static function wordSegmentation( $string, $pattern ) { + protected static function insertSpace( $string, $pattern ) { $string = preg_replace( $pattern, " $1 ", $string ); $string = preg_replace( '/ +/', ' ', $string ); return $string; Modified: trunk/phase3/languages/classes/LanguageGan.php =================================================================== --- trunk/phase3/languages/classes/LanguageGan.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/languages/classes/LanguageGan.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -135,9 +135,9 @@ } // word segmentation - function stripForSearch( $string, $doStrip = true, $autoVariant = 'gan-hans' ) { - // LanguageZh::stripForSearch - return parent::stripForSearch( $string, $doStrip, $autoVariant ); + function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) { + // LanguageZh::normalizeForSearch + return parent::normalizeForSearch( $string, $autoVariant ); } function convertForSearchResult( $termsArray ) { Modified: trunk/phase3/languages/classes/LanguageJa.php =================================================================== --- trunk/phase3/languages/classes/LanguageJa.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/languages/classes/LanguageJa.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -6,30 +6,29 @@ * @ingroup Language */ class LanguageJa extends Language { - function stripForSearch( $string, $doStrip = true ) { + function wordSegmentation( $string ) { + // Strip known punctuation ? + // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f - $s = $string; + // Space strings of like hiragana/katakana/kanji + $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f + $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff + $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' + . '|[\xe4-\xe8][\x80-\xbf]{2}' + . '|\xe9[\x80-\xa5][\x80-\xbf]' + . '|\xe9\xa6[\x80-\x99])'; + # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 + $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; + $s = self::insertSpace( $string, $reg ); + return $s; + } - if ( $doStrip == true ) { - // Strip known punctuation ? - // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f - - // Space strings of like hiragana/katakana/kanji - $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f - $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff - $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' - . '|[\xe4-\xe8][\x80-\xbf]{2}' - . '|\xe9[\x80-\xa5][\x80-\xbf]' - . '|\xe9\xa6[\x80-\x99])'; - # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 - $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; - $s = self::wordSegmentation( $s, $reg ); - } + function normalizeForSearch( $string ) { // Double-width roman characters - $s = self::convertDoubleWidth( $s ); + $s = self::convertDoubleWidth( $string ); # Do general case folding and UTF-8 armoring - return parent::stripForSearch( $s, $doStrip ); + return parent::normalizeForSearch( $s ); } # Italic is not appropriate for Japanese script Modified: trunk/phase3/languages/classes/LanguageYue.php =================================================================== --- trunk/phase3/languages/classes/LanguageYue.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/languages/classes/LanguageYue.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -3,24 +3,29 @@ * @ingroup Language */ class LanguageYue extends Language { - function stripForSearch( $string, $doStrip = true ) { + function hasWordBreaks() { + return false; + } + + /** + * Eventually this should be a word segmentation; + * for now just treat each character as a word. + * @todo Fixme: only do this for Han characters... + */ + function wordSegmentation( $string ) { + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; + $s = self::insertSpace( $string, $reg ); + return $s; + } + + function normalizeForSearch( $string ) { wfProfileIn( __METHOD__ ); // Double-width roman characters $s = self::convertDoubleWidth( $string ); - - if ( $doStrip == true ) { - // eventually this should be a word segmentation; - // for now just treat each character as a word. - // @todo Fixme: only do this for Han characters... - $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; - $s = self::wordSegmentation( $s, $reg ); - } - $s = trim( $s ); + $s = parent::normalizeForSearch( $s ); - // Do general case folding and UTF-8 armoring - $s = parent::stripForSearch( $s, $doStrip ); wfProfileOut( __METHOD__ ); return $s; } Modified: trunk/phase3/languages/classes/LanguageZh.php =================================================================== --- trunk/phase3/languages/classes/LanguageZh.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/languages/classes/LanguageZh.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -170,8 +170,23 @@ "\"$1\"", $text); } - // word segmentation - function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) { + /** + * word segmentation + */ + function wordSegmentation( $string ) { + // LanguageZh_hans::wordSegmentation + $s = parent::wordSegmentation( $string ); + return $s; + } + + /** + * auto convert to zh-hans and normalize special characters. + * + * @param $string String + * @param $autoVariant String, default to 'zh-hans' + * @return String + */ + function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) { wfProfileIn( __METHOD__ ); // always convert to zh-hans before indexing. it should be @@ -179,8 +194,8 @@ // Traditional to Simplified is less ambiguous than the // other way around $s = $this->mConverter->autoConvert( $string, $autoVariant ); - // LanguageZh_hans::stripForSearch - $s = parent::stripForSearch( $s, $doStrip ); + // LanguageZh_hans::normalizeForSearch + $s = parent::normalizeForSearch( $s ); wfProfileOut( __METHOD__ ); return $s; Modified: trunk/phase3/languages/classes/LanguageZh_hans.php =================================================================== --- trunk/phase3/languages/classes/LanguageZh_hans.php 2010-02-02 15:03:08 UTC (rev 61855) +++ trunk/phase3/languages/classes/LanguageZh_hans.php 2010-02-02 15:09:01 UTC (rev 61856) @@ -7,25 +7,26 @@ function hasWordBreaks() { return false; } - - function stripForSearch( $string, $doStrip = true ) { + + /** + * Eventually this should be a word segmentation; + * for now just treat each character as a word. + * @todo Fixme: only do this for Han characters... + */ + function wordSegmentation( $string ) { + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; + $s = self::insertSpace( $string, $reg ); + return $s; + } + + function normalizeForSearch( $string ) { wfProfileIn( __METHOD__ ); // Double-width roman characters $s = self::convertDoubleWidth( $string ); - - if ( $doStrip == true ) { - // Eventually this should be a word segmentation; - // for now just treat each character as a word. - // @todo Fixme: only do this for Han characters... - $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; - $s = self::wordSegmentation( $s, $reg ); - } - $s = trim( $s ); + $s = parent::normalizeForSearch( $s ); - // Do general case folding and UTF-8 armoring - $s = parent::stripForSearch( $s, $doStrip ); wfProfileOut( __METHOD__ ); return $s; } _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs