http://www.mediawiki.org/wiki/Special:Code/MediaWiki/61856

Revision: 61856
Author:   philip
Date:     2010-02-02 15:09:01 +0000 (Tue, 02 Feb 2010)

Log Message:
-----------
Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch 
into wordSegmentation and normalizeForSearch. So the wordSegmentation could be 
called by search engines separately.

Modified Paths:
--------------
    trunk/phase3/includes/Title.php
    trunk/phase3/includes/search/SearchIBM_DB2.php
    trunk/phase3/includes/search/SearchMySQL.php
    trunk/phase3/includes/search/SearchOracle.php
    trunk/phase3/includes/search/SearchSqlite.php
    trunk/phase3/includes/search/SearchUpdate.php
    trunk/phase3/languages/Language.php
    trunk/phase3/languages/classes/LanguageGan.php
    trunk/phase3/languages/classes/LanguageJa.php
    trunk/phase3/languages/classes/LanguageYue.php
    trunk/phase3/languages/classes/LanguageZh.php
    trunk/phase3/languages/classes/LanguageZh_hans.php

Modified: trunk/phase3/includes/Title.php
===================================================================
--- trunk/phase3/includes/Title.php     2010-02-02 15:03:08 UTC (rev 61855)
+++ trunk/phase3/includes/Title.php     2010-02-02 15:09:01 UTC (rev 61856)
@@ -435,7 +435,7 @@
                global $wgContLang;
 
                $lc = SearchEngine::legalSearchChars() . '&#;';
-               $t = $wgContLang->stripForSearch( $title );
+               $t = $wgContLang->normalizeForSearch( $title );
                $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
                $t = $wgContLang->lc( $t );
 

Modified: trunk/phase3/includes/search/SearchIBM_DB2.php
===================================================================
--- trunk/phase3/includes/search/SearchIBM_DB2.php      2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/includes/search/SearchIBM_DB2.php      2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -158,10 +158,10 @@
                                if( is_array( $temp_terms )) {
                                        $temp_terms = array_unique( 
array_values( $temp_terms ));
                                        foreach( $temp_terms as $t )
-                                               $q[] = $terms[1] . 
$wgContLang->stripForSearch( $t );
+                                               $q[] = $terms[1] . 
$wgContLang->normalizeForSearch( $t );
                                }
                                else
-                                       $q[] = $terms[1] . 
$wgContLang->stripForSearch( $terms[2] );
+                                       $q[] = $terms[1] . 
$wgContLang->normalizeForSearch( $terms[2] );
 
                                if (!empty($terms[3])) {
                                        $regexp = preg_quote( $terms[3], '/' );

Modified: trunk/phase3/includes/search/SearchMySQL.php
===================================================================
--- trunk/phase3/includes/search/SearchMySQL.php        2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/includes/search/SearchMySQL.php        2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -80,7 +80,7 @@
                                // fulltext engine.
                                // For Chinese this also inserts spaces between 
adjacent Han characters.
                                $strippedVariants = array_map(
-                                       array( $wgContLang, 'stripForSearch' ),
+                                       array( $wgContLang, 
'normalizeForSearch' ),
                                        $variants );
                                
                                // Some languages such as Chinese force all 
variants to a canonical
@@ -95,7 +95,7 @@
                                        $stripped = $this->normalizeText( 
$stripped );
                                        if( $nonQuoted && strpos( $stripped, ' 
' ) !== false ) {
                                                // Hack for Chinese: we need to 
toss in quotes for
-                                               // multiple-character phrases 
since stripForSearch()
+                                               // multiple-character phrases 
since normalizeForSearch()
                                                // added spaces between them to 
make word breaks.
                                                $stripped = '"' . trim( 
$stripped ) . '"';
                                        }
@@ -324,13 +324,16 @@
                global $wgContLang;
 
                wfProfileIn( __METHOD__ );
+               
+               // Some languages such as Chinese require word segmentation
+               $out = $wgContLang->wordSegmentation( $string );
 
                // MySQL fulltext index doesn't grok utf-8, so we
                // need to fold cases and convert to hex
                $out = preg_replace_callback(
                        "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
                        array( $this, 'stripForSearchCallback' ),
-                       $wgContLang->lc( $string ) );
+                       $wgContLang->lc( $out ) );
 
                // And to add insult to injury, the default indexing
                // ignores short words... Pad them so we can pass them

Modified: trunk/phase3/includes/search/SearchOracle.php
===================================================================
--- trunk/phase3/includes/search/SearchOracle.php       2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/includes/search/SearchOracle.php       2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -217,7 +217,7 @@
 
        private function escapeTerm($t) {
                global $wgContLang;
-               $t = $wgContLang->stripForSearch($t);
+               $t = $wgContLang->normalizeForSearch($t);
                $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : 
$t;
                $t = preg_replace('/^"(.*)"$/', '($1)', $t);
                $t = preg_replace('/([-&|])/', '\\\\$1', $t);

Modified: trunk/phase3/includes/search/SearchSqlite.php
===================================================================
--- trunk/phase3/includes/search/SearchSqlite.php       2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/includes/search/SearchSqlite.php       2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -92,7 +92,7 @@
                                // fulltext engine.
                                // For Chinese this also inserts spaces between 
adjacent Han characters.
                                $strippedVariants = array_map(
-                                       array( $wgContLang, 'stripForSearch' ),
+                                       array( $wgContLang, 
'normalizeForSearch' ),
                                        $variants );
                                
                                // Some languages such as Chinese force all 
variants to a canonical
@@ -106,7 +106,7 @@
                                foreach( $strippedVariants as $stripped ) {
                                        if( $nonQuoted && strpos( $stripped, ' 
' ) !== false ) {
                                                // Hack for Chinese: we need to 
toss in quotes for
-                                               // multiple-character phrases 
since stripForSearch()
+                                               // multiple-character phrases 
since normalizeForSearch()
                                                // added spaces between them to 
make word breaks.
                                                $stripped = '"' . trim( 
$stripped ) . '"';
                                        }

Modified: trunk/phase3/includes/search/SearchUpdate.php
===================================================================
--- trunk/phase3/includes/search/SearchUpdate.php       2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/includes/search/SearchUpdate.php       2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -43,7 +43,7 @@
                }
 
                # Language-specific strip/conversion
-               $text = $wgContLang->stripForSearch( $this->mText );
+               $text = $wgContLang->normalizeForSearch( $this->mText );
 
                wfProfileIn( $fname.'-regexps' );
                $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",

Modified: trunk/phase3/languages/Language.php
===================================================================
--- trunk/phase3/languages/Language.php 2010-02-02 15:03:08 UTC (rev 61855)
+++ trunk/phase3/languages/Language.php 2010-02-02 15:09:01 UTC (rev 61856)
@@ -1686,15 +1686,26 @@
        function hasWordBreaks() {
                return true;
        }
+       
+       /**
+        * Some languages such as Chinese require word segmentation,
+        * Specify such segmentation when overridden in derived class.
+        * 
+        * @param $string String
+        * @return String
+        */
+       function wordSegmentation( $string ) {
+               return $string;
+       }
 
        /**
-        * Some languages have special punctuation to strip out.
+        * Some languages have special punctuation need to be normalized.
         * Make such changes here.
         *
         * @param $string String
         * @return String
         */
-       function stripForSearch( $string, $doStrip = true ) {
+       function normalizeForSearch( $string ) {
                return $string;
        }
 
@@ -1708,7 +1719,7 @@
                return $string;
        }
 
-       protected static function wordSegmentation( $string, $pattern ) {
+       protected static function insertSpace( $string, $pattern ) {
                $string = preg_replace( $pattern, " $1 ", $string );
                $string = preg_replace( '/ +/', ' ', $string );
                return $string;

Modified: trunk/phase3/languages/classes/LanguageGan.php
===================================================================
--- trunk/phase3/languages/classes/LanguageGan.php      2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/languages/classes/LanguageGan.php      2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -135,9 +135,9 @@
        }
 
        // word segmentation
-       function stripForSearch( $string, $doStrip = true, $autoVariant = 
'gan-hans' ) {
-               // LanguageZh::stripForSearch
-               return parent::stripForSearch( $string, $doStrip, $autoVariant 
);
+       function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) {
+               // LanguageZh::normalizeForSearch
+               return parent::normalizeForSearch( $string, $autoVariant );
        }
 
        function convertForSearchResult( $termsArray ) {

Modified: trunk/phase3/languages/classes/LanguageJa.php
===================================================================
--- trunk/phase3/languages/classes/LanguageJa.php       2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/languages/classes/LanguageJa.php       2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -6,30 +6,29 @@
  * @ingroup Language
  */
 class LanguageJa extends Language {
-       function stripForSearch( $string, $doStrip = true ) {
+       function wordSegmentation( $string ) {
+               // Strip known punctuation ?
+               // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # 
U3000-303f
 
-               $s = $string;
+               // Space strings of like hiragana/katakana/kanji
+               $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # 
U3040-309f
+               $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # 
U30a0-30ff
+               $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+                       . '|[\xe4-\xe8][\x80-\xbf]{2}'
+                       . '|\xe9[\x80-\xa5][\x80-\xbf]'
+                       . '|\xe9\xa6[\x80-\x99])';
+                       # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+               $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
 
-               if ( $doStrip == true ) {
-                       // Strip known punctuation ?
-                       // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s 
); # U3000-303f
-
-                       // Space strings of like hiragana/katakana/kanji
-                       $hiragana = 
'(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
-                       $katakana = 
'(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
-                       $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
-                               . '|[\xe4-\xe8][\x80-\xbf]{2}'
-                               . '|\xe9[\x80-\xa5][\x80-\xbf]'
-                               . '|\xe9\xa6[\x80-\x99])';
-                               # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-                       $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
+       function normalizeForSearch( $string ) {
                // Double-width roman characters
-               $s = self::convertDoubleWidth( $s );
+               $s = self::convertDoubleWidth( $string );
                
                # Do general case folding and UTF-8 armoring
-               return parent::stripForSearch( $s, $doStrip );
+               return parent::normalizeForSearch( $s );
        }
 
        # Italic is not appropriate for Japanese script

Modified: trunk/phase3/languages/classes/LanguageYue.php
===================================================================
--- trunk/phase3/languages/classes/LanguageYue.php      2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/languages/classes/LanguageYue.php      2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -3,24 +3,29 @@
  * @ingroup Language
  */
 class LanguageYue extends Language {
-       function stripForSearch( $string, $doStrip = true ) {
+       function hasWordBreaks() {
+               return false;
+       }
+
+       /**
+        * Eventually this should be a word segmentation;
+        * for now just treat each character as a word.
+        * @todo Fixme: only do this for Han characters...
+        */
+       function wordSegmentation( $string ) {
+               $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
+       
+       function normalizeForSearch( $string ) {
                wfProfileIn( __METHOD__ );
 
                // Double-width roman characters
                $s = self::convertDoubleWidth( $string );
-
-               if ( $doStrip == true ) {
-                       // eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // @todo Fixme: only do this for Han characters...
-                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
-
                $s = trim( $s );
+               $s = parent::normalizeForSearch( $s );
 
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s, $doStrip );
                wfProfileOut( __METHOD__ );
                return $s;
        }

Modified: trunk/phase3/languages/classes/LanguageZh.php
===================================================================
--- trunk/phase3/languages/classes/LanguageZh.php       2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/languages/classes/LanguageZh.php       2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -170,8 +170,23 @@
                        "\"$1\"", $text);
        }
 
-       // word segmentation
-       function stripForSearch( $string, $doStrip = true, $autoVariant = 
'zh-hans' ) {
+       /**
+        * word segmentation
+        */
+       function wordSegmentation( $string ) {
+               // LanguageZh_hans::wordSegmentation
+               $s = parent::wordSegmentation( $string );
+               return $s;
+       }
+
+       /**
+        * auto convert to zh-hans and normalize special characters.
+        *
+        * @param $string String
+        * @param $autoVariant String, default to 'zh-hans'
+        * @return String
+        */
+       function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) {
                wfProfileIn( __METHOD__ );
 
                // always convert to zh-hans before indexing. it should be
@@ -179,8 +194,8 @@
                // Traditional to Simplified is less ambiguous than the
                // other way around
                $s = $this->mConverter->autoConvert( $string, $autoVariant );
-               // LanguageZh_hans::stripForSearch
-               $s = parent::stripForSearch( $s, $doStrip );
+               // LanguageZh_hans::normalizeForSearch
+               $s = parent::normalizeForSearch( $s );
                wfProfileOut( __METHOD__ );
                return $s;
 

Modified: trunk/phase3/languages/classes/LanguageZh_hans.php
===================================================================
--- trunk/phase3/languages/classes/LanguageZh_hans.php  2010-02-02 15:03:08 UTC 
(rev 61855)
+++ trunk/phase3/languages/classes/LanguageZh_hans.php  2010-02-02 15:09:01 UTC 
(rev 61856)
@@ -7,25 +7,26 @@
        function hasWordBreaks() {
                return false;
        }
-       
-       function stripForSearch( $string, $doStrip = true ) {
+
+       /**
+        * Eventually this should be a word segmentation;
+        * for now just treat each character as a word.
+        * @todo Fixme: only do this for Han characters...
+        */
+       function wordSegmentation( $string ) {
+               $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
+
+       function normalizeForSearch( $string ) {
                wfProfileIn( __METHOD__ );
 
                // Double-width roman characters
                $s = self::convertDoubleWidth( $string );
-
-               if ( $doStrip == true ) {
-                       // Eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // @todo Fixme: only do this for Han characters...
-                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
-
                $s = trim( $s );
+               $s = parent::normalizeForSearch( $s );
 
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s, $doStrip );
                wfProfileOut( __METHOD__ );
                return $s;
        }



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to