Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/130799

Change subject: If Hebrew guess if " is part of word
......................................................................

If Hebrew guess if " is part of word

In Hebrew " can stand in for ״ which isn't a phrase marker at all, actually
belongs inside the word.  This tries to guess if the " is a phrase marker
or a stand in for ״.

Bug 64350

Change-Id: Ic6ab79600c580103f2eaecf9959103b9c27c3593
---
M CirrusSearch.php
A includes/SearchEscaper.php
M includes/Searcher.php
A tests/unit/SearchEscaperTest.php
4 files changed, 157 insertions(+), 26 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/99/130799/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index b277a95..bc79b54 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -327,6 +327,7 @@
 $wgAutoloadClasses['CirrusSearch\ResultSet'] = $includes . 'ResultSet.php';
 $wgAutoloadClasses['CirrusSearch\ResultsType'] = $includes . 'ResultsType.php';
 $wgAutoloadClasses['CirrusSearch\Searcher'] = $includes . 'Searcher.php';
+$wgAutoloadClasses['CirrusSearch\SearchEscaper'] = $includes . 
'SearchEscaper.php';
 $wgAutoloadClasses['CirrusSearch\TitleResultsType'] = $includes . 
'ResultsType.php';
 $wgAutoloadClasses['CirrusSearch\UpdateSearchIndexConfig'] = __DIR__ . 
'/maintenance/updateSearchIndexConfig.php';
 $wgAutoloadClasses['CirrusSearch\UpdateVersionIndex'] = __DIR__ . 
'/maintenance/updateVersionIndex.php';
diff --git a/includes/SearchEscaper.php b/includes/SearchEscaper.php
new file mode 100644
index 0000000..6e34c3d
--- /dev/null
+++ b/includes/SearchEscaper.php
@@ -0,0 +1,70 @@
+<?php
+
+namespace CirrusSearch;
+
+use \ProfileSection;
+
+/**
+ * Escapes queries.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class SearchEscaper {
+       private $language;
+
+       public function __construct( $language ) {
+               $this->language = $language;
+       }
+
+       public function escapeQuotes( $text ) {
+               $profiler = new ProfileSection( __METHOD__ );
+               if ( $this->language === 'he' ) {
+                       // Hebrew uses the double quote (") character as a 
standin for quotation marks (“”)
+                       // which delineate phrases.  It also uses double quotes 
as a standin for another
+                       // character (״), call a Gershayim, which mark 
acronyms.  Here we guess if the intent
+                       // was to mark a phrase, in which case we leave the 
quotes alone, or to mark an
+                       // acronym, in which case we escape them.
+                       return preg_replace( '/(\S+)"(\S)/', '\1\\"\2', $text );
+               }
+               return $text;
+       }
+
+       public function balanceQuotes( $text ) {
+                               wfDebugLog( 'CirrusSearch', "asfasdf   " . 
$text );
+
+               $profiler = new ProfileSection( __METHOD__ );
+               $inQuote = false;
+               $inEscape = false;
+               $len = strlen( $text );
+               for ( $i = 0; $i < $len; $i++ ) {
+                       if ( $inEscape ) {
+                               $inEscape = false;
+                               continue;
+                       }
+                       switch ( $text[ $i ] ) {
+                       case '"':
+                               $inQuote = !$inQuote;
+                               break;
+                       case '\\':
+                               $inEscape = true;
+                       }
+               }
+               if ( $inQuote ) {
+                       $text = $text . '"';
+               }
+               return $text;
+       }
+}
diff --git a/includes/Searcher.php b/includes/Searcher.php
index cf660f5..58d8cbc 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -147,6 +147,11 @@
        private $highlightQuery = null;
 
        /**
+        * @var SearchEscaper escapes queries
+        */
+       private $escaper;
+
+       /**
         * Constructor
         * @param int $offset Offset the results by this much
         * @param int $limit Limit the results to this many
@@ -155,13 +160,15 @@
         * @param string $index Base name for index to search from, defaults to 
wfWikiId()
         */
        public function __construct( $offset, $limit, $namespaces, $user, 
$index = false ) {
-               global $wgCirrusSearchSlowSearch;
+               global $wgCirrusSearchSlowSearch,
+                       $wgLanguageCode;
 
                parent::__construct( $user, $wgCirrusSearchSlowSearch );
                $this->offset = min( $offset, self::MAX_OFFSET );
                $this->limit = $limit;
                $this->namespaces = $namespaces;
                $this->indexBaseName = $index ?: wfWikiId();
+               $this->escaper = new SearchEscaper( $wgLanguageCode );
        }
 
        /**
@@ -389,12 +396,15 @@
                $this->boostTemplates = $boostTemplates;
                $this->searchContainedSyntax = $searchContainedSyntax;
                wfProfileOut( __METHOD__ . '-other-filters' );
+
+               $this->term = $this->escaper->escapeQuotes( $this->term );
+
                wfProfileIn( __METHOD__ . '-find-phrase-queries' );
                // Match quoted phrases including those containing escaped 
quotes
                // Those phrases can optionally be followed by ~ then a number 
(this is the phrase slop)
                // That can optionally be followed by a ~ (this matches stemmed 
words in phrases)
                // The following all match: "a", "a boat", "a\"boat", "a 
boat"~, "a boat"~9, "a boat"~9~
-               $query = self::replacePartsOfQuery( $this->term, 
'/(?<main>"((?:[^"]|(?:\"))+)"(?:~[0-9]+)?)(?<fuzzy>~)?/',
+               $query = self::replacePartsOfQuery( $this->term, 
'/(?<![\]])(?<main>"((?:[^"]|(?:\"))+)"(?:~[0-9]+)?)(?<fuzzy>~)?/',
                        function ( $matches ) use ( $searcher ) {
                                $main = $searcher->fixupQueryStringPart( 
$matches[ 'main' ][ 0 ] );
                                if ( !isset( $matches[ 'fuzzy' ] ) ) {
@@ -1006,32 +1016,11 @@
                                ]|
                                \^|     (?# no user supplied boosts at this 
point, though I cant think why)
                                :|              (?# no specifying your own 
fields)
-                               \\\
+                               \\\(?!") (?# the only acceptable escaping is 
for quotes)
                        )/x', '\\\$1', $string );
 
-               // If the string doesn't have balanced quotes then add a quote 
on the end so Elasticsearch
-               // can parse it.
-               $inQuote = false;
-               $inEscape = false;
-               $len = strlen( $string );
-               for ( $i = 0; $i < $len; $i++ ) {
-                       if ( $inEscape ) {
-                               $inEscape = false;
-                               continue;
-                       }
-                       switch ( $string[ $i ] ) {
-                       case '"':
-                               $inQuote = !$inQuote;
-                               break;
-                       case '\\':
-                               $inEscape = true;
-                       }
-               }
-               if ( $inQuote ) {
-                       $string = $string . '"';
-               }
-
-               return $string;
+               // Elasticsearch's query strings can't abide unbalanced quotes
+               return $this->escaper->balanceQuotes( $string );
        }
 
        /**
diff --git a/tests/unit/SearchEscaperTest.php b/tests/unit/SearchEscaperTest.php
new file mode 100644
index 0000000..01112fa
--- /dev/null
+++ b/tests/unit/SearchEscaperTest.php
@@ -0,0 +1,71 @@
+<?php
+
+namespace CirrusSearch;
+
+use \MediaWikiTestCase;
+
+/**
+ * Make sure cirrus doens't break any hooks.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class SearchEscaperTest extends MediaWikiTestCase {
+       /**
+        * @dataProvider quoteEscapeTestCases
+        */
+       public function testQuoteEscape( $language, $input, $expected ) {
+               $escaper = new SearchEscaper( $language );
+               $actual = $escaper->escapeQuotes( $input );
+               $this->assertEquals( $expected, $actual );
+       }
+
+       public static function quoteEscapeTestCases() {
+               return array(
+                       array( 'en', 'foo', 'foo' ),
+                       array( 'en', 'fo"o', 'fo"o' ),
+                       array( 'el', 'fo"o', 'fo"o' ),
+                       array( 'de', 'fo"o', 'fo"o' ),
+                       array( 'he', 'מלבי"ם', 'מלבי\"ם' ),
+                       array( 'he', '"מלבי"', '"מלבי"' ),
+                       array( 'he', '"מלבי"ם"', '"מלבי\"ם"' ),
+                       array( 'he', 'מַ"כִּית', 'מַ\"כִּית' ),
+                       array( 'he', 'הוּא שִׂרְטֵט עַיִ"ן', 'הוּא שִׂרְטֵט 
עַיִ\"ן' ),
+                       array( 'he', '"הוּא שִׂרְטֵט עַיִ"ן"', '"הוּא שִׂרְטֵט 
עַיִ\"ן"' ),
+               );
+       }
+
+       /**
+        * @dataProvider balanceQuotesTestCases
+        */
+       public function testBalanceQuotes( $input, $expected ) {
+               $escaper = new SearchEscaper( 'en' ); // Language doesn't 
matter here
+               $actual = $escaper->balanceQuotes( $input);
+               $this->assertEquals( $expected, $actual );
+       }
+
+       public static function balanceQuotesTestCases() {
+               return array(
+                       array( 'foo', 'foo' ),
+                       array( '"foo', '"foo"' ),
+                       array( '"foo" bar', '"foo" bar' ),
+                       array( '"foo" ba"r', '"foo" ba"r"' ),
+                       array( '"foo" ba\\"r', '"foo" ba\\"r' ),
+                       array( '"foo\\" ba\\"r', '"foo\\" ba\\"r"' ),
+                       array( '\\"foo\\" ba\\"r', '\\"foo\\" ba\\"r' ),
+                       array( '"fo\\o bar', '"fo\\o bar"' ),
+               );
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/130799
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic6ab79600c580103f2eaecf9959103b9c27c3593
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <never...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to