Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/130799
Change subject: If Hebrew guess if " is part of word ...................................................................... If Hebrew guess if " is part of word In Hebrew " can stand in for ״ which isn't a phrase marker at all, actually belongs inside the word. This tries to guess if the " is a phrase marker or a stand in for ״. Bug 64350 Change-Id: Ic6ab79600c580103f2eaecf9959103b9c27c3593 --- M CirrusSearch.php A includes/SearchEscaper.php M includes/Searcher.php A tests/unit/SearchEscaperTest.php 4 files changed, 157 insertions(+), 26 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/99/130799/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index b277a95..bc79b54 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -327,6 +327,7 @@ $wgAutoloadClasses['CirrusSearch\ResultSet'] = $includes . 'ResultSet.php'; $wgAutoloadClasses['CirrusSearch\ResultsType'] = $includes . 'ResultsType.php'; $wgAutoloadClasses['CirrusSearch\Searcher'] = $includes . 'Searcher.php'; +$wgAutoloadClasses['CirrusSearch\SearchEscaper'] = $includes . 'SearchEscaper.php'; $wgAutoloadClasses['CirrusSearch\TitleResultsType'] = $includes . 'ResultsType.php'; $wgAutoloadClasses['CirrusSearch\UpdateSearchIndexConfig'] = __DIR__ . '/maintenance/updateSearchIndexConfig.php'; $wgAutoloadClasses['CirrusSearch\UpdateVersionIndex'] = __DIR__ . '/maintenance/updateVersionIndex.php'; diff --git a/includes/SearchEscaper.php b/includes/SearchEscaper.php new file mode 100644 index 0000000..6e34c3d --- /dev/null +++ b/includes/SearchEscaper.php @@ -0,0 +1,70 @@ +<?php + +namespace CirrusSearch; + +use \ProfileSection; + +/** + * Escapes queries. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +class SearchEscaper { + private $language; + + public function __construct( $language ) { + $this->language = $language; + } + + public function escapeQuotes( $text ) { + $profiler = new ProfileSection( __METHOD__ ); + if ( $this->language === 'he' ) { + // Hebrew uses the double quote (") character as a standin for quotation marks (“”) + // which delineate phrases. It also uses double quotes as a standin for another + // character (״), call a Gershayim, which mark acronyms. Here we guess if the intent + // was to mark a phrase, in which case we leave the quotes alone, or to mark an + // acronym, in which case we escape them. + return preg_replace( '/(\S+)"(\S)/', '\1\\"\2', $text ); + } + return $text; + } + + public function balanceQuotes( $text ) { + wfDebugLog( 'CirrusSearch', "asfasdf " . $text ); + + $profiler = new ProfileSection( __METHOD__ ); + $inQuote = false; + $inEscape = false; + $len = strlen( $text ); + for ( $i = 0; $i < $len; $i++ ) { + if ( $inEscape ) { + $inEscape = false; + continue; + } + switch ( $text[ $i ] ) { + case '"': + $inQuote = !$inQuote; + break; + case '\\': + $inEscape = true; + } + } + if ( $inQuote ) { + $text = $text . '"'; + } + return $text; + } +} diff --git a/includes/Searcher.php b/includes/Searcher.php index cf660f5..58d8cbc 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -147,6 +147,11 @@ private $highlightQuery = null; /** + * @var SearchEscaper escapes queries + */ + private $escaper; + + /** * Constructor * @param int $offset Offset the results by this much * @param int $limit Limit the results to this many @@ -155,13 +160,15 @@ * @param string $index Base name for index to search from, defaults to wfWikiId() */ public function __construct( $offset, $limit, $namespaces, $user, $index = false ) { - global $wgCirrusSearchSlowSearch; + global $wgCirrusSearchSlowSearch, + $wgLanguageCode; parent::__construct( $user, $wgCirrusSearchSlowSearch ); $this->offset = min( $offset, self::MAX_OFFSET ); $this->limit = $limit; $this->namespaces = $namespaces; $this->indexBaseName = $index ?: wfWikiId(); + $this->escaper = new SearchEscaper( $wgLanguageCode ); } /** @@ -389,12 +396,15 @@ $this->boostTemplates = $boostTemplates; $this->searchContainedSyntax = $searchContainedSyntax; wfProfileOut( __METHOD__ . '-other-filters' ); + + $this->term = $this->escaper->escapeQuotes( $this->term ); + wfProfileIn( __METHOD__ . '-find-phrase-queries' ); // Match quoted phrases including those containing escaped quotes // Those phrases can optionally be followed by ~ then a number (this is the phrase slop) // That can optionally be followed by a ~ (this matches stemmed words in phrases) // The following all match: "a", "a boat", "a\"boat", "a boat"~, "a boat"~9, "a boat"~9~ - $query = self::replacePartsOfQuery( $this->term, '/(?<main>"((?:[^"]|(?:\"))+)"(?:~[0-9]+)?)(?<fuzzy>~)?/', + $query = self::replacePartsOfQuery( $this->term, '/(?<![\]])(?<main>"((?:[^"]|(?:\"))+)"(?:~[0-9]+)?)(?<fuzzy>~)?/', function ( $matches ) use ( $searcher ) { $main = $searcher->fixupQueryStringPart( $matches[ 'main' ][ 0 ] ); if ( !isset( $matches[ 'fuzzy' ] ) ) { @@ -1006,32 +1016,11 @@ ]| \^| (?# no user supplied boosts at this point, though I cant think why) :| (?# no specifying your own fields) - \\\ + \\\(?!") (?# the only acceptable escaping is for quotes) )/x', '\\\$1', $string ); - // If the string doesn't have balanced quotes then add a quote on the end so Elasticsearch - // can parse it. - $inQuote = false; - $inEscape = false; - $len = strlen( $string ); - for ( $i = 0; $i < $len; $i++ ) { - if ( $inEscape ) { - $inEscape = false; - continue; - } - switch ( $string[ $i ] ) { - case '"': - $inQuote = !$inQuote; - break; - case '\\': - $inEscape = true; - } - } - if ( $inQuote ) { - $string = $string . '"'; - } - - return $string; + // Elasticsearch's query strings can't abide unbalanced quotes + return $this->escaper->balanceQuotes( $string ); } /** diff --git a/tests/unit/SearchEscaperTest.php b/tests/unit/SearchEscaperTest.php new file mode 100644 index 0000000..01112fa --- /dev/null +++ b/tests/unit/SearchEscaperTest.php @@ -0,0 +1,71 @@ +<?php + +namespace CirrusSearch; + +use \MediaWikiTestCase; + +/** + * Make sure cirrus doens't break any hooks. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +class SearchEscaperTest extends MediaWikiTestCase { + /** + * @dataProvider quoteEscapeTestCases + */ + public function testQuoteEscape( $language, $input, $expected ) { + $escaper = new SearchEscaper( $language ); + $actual = $escaper->escapeQuotes( $input ); + $this->assertEquals( $expected, $actual ); + } + + public static function quoteEscapeTestCases() { + return array( + array( 'en', 'foo', 'foo' ), + array( 'en', 'fo"o', 'fo"o' ), + array( 'el', 'fo"o', 'fo"o' ), + array( 'de', 'fo"o', 'fo"o' ), + array( 'he', 'מלבי"ם', 'מלבי\"ם' ), + array( 'he', '"מלבי"', '"מלבי"' ), + array( 'he', '"מלבי"ם"', '"מלבי\"ם"' ), + array( 'he', 'מַ"כִּית', 'מַ\"כִּית' ), + array( 'he', 'הוּא שִׂרְטֵט עַיִ"ן', 'הוּא שִׂרְטֵט עַיִ\"ן' ), + array( 'he', '"הוּא שִׂרְטֵט עַיִ"ן"', '"הוּא שִׂרְטֵט עַיִ\"ן"' ), + ); + } + + /** + * @dataProvider balanceQuotesTestCases + */ + public function testBalanceQuotes( $input, $expected ) { + $escaper = new SearchEscaper( 'en' ); // Language doesn't matter here + $actual = $escaper->balanceQuotes( $input); + $this->assertEquals( $expected, $actual ); + } + + public static function balanceQuotesTestCases() { + return array( + array( 'foo', 'foo' ), + array( '"foo', '"foo"' ), + array( '"foo" bar', '"foo" bar' ), + array( '"foo" ba"r', '"foo" ba"r"' ), + array( '"foo" ba\\"r', '"foo" ba\\"r' ), + array( '"foo\\" ba\\"r', '"foo\\" ba\\"r"' ), + array( '\\"foo\\" ba\\"r', '\\"foo\\" ba\\"r' ), + array( '"fo\\o bar', '"fo\\o bar"' ), + ); + } +} -- To view, visit https://gerrit.wikimedia.org/r/130799 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic6ab79600c580103f2eaecf9959103b9c27c3593 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits