jenkins-bot has submitted this change and it was merged. Change subject: Various optimizations ......................................................................
Various optimizations * Remove DeterministicMultiStringMatcher in favor of a hybrid approach, implemented in MultiStringMatcher, which caches the result of failure transitions. * Operate on bytes rather than characters. * Add a crude benchmarking script. Change-Id: I9b44ffd375716131e0494e0b511845055692a38f --- A bench/bench.php A bench/check.php D src/DeterministicMultiStringMatcher.php M src/MultiStringMatcher.php M src/MultiStringReplacer.php M tests/AhoCorasickTest.php 6 files changed, 163 insertions(+), 157 deletions(-) Approvals: Ori.livneh: Looks good to me, approved jenkins-bot: Verified diff --git a/bench/bench.php b/bench/bench.php new file mode 100644 index 0000000..7562be1 --- /dev/null +++ b/bench/bench.php @@ -0,0 +1,75 @@ +<?php +require_once __DIR__ . '/../src/MultiStringMatcher.php'; +require_once __DIR__ . '/../src/MultiStringReplacer.php'; + +use AhoCorasick\MultiStringReplacer; +use AhoCorasick\MultiStringMatcher; + +if ( !file_exists( __DIR__ . '/23835-0.txt' ) ) { + die( 'Please download http://www.gutenberg.org/files/23835/23835-0.txt' ); +} + +if ( !file_exists( __DIR__ . '/ZhConversion.php' ) ) { + die( 'You need ZhConversion.php, from http://git.io/vIMst' ); +} + +require_once __DIR__ . '/ZhConversion.php'; + +$text = file_get_contents( __DIR__ . '/23835-0.txt' ); + +$options = getopt( '', array( 'count:', 'input:', 'profile', 'fss', 'msr', 'strtr' ) ); +$text = file_get_contents( isset( $options['input'] ) ? $options['input'] : 'SueiTangYanYi.txt' ); +$loops = isset( $options['count'] ) ? intval( $options['count'] ) : 5; +if ( !isset( $options['fss'] ) && !isset( $options['msr'] ) && !isset( $options['strtr'] ) ) { + $options['fss'] = true; + $options['msr'] = true; + $options['strtr'] = true; +} +$profile = false; +if ( isset( $options['profile'] ) ) { + $profile = true; + $options['msr'] = true; + unset( $options['fss'] ); + unset( $options['strtr'] ); +} + +if ( isset( $options['msr'] ) ) { + $replacer = new MultiStringReplacer( $zh2Hant ); + if ( $profile ) { + xhprof_enable( XHPROF_FLAGS_CPU ); + } + $startTime = microtime( true ); + for ( $i = 0; $i < $loops; $i++ ) { + $replacer->searchAndReplace( $text ); + } + $endTime = microtime( true ); + $wallTime = 1000 * ( ( $endTime - $startTime ) / $loops ); + printf( "%-'.40s %.2fms\n", 'MultiStringRepeater::searchAndReplace(): ', $wallTime ); + if ( $profile ) { + $profile = xhprof_disable(); + foreach ( $profile as $func => $data ) { + printf( "%s: %.2f\n", $func, $data['cpu'] / $data['ct'] ); + } + } +} + +if ( function_exists( 'fss_prep_replace' ) && isset( $options['fss'] ) ) { + $fss = fss_prep_replace( $zh2Hant ); + $startTime = microtime( true ); + for ( $i = 0; $i < $loops; $i++ ) { + fss_exec_replace( $fss, $text ); + } + $endTime = microtime( true ); + $wallTime = 1000 * ( ( $endTime - $startTime ) / $loops ); + printf( "%-'.40s %.2fms\n", 'fss_exec_replace(): ', $wallTime ); +} + +if ( isset( $options['strtr'] ) ) { + $startTime = microtime( true ); + for ( $i = 0; $i < $loops; $i++ ) { + strtr( $text, $zh2Hant ); + } + $endTime = microtime( true ); + $wallTime = 1000 * ( ( $endTime - $startTime ) / $loops ); + printf( "%-'.40s %.2fms\n", 'strtr(): ', $wallTime ); +} diff --git a/bench/check.php b/bench/check.php new file mode 100644 index 0000000..d3cebd5 --- /dev/null +++ b/bench/check.php @@ -0,0 +1,43 @@ +<?php +require_once __DIR__ . '/../src/MultiStringMatcher.php'; +require_once __DIR__ . '/../src/MultiStringReplacer.php'; + +use AhoCorasick\MultiStringReplacer; +use AhoCorasick\MultiStringMatcher; + +if ( !file_exists( __DIR__ . '/23835-0.txt' ) ) { + die( 'Please download http://www.gutenberg.org/files/23835/23835-0.txt' ); +} + +if ( !file_exists( __DIR__ . '/ZhConversion.php' ) ) { + die( 'You need ZhConversion.php, from http://git.io/vIMst' ); +} + +require_once __DIR__ . '/ZhConversion.php'; + +$text = file_get_contents( __DIR__ . '/23835-0.txt' ); + +$status = 0; +$expected = strtr( $text, $zh2Hant ); + +echo "MultiStringReplacer::searchAndReplace(): "; +$replacer = new MultiStringReplacer( $zh2Hant ); +if ( $replacer->searchAndReplace( $text ) !== $expected ) { + echo "ERROR\n"; + $status = 1; +} else { + echo "OK\n"; +} + +if ( function_exists( 'fss_exec_replace' ) ) { + echo "fss_exec_replace(): "; + $fss = fss_prep_replace( $zh2Hant ); + if ( fss_exec_replace( $fss, $text ) !== $expected ) { + echo "ERROR\n"; + $status = 1; + } else { + echo "OK\n"; + } +} + +exit( $status ); diff --git a/src/DeterministicMultiStringMatcher.php b/src/DeterministicMultiStringMatcher.php deleted file mode 100644 index 922db6a..0000000 --- a/src/DeterministicMultiStringMatcher.php +++ /dev/null @@ -1,88 +0,0 @@ -<?php -/** - * AhoCorasick PHP Library - * - * Copyright (C) 2015 Ori Livneh <o...@wikimedia.org> - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * @file - * @author Ori Livneh <o...@wikimedia.org> - */ - -namespace AhoCorasick; - -use AhoCorasick\MultiStringMatcher; - -/** - * Represents a variant of the Aho-Corasick string matching algorithm - * which uses a deterministic finite automaton. - * - * The time it takes to construct the finite state machine is - * proportional to the sum of the lengths of the search keywords. - * Once constructed, the machine can locate all occurences of all - * search keywords in a body of text in a single pass, making exactly - * one state transition per input character. - */ -class DeterministicMultiStringMatcher extends MultiStringMatcher { - - /** @var array[] Mapping of all possible state transitions. **/ - protected $stateMachine = null; - - /** - * Constructor. - * - * @param string[] $searchKeywords The set of keywords to be matched. - */ - public function __construct( array $searchKeywords ) { - parent::__construct( $searchKeywords ); - $this->computeFiniteStateMachine(); - } - - - /** - * Map the current state and input character to the next state. - * - * @param int $currentState The current state of the string-matching - * automaton. - * @param string $inputChar The character the string-matching - * automaton is currently processing. - * @return int The state the automaton should transition to. - */ - public function nextState( $state, $ch ) { - return isset( $this->stateMachine[$state][$ch] ) ? - $this->stateMachine[$state][$ch] : 0; - } - - /** - * Construct the string-matching finite state machine. - * - * The machine will make one state transition per input symbol. - */ - protected function computeFiniteStateMachine() { - for ( $r = 0; $r < $this->numStates; $r++ ) { - foreach ( $this->searchChars as $ch ) { - $state = $r; - while ( $state !== 0 && !isset( $this->yesTransitions[$state][$ch] ) ) { - $state = $this->noTransitions[$state]; - } - if ( isset( $this->yesTransitions[$state][$ch] ) ) { - $this->stateMachine[$r][$ch] = $this->yesTransitions[$state][$ch]; - } else { - $this->stateMachine[$r][$ch] = isset( $this->noTransitions[$state] ) - ? $this->noTransitions[$state] : 0; - } - } - } - } -} diff --git a/src/MultiStringMatcher.php b/src/MultiStringMatcher.php index 2083a97..6d3764e 100644 --- a/src/MultiStringMatcher.php +++ b/src/MultiStringMatcher.php @@ -47,10 +47,6 @@ * an aid to bibliographic search", CACM, 18(6):333-340, June 1975. * * @link http://xlinux.nist.gov/dads//HTML/ahoCorasick.html - * - * @param string $text The input string. - * @param array $keywords An array of strings to search for. - * @return array[] An array of (offset, substring) arrays. */ class MultiStringMatcher { @@ -66,8 +62,10 @@ /** @var array Mapping of states to outputs. **/ protected $outputs = array(); + /** @var array Mapping of failure transitions. **/ protected $noTransitions = array(); + /** @var array Mapping of success transitions. **/ protected $yesTransitions = array(); @@ -78,8 +76,8 @@ */ public function __construct( array $searchKeywords ) { foreach ( $searchKeywords as $keyword ) { - if ( $keyword !== '' && !in_array( $keyword, $this->searchKeywords ) ) { - $this->searchKeywords[] = $keyword; + if ( $keyword !== '' ) { + $this->searchKeywords[$keyword] = strlen( $keyword ); } } @@ -88,8 +86,8 @@ return; } - $this->computeSuccessTransitions(); - $this->computeFailTransitions(); + $this->computeYesTransitions(); + $this->computeNoTransitions(); } @@ -99,7 +97,7 @@ * @return string[] Search keywords. */ public function getKeywords() { - return $this->searchKeywords; + return array_keys( $this->searchKeywords ); } @@ -113,14 +111,21 @@ * @return int The state the automaton should transition to. */ public function nextState( $currentState, $inputChar ) { - while ( - $currentState !== 0 && - !isset( $this->yesTransitions[$currentState][$inputChar] ) - ) { + $initialState = $currentState; + while ( true ) { + if ( isset( $this->yesTransitions[$currentState][$inputChar] ) ) { + $nextState = $this->yesTransitions[$currentState][$inputChar]; + // Avoid failure transitions next time. + if ( $currentState !== $initialState ) { + $this->yesTransitions[$initialState][$inputChar] = $nextState; + } + return $nextState; + } + if ( $currentState === 0 ) { + return 0; + } $currentState = $this->noTransitions[$currentState]; } - return isset( $this->yesTransitions[$currentState][$inputChar] ) ? - $this->yesTransitions[$currentState][$inputChar] : 0; } @@ -145,14 +150,14 @@ $state = 0; $results = array(); - $length = mb_strlen( $text ); + $length = strlen( $text ); for ( $i = 0; $i < $length; $i++ ) { - $ch = mb_substr( $text, $i, 1 ); + $ch = $text[$i]; $state = $this->nextState( $state, $ch ); - if ( !empty( $this->outputs[$state] ) ) { + if ( isset( $this->outputs[$state] ) ) { foreach ( $this->outputs[$state] as $match ) { - $offset = $i - mb_strlen( $match ) + 1; + $offset = $i - $this->searchKeywords[$match] + 1; $results[] = array( $offset, $match ); } } @@ -169,16 +174,13 @@ * Constructs a directed tree with a root node which represents the * initial state of the string-matching automaton and from which a * path exists which spells out each search keyword. - * - * @return array[] */ - protected function computeSuccessTransitions() { - foreach ( $this->searchKeywords as $keyword ) { + protected function computeYesTransitions() { + foreach ( $this->searchKeywords as $keyword => $length ) { $state = 0; - $length = mb_strlen( $keyword ); for ( $i = 0; $i < $length; $i++ ) { - $ch = mb_substr( $keyword, $i, 1 ); + $ch = $keyword[$i]; if ( !in_array( $ch, $this->searchChars ) ) { $this->searchChars[] = $ch; } @@ -198,12 +200,8 @@ /** * Get the state transitions which the string-matching automaton * shall make when a partial match proves false. - * - * @param array[] $this->yesTransitions The array created by - * MultiStringMatcher::computeSuccessTransitions. - * @return array[] */ - protected function computeFailTransitions() { + protected function computeNoTransitions() { $queue = array(); foreach ( $this->yesTransitions[0] as $ch => $toState ) { if ( $toState !== 0 ) { @@ -212,29 +210,28 @@ } } - while ( ( $r = array_shift( $queue ) ) !== null ) { - if ( empty( $this->yesTransitions[$r] ) ) { + while ( $queue ) { + $fromState = array_shift( $queue ); + if ( !isset( $this->yesTransitions[$fromState] ) ) { continue; } - foreach ( $this->yesTransitions[$r] as $ch => $toState ) { + foreach ( $this->yesTransitions[$fromState] as $ch => $toState ) { $queue[] = $toState; - $state = $this->noTransitions[$r]; + $state = $this->noTransitions[$fromState]; while ( $state !== 0 && empty( $this->yesTransitions[$state][$ch] ) ) { $state = $this->noTransitions[$state]; } - $failState = isset( $this->yesTransitions[$state][$ch] ) ? + $noState = isset( $this->yesTransitions[$state][$ch] ) ? $this->yesTransitions[$state][$ch] : 0; - $this->noTransitions[$toState] = $failState; - if ( isset( $this->outputs[$failState] ) ) { + $this->noTransitions[$toState] = $noState; + if ( isset( $this->outputs[$noState] ) ) { $this->outputs[$toState] = empty( $this->outputs[$toState] ) - ? $this->outputs[$failState] - : array_merge( $this->outputs[$toState], $this->outputs[$failState] ); + ? $this->outputs[$noState] + : array_merge( $this->outputs[$toState], $this->outputs[$noState] ); } } } - - return $this->noTransitions; } } diff --git a/src/MultiStringReplacer.php b/src/MultiStringReplacer.php index 184d1b9..54d87f9 100644 --- a/src/MultiStringReplacer.php +++ b/src/MultiStringReplacer.php @@ -79,10 +79,7 @@ $matches = array(); foreach ( $this->searchIn( $text ) as $result ) { list( $offset, $match ) = $result; - if ( !isset( $matches[$offset] ) || - mb_strlen( $match ) > mb_strlen( $matches[$offset] ) ) { - $matches[$offset] = $match; - } + $matches[$offset] = $match; } ksort( $matches ); @@ -90,12 +87,11 @@ $lastInsert = 0; foreach ( $matches as $offset => $match ) { if ( $offset >= $lastInsert ) { - $buf .= mb_substr( $text, $lastInsert, $offset - $lastInsert ); - $buf .= $this->replacePairs[$match]; - $lastInsert = $offset + mb_strlen( $match ); + $buf .= substr( $text, $lastInsert, $offset - $lastInsert ) . $this->replacePairs[$match]; + $lastInsert = $offset + $this->searchKeywords[$match]; } } - $buf .= mb_substr( $text, $lastInsert ); + $buf .= substr( $text, $lastInsert ); return $buf; } diff --git a/tests/AhoCorasickTest.php b/tests/AhoCorasickTest.php index 0907b88..14b225f 100644 --- a/tests/AhoCorasickTest.php +++ b/tests/AhoCorasickTest.php @@ -31,7 +31,6 @@ namespace AhoCorasick\Test; use AhoCorasick\MultiStringMatcher; -use AhoCorasick\DeterministicMultiStringMatcher; use AhoCorasick\MultiStringReplacer; /** @@ -43,10 +42,9 @@ /** @param string $text The text to search in. */ public function searchIn( $text ) { $matches = array(); - foreach ( $this->searchKeywords as $keyword ) { - $length = mb_strlen( $keyword ); + foreach ( $this->searchKeywords as $keyword => $length ) { $offset = 0; - while ( ( $offset = mb_strpos( $text, $keyword, $offset ) ) !== false ) { + while ( ( $offset = strpos( $text, $keyword, $offset ) ) !== false ) { $matches[] = array( $offset, $keyword ); $offset = $offset + $length; } @@ -72,7 +70,7 @@ // then by search keyword. usort( $matches, function ( $a, $b ) { return ( $a[0] - $b[0] ) - ?: ( mb_strlen( $a[1] ) - mb_strlen( $b[1] ) ) + ?: ( strlen( $a[1] ) - strlen( $b[1] ) ) ?: strcmp( $a[1], $b[1] ); } ); } @@ -128,20 +126,6 @@ } - /** @dataProvider matcherCaseProvider */ - public function testDeterministicMultiStringMatcher( $inputText, $searchKeywords ) { - $referenceMatcher = new NaiveMultiStringMatcher( $searchKeywords ); - $referenceResults = $referenceMatcher->searchIn( $inputText ); - $this->sortMatcherResults( $referenceResults ); - - $actualMatcher = new DeterministicMultiStringMatcher( $searchKeywords ); - $actualResults = $actualMatcher->searchIn( $inputText ); - $this->sortMatcherResults( $actualResults ); - - $this->assertEquals( $referenceResults, $actualResults ); - } - - public function replacerCaseProvider() { return array( array( @@ -172,5 +156,4 @@ $expected = strtr( $inputText, $replacePairs ); $this->assertEquals( $expected, $actual ); } - } -- To view, visit https://gerrit.wikimedia.org/r/217454 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I9b44ffd375716131e0494e0b511845055692a38f Gerrit-PatchSet: 1 Gerrit-Project: AhoCorasick Gerrit-Branch: master Gerrit-Owner: Ori.livneh <o...@wikimedia.org> Gerrit-Reviewer: Ori.livneh <o...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits