jenkins-bot has submitted this change and it was merged.

Change subject: Various optimizations
......................................................................


Various optimizations

* Remove DeterministicMultiStringMatcher in favor of a hybrid approach,
  implemented in MultiStringMatcher, which caches the result of failure
  transitions.
* Operate on bytes rather than characters.
* Add a crude benchmarking script.

Change-Id: I9b44ffd375716131e0494e0b511845055692a38f
---
A bench/bench.php
A bench/check.php
D src/DeterministicMultiStringMatcher.php
M src/MultiStringMatcher.php
M src/MultiStringReplacer.php
M tests/AhoCorasickTest.php
6 files changed, 163 insertions(+), 157 deletions(-)

Approvals:
  Ori.livneh: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/bench/bench.php b/bench/bench.php
new file mode 100644
index 0000000..7562be1
--- /dev/null
+++ b/bench/bench.php
@@ -0,0 +1,75 @@
+<?php
+require_once __DIR__ . '/../src/MultiStringMatcher.php';
+require_once __DIR__ . '/../src/MultiStringReplacer.php';
+
+use AhoCorasick\MultiStringReplacer;
+use AhoCorasick\MultiStringMatcher;
+
+if ( !file_exists( __DIR__ . '/23835-0.txt' ) ) {
+       die( 'Please download http://www.gutenberg.org/files/23835/23835-0.txt' 
);
+}
+
+if ( !file_exists( __DIR__ . '/ZhConversion.php' ) ) {
+       die( 'You need ZhConversion.php, from http://git.io/vIMst' );
+}
+
+require_once __DIR__ . '/ZhConversion.php';
+
+$text = file_get_contents( __DIR__ . '/23835-0.txt' );
+
+$options = getopt( '', array( 'count:', 'input:', 'profile', 'fss', 'msr', 
'strtr' ) );
+$text = file_get_contents( isset( $options['input'] ) ? $options['input'] : 
'SueiTangYanYi.txt' );
+$loops = isset( $options['count'] ) ? intval( $options['count'] ) : 5;
+if ( !isset( $options['fss'] ) && !isset( $options['msr'] ) && !isset( 
$options['strtr'] ) ) {
+       $options['fss'] = true;
+       $options['msr'] = true;
+       $options['strtr'] = true;
+}
+$profile = false;
+if ( isset( $options['profile'] ) ) {
+       $profile = true;
+       $options['msr'] = true;
+       unset( $options['fss'] );
+       unset( $options['strtr'] );
+}
+
+if ( isset( $options['msr'] ) ) {
+       $replacer = new MultiStringReplacer( $zh2Hant );
+       if ( $profile ) {
+               xhprof_enable( XHPROF_FLAGS_CPU );
+       }
+       $startTime = microtime( true );
+       for ( $i = 0; $i < $loops; $i++ ) {
+               $replacer->searchAndReplace( $text );
+       }
+       $endTime = microtime( true );
+       $wallTime = 1000 * ( ( $endTime - $startTime ) / $loops );
+       printf( "%-'.40s %.2fms\n", 'MultiStringRepeater::searchAndReplace(): 
', $wallTime );
+       if ( $profile ) {
+               $profile = xhprof_disable();
+               foreach ( $profile as $func => $data ) {
+                       printf( "%s: %.2f\n", $func, $data['cpu'] / $data['ct'] 
);
+               }
+       }
+}
+
+if ( function_exists( 'fss_prep_replace' ) && isset( $options['fss'] ) ) {
+       $fss = fss_prep_replace( $zh2Hant );
+       $startTime = microtime( true );
+       for ( $i = 0; $i < $loops; $i++ ) {
+               fss_exec_replace( $fss, $text );
+       }
+       $endTime = microtime( true );
+       $wallTime = 1000 * ( ( $endTime - $startTime ) / $loops );
+       printf( "%-'.40s %.2fms\n", 'fss_exec_replace(): ', $wallTime );
+}
+
+if ( isset( $options['strtr'] ) ) {
+       $startTime = microtime( true );
+       for ( $i = 0; $i < $loops; $i++ ) {
+               strtr( $text, $zh2Hant );
+       }
+       $endTime = microtime( true );
+       $wallTime = 1000 * ( ( $endTime - $startTime ) / $loops );
+       printf( "%-'.40s %.2fms\n", 'strtr(): ', $wallTime );
+}
diff --git a/bench/check.php b/bench/check.php
new file mode 100644
index 0000000..d3cebd5
--- /dev/null
+++ b/bench/check.php
@@ -0,0 +1,43 @@
+<?php
+require_once __DIR__ . '/../src/MultiStringMatcher.php';
+require_once __DIR__ . '/../src/MultiStringReplacer.php';
+
+use AhoCorasick\MultiStringReplacer;
+use AhoCorasick\MultiStringMatcher;
+
+if ( !file_exists( __DIR__ . '/23835-0.txt' ) ) {
+       die( 'Please download http://www.gutenberg.org/files/23835/23835-0.txt' 
);
+}
+
+if ( !file_exists( __DIR__ . '/ZhConversion.php' ) ) {
+       die( 'You need ZhConversion.php, from http://git.io/vIMst' );
+}
+
+require_once __DIR__ . '/ZhConversion.php';
+
+$text = file_get_contents( __DIR__ . '/23835-0.txt' );
+
+$status = 0;
+$expected = strtr( $text, $zh2Hant );
+
+echo "MultiStringReplacer::searchAndReplace(): ";
+$replacer = new MultiStringReplacer( $zh2Hant );
+if ( $replacer->searchAndReplace( $text ) !== $expected ) {
+       echo "ERROR\n";
+       $status = 1;
+} else {
+       echo "OK\n";
+}
+
+if ( function_exists( 'fss_exec_replace' ) ) {
+       echo "fss_exec_replace(): ";
+       $fss = fss_prep_replace( $zh2Hant );
+       if ( fss_exec_replace( $fss, $text ) !== $expected ) {
+               echo "ERROR\n";
+               $status = 1;
+       } else {
+               echo "OK\n";
+       }
+}
+
+exit( $status );
diff --git a/src/DeterministicMultiStringMatcher.php 
b/src/DeterministicMultiStringMatcher.php
deleted file mode 100644
index 922db6a..0000000
--- a/src/DeterministicMultiStringMatcher.php
+++ /dev/null
@@ -1,88 +0,0 @@
-<?php
-/**
- * AhoCorasick PHP Library
- *
- * Copyright (C) 2015 Ori Livneh <o...@wikimedia.org>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * @file
- * @author Ori Livneh <o...@wikimedia.org>
- */
-
-namespace AhoCorasick;
-
-use AhoCorasick\MultiStringMatcher;
-
-/**
- * Represents a variant of the Aho-Corasick string matching algorithm
- * which uses a deterministic finite automaton.
- *
- * The time it takes to construct the finite state machine is
- * proportional to the sum of the lengths of the search keywords.
- * Once constructed, the machine can locate all occurences of all
- * search keywords in a body of text in a single pass, making exactly
- * one state transition per input character.
- */
-class DeterministicMultiStringMatcher extends MultiStringMatcher {
-
-       /** @var array[] Mapping of all possible state transitions. **/
-       protected $stateMachine = null;
-
-       /**
-        * Constructor.
-        *
-        * @param string[] $searchKeywords The set of keywords to be matched.
-        */
-       public function __construct( array $searchKeywords ) {
-               parent::__construct( $searchKeywords );
-               $this->computeFiniteStateMachine();
-       }
-
-
-       /**
-        * Map the current state and input character to the next state.
-        *
-        * @param int $currentState The current state of the string-matching
-        *  automaton.
-        * @param string $inputChar The character the string-matching
-        *  automaton is currently processing.
-        * @return int The state the automaton should transition to.
-        */
-       public function nextState( $state, $ch ) {
-               return isset( $this->stateMachine[$state][$ch] ) ?
-                       $this->stateMachine[$state][$ch] : 0;
-       }
-
-       /**
-        * Construct the string-matching finite state machine.
-        *
-        * The machine will make one state transition per input symbol.
-        */
-       protected function computeFiniteStateMachine() {
-               for ( $r = 0; $r < $this->numStates; $r++ ) {
-                       foreach ( $this->searchChars as $ch ) {
-                               $state = $r;
-                               while ( $state !== 0 && !isset( 
$this->yesTransitions[$state][$ch] ) ) {
-                                       $state = $this->noTransitions[$state];
-                               }
-                               if ( isset( $this->yesTransitions[$state][$ch] 
) ) {
-                                       $this->stateMachine[$r][$ch] = 
$this->yesTransitions[$state][$ch];
-                               } else {
-                                       $this->stateMachine[$r][$ch] = isset( 
$this->noTransitions[$state] )
-                                               ? $this->noTransitions[$state] 
: 0;
-                               }
-                       }
-               }
-       }
-}
diff --git a/src/MultiStringMatcher.php b/src/MultiStringMatcher.php
index 2083a97..6d3764e 100644
--- a/src/MultiStringMatcher.php
+++ b/src/MultiStringMatcher.php
@@ -47,10 +47,6 @@
  *  an aid to bibliographic search", CACM, 18(6):333-340, June 1975.
  *
  * @link http://xlinux.nist.gov/dads//HTML/ahoCorasick.html
- *
- * @param string $text The input string.
- * @param array $keywords An array of strings to search for.
- * @return array[] An array of (offset, substring) arrays.
  */
 class MultiStringMatcher {
 
@@ -66,8 +62,10 @@
        /** @var array Mapping of states to outputs. **/
        protected $outputs = array();
 
+       /** @var array Mapping of failure transitions. **/
        protected $noTransitions = array();
 
+       /** @var array Mapping of success transitions. **/
        protected $yesTransitions = array();
 
 
@@ -78,8 +76,8 @@
         */
        public function __construct( array $searchKeywords ) {
                foreach ( $searchKeywords as $keyword ) {
-                       if ( $keyword !== '' && !in_array( $keyword, 
$this->searchKeywords ) ) {
-                               $this->searchKeywords[] = $keyword;
+                       if ( $keyword !== '' ) {
+                               $this->searchKeywords[$keyword] = strlen( 
$keyword );
                        }
                }
 
@@ -88,8 +86,8 @@
                        return;
                }
 
-               $this->computeSuccessTransitions();
-               $this->computeFailTransitions();
+               $this->computeYesTransitions();
+               $this->computeNoTransitions();
        }
 
 
@@ -99,7 +97,7 @@
         * @return string[] Search keywords.
         */
        public function getKeywords() {
-               return $this->searchKeywords;
+               return array_keys( $this->searchKeywords );
        }
 
 
@@ -113,14 +111,21 @@
         * @return int The state the automaton should transition to.
         */
        public function nextState( $currentState, $inputChar ) {
-               while (
-                       $currentState !== 0 &&
-                       !isset( 
$this->yesTransitions[$currentState][$inputChar] )
-               ) {
+               $initialState = $currentState;
+               while ( true ) {
+                       if ( isset( 
$this->yesTransitions[$currentState][$inputChar] ) ) {
+                               $nextState = 
$this->yesTransitions[$currentState][$inputChar];
+                               // Avoid failure transitions next time.
+                               if ( $currentState !== $initialState ) {
+                                       
$this->yesTransitions[$initialState][$inputChar] = $nextState;
+                               }
+                               return $nextState;
+                       }
+                       if ( $currentState === 0 ) {
+                               return 0;
+                       }
                        $currentState = $this->noTransitions[$currentState];
                }
-               return isset( $this->yesTransitions[$currentState][$inputChar] 
) ?
-                       $this->yesTransitions[$currentState][$inputChar] : 0;
        }
 
 
@@ -145,14 +150,14 @@
 
                $state = 0;
                $results = array();
-               $length = mb_strlen( $text );
+               $length = strlen( $text );
 
                for ( $i = 0; $i < $length; $i++ ) {
-                       $ch = mb_substr( $text, $i, 1 );
+                       $ch = $text[$i];
                        $state = $this->nextState( $state, $ch );
-                       if ( !empty( $this->outputs[$state] ) ) {
+                       if ( isset( $this->outputs[$state] ) ) {
                                foreach ( $this->outputs[$state] as $match ) {
-                                       $offset = $i - mb_strlen( $match ) + 1;
+                                       $offset = $i - 
$this->searchKeywords[$match] + 1;
                                        $results[] = array( $offset, $match );
                                }
                        }
@@ -169,16 +174,13 @@
         * Constructs a directed tree with a root node which represents the
         * initial state of the string-matching automaton and from which a
         * path exists which spells out each search keyword.
-        *
-        * @return array[]
         */
-       protected function computeSuccessTransitions() {
-               foreach ( $this->searchKeywords as $keyword ) {
+       protected function computeYesTransitions() {
+               foreach ( $this->searchKeywords as $keyword => $length ) {
                        $state = 0;
-                       $length = mb_strlen( $keyword );
 
                        for ( $i = 0; $i < $length; $i++ ) {
-                               $ch = mb_substr( $keyword, $i, 1 );
+                               $ch = $keyword[$i];
                                if ( !in_array( $ch, $this->searchChars ) ) {
                                        $this->searchChars[] = $ch;
                                }
@@ -198,12 +200,8 @@
        /**
         * Get the state transitions which the string-matching automaton
         * shall make when a partial match proves false.
-        *
-        * @param array[] $this->yesTransitions The array created by
-        *  MultiStringMatcher::computeSuccessTransitions.
-        * @return array[]
         */
-       protected function computeFailTransitions() {
+       protected function computeNoTransitions() {
                $queue = array();
                foreach ( $this->yesTransitions[0] as $ch => $toState ) {
                        if ( $toState !== 0 ) {
@@ -212,29 +210,28 @@
                        }
                }
 
-               while ( ( $r = array_shift( $queue ) ) !== null ) {
-                       if ( empty( $this->yesTransitions[$r] ) ) {
+               while ( $queue ) {
+                       $fromState = array_shift( $queue );
+                       if ( !isset( $this->yesTransitions[$fromState] ) ) {
                                continue;
                        }
-                       foreach ( $this->yesTransitions[$r] as $ch => $toState 
) {
+                       foreach ( $this->yesTransitions[$fromState] as $ch => 
$toState ) {
                                $queue[] = $toState;
-                               $state = $this->noTransitions[$r];
+                               $state = $this->noTransitions[$fromState];
 
                                while ( $state !== 0 && empty( 
$this->yesTransitions[$state][$ch] ) ) {
                                        $state = $this->noTransitions[$state];
                                }
 
-                               $failState = isset( 
$this->yesTransitions[$state][$ch] ) ?
+                               $noState = isset( 
$this->yesTransitions[$state][$ch] ) ?
                                        $this->yesTransitions[$state][$ch] : 0;
-                               $this->noTransitions[$toState] = $failState;
-                               if ( isset( $this->outputs[$failState] ) ) {
+                               $this->noTransitions[$toState] = $noState;
+                               if ( isset( $this->outputs[$noState] ) ) {
                                        $this->outputs[$toState] = empty( 
$this->outputs[$toState] )
-                                               ? $this->outputs[$failState]
-                                               : array_merge( 
$this->outputs[$toState], $this->outputs[$failState] );
+                                               ? $this->outputs[$noState]
+                                               : array_merge( 
$this->outputs[$toState], $this->outputs[$noState] );
                                }
                        }
                }
-
-               return $this->noTransitions;
        }
 }
diff --git a/src/MultiStringReplacer.php b/src/MultiStringReplacer.php
index 184d1b9..54d87f9 100644
--- a/src/MultiStringReplacer.php
+++ b/src/MultiStringReplacer.php
@@ -79,10 +79,7 @@
                $matches = array();
                foreach ( $this->searchIn( $text ) as $result ) {
                        list( $offset, $match ) = $result;
-                       if ( !isset( $matches[$offset] ) ||
-                               mb_strlen( $match ) > mb_strlen( 
$matches[$offset] ) ) {
-                               $matches[$offset] = $match;
-                       }
+                       $matches[$offset] = $match;
                }
                ksort( $matches );
 
@@ -90,12 +87,11 @@
                $lastInsert = 0;
                foreach ( $matches as $offset => $match ) {
                        if ( $offset >= $lastInsert ) {
-                               $buf .= mb_substr( $text, $lastInsert, $offset 
- $lastInsert );
-                               $buf .= $this->replacePairs[$match];
-                               $lastInsert = $offset + mb_strlen( $match );
+                               $buf .= substr( $text, $lastInsert, $offset - 
$lastInsert ) . $this->replacePairs[$match];
+                               $lastInsert = $offset + 
$this->searchKeywords[$match];
                        }
                }
-               $buf .= mb_substr( $text, $lastInsert );
+               $buf .= substr( $text, $lastInsert );
 
                return $buf;
        }
diff --git a/tests/AhoCorasickTest.php b/tests/AhoCorasickTest.php
index 0907b88..14b225f 100644
--- a/tests/AhoCorasickTest.php
+++ b/tests/AhoCorasickTest.php
@@ -31,7 +31,6 @@
 namespace AhoCorasick\Test;
 
 use AhoCorasick\MultiStringMatcher;
-use AhoCorasick\DeterministicMultiStringMatcher;
 use AhoCorasick\MultiStringReplacer;
 
 /**
@@ -43,10 +42,9 @@
        /** @param string $text The text to search in. */
        public function searchIn( $text ) {
                $matches = array();
-               foreach ( $this->searchKeywords as $keyword ) {
-                       $length = mb_strlen( $keyword );
+               foreach ( $this->searchKeywords as $keyword => $length ) {
                        $offset = 0;
-                       while ( ( $offset = mb_strpos( $text, $keyword, $offset 
) ) !== false ) {
+                       while ( ( $offset = strpos( $text, $keyword, $offset ) 
) !== false ) {
                                $matches[] = array( $offset, $keyword );
                                $offset = $offset + $length;
                        }
@@ -72,7 +70,7 @@
                // then by search keyword.
                usort( $matches, function ( $a, $b ) {
                        return ( $a[0] - $b[0] )
-                               ?: ( mb_strlen( $a[1] ) - mb_strlen( $b[1] ) )
+                               ?: ( strlen( $a[1] ) - strlen( $b[1] ) )
                                ?: strcmp( $a[1], $b[1] );
                } );
        }
@@ -128,20 +126,6 @@
        }
 
 
-       /** @dataProvider matcherCaseProvider */
-       public function testDeterministicMultiStringMatcher( $inputText, 
$searchKeywords ) {
-               $referenceMatcher = new NaiveMultiStringMatcher( 
$searchKeywords );
-               $referenceResults = $referenceMatcher->searchIn( $inputText );
-               $this->sortMatcherResults( $referenceResults );
-
-               $actualMatcher = new DeterministicMultiStringMatcher( 
$searchKeywords );
-               $actualResults = $actualMatcher->searchIn( $inputText );
-               $this->sortMatcherResults( $actualResults );
-
-               $this->assertEquals( $referenceResults, $actualResults );
-       }
-
-
        public function replacerCaseProvider() {
                return array(
                        array(
@@ -172,5 +156,4 @@
                $expected = strtr( $inputText, $replacePairs );
                $this->assertEquals( $expected, $actual );
        }
-
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/217454
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I9b44ffd375716131e0494e0b511845055692a38f
Gerrit-PatchSet: 1
Gerrit-Project: AhoCorasick
Gerrit-Branch: master
Gerrit-Owner: Ori.livneh <o...@wikimedia.org>
Gerrit-Reviewer: Ori.livneh <o...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to