Dominic.sauer has uploaded a new change for review. https://gerrit.wikimedia.org/r/215008
Change subject: Add Levenshtein distance and data cleaning ...................................................................... Add Levenshtein distance and data cleaning Change-Id: Ibe46d7b013f024c8749514f2115bbbad77926da6 --- M includes/CrossCheck/Comparer/StringComparer.php M tests/phpunit/CrossCheck/Comparer/StringComparerTest.php 2 files changed, 146 insertions(+), 22 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikidataQualityExternalValidation refs/changes/08/215008/1 diff --git a/includes/CrossCheck/Comparer/StringComparer.php b/includes/CrossCheck/Comparer/StringComparer.php index fccf16d..8e5ffb6 100755 --- a/includes/CrossCheck/Comparer/StringComparer.php +++ b/includes/CrossCheck/Comparer/StringComparer.php @@ -15,7 +15,7 @@ class StringComparer { /** - * Threshold for matching compliance in prefix/postfix similarity checks + * Threshold for matching compliance in prefix/suffix similarity checks */ const SIMILARITY_THRESHOLD = 0.8; @@ -29,6 +29,8 @@ public function compare( $value, $comparativeValue ) { $this->assertIsString( $value, '$value' ); $this->assertIsString( $comparativeValue, '$comparativeValue' ); + $value = $this->cleanDataString( $value ); + $comparativeValue = $this->cleanDataString( $comparativeValue ); if ( $value === $comparativeValue ) { return CompareResult::STATUS_MATCH; @@ -49,6 +51,8 @@ public function compareArray( $value, array $comparativeValues ) { $this->assertIsString( $value, '$value' ); $this->assertIsArrayOfStrings( $comparativeValues, '$comparativeValues' ); + $value = $this->cleanDataString($value); + $comparativeValues = $this->cleanDataArray( $comparativeValues ); if ( in_array( $value, $comparativeValues ) ) { return CompareResult::STATUS_MATCH; @@ -73,6 +77,8 @@ public function compareArrays( array $values, array $comparativeValues ) { $this->assertIsArrayOfStrings( $values, '$values' ); $this->assertIsArrayOfStrings( $comparativeValues, '$comparativeValues' ); + $values = $this->cleanDataArray( $values ); + $comparativeValues = $this->cleanDataArray( $comparativeValues ); if ( count( array_intersect( $values, $comparativeValues ) ) > 0 ) { return CompareResult::STATUS_MATCH; @@ -114,7 +120,7 @@ } /** - * Checks the similarity of two strings by prefix/postfix check. + * Checks the similarity of two strings by prefix/suffix check. * * @param string $value * @param string $comparativeValue @@ -123,24 +129,52 @@ private function checkSimilarity( $value, $comparativeValue ) { return $this->percentagePrefixSimilarity( $value, $comparativeValue ) > self::SIMILARITY_THRESHOLD || - $this->percentagePostfixSimilarity( $value, $comparativeValue ) > self::SIMILARITY_THRESHOLD; + $this->percentageSuffixSimilarity( $value, $comparativeValue ) > self::SIMILARITY_THRESHOLD || + $this->percentageLevenshteinDistance( $value, $comparativeValue ) > self::SIMILARITY_THRESHOLD; + } + + /** + * Returns cleaned (without whitespaces at beginning/end and lowercase) string of a given input string. + * + * @param string $value + * + * @return float + */ + private function cleanDataString( $value ) { + $value = trim( $value ); + + return strtolower( $value ); + } + + /** + * Returns cleaned (without whitespaces at beginning/end and lowercase) array of strings of a given input array. + * + * @param array $array + * + * @return float + */ + private function cleanDataArray( array $array ) { + + return array_map( + array( $this, 'cleanDataString'), + $array ); } /** * Returns percentage of local value prefix-matching the external values. * - * @param localValue - value to prefix-match with external value - * @param $externalValue - value to prefix-match with local value + * @param $value - value to prefix-match with external value + * @param $comparativeValue - value to prefix-match with local value * * @return float */ - private function percentagePrefixSimilarity( $localValue, $externalValue ) { + private function percentagePrefixSimilarity( $value, $comparativeValue ) { $prefixLength = 0; // common prefix length - $localLength = strlen( $localValue ); - $externalLength = strlen( $externalValue ); + $localLength = strlen( $value ); + $externalLength = strlen( $comparativeValue ); while ( $prefixLength < $localLength ) { - $c = $localValue[$prefixLength]; - if ( $externalLength > $prefixLength && $externalValue[$prefixLength] !== $c ) { + $c = $value[$prefixLength]; + if ( $externalLength > $prefixLength && $comparativeValue[$prefixLength] !== $c ) { break; } $prefixLength++; @@ -150,25 +184,42 @@ } /** - * Returns percentage of local value postfix-matching the external values. + * Returns percentage of local value suffix-matching the external values. * - * @param $localValue - value to postfix-match with local value - * @param $externalValue - value to postfix-match with external value + * @param $value - value to suffix-match with local value + * @param $comparativeValue - value to suffix-match with external value * * @return float */ - private function percentagePostfixSimilarity( $localValue, $externalValue ) { - $postfixLength = 0; // common postfix length - $localLength = strlen( $localValue ); - $externalLength = strlen( $externalValue ); - while ( $postfixLength < $localLength ) { - $c = $localValue[$localLength - 1 - $postfixLength]; - if ( $externalLength > $postfixLength && $externalValue[$externalLength - 1 - $postfixLength] !== $c ) { + private function percentageSuffixSimilarity( $value, $comparativeValue ) { + $suffixLength = 0; // common suffix length + $localLength = strlen( $value ); + $externalLength = strlen( $comparativeValue ); + while ( $suffixLength < $localLength ) { + $c = $value[$localLength - 1 - $suffixLength]; + if ( $externalLength > $suffixLength && $comparativeValue[$externalLength - 1 - $suffixLength] !== $c ) { break; } - $postfixLength++; + $suffixLength++; } - return $postfixLength / $externalLength; + return $suffixLength / $externalLength; } + + /** + * Returns percentage of similarity check using levenshtein distance. + * + * @param $value + * @param $comparativeValue + * + * @return float + */ + private function percentageLevenshteinDistance( $value, $comparativeValue ) { + $distance = levenshtein( $value, $comparativeValue ); + $percentage = 1.0 - $distance/max( strlen( $value ), strlen( $comparativeValue ) ); + + return $percentage; + } + + } \ No newline at end of file diff --git a/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php b/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php index 18965f8..c70804d 100755 --- a/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php +++ b/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php @@ -53,6 +53,37 @@ ), array( 'foobar', + 'Foobar', + CompareResult::STATUS_MATCH + ), + array( + 'foobar', + 'foObar', + CompareResult::STATUS_MATCH + ), + array( + 'foobar', + 'FOOBAR', + CompareResult::STATUS_MATCH + ), + array( + 'foobar', + ' foobar', + CompareResult::STATUS_MATCH + ), + array( + 'foobar', + 'foobar ', + CompareResult::STATUS_MATCH + ), + array( + ' foobar', + ' foobar ', + CompareResult::STATUS_MATCH + ), + // prefix/suffix partial match + array( + 'foobar', 'foobaz', CompareResult::STATUS_PARTIAL_MATCH ), @@ -81,6 +112,28 @@ 'oobar', CompareResult::STATUS_PARTIAL_MATCH ), + // levenshtein partial match + array( + 'foobar', + 'fooobar', + CompareResult::STATUS_PARTIAL_MATCH + ), + array( + 'fobar', + 'foobar', + CompareResult::STATUS_PARTIAL_MATCH + ), + array( + 'foubar', + 'foobar', + CompareResult::STATUS_PARTIAL_MATCH + ), + array( + 'Schlossstraße', + 'Schloßstraße', + CompareResult::STATUS_PARTIAL_MATCH + ), + // mismatches array( 'fo', 'foobar', @@ -153,6 +206,16 @@ array( 'foobar', array( 'fo', 'foobar' ), + CompareResult::STATUS_MATCH + ), + array( + 'foobar', + array( 'fo', 'FOOBAR' ), + CompareResult::STATUS_MATCH + ), + array( + 'foobar', + array( 'fo', ' FOOBAR ' ), CompareResult::STATUS_MATCH ), array( @@ -235,6 +298,16 @@ CompareResult::STATUS_PARTIAL_MATCH ), array( + array( 'foobar', 'fubar' ), + array( 'bar', 'FOOBAR' ), + CompareResult::STATUS_MATCH + ), + array( + array( 'foobar', 'fubar' ), + array( 'bar', ' FOOBAR ' ), + CompareResult::STATUS_MATCH + ), + array( array( 'foobar', 'foo' ), array( 'fubar', 'baz' ), CompareResult::STATUS_MISMATCH -- To view, visit https://gerrit.wikimedia.org/r/215008 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ibe46d7b013f024c8749514f2115bbbad77926da6 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikidataQualityExternalValidation Gerrit-Branch: master Gerrit-Owner: Dominic.sauer <dominic.sa...@yahoo.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits