Dominic.sauer has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/215008

Change subject: Add Levenshtein distance and data cleaning
......................................................................

Add Levenshtein distance and data cleaning

Change-Id: Ibe46d7b013f024c8749514f2115bbbad77926da6
---
M includes/CrossCheck/Comparer/StringComparer.php
M tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
2 files changed, 146 insertions(+), 22 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikidataQualityExternalValidation
 refs/changes/08/215008/1

diff --git a/includes/CrossCheck/Comparer/StringComparer.php 
b/includes/CrossCheck/Comparer/StringComparer.php
index fccf16d..8e5ffb6 100755
--- a/includes/CrossCheck/Comparer/StringComparer.php
+++ b/includes/CrossCheck/Comparer/StringComparer.php
@@ -15,7 +15,7 @@
 class StringComparer {
 
     /**
-     * Threshold for matching compliance in prefix/postfix similarity checks
+     * Threshold for matching compliance in prefix/suffix similarity checks
      */
     const SIMILARITY_THRESHOLD = 0.8;
 
@@ -29,6 +29,8 @@
     public function compare( $value, $comparativeValue ) {
         $this->assertIsString( $value, '$value' );
         $this->assertIsString( $comparativeValue, '$comparativeValue' );
+        $value = $this->cleanDataString( $value );
+        $comparativeValue = $this->cleanDataString( $comparativeValue );
 
         if ( $value === $comparativeValue ) {
             return CompareResult::STATUS_MATCH;
@@ -49,6 +51,8 @@
     public function compareArray( $value, array $comparativeValues ) {
         $this->assertIsString( $value, '$value' );
         $this->assertIsArrayOfStrings( $comparativeValues, 
'$comparativeValues' );
+        $value = $this->cleanDataString($value);
+        $comparativeValues = $this->cleanDataArray( $comparativeValues );
 
         if ( in_array( $value, $comparativeValues ) ) {
             return CompareResult::STATUS_MATCH;
@@ -73,6 +77,8 @@
     public function compareArrays( array $values, array $comparativeValues ) {
         $this->assertIsArrayOfStrings( $values, '$values' );
         $this->assertIsArrayOfStrings( $comparativeValues, 
'$comparativeValues' );
+        $values = $this->cleanDataArray( $values );
+        $comparativeValues = $this->cleanDataArray( $comparativeValues );
 
         if ( count( array_intersect( $values, $comparativeValues ) ) > 0 ) {
             return CompareResult::STATUS_MATCH;
@@ -114,7 +120,7 @@
     }
 
     /**
-     * Checks the similarity of two strings by prefix/postfix check.
+     * Checks the similarity of two strings by prefix/suffix check.
      *
      * @param string $value
      * @param string $comparativeValue
@@ -123,24 +129,52 @@
     private function checkSimilarity( $value, $comparativeValue ) {
         return
             $this->percentagePrefixSimilarity( $value, $comparativeValue ) > 
self::SIMILARITY_THRESHOLD ||
-            $this->percentagePostfixSimilarity( $value, $comparativeValue ) > 
self::SIMILARITY_THRESHOLD;
+            $this->percentageSuffixSimilarity( $value, $comparativeValue ) > 
self::SIMILARITY_THRESHOLD ||
+            $this->percentageLevenshteinDistance( $value, $comparativeValue ) 
> self::SIMILARITY_THRESHOLD;
+    }
+
+    /**
+     * Returns cleaned (without whitespaces at beginning/end and lowercase) 
string of a given input string.
+     *
+     * @param string $value
+     *
+     * @return float
+     */
+    private function cleanDataString( $value ) {
+        $value = trim( $value );
+
+        return strtolower( $value );
+    }
+
+    /**
+     * Returns cleaned (without whitespaces at beginning/end and lowercase) 
array of strings of a given input array.
+     *
+     * @param array $array
+     *
+     * @return float
+     */
+    private function cleanDataArray( array $array ) {
+
+        return array_map(
+            array( $this, 'cleanDataString'),
+            $array );
     }
 
     /**
      * Returns percentage of local value prefix-matching the external values.
      *
-     * @param localValue - value to prefix-match with external value
-     * @param $externalValue - value to prefix-match with local value
+     * @param $value - value to prefix-match with external value
+     * @param $comparativeValue - value to prefix-match with local value
      *
      * @return float
      */
-    private function percentagePrefixSimilarity( $localValue, $externalValue ) 
{
+    private function percentagePrefixSimilarity( $value, $comparativeValue ) {
         $prefixLength = 0; // common prefix length
-        $localLength = strlen( $localValue );
-        $externalLength = strlen( $externalValue );
+        $localLength = strlen( $value );
+        $externalLength = strlen( $comparativeValue );
         while ( $prefixLength < $localLength ) {
-            $c = $localValue[$prefixLength];
-            if ( $externalLength > $prefixLength && 
$externalValue[$prefixLength] !== $c ) {
+            $c = $value[$prefixLength];
+            if ( $externalLength > $prefixLength && 
$comparativeValue[$prefixLength] !== $c ) {
                 break;
             }
             $prefixLength++;
@@ -150,25 +184,42 @@
     }
 
     /**
-     * Returns percentage of local value postfix-matching the external values.
+     * Returns percentage of local value suffix-matching the external values.
      *
-     * @param $localValue - value to postfix-match with local value
-     * @param $externalValue - value to postfix-match with external value
+     * @param $value - value to suffix-match with local value
+     * @param $comparativeValue - value to suffix-match with external value
      *
      * @return float
      */
-    private function percentagePostfixSimilarity( $localValue, $externalValue 
) {
-        $postfixLength = 0; // common postfix length
-        $localLength = strlen( $localValue );
-        $externalLength = strlen( $externalValue );
-        while ( $postfixLength < $localLength ) {
-            $c = $localValue[$localLength - 1 - $postfixLength];
-            if ( $externalLength > $postfixLength && 
$externalValue[$externalLength - 1 - $postfixLength] !== $c ) {
+    private function percentageSuffixSimilarity( $value, $comparativeValue ) {
+        $suffixLength = 0; // common suffix length
+        $localLength = strlen( $value );
+        $externalLength = strlen( $comparativeValue );
+        while ( $suffixLength < $localLength ) {
+            $c = $value[$localLength - 1 - $suffixLength];
+            if ( $externalLength > $suffixLength && 
$comparativeValue[$externalLength - 1 - $suffixLength] !== $c ) {
                 break;
             }
-            $postfixLength++;
+            $suffixLength++;
         }
 
-        return $postfixLength / $externalLength;
+        return $suffixLength / $externalLength;
     }
+
+    /**
+     * Returns percentage of similarity check using levenshtein distance.
+     *
+     * @param $value
+     * @param $comparativeValue
+     *
+     * @return float
+     */
+    private function percentageLevenshteinDistance( $value, $comparativeValue 
) {
+        $distance = levenshtein( $value, $comparativeValue );
+        $percentage = 1.0 - $distance/max( strlen( $value ), strlen( 
$comparativeValue ) );
+
+        return $percentage;
+    }
+
+
 }
\ No newline at end of file
diff --git a/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php 
b/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
index 18965f8..c70804d 100755
--- a/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
+++ b/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
@@ -53,6 +53,37 @@
             ),
             array(
                 'foobar',
+                'Foobar',
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                'foobar',
+                'foObar',
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                'foobar',
+                'FOOBAR',
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                'foobar',
+                '    foobar',
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                'foobar',
+                'foobar    ',
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                '    foobar',
+                '    foobar    ',
+                CompareResult::STATUS_MATCH
+            ),
+            // prefix/suffix partial match
+            array(
+                'foobar',
                 'foobaz',
                 CompareResult::STATUS_PARTIAL_MATCH
             ),
@@ -81,6 +112,28 @@
                 'oobar',
                 CompareResult::STATUS_PARTIAL_MATCH
             ),
+            // levenshtein partial match
+            array(
+                'foobar',
+                'fooobar',
+                CompareResult::STATUS_PARTIAL_MATCH
+            ),
+            array(
+                'fobar',
+                'foobar',
+                CompareResult::STATUS_PARTIAL_MATCH
+            ),
+            array(
+                'foubar',
+                'foobar',
+                CompareResult::STATUS_PARTIAL_MATCH
+            ),
+            array(
+                'Schlossstraße',
+                'Schloßstraße',
+                CompareResult::STATUS_PARTIAL_MATCH
+            ),
+            // mismatches
             array(
                 'fo',
                 'foobar',
@@ -153,6 +206,16 @@
             array(
                 'foobar',
                 array( 'fo', 'foobar' ),
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                'foobar',
+                array( 'fo', 'FOOBAR' ),
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                'foobar',
+                array( 'fo', '   FOOBAR   ' ),
                 CompareResult::STATUS_MATCH
             ),
             array(
@@ -235,6 +298,16 @@
                 CompareResult::STATUS_PARTIAL_MATCH
             ),
             array(
+                array( 'foobar', 'fubar' ),
+                array( 'bar', 'FOOBAR' ),
+                CompareResult::STATUS_MATCH
+            ),
+            array(
+                array( 'foobar', 'fubar' ),
+                array( 'bar', '   FOOBAR  ' ),
+                CompareResult::STATUS_MATCH
+            ),
+            array(
                 array( 'foobar', 'foo' ),
                 array( 'fubar', 'baz' ),
                 CompareResult::STATUS_MISMATCH

-- 
To view, visit https://gerrit.wikimedia.org/r/215008
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibe46d7b013f024c8749514f2115bbbad77926da6
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikidataQualityExternalValidation
Gerrit-Branch: master
Gerrit-Owner: Dominic.sauer <dominic.sa...@yahoo.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to