jenkins-bot has submitted this change and it was merged.

Change subject: Make MonthNameUnlocalizer aware of genitive month names
......................................................................


Make MonthNameUnlocalizer aware of genitive month names

See Ic7a5797.

This includes a major rewrite of the method. Now it tries to
unlocalize longer strings first. This should avoid all possible
conflicts.

Change-Id: I232fd3b5433f04396c53eecf3ad49d025a84ee64
---
M lib/includes/parsers/MonthNameUnlocalizer.php
M lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php
2 files changed, 118 insertions(+), 50 deletions(-)

Approvals:
  Daniel Kinzler: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/includes/parsers/MonthNameUnlocalizer.php 
b/lib/includes/parsers/MonthNameUnlocalizer.php
index ef6fd9e..990dd0a 100644
--- a/lib/includes/parsers/MonthNameUnlocalizer.php
+++ b/lib/includes/parsers/MonthNameUnlocalizer.php
@@ -6,7 +6,7 @@
 use ValueParsers\ParserOptions;
 
 /**
- * Class to unlocalise month names using Mediawiki's Language object
+ * Class to unlocalize a month name in a date string using MediaWiki's 
Language object.
  *
  * @since 0.5
  *
@@ -18,51 +18,77 @@
 class MonthNameUnlocalizer {
 
        /**
-        * @see Unlocalizer::unlocalize()
+        * @see NumberUnlocalizer::unlocalizeNumber
         *
-        * @param string $string string to process
-        * @param string $langCode
+        * @param string $date Localized date string.
+        * @param string $languageCode
         * @param ParserOptions $options
-        *
-        * @return string unlocalized string
-        */
-       public function unlocalize( $string, $langCode, ParserOptions $options 
) {
-               if( $langCode === 'en' ) {
-                       return $string;
-               }
-
-               $lang = Language::factory( $langCode );
-               $en = Language::factory( 'en' );
-
-               $string = $this->unlocalizeMonthNames( $lang, $en, $string );
-
-               return $string;
-       }
-
-       /**
-        * Unlocalizes month names in a string, checking both full month names 
and abbreviations
-        * @param Language $from
-        * @param Language $to
-        * @param string $string
         *
         * @return string
         */
-       private function unlocalizeMonthNames( Language $from, Language $to, 
$string ) {
-               $initialString = $string;
-
-               for ( $i = 1; $i <= 12; $i++ ) {
-                       $string = str_replace( $from->getMonthName( $i ), 
$to->getMonthName( $i ), $string );
+       public function unlocalize( $date, $languageCode, ParserOptions 
$options ) {
+               if ( $languageCode === 'en' ) {
+                       return $date;
                }
 
-               if( $string !== $initialString ) {
-                       return $string;
-               }
+               $language = Language::factory( $languageCode );
+               $en = Language::factory( 'en' );
 
-               for ( $i = 1; $i <= 12; $i++ ) {
-                       $string = str_replace( $from->getMonthAbbreviation( $i 
), $to->getMonthName( $i ), $string );
-               }
+               $date = $this->unlocalizeMonthName( $language, $en, $date );
 
-               return $string;
+               return $date;
        }
 
-}
\ No newline at end of file
+       /**
+        * Unlocalizes the longest month name in a date string that could be 
found first.
+        * Tries to avoid doing multiple replacements and returns the localized 
original if in doubt.
+        * Takes full month names, genitive names and abbreviations into 
account.
+        *
+        * @param Language $from
+        * @param Language $to
+        * @param string $date Localized date string.
+        *
+        * @return string
+        */
+       private function unlocalizeMonthName( Language $from, Language $to, 
$date ) {
+               $replacements = array();
+
+               for ( $i = 1; $i <= 12; $i++ ) {
+                       $replace = $to->getMonthName( $i );
+
+                       $replacements[$from->getMonthName( $i )] = $replace;
+                       $replacements[$from->getMonthNameGen( $i )] = $replace;
+                       $replacements[$from->getMonthAbbreviation( $i )] = 
$replace;
+               }
+
+               // Order search strings from longest to shortest
+               uksort( $replacements, function( $a, $b ) {
+                       return strlen( $b ) - strlen( $a );
+               } );
+
+               foreach ( $replacements as $search => $replace ) {
+                       $unlocalized = str_replace( $search, $replace, $date, 
$count );
+
+                       // Nothing happened, try the next.
+                       if ( $count <= 0 ) {
+                               continue;
+                       }
+
+                       // Do not mess with strings that are clearly not a 
valid date.
+                       if ( $count > 1 ) {
+                               break;
+                       }
+
+                       // Do not mess with already unlocalized month names, 
e.g. "Juli" should not become
+                       // "Julyi" when replacing "Jul" with "July". But 
shortening "Julyus" to "July" is ok.
+                       if ( strpos( $date, $replace ) !== false && strlen( 
$replace ) >= strlen( $search ) ) {
+                               break;
+                       }
+
+                       return $unlocalized;
+               }
+
+               return $date;
+       }
+
+}
diff --git a/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php 
b/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php
index 7a47e43..0c5b24b 100644
--- a/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php
+++ b/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php
@@ -20,28 +20,70 @@
 
        public function provideUnlocalize() {
                $testCases = array(
-
-                       //Should unlocalize dates
+                       // Nominative month names.
                        array( '1 Juli 2013', 'de', '1 July 2013' ),
                        array( '1 Januarie 1999', 'af', '1 January 1999' ),
-                       array( '1 Jan 1999', 'af', '1 January 1999' ),
                        array( '16 Jenna 1999', 'bar', '16 January 1999' ),
+                       array( '12 Jänner 2013', 'de-at', '12 January 2013' ),
 
-                       //Shouldn#t do anything if we cant / don#t need to
+                       // Genitive month names.
+                       array( '1 Julis 2013', 'de', '1 July 2013' ),
+                       array( '31 Decembris 2013', 'la', '31 December 2013' ),
+
+                       // Abbreviations.
+                       array( '1 Jan 1999', 'af', '1 January 1999' ),
+                       array( '1 Mär. 1999', 'de', '1 March 1999' ),
+
+                       // Nothing to do in English.
                        array( '1 June 2013', 'en', '1 June 2013' ),
                        array( '1 Jan 2013', 'en', '1 Jan 2013' ),
-                       array( '16 FooBarBarxxx 1999', 'bar', '16 FooBarBarxxx 
1999' ),
+                       array( '1 January 1999', 'en', '1 January 1999' ),
 
+                       // No localized month name found.
+                       array( '16 FooBarBarxxx 1999', 'bar', '16 FooBarBarxxx 
1999' ),
+                       array( '16 Martii 1999', 'de', '16 Martii 1999' ),
+                       array( '16 May 1999', 'de', '16 May 1999' ),
+                       array( '16 Dezember 1999', 'la', '16 Dezember 1999' ),
+
+                       // Replace the longest unlocalized substring first.
+                       array( 'Juli Januar', 'de', 'Juli January' ),
+                       array( 'Juli Mai', 'de', 'July Mai' ),
+                       array( 'Juli December', 'de', 'July December' ),
+                       array( 'July Dezember', 'de', 'July December' ),
+                       array( 'Januar Mär Dez', 'de', 'January Mär Dez' ),
+
+                       // Do not mess with already unlocalized month names.
+                       array( 'January', 'de', 'January' ),
+                       array( 'April', 'la', 'April' ),
+                       array( 'Dec', 'de', 'Dec' ),
+                       array( '15 March 44 BC', 'nrm', '15 March 44 BC' ),
+                       array( 'Juni June', 'de', 'Juni June' ),
+                       array( 'July Jul', 'de', 'July Jul' ),
+
+                       // But shortening is ok even if a substring looks like 
it's already unlocalized.
+                       array( 'Mayo', 'war', 'May' ),
+                       array( 'July Julis', 'de', 'July July' ),
+
+                       // Do not mess with strings that are clearly not a 
valid date.
+                       array( 'Juli Juli', 'de', 'Juli Juli' ),
+
+                       // Word boundaries currently do not prevent 
unlocalization on purpose.
+                       array( 'Mai2013', 'de', 'May2013' ),
+                       array( 'Februarii', 'de', 'Februaryii' ),
+
+                       // Capitalization is currently significant. This may 
need to depend on the languages.
+                       array( '1 juli 2013', 'de', '1 juli 2013' ),
                );
 
-               //Loop through some other languages
-               $someLangs = array( 'war', 'ceb', 'uk', 'ru', 'de' );
+               // Loop through some other languages
+               $languageCodes = array( 'war', 'ceb', 'uk', 'ru', 'de' );
                $en = Language::factory( 'en' );
 
-               foreach( $someLangs as $from ) {
+               foreach ( $languageCodes as $from ) {
                        $fromLang = Language::factory( $from );
                        for ( $i = 1; $i <= 12; $i++ ) {
                                $testCases[] = array( $fromLang->getMonthName( 
$i ), $from, $en->getMonthName( $i ) );
+                               $testCases[] = array( 
$fromLang->getMonthNameGen( $i ), $from, $en->getMonthName( $i ) );
                                $testCases[] = array( 
$fromLang->getMonthAbbreviation( $i ), $from, $en->getMonthName( $i ) );
                        }
                }
@@ -53,16 +95,16 @@
         * @dataProvider provideUnlocalize
         *
         * @param $localized
-        * @param $lang
+        * @param $languageCode
         * @param $expected
         */
-       public function testUnlocalize( $localized, $lang, $expected ) {
+       public function testUnlocalize( $localized, $languageCode, $expected ) {
                $monthUnlocalizer = new MonthNameUnlocalizer();
                $options = new ParserOptions();
 
-               $actual = $monthUnlocalizer->unlocalize( $localized, $lang, 
$options );
+               $actual = $monthUnlocalizer->unlocalize( $localized, 
$languageCode, $options );
 
                $this->assertEquals( $expected, $actual );
        }
 
-}
\ No newline at end of file
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/148113
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I232fd3b5433f04396c53eecf3ad49d025a84ee64
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de>
Gerrit-Reviewer: Addshore <addshorew...@gmail.com>
Gerrit-Reviewer: Aude <aude.w...@gmail.com>
Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de>
Gerrit-Reviewer: Eranroz <eranro...@gmail.com>
Gerrit-Reviewer: Hoo man <h...@online.de>
Gerrit-Reviewer: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de>
Gerrit-Reviewer: WikidataJenkins <wikidata-servi...@wikimedia.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to