jenkins-bot has submitted this change and it was merged. Change subject: Make MonthNameUnlocalizer aware of genitive month names ......................................................................
Make MonthNameUnlocalizer aware of genitive month names See Ic7a5797. This includes a major rewrite of the method. Now it tries to unlocalize longer strings first. This should avoid all possible conflicts. Change-Id: I232fd3b5433f04396c53eecf3ad49d025a84ee64 --- M lib/includes/parsers/MonthNameUnlocalizer.php M lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php 2 files changed, 118 insertions(+), 50 deletions(-) Approvals: Daniel Kinzler: Looks good to me, approved jenkins-bot: Verified diff --git a/lib/includes/parsers/MonthNameUnlocalizer.php b/lib/includes/parsers/MonthNameUnlocalizer.php index ef6fd9e..990dd0a 100644 --- a/lib/includes/parsers/MonthNameUnlocalizer.php +++ b/lib/includes/parsers/MonthNameUnlocalizer.php @@ -6,7 +6,7 @@ use ValueParsers\ParserOptions; /** - * Class to unlocalise month names using Mediawiki's Language object + * Class to unlocalize a month name in a date string using MediaWiki's Language object. * * @since 0.5 * @@ -18,51 +18,77 @@ class MonthNameUnlocalizer { /** - * @see Unlocalizer::unlocalize() + * @see NumberUnlocalizer::unlocalizeNumber * - * @param string $string string to process - * @param string $langCode + * @param string $date Localized date string. + * @param string $languageCode * @param ParserOptions $options - * - * @return string unlocalized string - */ - public function unlocalize( $string, $langCode, ParserOptions $options ) { - if( $langCode === 'en' ) { - return $string; - } - - $lang = Language::factory( $langCode ); - $en = Language::factory( 'en' ); - - $string = $this->unlocalizeMonthNames( $lang, $en, $string ); - - return $string; - } - - /** - * Unlocalizes month names in a string, checking both full month names and abbreviations - * @param Language $from - * @param Language $to - * @param string $string * * @return string */ - private function unlocalizeMonthNames( Language $from, Language $to, $string ) { - $initialString = $string; - - for ( $i = 1; $i <= 12; $i++ ) { - $string = str_replace( $from->getMonthName( $i ), $to->getMonthName( $i ), $string ); + public function unlocalize( $date, $languageCode, ParserOptions $options ) { + if ( $languageCode === 'en' ) { + return $date; } - if( $string !== $initialString ) { - return $string; - } + $language = Language::factory( $languageCode ); + $en = Language::factory( 'en' ); - for ( $i = 1; $i <= 12; $i++ ) { - $string = str_replace( $from->getMonthAbbreviation( $i ), $to->getMonthName( $i ), $string ); - } + $date = $this->unlocalizeMonthName( $language, $en, $date ); - return $string; + return $date; } -} \ No newline at end of file + /** + * Unlocalizes the longest month name in a date string that could be found first. + * Tries to avoid doing multiple replacements and returns the localized original if in doubt. + * Takes full month names, genitive names and abbreviations into account. + * + * @param Language $from + * @param Language $to + * @param string $date Localized date string. + * + * @return string + */ + private function unlocalizeMonthName( Language $from, Language $to, $date ) { + $replacements = array(); + + for ( $i = 1; $i <= 12; $i++ ) { + $replace = $to->getMonthName( $i ); + + $replacements[$from->getMonthName( $i )] = $replace; + $replacements[$from->getMonthNameGen( $i )] = $replace; + $replacements[$from->getMonthAbbreviation( $i )] = $replace; + } + + // Order search strings from longest to shortest + uksort( $replacements, function( $a, $b ) { + return strlen( $b ) - strlen( $a ); + } ); + + foreach ( $replacements as $search => $replace ) { + $unlocalized = str_replace( $search, $replace, $date, $count ); + + // Nothing happened, try the next. + if ( $count <= 0 ) { + continue; + } + + // Do not mess with strings that are clearly not a valid date. + if ( $count > 1 ) { + break; + } + + // Do not mess with already unlocalized month names, e.g. "Juli" should not become + // "Julyi" when replacing "Jul" with "July". But shortening "Julyus" to "July" is ok. + if ( strpos( $date, $replace ) !== false && strlen( $replace ) >= strlen( $search ) ) { + break; + } + + return $unlocalized; + } + + return $date; + } + +} diff --git a/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php b/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php index 7a47e43..0c5b24b 100644 --- a/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php +++ b/lib/tests/phpunit/parsers/MonthNameUnlocalizerTest.php @@ -20,28 +20,70 @@ public function provideUnlocalize() { $testCases = array( - - //Should unlocalize dates + // Nominative month names. array( '1 Juli 2013', 'de', '1 July 2013' ), array( '1 Januarie 1999', 'af', '1 January 1999' ), - array( '1 Jan 1999', 'af', '1 January 1999' ), array( '16 Jenna 1999', 'bar', '16 January 1999' ), + array( '12 Jänner 2013', 'de-at', '12 January 2013' ), - //Shouldn#t do anything if we cant / don#t need to + // Genitive month names. + array( '1 Julis 2013', 'de', '1 July 2013' ), + array( '31 Decembris 2013', 'la', '31 December 2013' ), + + // Abbreviations. + array( '1 Jan 1999', 'af', '1 January 1999' ), + array( '1 Mär. 1999', 'de', '1 March 1999' ), + + // Nothing to do in English. array( '1 June 2013', 'en', '1 June 2013' ), array( '1 Jan 2013', 'en', '1 Jan 2013' ), - array( '16 FooBarBarxxx 1999', 'bar', '16 FooBarBarxxx 1999' ), + array( '1 January 1999', 'en', '1 January 1999' ), + // No localized month name found. + array( '16 FooBarBarxxx 1999', 'bar', '16 FooBarBarxxx 1999' ), + array( '16 Martii 1999', 'de', '16 Martii 1999' ), + array( '16 May 1999', 'de', '16 May 1999' ), + array( '16 Dezember 1999', 'la', '16 Dezember 1999' ), + + // Replace the longest unlocalized substring first. + array( 'Juli Januar', 'de', 'Juli January' ), + array( 'Juli Mai', 'de', 'July Mai' ), + array( 'Juli December', 'de', 'July December' ), + array( 'July Dezember', 'de', 'July December' ), + array( 'Januar Mär Dez', 'de', 'January Mär Dez' ), + + // Do not mess with already unlocalized month names. + array( 'January', 'de', 'January' ), + array( 'April', 'la', 'April' ), + array( 'Dec', 'de', 'Dec' ), + array( '15 March 44 BC', 'nrm', '15 March 44 BC' ), + array( 'Juni June', 'de', 'Juni June' ), + array( 'July Jul', 'de', 'July Jul' ), + + // But shortening is ok even if a substring looks like it's already unlocalized. + array( 'Mayo', 'war', 'May' ), + array( 'July Julis', 'de', 'July July' ), + + // Do not mess with strings that are clearly not a valid date. + array( 'Juli Juli', 'de', 'Juli Juli' ), + + // Word boundaries currently do not prevent unlocalization on purpose. + array( 'Mai2013', 'de', 'May2013' ), + array( 'Februarii', 'de', 'Februaryii' ), + + // Capitalization is currently significant. This may need to depend on the languages. + array( '1 juli 2013', 'de', '1 juli 2013' ), ); - //Loop through some other languages - $someLangs = array( 'war', 'ceb', 'uk', 'ru', 'de' ); + // Loop through some other languages + $languageCodes = array( 'war', 'ceb', 'uk', 'ru', 'de' ); $en = Language::factory( 'en' ); - foreach( $someLangs as $from ) { + foreach ( $languageCodes as $from ) { $fromLang = Language::factory( $from ); for ( $i = 1; $i <= 12; $i++ ) { $testCases[] = array( $fromLang->getMonthName( $i ), $from, $en->getMonthName( $i ) ); + $testCases[] = array( $fromLang->getMonthNameGen( $i ), $from, $en->getMonthName( $i ) ); $testCases[] = array( $fromLang->getMonthAbbreviation( $i ), $from, $en->getMonthName( $i ) ); } } @@ -53,16 +95,16 @@ * @dataProvider provideUnlocalize * * @param $localized - * @param $lang + * @param $languageCode * @param $expected */ - public function testUnlocalize( $localized, $lang, $expected ) { + public function testUnlocalize( $localized, $languageCode, $expected ) { $monthUnlocalizer = new MonthNameUnlocalizer(); $options = new ParserOptions(); - $actual = $monthUnlocalizer->unlocalize( $localized, $lang, $options ); + $actual = $monthUnlocalizer->unlocalize( $localized, $languageCode, $options ); $this->assertEquals( $expected, $actual ); } -} \ No newline at end of file +} -- To view, visit https://gerrit.wikimedia.org/r/148113 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I232fd3b5433f04396c53eecf3ad49d025a84ee64 Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de> Gerrit-Reviewer: Addshore <addshorew...@gmail.com> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: Eranroz <eranro...@gmail.com> Gerrit-Reviewer: Hoo man <h...@online.de> Gerrit-Reviewer: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de> Gerrit-Reviewer: WikidataJenkins <wikidata-servi...@wikimedia.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits