i18nlangtag/source/languagetag/languagetag.cxx | 100 +++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 8 deletions(-)
New commits: commit 0382e86830daa3255feaa0561c678e58ad714126 Author: Eike Rathke <er...@redhat.com> AuthorDate: Fri Feb 17 01:56:09 2023 +0100 Commit: Eike Rathke <er...@redhat.com> CommitDate: Fri Feb 17 02:23:48 2023 +0000 Handle 3-digits UN M.49 region codes in simpleExtract() Necessary for {es-419} and the like that are predefined language tags with known LCID mappings. Change-Id: I46491842fc11836a55cd56327d4f955952c026f0 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/147182 Reviewed-by: Eike Rathke <er...@redhat.com> Tested-by: Jenkins diff --git a/i18nlangtag/source/languagetag/languagetag.cxx b/i18nlangtag/source/languagetag/languagetag.cxx index 1f097a23a840..63462e3e6a3a 100644 --- a/i18nlangtag/source/languagetag/languagetag.cxx +++ b/i18nlangtag/source/languagetag/languagetag.cxx @@ -360,6 +360,7 @@ private: EXTRACTED_NONE, EXTRACTED_LSC, EXTRACTED_LV, + EXTRACTED_LR, EXTRACTED_C_LOCALE, EXTRACTED_X, EXTRACTED_X_JOKER, @@ -374,6 +375,7 @@ private: @return EXTRACTED_LSC if simple tag was detected (i.e. one that would fulfill the isIsoODF() condition), EXTRACTED_LV if a tag with variant was detected, + EXTRACTED_LR if a tag with 3-digit UN M.49 region code was detected EXTRACTED_C_LOCALE if a 'C' locale was detected, EXTRACTED_X if x-... privateuse tag was detected, EXTRACTED_X_JOKER if "*" joker was detected, @@ -384,6 +386,7 @@ private: OUString& rLanguage, OUString& rScript, OUString& rCountry, + OUString& rRegion, OUString& rVariants ); /** Convert Locale to BCP 47 string without resolving system and creating @@ -1120,20 +1123,22 @@ bool LanguageTagImpl::canonicalize() // and want to determine if parsing it would be possible // without using liblangtag just to see if it is a simple known // locale or could fall back to one. - OUString aLanguage, aScript, aCountry, aVariants; - Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); + OUString aLanguage, aScript, aCountry, aRegion, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aRegion, aVariants); if (eExt != EXTRACTED_NONE) { - if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV) + if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt == EXTRACTED_LR) { // Rebuild bcp47 with proper casing of tags. OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() + - 1 + aCountry.getLength() + 1 + aVariants.getLength()); + 1 + aCountry.getLength() + 1 + aRegion.getLength() + 1 + aVariants.getLength()); aBuf.append( aLanguage); if (!aScript.isEmpty()) aBuf.append("-" + aScript); if (!aCountry.isEmpty()) aBuf.append("-" + aCountry); + if (!aRegion.isEmpty()) + aBuf.append("-" + aRegion); if (!aVariants.isEmpty()) aBuf.append("-" + aVariants); OUString aStr( aBuf.makeStringAndClear()); @@ -2036,9 +2041,9 @@ void LanguageTag::setScriptType(LanguageTag::ScriptType st) bool LanguageTagImpl::cacheSimpleLSCV() { - OUString aLanguage, aScript, aCountry, aVariants; - Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); - bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV); + OUString aLanguage, aScript, aCountry, aRegion, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aRegion, aVariants); + bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt == EXTRACTED_LR); if (bRet) { maCachedLanguage = aLanguage; @@ -2440,7 +2445,7 @@ bool LanguageTag::operator<( const LanguageTag & rLanguageTag ) const // static LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp47, - OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rVariants ) + OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rRegion, OUString& rVariants ) { Extraction eRet = EXTRACTED_NONE; const sal_Int32 nLen = rBcp47.getLength(); @@ -2464,6 +2469,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "C"; rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); } else if (nLen == 2 || nLen == 3) // ll or lll @@ -2473,6 +2479,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.toAsciiLowerCase(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } @@ -2484,11 +2491,25 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp { rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rScript.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } } + else if ( (nHyph1 == 2 && nLen == 6) // ll-rrr + || (nHyph1 == 3 && nLen == 7)) // lll-rrr + { + if (nHyph2 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph1 + 1, 3); + rScript.clear(); + rVariants.clear(); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nLen == 7) // ll-Ssss or ll-vvvv || (nHyph1 == 3 && nLen == 8)) // lll-Ssss or lll-vvvv { @@ -2501,6 +2522,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph1 + 1); eRet = EXTRACTED_LV; } @@ -2510,6 +2532,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } @@ -2523,10 +2546,24 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } } + else if ( (nHyph1 == 2 && nHyph2 == 7 && nLen == 11) // ll-Ssss-rrr + || (nHyph1 == 3 && nHyph2 == 8 && nLen == 12)) // lll-Ssss-rrr + { + if (nHyph3 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph2 + 1, 3); + rVariants.clear(); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15) // ll-Ssss-CC-vvvv[vvvv][-...] || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16)) // lll-Ssss-CC-vvvv[vvvv][-...] { @@ -2537,10 +2574,26 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph3 + 1); eRet = EXTRACTED_LV; } } + else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 11 && nLen >= 16) // ll-Ssss-rrr-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 12 && nLen >= 17)) // lll-Ssss-rrr-vvvv[vvvv][-...] + { + if (nHyph4 < 0) + nHyph4 = rBcp47.getLength(); + if (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph2 + 1, 3); + rVariants = rBcp47.copy( nHyph3 + 1); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nHyph2 == 5 && nHyph3 == 7) // ll-CC-u-... || (nHyph1 == 3 && nHyph2 == 6 && nHyph3 == 8)) // lll-CC-u-... { @@ -2556,6 +2609,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "es"; rScript.clear(); rCountry = "ES"; + rRegion.clear(); rVariants = "u-co-trad"; // not strictly a variant, but used to reconstruct the tag. eRet = EXTRACTED_LV; } @@ -2571,10 +2625,26 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript.clear(); rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph2 + 1); eRet = EXTRACTED_LV; } } + else if ( (nHyph1 == 2 && nHyph2 == 6 && nLen >= 11) // ll-rrr-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 7 && nLen >= 12)) // lll-rrr-vvvv[vvvv][-...] + { + if (nHyph3 < 0) + nHyph3 = rBcp47.getLength(); + if (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript.clear(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph1 + 1, 3); + rVariants = rBcp47.copy( nHyph2 + 1); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nLen >= 8) // ll-vvvvv[vvv][-...] || (nHyph1 == 3 && nLen >= 9)) // lll-vvvvv[vvv][-...] { @@ -2585,6 +2655,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph1 + 1); eRet = EXTRACTED_LV; } @@ -2598,6 +2669,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "en"; rScript.clear(); rCountry = "GB"; + rRegion.clear(); rVariants = "oed"; eRet = EXTRACTED_LV; } @@ -2608,6 +2680,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "es"; rScript.clear(); rCountry = "ES"; + rRegion.clear(); rVariants = "tradnl"; // this is nonsense, but... ignored. eRet = EXTRACTED_KNOWN_BAD; } @@ -2619,8 +2692,19 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage.clear(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); } + else + { + assert(rLanguage.getLength() == 2 || rLanguage.getLength() == 3 + || eRet == EXTRACTED_X_JOKER || eRet == EXTRACTED_X || eRet == EXTRACTED_C_LOCALE); + assert(rScript.isEmpty() || rScript.getLength() == 4); + assert(rCountry.isEmpty() || rRegion.isEmpty()); // [2ALPHA / 3DIGIT] + assert(rCountry.isEmpty() || rCountry.getLength() == 2); + assert(rRegion.isEmpty() || rRegion.getLength() == 3); + assert(rVariants.isEmpty() || rVariants.getLength() >= 4 || rVariants == "oed"); + } return eRet; }