i18nlangtag/source/languagetag/languagetag.cxx |  100 +++++++++++++++++++++++--
 1 file changed, 92 insertions(+), 8 deletions(-)

New commits:
commit 0382e86830daa3255feaa0561c678e58ad714126
Author:     Eike Rathke <er...@redhat.com>
AuthorDate: Fri Feb 17 01:56:09 2023 +0100
Commit:     Eike Rathke <er...@redhat.com>
CommitDate: Fri Feb 17 02:23:48 2023 +0000

    Handle 3-digits UN M.49 region codes in simpleExtract()
    
    Necessary for {es-419} and the like that are predefined language
    tags with known LCID mappings.
    
    Change-Id: I46491842fc11836a55cd56327d4f955952c026f0
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/147182
    Reviewed-by: Eike Rathke <er...@redhat.com>
    Tested-by: Jenkins

diff --git a/i18nlangtag/source/languagetag/languagetag.cxx 
b/i18nlangtag/source/languagetag/languagetag.cxx
index 1f097a23a840..63462e3e6a3a 100644
--- a/i18nlangtag/source/languagetag/languagetag.cxx
+++ b/i18nlangtag/source/languagetag/languagetag.cxx
@@ -360,6 +360,7 @@ private:
         EXTRACTED_NONE,
         EXTRACTED_LSC,
         EXTRACTED_LV,
+        EXTRACTED_LR,
         EXTRACTED_C_LOCALE,
         EXTRACTED_X,
         EXTRACTED_X_JOKER,
@@ -374,6 +375,7 @@ private:
         @return EXTRACTED_LSC if simple tag was detected (i.e. one that
                 would fulfill the isIsoODF() condition),
                 EXTRACTED_LV if a tag with variant was detected,
+                EXTRACTED_LR if a tag with 3-digit UN M.49 region code was 
detected
                 EXTRACTED_C_LOCALE if a 'C' locale was detected,
                 EXTRACTED_X if x-... privateuse tag was detected,
                 EXTRACTED_X_JOKER if "*" joker was detected,
@@ -384,6 +386,7 @@ private:
                                        OUString& rLanguage,
                                        OUString& rScript,
                                        OUString& rCountry,
+                                       OUString& rRegion,
                                        OUString& rVariants );
 
     /** Convert Locale to BCP 47 string without resolving system and creating
@@ -1120,20 +1123,22 @@ bool LanguageTagImpl::canonicalize()
                 // and want to determine if parsing it would be possible
                 // without using liblangtag just to see if it is a simple known
                 // locale or could fall back to one.
-                OUString aLanguage, aScript, aCountry, aVariants;
-                Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, 
aCountry, aVariants);
+                OUString aLanguage, aScript, aCountry, aRegion, aVariants;
+                Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, 
aCountry, aRegion, aVariants);
                 if (eExt != EXTRACTED_NONE)
                 {
-                    if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV)
+                    if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt 
== EXTRACTED_LR)
                     {
                         // Rebuild bcp47 with proper casing of tags.
                         OUStringBuffer aBuf( aLanguage.getLength() + 1 + 
aScript.getLength() +
-                                1 + aCountry.getLength() + 1 + 
aVariants.getLength());
+                                1 + aCountry.getLength() + 1 + 
aRegion.getLength() + 1 + aVariants.getLength());
                         aBuf.append( aLanguage);
                         if (!aScript.isEmpty())
                             aBuf.append("-" + aScript);
                         if (!aCountry.isEmpty())
                             aBuf.append("-" + aCountry);
+                        if (!aRegion.isEmpty())
+                            aBuf.append("-" + aRegion);
                         if (!aVariants.isEmpty())
                             aBuf.append("-" + aVariants);
                         OUString aStr( aBuf.makeStringAndClear());
@@ -2036,9 +2041,9 @@ void LanguageTag::setScriptType(LanguageTag::ScriptType 
st)
 
 bool LanguageTagImpl::cacheSimpleLSCV()
 {
-    OUString aLanguage, aScript, aCountry, aVariants;
-    Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, 
aVariants);
-    bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV);
+    OUString aLanguage, aScript, aCountry, aRegion, aVariants;
+    Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, 
aRegion, aVariants);
+    bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt == 
EXTRACTED_LR);
     if (bRet)
     {
         maCachedLanguage = aLanguage;
@@ -2440,7 +2445,7 @@ bool LanguageTag::operator<( const LanguageTag & 
rLanguageTag ) const
 
 // static
 LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& 
rBcp47,
-        OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& 
rVariants )
+        OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& 
rRegion, OUString& rVariants )
 {
     Extraction eRet = EXTRACTED_NONE;
     const sal_Int32 nLen = rBcp47.getLength();
@@ -2464,6 +2469,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
         rLanguage = "C";
         rScript.clear();
         rCountry.clear();
+        rRegion.clear();
         rVariants.clear();
     }
     else if (nLen == 2 || nLen == 3)                // ll or lll
@@ -2473,6 +2479,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
             rLanguage = rBcp47.toAsciiLowerCase();
             rScript.clear();
             rCountry.clear();
+            rRegion.clear();
             rVariants.clear();
             eRet = EXTRACTED_LSC;
         }
@@ -2484,11 +2491,25 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
         {
             rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
             rCountry  = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
+            rRegion.clear();
             rScript.clear();
             rVariants.clear();
             eRet = EXTRACTED_LSC;
         }
     }
+    else if (  (nHyph1 == 2 && nLen == 6)           // ll-rrr
+            || (nHyph1 == 3 && nLen == 7))          // lll-rrr
+    {
+        if (nHyph2 < 0)
+        {
+            rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+            rCountry.clear();
+            rRegion  = rBcp47.copy( nHyph1 + 1, 3);
+            rScript.clear();
+            rVariants.clear();
+            eRet = EXTRACTED_LR;
+        }
+    }
     else if (  (nHyph1 == 2 && nLen ==  7)          // ll-Ssss or ll-vvvv
             || (nHyph1 == 3 && nLen ==  8))         // lll-Ssss or lll-vvvv
     {
@@ -2501,6 +2522,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
                 rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
                 rScript.clear();
                 rCountry.clear();
+                rRegion.clear();
                 rVariants = rBcp47.copy( nHyph1 + 1);
                 eRet = EXTRACTED_LV;
             }
@@ -2510,6 +2532,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
                 rScript   = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() +
                             rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
                 rCountry.clear();
+                rRegion.clear();
                 rVariants.clear();
                 eRet = EXTRACTED_LSC;
             }
@@ -2523,10 +2546,24 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
             rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
             rScript   = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + 
rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
             rCountry  = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
+            rRegion.clear();
             rVariants.clear();
             eRet = EXTRACTED_LSC;
         }
     }
+    else if (  (nHyph1 == 2 && nHyph2 == 7 && nLen == 11)   // ll-Ssss-rrr
+            || (nHyph1 == 3 && nHyph2 == 8 && nLen == 12))  // lll-Ssss-rrr
+    {
+        if (nHyph3 < 0)
+        {
+            rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+            rScript   = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + 
rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
+            rCountry.clear();
+            rRegion  = rBcp47.copy( nHyph2 + 1, 3);
+            rVariants.clear();
+            eRet = EXTRACTED_LR;
+        }
+    }
     else if (  (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15)   // 
ll-Ssss-CC-vvvv[vvvv][-...]
             || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16))  // 
lll-Ssss-CC-vvvv[vvvv][-...]
     {
@@ -2537,10 +2574,26 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
             rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
             rScript   = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + 
rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
             rCountry  = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
+            rRegion.clear();
             rVariants = rBcp47.copy( nHyph3 + 1);
             eRet = EXTRACTED_LV;
         }
     }
+    else if (  (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 11 && nLen >= 16)   // 
ll-Ssss-rrr-vvvv[vvvv][-...]
+            || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 12 && nLen >= 17))  // 
lll-Ssss-rrr-vvvv[vvvv][-...]
+    {
+        if (nHyph4 < 0)
+            nHyph4 = rBcp47.getLength();
+        if (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9)
+        {
+            rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+            rScript   = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + 
rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
+            rCountry.clear();
+            rRegion   = rBcp47.copy( nHyph2 + 1, 3);
+            rVariants = rBcp47.copy( nHyph3 + 1);
+            eRet = EXTRACTED_LR;
+        }
+    }
     else if (  (nHyph1 == 2 && nHyph2 == 5 && nHyph3 == 7)      // ll-CC-u-...
             || (nHyph1 == 3 && nHyph2 == 6 && nHyph3 == 8))     // lll-CC-u-...
     {
@@ -2556,6 +2609,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
                 rLanguage = "es";
                 rScript.clear();
                 rCountry  = "ES";
+                rRegion.clear();
                 rVariants = "u-co-trad";    // not strictly a variant, but 
used to reconstruct the tag.
                 eRet = EXTRACTED_LV;
             }
@@ -2571,10 +2625,26 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
             rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
             rScript.clear();
             rCountry  = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
+            rRegion.clear();
             rVariants = rBcp47.copy( nHyph2 + 1);
             eRet = EXTRACTED_LV;
         }
     }
+    else if (  (nHyph1 == 2 && nHyph2 == 6 && nLen >= 11)   // 
ll-rrr-vvvv[vvvv][-...]
+            || (nHyph1 == 3 && nHyph2 == 7 && nLen >= 12))  // 
lll-rrr-vvvv[vvvv][-...]
+    {
+        if (nHyph3 < 0)
+            nHyph3 = rBcp47.getLength();
+        if (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9)
+        {
+            rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+            rScript.clear();
+            rCountry.clear();
+            rRegion   = rBcp47.copy( nHyph1 + 1, 3);
+            rVariants = rBcp47.copy( nHyph2 + 1);
+            eRet = EXTRACTED_LR;
+        }
+    }
     else if (  (nHyph1 == 2 && nLen >= 8)                   // 
ll-vvvvv[vvv][-...]
             || (nHyph1 == 3 && nLen >= 9))                  // 
lll-vvvvv[vvv][-...]
     {
@@ -2585,6 +2655,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
             rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
             rScript.clear();
             rCountry.clear();
+            rRegion.clear();
             rVariants = rBcp47.copy( nHyph1 + 1);
             eRet = EXTRACTED_LV;
         }
@@ -2598,6 +2669,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
                 rLanguage = "en";
                 rScript.clear();
                 rCountry  = "GB";
+                rRegion.clear();
                 rVariants = "oed";
                 eRet = EXTRACTED_LV;
             }
@@ -2608,6 +2680,7 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
                 rLanguage = "es";
                 rScript.clear();
                 rCountry  = "ES";
+                rRegion.clear();
                 rVariants = "tradnl";   // this is nonsense, but... ignored.
                 eRet = EXTRACTED_KNOWN_BAD;
             }
@@ -2619,8 +2692,19 @@ LanguageTagImpl::Extraction 
LanguageTagImpl::simpleExtract( const OUString& rBcp
         rLanguage.clear();
         rScript.clear();
         rCountry.clear();
+        rRegion.clear();
         rVariants.clear();
     }
+    else
+    {
+        assert(rLanguage.getLength() == 2 || rLanguage.getLength() == 3
+                || eRet == EXTRACTED_X_JOKER || eRet == EXTRACTED_X || eRet == 
EXTRACTED_C_LOCALE);
+        assert(rScript.isEmpty()   || rScript.getLength() == 4);
+        assert(rCountry.isEmpty()  || rRegion.isEmpty());    // [2ALPHA / 
3DIGIT]
+        assert(rCountry.isEmpty()  || rCountry.getLength() == 2);
+        assert(rRegion.isEmpty()   || rRegion.getLength() == 3);
+        assert(rVariants.isEmpty() || rVariants.getLength() >= 4 || rVariants 
== "oed");
+    }
     return eRet;
 }
 

Reply via email to