i18nlangtag/source/languagetag/languagetag.cxx | 143 ++++++++++++++++++++----- include/i18nlangtag/languagetag.hxx | 25 ++-- 2 files changed, 131 insertions(+), 37 deletions(-)
New commits: commit 49656398d43fa03f8adb70b9be417f2fd65dd9ea Author: Eike Rathke <er...@redhat.com> Date: Mon Sep 2 18:01:13 2013 +0200 simpleExtract() with variants rsc needs to resolve all known defined languages without access to liblangtag because that would need /usr/local/share/liblangtag/language-subtag-registry.xml so we'd end up with a bootstrap problem (or would have to pass the database path). Change-Id: I6b966d45080da26cb89169cdb40cd8a58c04a276 diff --git a/i18nlangtag/source/languagetag/languagetag.cxx b/i18nlangtag/source/languagetag/languagetag.cxx index b5c24ee..22f82fe 100644 --- a/i18nlangtag/source/languagetag/languagetag.cxx +++ b/i18nlangtag/source/languagetag/languagetag.cxx @@ -155,6 +155,7 @@ LanguageTag::LanguageTag( const OUString & rBcp47LanguageTag, bool bCanonicalize mbCachedLanguage( false), mbCachedScript( false), mbCachedCountry( false), + mbCachedVariants( false), mbIsFallback( false) { if (bCanonicalize) @@ -178,6 +179,7 @@ LanguageTag::LanguageTag( const com::sun::star::lang::Locale & rLocale ) mbCachedLanguage( false), mbCachedScript( false), mbCachedCountry( false), + mbCachedVariants( false), mbIsFallback( false) { } @@ -198,6 +200,7 @@ LanguageTag::LanguageTag( LanguageType nLanguage ) mbCachedLanguage( false), mbCachedScript( false), mbCachedCountry( false), + mbCachedVariants( false), mbIsFallback( false) { } @@ -220,6 +223,7 @@ LanguageTag::LanguageTag( const OUString& rBcp47, const OUString& rLanguage, mbCachedLanguage( false), mbCachedScript( false), mbCachedCountry( false), + mbCachedVariants( false), mbIsFallback( false) { if (!mbSystemLocale && !mbInitializedBcp47) @@ -257,6 +261,7 @@ LanguageTag::LanguageTag( const rtl_Locale & rLocale ) mbCachedLanguage( false), mbCachedScript( false), mbCachedCountry( false), + mbCachedVariants( false), mbIsFallback( false) { convertFromRtlLocale(); @@ -284,6 +289,7 @@ LanguageTag::LanguageTag( const LanguageTag & rLanguageTag ) mbCachedLanguage( rLanguageTag.mbCachedLanguage), mbCachedScript( rLanguageTag.mbCachedScript), mbCachedCountry( rLanguageTag.mbCachedCountry), + mbCachedVariants( rLanguageTag.mbCachedVariants), mbIsFallback( rLanguageTag.mbIsFallback) { if (mpImplLangtag) @@ -313,6 +319,7 @@ LanguageTag& LanguageTag::operator=( const LanguageTag & rLanguageTag ) mbCachedLanguage = rLanguageTag.mbCachedLanguage; mbCachedScript = rLanguageTag.mbCachedScript; mbCachedCountry = rLanguageTag.mbCachedCountry; + mbCachedVariants = rLanguageTag.mbCachedVariants; mbIsFallback = rLanguageTag.mbIsFallback; if (mpImplLangtag) theDataRef::get().incRef(); @@ -360,6 +367,7 @@ void LanguageTag::resetVars() mbCachedLanguage = false; mbCachedScript = false; mbCachedCountry = false; + mbCachedVariants = false; mbIsFallback = false; } @@ -441,20 +449,22 @@ bool LanguageTag::canonicalize() // and want to determine if parsing it would be possible // without using liblangtag just to see if it is a simple known // locale. - OUString aLanguage, aScript, aCountry; - Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry); + OUString aLanguage, aScript, aCountry, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); if (eExt != EXTRACTED_NONE) { - if (eExt == EXTRACTED_LSC) + if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV) { // Rebuild bcp47 with proper casing of tags. OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() + - 1 + aCountry.getLength()); + 1 + aCountry.getLength() + 1 + aVariants.getLength()); aBuf.append( aLanguage); if (!aScript.isEmpty()) aBuf.append("-" + aScript); if (!aCountry.isEmpty()) aBuf.append("-" + aCountry); + if (!aVariants.isEmpty()) + aBuf.append("-" + aVariants); OUString aStr( aBuf.makeStringAndClear()); if (maBcp47 != aStr) @@ -749,7 +759,7 @@ OUString LanguageTag::getLanguageFromLangtag() } else { - if (mbCachedLanguage || cacheSimpleLSC()) + if (mbCachedLanguage || cacheSimpleLSCV()) aLanguage = maCachedLanguage; } return aLanguage; @@ -775,7 +785,7 @@ OUString LanguageTag::getScriptFromLangtag() } else { - if (mbCachedScript || cacheSimpleLSC()) + if (mbCachedScript || cacheSimpleLSCV()) aScript = maCachedScript; } return aScript; @@ -808,7 +818,7 @@ OUString LanguageTag::getRegionFromLangtag() } else { - if (mbCachedCountry || cacheSimpleLSC()) + if (mbCachedCountry || cacheSimpleLSCV()) aRegion = maCachedCountry; } return aRegion; @@ -840,6 +850,11 @@ OUString LanguageTag::getVariantsFromLangtag() } } } + else + { + if (mbCachedVariants || cacheSimpleLSCV()) + aVariants = maCachedVariants; + } return aVariants; } @@ -1020,7 +1035,12 @@ OUString LanguageTag::getRegion() const OUString LanguageTag::getVariants() const { - return const_cast<LanguageTag*>(this)->getVariantsFromLangtag(); + if (!mbCachedVariants) + { + maCachedVariants = const_cast<LanguageTag*>(this)->getVariantsFromLangtag(); + mbCachedVariants = true; + } + return maCachedVariants; } @@ -1055,16 +1075,18 @@ bool LanguageTag::hasScript() const } -bool LanguageTag::cacheSimpleLSC() +bool LanguageTag::cacheSimpleLSCV() { - OUString aLanguage, aScript, aCountry; - bool bRet = (simpleExtract( maBcp47, aLanguage, aScript, aCountry) == EXTRACTED_LSC); + OUString aLanguage, aScript, aCountry, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); + bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV); if (bRet) { maCachedLanguage = aLanguage; maCachedScript = aScript; maCachedCountry = aCountry; - mbCachedLanguage = mbCachedScript = mbCachedCountry = true; + maCachedVariants = aVariants; + mbCachedLanguage = mbCachedScript = mbCachedCountry = mbCachedVariants = true; } return bRet; } @@ -1275,11 +1297,14 @@ bool LanguageTag::operator!=( const LanguageTag & rLanguageTag ) const // static LanguageTag::Extraction LanguageTag::simpleExtract( const OUString& rBcp47, - OUString& rLanguage, OUString& rScript, OUString& rCountry ) + OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rVariants ) { Extraction eRet = EXTRACTED_NONE; const sal_Int32 nLen = rBcp47.getLength(); const sal_Int32 nHyph1 = rBcp47.indexOf( '-'); + const sal_Int32 nHyph2 = (nHyph1 < 0 ? -1 : rBcp47.indexOf( '-', nHyph1 + 1)); + const sal_Int32 nHyph3 = (nHyph2 < 0 ? -1 : rBcp47.indexOf( '-', nHyph2 + 1)); + const sal_Int32 nHyph4 = (nHyph3 < 0 ? -1 : rBcp47.indexOf( '-', nHyph3 + 1)); if (nLen == 1 && rBcp47[0] == '*') // * the dreaded jolly joker { // It's f*d up but we need to recognize this. @@ -1290,34 +1315,96 @@ LanguageTag::Extraction LanguageTag::simpleExtract( const OUString& rBcp47, // x-... privateuse tags MUST be known to us by definition. eRet = EXTRACTED_X; } - else if ((nLen == 2 || nLen == 3) && nHyph1 < 0) // ll or lll + else if (nLen == 2 || nLen == 3) // ll or lll { - rLanguage = rBcp47.toAsciiLowerCase(); - rScript = rCountry = OUString(); - eRet = EXTRACTED_LSC; + if (nHyph1 < 0) + { + rLanguage = rBcp47.toAsciiLowerCase(); + rScript = rCountry = rVariants = OUString(); + eRet = EXTRACTED_LSC; + } } - else if ( (nLen == 5 && nHyph1 == 2) // ll-CC - || (nLen == 6 && nHyph1 == 3)) // lll-CC + else if ( (nHyph1 == 2 && nLen == 5) // ll-CC + || (nHyph1 == 3 && nLen == 6)) // lll-CC { - rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); - rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); - rScript = OUString(); - eRet = EXTRACTED_LSC; + if (nHyph2 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rScript = rVariants = OUString(); + eRet = EXTRACTED_LSC; + } + } + else if ( (nHyph1 == 2 && nLen == 7) // ll-Ssss + || (nHyph1 == 3 && nLen == 8)) // lll-Ssss + { + /* TODO: also accept a (DIGIT 3*ALNUM) vvvv variant instead of Ssss */ + if (nHyph2 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry = rVariants = OUString(); + eRet = EXTRACTED_LSC; + } } - else if ( (nHyph1 == 2 && nLen == 10) // ll-Ssss-CC check - || (nHyph1 == 3 && nLen == 11)) // lll-Ssss-CC check + else if ( (nHyph1 == 2 && nHyph2 == 7 && nLen == 10) // ll-Ssss-CC + || (nHyph1 == 3 && nHyph2 == 8 && nLen == 11)) // lll-Ssss-CC { - const sal_Int32 nHyph2 = rBcp47.indexOf( '-', nHyph1 + 1); - if (nHyph2 == nHyph1 + 5) + if (nHyph3 < 0) { rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + rVariants = OUString(); eRet = EXTRACTED_LSC; } } + else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15) // ll-Ssss-CC-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16)) // lll-Ssss-CC-vvvv[vvvv][-...] + { + if (nHyph4 < 0 || (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9)) + { + rVariants = rBcp47.copy( nHyph3 + 1); + if (nHyph4 < 0 && (rVariants.getLength() < 4 || 8 < rVariants.getLength())) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + eRet = EXTRACTED_LV; + } + } + } + else if ( (nHyph1 == 2 && nHyph2 == 5 && nLen >= 10) // ll-CC-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 6 && nLen >= 11)) // lll-CC-vvvv[vvvv][-...] + { + if (nHyph3 < 0 || (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9)) + { + rVariants = rBcp47.copy( nHyph2 + 1); + if (nHyph3 < 0 && (rVariants.getLength() < 4 || 8 < rVariants.getLength())) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rScript = OUString(); + eRet = EXTRACTED_LV; + } + } + } + else if ( (nHyph1 == 2 && nLen >= 8) // ll-vvvvv[vvv][-...] + || (nHyph1 == 3 && nLen >= 9)) // lll-vvvvv[vvv][-...] + { + if (nHyph2 < 0 || (nHyph2 - nHyph1 > 5 && nHyph2 - nHyph1 <= 9)) + { + rVariants = rBcp47.copy( nHyph1 + 1); + if (nHyph2 < 0 && (rVariants.getLength() < 5 || 8 < rVariants.getLength())) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rCountry = OUString(); + eRet = EXTRACTED_LV; + } + } + } if (eRet == EXTRACTED_NONE) - rLanguage = rScript = rCountry = OUString(); + rLanguage = rScript = rCountry = rVariants = OUString(); return eRet; } diff --git a/include/i18nlangtag/languagetag.hxx b/include/i18nlangtag/languagetag.hxx index 403221f..d87c5e3 100644 --- a/include/i18nlangtag/languagetag.hxx +++ b/include/i18nlangtag/languagetag.hxx @@ -449,6 +449,7 @@ private: mutable OUString maCachedLanguage; ///< cache getLanguage() mutable OUString maCachedScript; ///< cache getScript() mutable OUString maCachedCountry; ///< cache getCountry() + mutable OUString maCachedVariants; ///< cache getVariants() mutable void* mpImplLangtag; ///< actually lt_tag_t pointer, encapsulated mutable LanguageType mnLangID; mutable Decision meIsValid; @@ -462,6 +463,7 @@ private: mutable bool mbCachedLanguage : 1; mutable bool mbCachedScript : 1; mutable bool mbCachedCountry : 1; + mutable bool mbCachedVariants : 1; bool mbIsFallback : 1; void convertLocaleToBcp47(); @@ -489,12 +491,12 @@ private: void resetVars(); - /** Obtain Language, Script and Country via simpleExtract() and assign them - to the cached variables if successful. + /** Obtain Language, Script, Country and Variants via simpleExtract() and + assign them to the cached variables if successful. @return return of simpleExtract() */ - bool cacheSimpleLSC(); + bool cacheSimpleLSCV(); static bool isIsoLanguage( const OUString& rLanguage ); static bool isIsoScript( const OUString& rScript ); @@ -504,23 +506,28 @@ private: { EXTRACTED_NONE, EXTRACTED_LSC, + EXTRACTED_LV, EXTRACTED_X, EXTRACTED_X_JOKER }; - /** Of a simple language tag of the form lll[-Ssss][-CC] (i.e. one that - would fulfill the isIsoODF() condition) extract the portions. + /** Of a language tag of the form lll[-Ssss][-CC][-vvvvvvvv] extract the + portions. Does not check case or content! - @return EXTRACTED_LSC if simple tag was detected, EXTRACTED_X if x-... - privateuse tag was detected, EXTRACTED_X_JOKER if "*" joker was - detected, else EXTRACTED_NONE. + @return EXTRACTED_LSC if simple tag was detected (i.e. one that + would fulfill the isIsoODF() condition), + EXTRACTED_LV if a tag with variant was detected, + EXTRACTED_X if x-... privateuse tag was detected, + EXTRACTED_X_JOKER if "*" joker was detected, + EXTRACTED_NONE else. */ static Extraction simpleExtract( const OUString& rBcp47, OUString& rLanguage, OUString& rScript, - OUString& rCountry ); + OUString& rCountry, + OUString& rVariants ); }; _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits