Author: FabianLange Date: 2010-02-15 22:53:33 +0100 (Mon, 15 Feb 2010) New Revision: 28046
Modified: tools/i18n-icu2dat/icu-converter.php Log: prepared icu converter for icu 4.3 release preparing to fix some bugs (refs #7988, #8000) Modified: tools/i18n-icu2dat/icu-converter.php =================================================================== --- tools/i18n-icu2dat/icu-converter.php 2010-02-15 19:44:47 UTC (rev 28045) +++ tools/i18n-icu2dat/icu-converter.php 2010-02-15 21:53:33 UTC (rev 28046) @@ -1,8 +1,7 @@ <?php /** * Converts the ICU files into Prado style format: - * ICU: http://source.icu-project.org/repos/icu/icu/tags/release-4-2-1/source/data/locales - * ICU Metazones: http://source.icu-project.org/repos/icu/icu/tags/release-4-2-1/source/data/misc/metazoneInfo.txt + * ICU: http://source.icu-project.org/repos/icu/icu/tags/milestone-4-3-4/source/data/ * Prado: http://code.google.com/p/prado3/source/browse/#svn/trunk/framework/I18N/core/data * * Created for http://www.symfony-project.org by Fabian Lange ([email protected]) @@ -12,42 +11,47 @@ */ /** * How to use: - * - put this file and sfYaml classe in a directory - * - use spec version 1.2 for sfYaml to parse 'no' correctly as norwegian (not as false) - * - download ICU files in subdirectory called "icu" - * - download metazone file - * - create subdirectories "data" and "yml" + * - put this file and sfYaml classes in a directory + * - use spec version 1.2 for sfYaml to parse 'no' correctly as norwegian (not as false) (default for up2dte sfYaml) + * - download ICU files in subdirectory called "data" * - preprocess some ICU files. */ /** * Preprocess (take care not to kill utf-8): * due to line wrappings in icu files: - * es.txt line 141 HK{"Región Administrativa Especial de Hong Kong de la República Popular China"} - * he.txt line 1788 fix date time patterns - * it.txt line 144 HK{"Regione Amministrativa Speciale di Hong Kong della Repubblica Popolare Cinese"} - * it.txt line 193 MO{"Regione Amministrativa Speciale di Macao della Repubblica Popolare Cinese"} + * region/es.txt line 141 HK{"Región Administrativa Especial de Hong Kong de la República Popular China"} + * locales/he.txt line 1788 fix date time patterns + * region/it.txt line 140 HK{"Regione Amministrativa Speciale di Hong Kong della Repubblica Popolare Cinese"} + * region/it.txt line 193 MO{"Regione Amministrativa Speciale di Macao della Repubblica Popolare Cinese"} * - * metazonesInfo.txt make sure the first line is a usable yml key (metazoneInfo:table(nofallback) -> metazoneInfo) * Note: feel free to supply a patch that eliminates the need of preprocessing */ -// Include the symfony YAML library (can be obtained from symfony 1.1 +) +// Include the symfony YAML library (can be obtained from symfony 1.3 +) include_once('sfYaml.php'); -function sanitize($string) { - if(substr($string, 0,3) == pack("CCC",0xef,0xbb,0xbf)) { - $string = substr($string, 3); - } - return str_replace(array("\r\n", "\n", "\r"), "\n", $string); +function sanitize($string) +{ + if(substr($string, 0,3) == pack("CCC",0xef,0xbb,0xbf)) + { + $string = substr($string, 3); + } + return str_replace(array("\r\n", "\n", "\r"), "\n", $string); } // some postprocessing might result in empty arrays. we should clean them before serializing -function remove_emtpy_arrays($input) { - foreach ($input as $key => $value) { - if (is_array($value)) { - if (empty($value)) { +function remove_emtpy_arrays($input) +{ + foreach ($input as $key => $value) + { + if (is_array($value)) + { + if (empty($value)) + { unset($input[$key]); - } else { + } + else + { $value = remove_emtpy_arrays($value); } } @@ -55,78 +59,125 @@ return $input; } -$files = glob("icu/*.txt"); -foreach ($files as $filename) { - $locale = substr($filename, 4); - $locale = substr($locale, 0, -4); +// dirty hack to avoid special handling of the metazone input data +copy('data/misc/metazoneInfo.txt', 'data/locales/metazoneInfo.txt'); - // Step 1: convert ICU txt into yml: - $icu_data = file_get_contents('icu/'.$locale.'.txt'); - $icu_data = sanitize($icu_data); - // skip Copyright - file starting with $locale{ - $icu_data = preg_replace('/\/\/.*?('.$locale.'\{)/sm','$1', $icu_data); +// since newer icu releases the data is split n multiple files +$types = array ("locales", "curr", "zone", "lang", "region"); - // Remove Duplicated from xxx package - $icu_data = preg_replace ('/\{\s*\/\*\*[^\}]*\}/sm','', $icu_data); +foreach ($types as $type) +{ + $files = glob("data/".$type."/*.txt"); + foreach ($files as $filename) + { + $locale = substr($filename, 6 + strlen($type)); + $locale = substr($locale, 0, -4); - // this should reference the current locale - $icu_data = str_replace('/LOCALE', $locale, $icu_data); - // done for BC with old prado files - $icu_data = str_replace('%%ALIAS','__ALIAS', $icu_data); - - // original prado neither uses this, nor imports this correctly. this is a typemarker. php manages this on its own - // this enables sensible use of the field if required in future - $icu_data = str_replace(':intvector{','{ ', $icu_data); - $icu_data = str_replace(':int{','{ ', $icu_data); + // Step 1: convert ICU txt into yml: + $icu_data = file_get_contents('data/'.$type.'/'.$locale.'.txt'); + $icu_data = sanitize($icu_data); + // skip Copyright - file starting with $locale{ + $icu_data = preg_replace('/\/\/.*?('.$locale.'\{)/sm','$1', $icu_data); - //hack need to preserve {0} and {1} placeholders from later array conversions: - $icu_data = str_replace('{0}','<0>', $icu_data); - $icu_data = str_replace('{1}','<1>', $icu_data); + // Remove Duplicated from xxx package + $icu_data = preg_replace ('/\{\s*\/\*\*[^\}]*\}/sm','', $icu_data); - // Step 2: make yml out of icu format - $yml = $icu_data; - - // create array structure from csv - // "R$", - // -> - // - "R$" - $yml = preg_replace('/^(\s*)(.*),\s*$/m','$1- $2',$yml); - - // create array structure for name elements - // PT{"Portugal"} - // -> - // PT: ["Portugal"] - $yml = str_replace('"}','"]', $yml); - $yml = str_replace('{"',': ["', $yml); - - // create yml key-value pairs from { array notation - $yml = str_replace('{',':', $yml); - $yml = str_replace('}','', $yml); - - // some example chars are multiline, we will remove them anyway later. - // for parsing them lets pretend its a string block - $yml = str_replace('ExemplarCharacters:','ExemplarCharacters: |', $yml); + // this should reference the current locale + $icu_data = str_replace('/LOCALE', $locale, $icu_data); + // done for BC with old prado files + $icu_data = str_replace('%%ALIAS','__ALIAS', $icu_data); - // the original CultureInfo class simplified single element arrays into the element alone - // now we do this already at data creation time and remove the simplify() calles - // this greatly reduces file size and improces runtime performance - // PT: ["Portugal"] - // -> - // PT: "Portugal" - $yml = preg_replace('/\[("[^"]*")\]/','$1', $yml); + // original prado neither uses this, nor imports this correctly. this is a typemarker. php manages this on its own + // this enables sensible use of the field if required in future + $icu_data = str_replace(':intvector{','{ ', $icu_data); + $icu_data = str_replace(':int{','{ ', $icu_data); - //hack need to preserve {0} and {1} placeholders from later array conversions: - $yml = str_replace('<0>','{0}', $yml); - $yml = str_replace('<1>','{1}', $yml); + // hack need to preserve {0} and {1} placeholders from later array conversions: + $icu_data = str_replace('{0}','<0>', $icu_data); + $icu_data = str_replace('{1}','<1>', $icu_data); - // save for manual checks of generated yml - file_put_contents('yml/'.$locale.'.yml', $yml); - - // step 3: Load the YAML file and save serialized - $array = sfYaml::load('yml/'.$locale.'.yml'); + // only valid for metazone, but will remove need for manual preprocessing + $icu_data = str_replace('metazoneInfo:table(nofallback)', 'metazoneInfo', $icu_data); - $dat_data = $array[$locale]; - + // Step 2: make yml out of icu format + $yml = $icu_data; + + // create array structure from csv + // "R$", + // -> + // - "R$" + $yml = preg_replace('/^(\s*)(.*),\s*$/m','$1- $2',$yml); + + // create array structure for name elements + // PT{"Portugal"} + // -> + // PT: ["Portugal"] + $yml = str_replace('"}','"]', $yml); + $yml = str_replace('{"',': ["', $yml); + + // create yml key-value pairs from { array notation + $yml = str_replace('{',':', $yml); + $yml = str_replace('}','', $yml); + + // some example chars are multiline, we will remove them anyway later. + // for parsing them lets pretend its a string block + $yml = str_replace('ExemplarCharacters:','ExemplarCharacters: |', $yml); + + // the original CultureInfo class simplified single element arrays into the element alone + // now we do this already at data creation time and remove the simplify() calles + // this greatly reduces file size and improces runtime performance + // PT: ["Portugal"] + // -> + // PT: "Portugal" + $yml = preg_replace('/\[("[^"]*")\]/','$1', $yml); + + // hack need to preserve {0} and {1} placeholders from later array conversions: + $yml = str_replace('<0>','{0}', $yml); + $yml = str_replace('<1>','{1}', $yml); + + // save for manual checks of generated yml + file_put_contents('data/'.$type.'/'.$locale.'.yml', $yml); + } +} + +// we use the locales directory as input for locales, because it contains all files +// 'region' or other may contain less files +$files = glob("data/locales/*.yml"); +foreach ($files as $filename) +{ + $locale = substr($filename, 6 + strlen('locales')); + $locale = substr($locale, 0, -4); + + // step 3: Load and Merge the YAML files and save serialized + $dat_data = array(); + foreach ($types as $type) + { + $array = sfYaml::load('data/'.$type.'/'.$locale.'.yml'); + if (is_array($array)) + { + $type_data = $array[$locale]; + if ($type == 'region' && isset($type_data['Countries'])) + { + foreach ($type_data['Countries'] as $key => $country) + { + // numeric keys are regions, no countries (why are they in the data files?) + if (is_numeric($key)) unset($type_data['Countries'][$key]); + } + // ZZ is the unknown entry + unset($type_data['Countries']['ZZ']); + } + if ($type == 'curr') + { + // XXX is unknown and XTS is testing + unset($type_data['Currencies']['XTS']); + unset($type_data['Currencies']['XXX']); + unset($type_data['CurrencyPlurals']['XTS']); + unset($type_data['CurrencyPlurals']['XXX']); + } + $dat_data = array_merge($dat_data, $type_data); + } + } + // those were not in prado and seem not to make any usable sense for us // we remove them to reduce file size unset($dat_data['codePatterns']); @@ -134,6 +185,8 @@ unset($dat_data['AuxExemplarCharacters']); unset($dat_data['CurrencyUnitPatterns']); + // possibly remove more, but the data is actually useful + // clean any remaining empty arrays $dat_data = remove_emtpy_arrays($dat_data); @@ -141,15 +194,16 @@ file_put_contents('data/'.$locale.'.dat',serialize($dat_data)); } -//postprocess the metazoneInfo into root +// postprocess the metazoneInfo.dat into root.dat $metazoneInfo = unserialize(file_get_contents('data/metazoneInfo.dat')); $zones = array(); -foreach ($metazoneInfo['metazoneMappings'] as $key => $value){ - // for nor only take last valid timezone mapping +foreach ($metazoneInfo['metazoneMappings'] as $key => $value) +{ + // only take last valid timezone mapping $validMetazone = array_pop($value); $zones[str_replace(':', '/', $key)] = $validMetazone[0]; } -//add to root file +// add to root file $rootData = unserialize(file_get_contents('data/root.dat')); $rootData['TimeZones'] = $zones; file_put_contents('data/root.dat',serialize($rootData)); -- You received this message because you are subscribed to the Google Groups "symfony SVN" group. To post to this group, send email to [email protected]. To unsubscribe from this group, send email to [email protected]. For more options, visit this group at http://groups.google.com/group/symfony-svn?hl=en.
