Author: FabianLange
Date: 2010-02-15 22:53:33 +0100 (Mon, 15 Feb 2010)
New Revision: 28046

Modified:
   tools/i18n-icu2dat/icu-converter.php
Log:
prepared icu converter for icu 4.3 release preparing to fix some bugs (refs 
#7988, #8000)

Modified: tools/i18n-icu2dat/icu-converter.php
===================================================================
--- tools/i18n-icu2dat/icu-converter.php        2010-02-15 19:44:47 UTC (rev 
28045)
+++ tools/i18n-icu2dat/icu-converter.php        2010-02-15 21:53:33 UTC (rev 
28046)
@@ -1,8 +1,7 @@
 <?php
 /**
  * Converts the ICU files into Prado style format:
- * ICU: 
http://source.icu-project.org/repos/icu/icu/tags/release-4-2-1/source/data/locales
- * ICU Metazones: 
http://source.icu-project.org/repos/icu/icu/tags/release-4-2-1/source/data/misc/metazoneInfo.txt
+ * ICU: 
http://source.icu-project.org/repos/icu/icu/tags/milestone-4-3-4/source/data/
  * Prado: 
http://code.google.com/p/prado3/source/browse/#svn/trunk/framework/I18N/core/data
  *
  * Created for http://www.symfony-project.org by Fabian Lange 
([email protected])
@@ -12,42 +11,47 @@
  */
 /**
  * How to use:
- *  - put this file and sfYaml classe in a directory
- *  - use spec version 1.2 for sfYaml to parse 'no' correctly as norwegian 
(not as false)
- *  - download ICU files in subdirectory called "icu"
- * -  download metazone file
- *  - create subdirectories "data" and "yml"
+ *  - put this file and sfYaml classes in a directory
+ *  - use spec version 1.2 for sfYaml to parse 'no' correctly as norwegian 
(not as false) (default for up2dte sfYaml)
+ *  - download ICU files in subdirectory called "data"
  *  - preprocess some ICU files.
  */
 /**
  * Preprocess (take care not to kill utf-8):
  * due to line wrappings in icu files:
- * es.txt line 141 HK{"Región Administrativa Especial de Hong Kong de la 
República Popular China"}
- * he.txt line 1788 fix date time patterns
- * it.txt line 144 HK{"Regione Amministrativa Speciale di Hong Kong della 
Repubblica Popolare Cinese"}
- * it.txt line 193 MO{"Regione Amministrativa Speciale di Macao della 
Repubblica Popolare Cinese"}
+ * region/es.txt line 141 HK{"Región Administrativa Especial de Hong Kong de 
la República Popular China"}
+ * locales/he.txt line 1788 fix date time patterns
+ * region/it.txt line 140 HK{"Regione Amministrativa Speciale di Hong Kong 
della Repubblica Popolare Cinese"}
+ * region/it.txt line 193 MO{"Regione Amministrativa Speciale di Macao della 
Repubblica Popolare Cinese"}
  *
- * metazonesInfo.txt make sure the first line is a usable yml key 
(metazoneInfo:table(nofallback) -> metazoneInfo)
  * Note: feel free to supply a patch that eliminates the need of preprocessing
  */
 
-// Include the symfony YAML library (can be obtained from symfony 1.1 +)
+// Include the symfony YAML library (can be obtained from symfony 1.3 +)
 include_once('sfYaml.php');
 
-function sanitize($string) { 
-    if(substr($string, 0,3) == pack("CCC",0xef,0xbb,0xbf)) { 
-        $string = substr($string, 3); 
-    } 
-    return str_replace(array("\r\n", "\n", "\r"), "\n", $string); 
+function sanitize($string)
+{ 
+    if(substr($string, 0,3) == pack("CCC",0xef,0xbb,0xbf))
+    {
+        $string = substr($string, 3);
+    }
+    return str_replace(array("\r\n", "\n", "\r"), "\n", $string);
 }
 
 // some postprocessing might result in empty arrays. we should clean them 
before serializing
-function remove_emtpy_arrays($input) {
-  foreach ($input as $key => $value) {
-    if (is_array($value)) {
-      if (empty($value)) {
+function remove_emtpy_arrays($input)
+{
+  foreach ($input as $key => $value)
+  {
+    if (is_array($value))
+    {
+      if (empty($value))
+      {
         unset($input[$key]);
-      } else {
+      }
+      else
+      {
         $value = remove_emtpy_arrays($value);
       }
     }
@@ -55,78 +59,125 @@
   return $input;
 } 
 
-$files = glob("icu/*.txt");
-foreach ($files as $filename) {
-  $locale = substr($filename, 4);
-  $locale = substr($locale, 0, -4);
+// dirty hack to avoid special handling of the metazone input data
+copy('data/misc/metazoneInfo.txt', 'data/locales/metazoneInfo.txt');
 
-  // Step 1: convert ICU txt into yml:
-  $icu_data = file_get_contents('icu/'.$locale.'.txt');
-  $icu_data = sanitize($icu_data);
-  // skip Copyright - file starting with $locale{
-  $icu_data = preg_replace('/\/\/.*?('.$locale.'\{)/sm','$1', $icu_data);
+// since newer icu releases the data is split n multiple files
+$types = array ("locales", "curr", "zone", "lang", "region");
 
-  // Remove Duplicated from xxx package
-  $icu_data = preg_replace ('/\{\s*\/\*\*[^\}]*\}/sm','', $icu_data);
+foreach ($types as $type)
+{
+  $files = glob("data/".$type."/*.txt");
+  foreach ($files as $filename)
+  {
+    $locale = substr($filename, 6 + strlen($type));
+    $locale = substr($locale, 0, -4);
 
-  // this should reference the current locale
-  $icu_data = str_replace('/LOCALE', $locale, $icu_data);
-  // done for BC with old prado files
-  $icu_data = str_replace('%%ALIAS','__ALIAS', $icu_data);
-  
-  // original prado neither uses this, nor imports this correctly. this is a 
typemarker. php manages this on its own
-  // this enables sensible use of the field if required in future
-  $icu_data = str_replace(':intvector{','{ ', $icu_data);
-  $icu_data = str_replace(':int{','{ ', $icu_data);
+    // Step 1: convert ICU txt into yml:
+    $icu_data = file_get_contents('data/'.$type.'/'.$locale.'.txt');
+    $icu_data = sanitize($icu_data);
+    // skip Copyright - file starting with $locale{
+    $icu_data = preg_replace('/\/\/.*?('.$locale.'\{)/sm','$1', $icu_data);
 
-  //hack need to preserve {0} and {1} placeholders from later array 
conversions:
-  $icu_data = str_replace('{0}','<0>', $icu_data);
-  $icu_data = str_replace('{1}','<1>', $icu_data);
+    // Remove Duplicated from xxx package
+    $icu_data = preg_replace ('/\{\s*\/\*\*[^\}]*\}/sm','', $icu_data);
 
-  // Step 2: make yml out of icu format
-  $yml = $icu_data;
-  
-  // create array structure from csv
-  //             "R$",
-  // ->
-  //            - "R$"
-  $yml = preg_replace('/^(\s*)(.*),\s*$/m','$1- $2',$yml);
-  
-  // create array structure for name elements
-  //         PT{"Portugal"}
-  // ->
-  //         PT: ["Portugal"]
-  $yml = str_replace('"}','"]', $yml);
-  $yml = str_replace('{"',': ["', $yml);
-  
-  // create yml key-value pairs from { array notation
-  $yml = str_replace('{',':', $yml);
-  $yml = str_replace('}','', $yml);
-  
-  // some example chars are multiline, we will remove them anyway later.
-  // for parsing them lets pretend its a string block
-  $yml = str_replace('ExemplarCharacters:','ExemplarCharacters: |', $yml);
+    // this should reference the current locale
+    $icu_data = str_replace('/LOCALE', $locale, $icu_data);
+    // done for BC with old prado files
+    $icu_data = str_replace('%%ALIAS','__ALIAS', $icu_data);
 
-  // the original CultureInfo class simplified single element arrays into the 
element alone
-  // now we do this already at data creation time and remove the simplify() 
calles
-  // this greatly reduces file size and improces runtime performance
-  //         PT: ["Portugal"]
-  // ->
-  //         PT: "Portugal"
-  $yml = preg_replace('/\[("[^"]*")\]/','$1', $yml);
+    // original prado neither uses this, nor imports this correctly. this is a 
typemarker. php manages this on its own
+    // this enables sensible use of the field if required in future
+    $icu_data = str_replace(':intvector{','{ ', $icu_data);
+    $icu_data = str_replace(':int{','{ ', $icu_data);
 
-  //hack need to preserve {0} and {1} placeholders from later array 
conversions:
-  $yml = str_replace('<0>','{0}', $yml);
-  $yml = str_replace('<1>','{1}', $yml);
+    // hack need to preserve {0} and {1} placeholders from later array 
conversions:
+    $icu_data = str_replace('{0}','<0>', $icu_data);
+    $icu_data = str_replace('{1}','<1>', $icu_data);
 
-  // save for manual checks of generated yml
-  file_put_contents('yml/'.$locale.'.yml', $yml);
-  
-  // step 3: Load the YAML file and save serialized
-  $array = sfYaml::load('yml/'.$locale.'.yml');
+    // only valid for metazone, but will remove need for manual preprocessing
+    $icu_data = str_replace('metazoneInfo:table(nofallback)', 'metazoneInfo', 
$icu_data);
 
-  $dat_data = $array[$locale];
-  
+    // Step 2: make yml out of icu format
+    $yml = $icu_data;
+
+    // create array structure from csv
+    //             "R$",
+    // ->
+    //            - "R$"
+    $yml = preg_replace('/^(\s*)(.*),\s*$/m','$1- $2',$yml);
+
+    // create array structure for name elements
+    //         PT{"Portugal"}
+    // ->
+    //         PT: ["Portugal"]
+    $yml = str_replace('"}','"]', $yml);
+    $yml = str_replace('{"',': ["', $yml);
+
+    // create yml key-value pairs from { array notation
+    $yml = str_replace('{',':', $yml);
+    $yml = str_replace('}','', $yml);
+
+    // some example chars are multiline, we will remove them anyway later.
+    // for parsing them lets pretend its a string block
+    $yml = str_replace('ExemplarCharacters:','ExemplarCharacters: |', $yml);
+
+    // the original CultureInfo class simplified single element arrays into 
the element alone
+    // now we do this already at data creation time and remove the simplify() 
calles
+    // this greatly reduces file size and improces runtime performance
+    //         PT: ["Portugal"]
+    // ->
+    //         PT: "Portugal"
+    $yml = preg_replace('/\[("[^"]*")\]/','$1', $yml);
+
+    // hack need to preserve {0} and {1} placeholders from later array 
conversions:
+    $yml = str_replace('<0>','{0}', $yml);
+    $yml = str_replace('<1>','{1}', $yml);
+
+    // save for manual checks of generated yml
+    file_put_contents('data/'.$type.'/'.$locale.'.yml', $yml);
+  }
+}
+
+// we use the locales directory as input for locales, because it contains all 
files
+// 'region' or other may contain less files
+$files = glob("data/locales/*.yml");
+foreach ($files as $filename)
+{
+  $locale = substr($filename, 6 + strlen('locales'));
+  $locale = substr($locale, 0, -4);
+
+  // step 3: Load and Merge the YAML files and save serialized
+  $dat_data = array();
+  foreach ($types as $type)
+  {
+    $array = sfYaml::load('data/'.$type.'/'.$locale.'.yml');
+    if (is_array($array))
+    {
+      $type_data = $array[$locale];
+      if ($type == 'region' && isset($type_data['Countries']))
+      {
+        foreach ($type_data['Countries'] as $key => $country)
+        {
+          // numeric keys are regions, no countries (why are they in the data 
files?)
+          if (is_numeric($key)) unset($type_data['Countries'][$key]);
+        }
+        // ZZ is the unknown entry
+        unset($type_data['Countries']['ZZ']);
+      }
+      if ($type == 'curr')
+      {
+        // XXX is unknown and XTS is testing
+        unset($type_data['Currencies']['XTS']);
+        unset($type_data['Currencies']['XXX']);
+        unset($type_data['CurrencyPlurals']['XTS']);
+        unset($type_data['CurrencyPlurals']['XXX']);
+      }
+      $dat_data = array_merge($dat_data, $type_data);
+    }
+  }
+
   // those were not in prado and seem not to make any usable sense for us
   // we remove them to reduce file size
   unset($dat_data['codePatterns']);
@@ -134,6 +185,8 @@
   unset($dat_data['AuxExemplarCharacters']);
   unset($dat_data['CurrencyUnitPatterns']);
 
+  // possibly remove more, but the data is actually useful
+
   // clean any remaining empty arrays
   $dat_data = remove_emtpy_arrays($dat_data);
   
@@ -141,15 +194,16 @@
   file_put_contents('data/'.$locale.'.dat',serialize($dat_data));
 }
 
-//postprocess the metazoneInfo into root
+// postprocess the metazoneInfo.dat into root.dat
 $metazoneInfo = unserialize(file_get_contents('data/metazoneInfo.dat'));
 $zones = array();
-foreach ($metazoneInfo['metazoneMappings'] as $key => $value){
-  // for nor only take last valid timezone mapping
+foreach ($metazoneInfo['metazoneMappings'] as $key => $value)
+{
+  // only take last valid timezone mapping
   $validMetazone = array_pop($value);
   $zones[str_replace(':', '/', $key)] = $validMetazone[0];
 }
-//add to root file
+// add to root file
 $rootData = unserialize(file_get_contents('data/root.dat'));
 $rootData['TimeZones'] = $zones;
 file_put_contents('data/root.dat',serialize($rootData));

-- 
You received this message because you are subscribed to the Google Groups 
"symfony SVN" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to 
[email protected].
For more options, visit this group at 
http://groups.google.com/group/symfony-svn?hl=en.

Reply via email to