Hi guys,

Attached is a patch for make_chm.php which solves character 
set issues and enables you to make chm help files in any supported 
languages.

You have to get either iconv or mbstring enabled to make the chm files 
that are encoded in the character set other than iso-8859-1 or 
Windows-1252. And you should specify the proper font by modifying the 
appropriate part of the code.

Hm, Well, I'm aware of the header of the file says
"PLEASE DO NOT MAKE ANY MAJOR MODIFICATIONS TO THIS CODE!" and
I actually did heavy modifications on this, but I hope it helps
the ongoing development.

Anyway, comments are welcome.


Cheers,

Moriyoshi

Index: make_chm.php
===================================================================
RCS file: /repository/phpdoc/chm/make_chm.php,v
retrieving revision 1.5
diff -u -r1.5 make_chm.php
--- make_chm.php        6 Oct 2002 09:35:03 -0000       1.5
+++ make_chm.php        17 Nov 2002 13:45:21 -0000
@@ -14,25 +14,79 @@
 $FANCY_PATH    = getenv("PHP_HELP_COMPILE_FANCYDIR");
 $LANGUAGE      = getenv("PHP_HELP_COMPILE_LANG");
 $INDEX_IN_HTML = "index.html";
+$INTERNAL_CHARSET = "UTF-8";
+$DEFAULT_FONT = "Arial,10,0";
 
 if (empty($FANCY_PATH)) { $FANCY_PATH = $HTML_PATH; }
 
 // Array to manual code -> HTML Help Code conversion
 // Code list: http://www.helpware.net/htmlhelp/hh_info.htm
 $LANGUAGES = array(
-    "tw"    => "0x404 Traditional Chinese",
-    "cs"    => "0x405 Czech",
-    "de"    => "0x407 German (Germany)",
-    "en"    => "0x809 Enlish (United Kingdom)",
-    "es"    => "0xc0a Spanish (International Sort)",
-    "fr"    => "0x40c French (France)",
-    "hu"    => "0x40e Hungarian",
-    "it"    => "0x410 Italian (Italy)",
-    "ja"    => "0x411 Japanese",
-    "kr"    => "0x412 Korean",
-    "nl"    => "0x413 Dutch (Netherlands)",
-    "pt_BR" => "0x416 Portuguese (Brazil)",
-    "zh"    => "0x804 Simplified Chinese"
+    "tw"    => array(
+                   "langcode" => "0x404 Traditional Chinese",
+                   "preferred_charset" => "CP950",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "cs"    => array(
+                   "langcode" => "0x405 Czech",
+                   "preferred_charset" => "Windows-1250",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "de"    => array(
+                   "langcode" => "0x407 German (Germany)",
+                   "preferred_charset" => "Windows-1252",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "en"    => array(
+                   "langcode" => "0x809 Enlish (United Kingdom)",
+                   "preferred_charset" => "Windows-1252",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "es"    => array(
+                   "langcode" => "0xc0a Spanish (International Sort)",
+                   "preferred_charset" => "Windows-1252",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "fr"    => array(
+                   "langcode" => "0x40c Frcsh (France)",
+                   "preferred_charset" => "Windows-1252",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "hu"    => array(
+                   "langcode" => "0x40e Hungarian",
+                   "preferred_charset" => "Windows-1250",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "it"    => array(
+                   "langcode" => "0x410 Italian (Italy)",
+                   "preferred_charset" => "Windows-1252",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "ja"    => array(
+                   "langcode" => "0x411 Japanese",
+                   "preferred_charset" => "CP932",
+                   "preferred_font" => "MS P Gothic,10,0"
+               ),
+    "kr"    => array(
+                   "langcode" => "0x412 Korean",
+                   "preferred_charset" => "CP949",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "nl"    => array(
+                   "langcode" => "0x413 Dutch (Netherlands)",
+                   "preferred_charset" => "Windows-1252",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "pt_BR" => array(
+                   "langcode" => "0x416 Portuguese (Brazil)",
+                   "preferred_charset" => "Windows-1252",
+                   "preferred_font" => $DEFAULT_FONT
+               ),
+    "zh"    => array(
+                   "langcode" => "0x804 Simplified Chinese",
+                   "preferred_charset" => "CP936",
+                   "preferred_font" => $DEFAULT_FONT
+               )
 );
 
 // Files on the top level of the TOC
@@ -46,6 +100,18 @@
     "appendixes.html"
 );
 
+// backwards compatibility
+if (!function_exists("file_get_contents")) {
+    function file_get_contents($file)
+    {
+        $cnt = file($file);
+        if ($cnt !== false) {
+            return join('', $cnt);
+        }
+        return false;
+    }
+}
+
 // Header for index and toc 
 $HEADER = '<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
 <html>
@@ -72,8 +138,8 @@
     $index = fopen("php_manual_$LANGUAGE.hhk", "w");
 
     // Write out file headers
-    fputs($toc,   $HEADER);
-    fputs($index, $HEADER);
+    fputs_wrapper($toc,   $HEADER);
+    fputs_wrapper($index, $HEADER);
 
     // Read original index file and drop out newlines
     $indexline = oneLiner("$HTML_PATH/$INDEX_IN_HTML");
@@ -85,7 +151,7 @@
     if ($FIRST_PAGE != $INDEX_IN_HTML) {
 
         // Find the name of the Table of Contents
-        preg_match('|CLASS=\"TOC\" ><DL ><DT ><B >(.*)</B|U', $indexline, $match);
+        preg_match('|CLASS=\"TOC\" *><DL *><DT *><B *>(.*)</B|U', $indexline, $match);
         if (empty($match[1])) { // Fallback
             $match[1] = "Table of Contents";
         }
@@ -94,40 +160,43 @@
     }
 
     // Find the name of the Preface
-    preg_match('|<A HREF="preface.html" >(.*)</A >|U', $indexline, $match);
+    preg_match('|<A +HREF="preface.html" *>([^<]*)</A *>|U', $indexline, $match);
     if (empty($match[1])) { // Fallback
         $match[1] = "Preface";
     }
     mapAndIndex($match[1], "preface.html", "    ", $toc, $index);
 
     // Now autofind the main pages
+
     $MAIN_REGEXP = join("|", $MAIN_FILES);
-    preg_match_all("![IVX]+\. <A HREF=\"($MAIN_REGEXP)\" >(.+)</A >(.+)(?=[IVX]+\. <A 
HREF=\"($MAIN_REGEXP)|</DT ></DL ></DD ></DL)!U", $indexline, $matches, 
PREG_SET_ORDER);
+
+    
+preg_match_all("![IVX]+[^<]*<A\\s+HREF=\"($MAIN_REGEXP)\"\\s*>([^<]+)</A\\s*>(.+)</DT\\s*></DL\\s*></DD\\s*><DT\\s*>!Ui",
+ $indexline, $matches, PREG_SET_ORDER);
     
     // Go through the main files, and link in subpages
     foreach ($matches as $matchinfo) {
         mapAndIndex($matchinfo[2], $matchinfo[1], "    ", $toc, $index);
 
-        fputs($toc, "\n      <ul>\n");
-        preg_match_all("!<A HREF=\"(.+)\" >(.+)</A >!U", $matchinfo[3], $subpages, 
PREG_SET_ORDER);
+        fputs_wrapper($toc, "\n      <ul>\n");
+        preg_match_all("!<A\\s+HREF=\"([^\"]+)\"\\s*>([^<]*)</A\\s*>!iU", 
+$matchinfo[3], $subpages, PREG_SET_ORDER);
+
         foreach ($subpages as $spinfo) {
             mapAndIndex($spinfo[2], $spinfo[1], "        ", $toc, $index);
             findDeeperLinks($spinfo[1], $toc, $index);
         }
-        fputs($toc, "\n      </ul>\n");
+        fputs_wrapper($toc, "\n      </ul>\n");
     }
 
     // Link in directly the copyright page
     $copyline = oneLiner("$HTML_PATH/copyright.html");
-    preg_match('|<A NAME="copyright" ></A ><P ><B >(.*)</B|U', $copyline, $match);
+    preg_match('|<A\\s+NAME="copyright"\\s*></A\\s*><P\\s*><B\\s*>([^<]*)</B|U', 
+$copyline, $match);
     if (empty($match[1])) { // Fallback
         $match[1] = "Copyright";
     }
     mapAndIndex($match[1], "copyright.html", "    ", $toc, $index, 17);
 
     // Write out closing line, and end files
-    fputs($index, "  </ul>\n</body>\n</html>");
-    fputs($toc,   "  </ul>\n</body>\n</html>");
+    fputs_wrapper($index, "  </ul>\n</body>\n</html>");
+    fputs_wrapper($toc,   "  </ul>\n</body>\n</html>");
     fclose($index);
     fclose($toc);
 } // makeContentfiles() function end
@@ -148,41 +217,41 @@
            
     // Start writing the project file
     $project = fopen("php_manual_$LANGUAGE.hhp", "w");
-    fputs($project, "[OPTIONS]\n");
-    fputs($project, "Compatibility=1.1 or later\n");
-    fputs($project, "Compiled file=php_manual_$LANGUAGE.chm\n");
-    fputs($project, "Contents file=php_manual_$LANGUAGE.hhc\n");
-    fputs($project, "Index file=php_manual_$LANGUAGE.hhk\n");
-    fputs($project, "Default Font=Arial,10,0\n");
-    fputs($project, "Default Window=phpdoc\n");
-    fputs($project, "Default topic=$FANCY_PATH\\$FIRST_PAGE\n");
-    fputs($project, "Display compile progress=Yes\n");
-    fputs($project, "Full-text search=Yes\n");
+    fputs_wrapper($project, "[OPTIONS]\n");
+    fputs_wrapper($project, "Compatibility=1.1 or later\n");
+    fputs_wrapper($project, "Compiled file=php_manual_$LANGUAGE.chm\n");
+    fputs_wrapper($project, "Contents file=php_manual_$LANGUAGE.hhc\n");
+    fputs_wrapper($project, "Index file=php_manual_$LANGUAGE.hhk\n");
+    fputs_wrapper($project, "Default 
+Font={$LANGUAGES[$LANGUAGE]['preferred_font']}\n");
+    fputs_wrapper($project, "Default Window=phpdoc\n");
+    fputs_wrapper($project, "Default topic=$FANCY_PATH\\$FIRST_PAGE\n");
+    fputs_wrapper($project, "Display compile progress=Yes\n");
+    fputs_wrapper($project, "Full-text search=Yes\n");
 
     // Get the proper language code from the array
-    fputs($project, "Language=" . $LANGUAGES[$LANGUAGE] . "\n");
+    fputs_wrapper($project, "Language={$LANGUAGES[$LANGUAGE]["langcode"]}\n");
 
     // Now try to find out how the manual named in the actual language
     // this must be in the index.html file as the title (DSSSL generated)
     $content = oneLiner("$HTML_PATH/$INDEX_IN_HTML");
-    if (preg_match("|<TITLE >(.*)</TITLE >|U", $content, $found)) {
+    if (preg_match("|<TITLE\s*>([^<]*)</TITLE\s*>|U", $content, $found)) {
         $MANUAL_TITLE = $found[1];
     } else { // Fallback
         $MANUAL_TITLE = "PHP Manual";
     }
 
-    fputs($project, "Title=$MANUAL_TITLE\n");
+    fputs_wrapper($project, "Title=$MANUAL_TITLE\n");
 
     // Define the phpdoc window style (adds more functionality)
-    fputs($project, 
"\n[WINDOWS]\nphpdoc=\"$MANUAL_TITLE\",\"php_manual_$LANGUAGE.hhc\",\"php_manual_$LANGUAGE.hhk\","
 .
+    fputs_wrapper($project, 
+"\n[WINDOWS]\nphpdoc=\"$MANUAL_TITLE\",\"php_manual_$LANGUAGE.hhc\",\"php_manual_$LANGUAGE.hhk\","
+ .
           
"\"$FANCY_PATH\\$FIRST_PAGE\",\"$FANCY_PATH\\$FIRST_PAGE\",,,,,0x23520,,0x386e,,,,,,,,0\n");
 
     // Write out all the filenames as in FANCY_PATH
-    fputs($project, "\n[FILES]\n");
+    fputs_wrapper($project, "\n[FILES]\n");
     $handle = opendir($FANCY_PATH);
     while (false !== ($file = readdir($handle))) {
         if ($file != "." && $file != "..") {
-            fputs($project, "$FANCY_PATH\\$file\n");
+            fputs_wrapper($project, "$FANCY_PATH\\$file\n");
         }
     }
     closedir($handle);
@@ -195,18 +264,18 @@
     global $FANCY_PATH;
     $name = str_replace('"', '&quot;', $name);
 
-    fputs($toc, "
+    fputs_wrapper($toc, "
 $tabs<li><object type=\"text/sitemap\">
 $tabs  <param name=\"Name\" value=\"$name\">
 $tabs  <param name=\"Local\" value=\"$FANCY_PATH\\$local\">
 ");
 
     if ($imgnum != "auto") {
-        fputs($toc, "$tabs  <param name=\"ImageNumber\" value=\"$imgnum\">\n");
+        fputs_wrapper($toc, "$tabs  <param name=\"ImageNumber\" 
+value=\"$imgnum\">\n");
     }
-    fputs($toc, "$tabs  </object>\n");
+    fputs_wrapper($toc, "$tabs  </object>\n");
 
-    fputs($index, "
+    fputs_wrapper($index, "
     <li><object type=\"text/sitemap\">
       <param name=\"Local\" value=\"$FANCY_PATH\\$local\">
       <param name=\"Name\" value=\"$name\">
@@ -223,10 +292,10 @@
     $contents = oneLiner("$HTML_PATH/$filename");
     
     // Find all sublinks
-    if 
(preg_match_all("!<DT\s+><A\s+HREF=\"(([\w\.-]+\.)+html)(\#[\w\.-]+)?\"\s+>(.*)</A\s+>!U",
 $contents, $matches, PREG_SET_ORDER)) {
+    if 
+(preg_match_all("!<DT\\s*><A\\s+HREF=\"(([\\w\\.-]+\\.)+html)(\\#[\\w\\.-]+)?\"\\s*>([^<]*)</A\\s*>!U",
+ $contents, $matches, PREG_SET_ORDER)) {
         
         // Print out the file informations for all the links
-        fputs($toc, "\n        <ul>");
+        fputs_wrapper($toc, "\n        <ul>");
         foreach ($matches as $onematch) {
             $param["html"] = $onematch[1];
             if (!empty($onematch[3])) {
@@ -235,7 +304,7 @@
             $param["title"] = strip_tags($onematch[4]);
             mapAndIndex($param["title"], $param["html"], "          ", $toc, $index);
         }
-        fputs($toc, "        </ul>\n");
+        fputs_wrapper($toc, "        </ul>\n");
 
     } else {
 
@@ -250,11 +319,61 @@
     
 } // findDeeperLinks() function end
 
+function fputs_wrapper($fp, $str)
+{
+    fputs($fp, convertCharset($str));
+}
 
 // Return a file joined on one line
 function oneLiner($filename)
 {
-    return preg_replace("/[\r|\n]{1,2}/", " ", join("", file($filename)));
+    global $INTERNAL_CHARSET;
+
+    $buf = preg_replace("/[\r|\n]{1,2}/U", " ", file_get_contents($filename));
+    $charset = detectDocumentCharset($buf);
+
+    if ($charset === false) $charset = "UTF-8";
+
+    if ($charset != $INTERNAL_CHARSET) {
+        if (function_exists("iconv")) {
+            $buf = iconv($charset, $INTERNAL_CHARSET, $buf);
+        } elseif (function_exists("mb_convert_encoding")) {
+            $buf = mb_convert_encoding($buf, $INTERNAL_CHARSET, $charset);
+        } elseif (preg_match("/^UTF-?8$/i", $INTERNAL_CHARSET) && 
+preg_match("/^(ISO-8859-1|WINDOWS-1252)$/i", $charset)) {
+            $buf = utf8_encode($buf);
+        } else {
+            die("charset conversion function is not available.");
+        }
+    }
+    return $buf;
+}
+
+function convertCharset($buf)
+{
+    global $LANGUAGE, $LANGUAGES, $INTERNAL_CHARSET;
+
+    $charset = $LANGUAGES[$LANGUAGE]['preferred_charset'];
+
+    if ($charset != $INTERNAL_CHARSET) {
+        if (function_exists("iconv")) {
+            $buf = iconv($INTERNAL_CHARSET, $charset, $buf);
+        } elseif (function_exists("mb_convert_encoding")) {
+            $buf = mb_convert_encoding($buf, $charset, $INTERNAL_CHARSET);
+        } elseif (preg_match("/^UTF-?8$/i", $INTERNAL_CHARSET) && 
+preg_match("/^(ISO-8859-1|WINDOWS-1252)$/i", $charset)) {
+            $buf = utf8_decode($buf);
+        } else {
+            die("$LANGUAGE locale is not supported.");
+        }
+    }
+    return $buf;
 } // oneLiner() function end
 
+// Returns the name of character set in the given document
+function detectDocumentCharset($doc)
+{
+    if 
+(preg_match("/<META\\s+HTTP-EQUIV=\"CONTENT-TYPE\"\\s+CONTENT=\"TEXT\\/HTML;\\s+CHARSET=([\\w\\d-]*)\"\\s*>/iU",
+ $doc, $reg)) {
+        return $reg[1];
+    }
+    return false;
+}
 ?>
-- 
PHP Documentation Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to