Hi guys, Attached is a patch for make_chm.php which solves character set issues and enables you to make chm help files in any supported languages.
You have to get either iconv or mbstring enabled to make the chm files that are encoded in the character set other than iso-8859-1 or Windows-1252. And you should specify the proper font by modifying the appropriate part of the code. Hm, Well, I'm aware of the header of the file says "PLEASE DO NOT MAKE ANY MAJOR MODIFICATIONS TO THIS CODE!" and I actually did heavy modifications on this, but I hope it helps the ongoing development. Anyway, comments are welcome. Cheers, Moriyoshi
Index: make_chm.php =================================================================== RCS file: /repository/phpdoc/chm/make_chm.php,v retrieving revision 1.5 diff -u -r1.5 make_chm.php --- make_chm.php 6 Oct 2002 09:35:03 -0000 1.5 +++ make_chm.php 17 Nov 2002 13:45:21 -0000 @@ -14,25 +14,79 @@ $FANCY_PATH = getenv("PHP_HELP_COMPILE_FANCYDIR"); $LANGUAGE = getenv("PHP_HELP_COMPILE_LANG"); $INDEX_IN_HTML = "index.html"; +$INTERNAL_CHARSET = "UTF-8"; +$DEFAULT_FONT = "Arial,10,0"; if (empty($FANCY_PATH)) { $FANCY_PATH = $HTML_PATH; } // Array to manual code -> HTML Help Code conversion // Code list: http://www.helpware.net/htmlhelp/hh_info.htm $LANGUAGES = array( - "tw" => "0x404 Traditional Chinese", - "cs" => "0x405 Czech", - "de" => "0x407 German (Germany)", - "en" => "0x809 Enlish (United Kingdom)", - "es" => "0xc0a Spanish (International Sort)", - "fr" => "0x40c French (France)", - "hu" => "0x40e Hungarian", - "it" => "0x410 Italian (Italy)", - "ja" => "0x411 Japanese", - "kr" => "0x412 Korean", - "nl" => "0x413 Dutch (Netherlands)", - "pt_BR" => "0x416 Portuguese (Brazil)", - "zh" => "0x804 Simplified Chinese" + "tw" => array( + "langcode" => "0x404 Traditional Chinese", + "preferred_charset" => "CP950", + "preferred_font" => $DEFAULT_FONT + ), + "cs" => array( + "langcode" => "0x405 Czech", + "preferred_charset" => "Windows-1250", + "preferred_font" => $DEFAULT_FONT + ), + "de" => array( + "langcode" => "0x407 German (Germany)", + "preferred_charset" => "Windows-1252", + "preferred_font" => $DEFAULT_FONT + ), + "en" => array( + "langcode" => "0x809 Enlish (United Kingdom)", + "preferred_charset" => "Windows-1252", + "preferred_font" => $DEFAULT_FONT + ), + "es" => array( + "langcode" => "0xc0a Spanish (International Sort)", + "preferred_charset" => "Windows-1252", + "preferred_font" => $DEFAULT_FONT + ), + "fr" => array( + "langcode" => "0x40c Frcsh (France)", + "preferred_charset" => "Windows-1252", + "preferred_font" => $DEFAULT_FONT + ), + "hu" => array( + "langcode" => "0x40e Hungarian", + "preferred_charset" => "Windows-1250", + "preferred_font" => $DEFAULT_FONT + ), + "it" => array( + "langcode" => "0x410 Italian (Italy)", + "preferred_charset" => "Windows-1252", + "preferred_font" => $DEFAULT_FONT + ), + "ja" => array( + "langcode" => "0x411 Japanese", + "preferred_charset" => "CP932", + "preferred_font" => "MS P Gothic,10,0" + ), + "kr" => array( + "langcode" => "0x412 Korean", + "preferred_charset" => "CP949", + "preferred_font" => $DEFAULT_FONT + ), + "nl" => array( + "langcode" => "0x413 Dutch (Netherlands)", + "preferred_charset" => "Windows-1252", + "preferred_font" => $DEFAULT_FONT + ), + "pt_BR" => array( + "langcode" => "0x416 Portuguese (Brazil)", + "preferred_charset" => "Windows-1252", + "preferred_font" => $DEFAULT_FONT + ), + "zh" => array( + "langcode" => "0x804 Simplified Chinese", + "preferred_charset" => "CP936", + "preferred_font" => $DEFAULT_FONT + ) ); // Files on the top level of the TOC @@ -46,6 +100,18 @@ "appendixes.html" ); +// backwards compatibility +if (!function_exists("file_get_contents")) { + function file_get_contents($file) + { + $cnt = file($file); + if ($cnt !== false) { + return join('', $cnt); + } + return false; + } +} + // Header for index and toc $HEADER = '<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN"> <html> @@ -72,8 +138,8 @@ $index = fopen("php_manual_$LANGUAGE.hhk", "w"); // Write out file headers - fputs($toc, $HEADER); - fputs($index, $HEADER); + fputs_wrapper($toc, $HEADER); + fputs_wrapper($index, $HEADER); // Read original index file and drop out newlines $indexline = oneLiner("$HTML_PATH/$INDEX_IN_HTML"); @@ -85,7 +151,7 @@ if ($FIRST_PAGE != $INDEX_IN_HTML) { // Find the name of the Table of Contents - preg_match('|CLASS=\"TOC\" ><DL ><DT ><B >(.*)</B|U', $indexline, $match); + preg_match('|CLASS=\"TOC\" *><DL *><DT *><B *>(.*)</B|U', $indexline, $match); if (empty($match[1])) { // Fallback $match[1] = "Table of Contents"; } @@ -94,40 +160,43 @@ } // Find the name of the Preface - preg_match('|<A HREF="preface.html" >(.*)</A >|U', $indexline, $match); + preg_match('|<A +HREF="preface.html" *>([^<]*)</A *>|U', $indexline, $match); if (empty($match[1])) { // Fallback $match[1] = "Preface"; } mapAndIndex($match[1], "preface.html", " ", $toc, $index); // Now autofind the main pages + $MAIN_REGEXP = join("|", $MAIN_FILES); - preg_match_all("![IVX]+\. <A HREF=\"($MAIN_REGEXP)\" >(.+)</A >(.+)(?=[IVX]+\. <A HREF=\"($MAIN_REGEXP)|</DT ></DL ></DD ></DL)!U", $indexline, $matches, PREG_SET_ORDER); + + +preg_match_all("![IVX]+[^<]*<A\\s+HREF=\"($MAIN_REGEXP)\"\\s*>([^<]+)</A\\s*>(.+)</DT\\s*></DL\\s*></DD\\s*><DT\\s*>!Ui", + $indexline, $matches, PREG_SET_ORDER); // Go through the main files, and link in subpages foreach ($matches as $matchinfo) { mapAndIndex($matchinfo[2], $matchinfo[1], " ", $toc, $index); - fputs($toc, "\n <ul>\n"); - preg_match_all("!<A HREF=\"(.+)\" >(.+)</A >!U", $matchinfo[3], $subpages, PREG_SET_ORDER); + fputs_wrapper($toc, "\n <ul>\n"); + preg_match_all("!<A\\s+HREF=\"([^\"]+)\"\\s*>([^<]*)</A\\s*>!iU", +$matchinfo[3], $subpages, PREG_SET_ORDER); + foreach ($subpages as $spinfo) { mapAndIndex($spinfo[2], $spinfo[1], " ", $toc, $index); findDeeperLinks($spinfo[1], $toc, $index); } - fputs($toc, "\n </ul>\n"); + fputs_wrapper($toc, "\n </ul>\n"); } // Link in directly the copyright page $copyline = oneLiner("$HTML_PATH/copyright.html"); - preg_match('|<A NAME="copyright" ></A ><P ><B >(.*)</B|U', $copyline, $match); + preg_match('|<A\\s+NAME="copyright"\\s*></A\\s*><P\\s*><B\\s*>([^<]*)</B|U', +$copyline, $match); if (empty($match[1])) { // Fallback $match[1] = "Copyright"; } mapAndIndex($match[1], "copyright.html", " ", $toc, $index, 17); // Write out closing line, and end files - fputs($index, " </ul>\n</body>\n</html>"); - fputs($toc, " </ul>\n</body>\n</html>"); + fputs_wrapper($index, " </ul>\n</body>\n</html>"); + fputs_wrapper($toc, " </ul>\n</body>\n</html>"); fclose($index); fclose($toc); } // makeContentfiles() function end @@ -148,41 +217,41 @@ // Start writing the project file $project = fopen("php_manual_$LANGUAGE.hhp", "w"); - fputs($project, "[OPTIONS]\n"); - fputs($project, "Compatibility=1.1 or later\n"); - fputs($project, "Compiled file=php_manual_$LANGUAGE.chm\n"); - fputs($project, "Contents file=php_manual_$LANGUAGE.hhc\n"); - fputs($project, "Index file=php_manual_$LANGUAGE.hhk\n"); - fputs($project, "Default Font=Arial,10,0\n"); - fputs($project, "Default Window=phpdoc\n"); - fputs($project, "Default topic=$FANCY_PATH\\$FIRST_PAGE\n"); - fputs($project, "Display compile progress=Yes\n"); - fputs($project, "Full-text search=Yes\n"); + fputs_wrapper($project, "[OPTIONS]\n"); + fputs_wrapper($project, "Compatibility=1.1 or later\n"); + fputs_wrapper($project, "Compiled file=php_manual_$LANGUAGE.chm\n"); + fputs_wrapper($project, "Contents file=php_manual_$LANGUAGE.hhc\n"); + fputs_wrapper($project, "Index file=php_manual_$LANGUAGE.hhk\n"); + fputs_wrapper($project, "Default +Font={$LANGUAGES[$LANGUAGE]['preferred_font']}\n"); + fputs_wrapper($project, "Default Window=phpdoc\n"); + fputs_wrapper($project, "Default topic=$FANCY_PATH\\$FIRST_PAGE\n"); + fputs_wrapper($project, "Display compile progress=Yes\n"); + fputs_wrapper($project, "Full-text search=Yes\n"); // Get the proper language code from the array - fputs($project, "Language=" . $LANGUAGES[$LANGUAGE] . "\n"); + fputs_wrapper($project, "Language={$LANGUAGES[$LANGUAGE]["langcode"]}\n"); // Now try to find out how the manual named in the actual language // this must be in the index.html file as the title (DSSSL generated) $content = oneLiner("$HTML_PATH/$INDEX_IN_HTML"); - if (preg_match("|<TITLE >(.*)</TITLE >|U", $content, $found)) { + if (preg_match("|<TITLE\s*>([^<]*)</TITLE\s*>|U", $content, $found)) { $MANUAL_TITLE = $found[1]; } else { // Fallback $MANUAL_TITLE = "PHP Manual"; } - fputs($project, "Title=$MANUAL_TITLE\n"); + fputs_wrapper($project, "Title=$MANUAL_TITLE\n"); // Define the phpdoc window style (adds more functionality) - fputs($project, "\n[WINDOWS]\nphpdoc=\"$MANUAL_TITLE\",\"php_manual_$LANGUAGE.hhc\",\"php_manual_$LANGUAGE.hhk\"," . + fputs_wrapper($project, +"\n[WINDOWS]\nphpdoc=\"$MANUAL_TITLE\",\"php_manual_$LANGUAGE.hhc\",\"php_manual_$LANGUAGE.hhk\"," + . "\"$FANCY_PATH\\$FIRST_PAGE\",\"$FANCY_PATH\\$FIRST_PAGE\",,,,,0x23520,,0x386e,,,,,,,,0\n"); // Write out all the filenames as in FANCY_PATH - fputs($project, "\n[FILES]\n"); + fputs_wrapper($project, "\n[FILES]\n"); $handle = opendir($FANCY_PATH); while (false !== ($file = readdir($handle))) { if ($file != "." && $file != "..") { - fputs($project, "$FANCY_PATH\\$file\n"); + fputs_wrapper($project, "$FANCY_PATH\\$file\n"); } } closedir($handle); @@ -195,18 +264,18 @@ global $FANCY_PATH; $name = str_replace('"', '"', $name); - fputs($toc, " + fputs_wrapper($toc, " $tabs<li><object type=\"text/sitemap\"> $tabs <param name=\"Name\" value=\"$name\"> $tabs <param name=\"Local\" value=\"$FANCY_PATH\\$local\"> "); if ($imgnum != "auto") { - fputs($toc, "$tabs <param name=\"ImageNumber\" value=\"$imgnum\">\n"); + fputs_wrapper($toc, "$tabs <param name=\"ImageNumber\" +value=\"$imgnum\">\n"); } - fputs($toc, "$tabs </object>\n"); + fputs_wrapper($toc, "$tabs </object>\n"); - fputs($index, " + fputs_wrapper($index, " <li><object type=\"text/sitemap\"> <param name=\"Local\" value=\"$FANCY_PATH\\$local\"> <param name=\"Name\" value=\"$name\"> @@ -223,10 +292,10 @@ $contents = oneLiner("$HTML_PATH/$filename"); // Find all sublinks - if (preg_match_all("!<DT\s+><A\s+HREF=\"(([\w\.-]+\.)+html)(\#[\w\.-]+)?\"\s+>(.*)</A\s+>!U", $contents, $matches, PREG_SET_ORDER)) { + if +(preg_match_all("!<DT\\s*><A\\s+HREF=\"(([\\w\\.-]+\\.)+html)(\\#[\\w\\.-]+)?\"\\s*>([^<]*)</A\\s*>!U", + $contents, $matches, PREG_SET_ORDER)) { // Print out the file informations for all the links - fputs($toc, "\n <ul>"); + fputs_wrapper($toc, "\n <ul>"); foreach ($matches as $onematch) { $param["html"] = $onematch[1]; if (!empty($onematch[3])) { @@ -235,7 +304,7 @@ $param["title"] = strip_tags($onematch[4]); mapAndIndex($param["title"], $param["html"], " ", $toc, $index); } - fputs($toc, " </ul>\n"); + fputs_wrapper($toc, " </ul>\n"); } else { @@ -250,11 +319,61 @@ } // findDeeperLinks() function end +function fputs_wrapper($fp, $str) +{ + fputs($fp, convertCharset($str)); +} // Return a file joined on one line function oneLiner($filename) { - return preg_replace("/[\r|\n]{1,2}/", " ", join("", file($filename))); + global $INTERNAL_CHARSET; + + $buf = preg_replace("/[\r|\n]{1,2}/U", " ", file_get_contents($filename)); + $charset = detectDocumentCharset($buf); + + if ($charset === false) $charset = "UTF-8"; + + if ($charset != $INTERNAL_CHARSET) { + if (function_exists("iconv")) { + $buf = iconv($charset, $INTERNAL_CHARSET, $buf); + } elseif (function_exists("mb_convert_encoding")) { + $buf = mb_convert_encoding($buf, $INTERNAL_CHARSET, $charset); + } elseif (preg_match("/^UTF-?8$/i", $INTERNAL_CHARSET) && +preg_match("/^(ISO-8859-1|WINDOWS-1252)$/i", $charset)) { + $buf = utf8_encode($buf); + } else { + die("charset conversion function is not available."); + } + } + return $buf; +} + +function convertCharset($buf) +{ + global $LANGUAGE, $LANGUAGES, $INTERNAL_CHARSET; + + $charset = $LANGUAGES[$LANGUAGE]['preferred_charset']; + + if ($charset != $INTERNAL_CHARSET) { + if (function_exists("iconv")) { + $buf = iconv($INTERNAL_CHARSET, $charset, $buf); + } elseif (function_exists("mb_convert_encoding")) { + $buf = mb_convert_encoding($buf, $charset, $INTERNAL_CHARSET); + } elseif (preg_match("/^UTF-?8$/i", $INTERNAL_CHARSET) && +preg_match("/^(ISO-8859-1|WINDOWS-1252)$/i", $charset)) { + $buf = utf8_decode($buf); + } else { + die("$LANGUAGE locale is not supported."); + } + } + return $buf; } // oneLiner() function end +// Returns the name of character set in the given document +function detectDocumentCharset($doc) +{ + if +(preg_match("/<META\\s+HTTP-EQUIV=\"CONTENT-TYPE\"\\s+CONTENT=\"TEXT\\/HTML;\\s+CHARSET=([\\w\\d-]*)\"\\s*>/iU", + $doc, $reg)) { + return $reg[1]; + } + return false; +} ?>
-- PHP Documentation Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php