fujimoto                Mon Aug 11 01:24:42 2003 EDT

  Added files:                 
    /ZendEngine2        zend_multibyte.h 

  Modified files:              
    /ZendEngine2        Zend.m4 flex.skl zend_compile.c zend_globals.h 
                        zend_highlight.c zend_language_scanner.h 
                        zend_language_scanner.l zend_multibyte.c 
    /php-src/ext/mbstring       mbstring.c 
    /php-src/main       main.c 
  Log:
  - added script encoding support to Zend Engine 2.
    this enables ZE2 to gracefully parse scripts written in UTF-8 (with BOM),
    UTF-16, UTF-32, Shift_JIS, ISO-2022-JP etc... (when configured with
    '--enable-zend-multibyte' and '--enable-mbstring')
  
  
Index: ZendEngine2/Zend.m4
diff -u ZendEngine2/Zend.m4:1.39 ZendEngine2/Zend.m4:1.40
--- ZendEngine2/Zend.m4:1.39    Thu Mar 20 12:41:40 2003
+++ ZendEngine2/Zend.m4 Mon Aug 11 01:24:41 2003
@@ -1,5 +1,5 @@
 dnl
-dnl $Id: Zend.m4,v 1.39 2003/03/20 17:41:40 stas Exp $
+dnl $Id: Zend.m4,v 1.40 2003/08/11 05:24:41 fujimoto Exp $
 dnl
 dnl This file contains Zend specific autoconf functions.
 dnl
@@ -129,6 +129,13 @@
   ZEND_MEMORY_LIMIT=no
 ])
 
+AC_ARG_ENABLE(zend-multibyte,
+[  --enable-zend-multibyte   Compile with zend multibyte support. ], [
+  ZEND_MULTIBYTE=$enableval
+],[
+  ZEND_MULTIBYTE=no
+])
+
 AC_MSG_CHECKING(whether to enable thread-safety)
 AC_MSG_RESULT($ZEND_MAINTAINER_ZTS)
 
@@ -140,6 +147,9 @@
 
 AC_MSG_CHECKING(whether to enable Zend debugging)
 AC_MSG_RESULT($ZEND_DEBUG)
+
+AC_MSG_CHECKING(whether to enable Zend multibyte)
+AC_MSG_RESULT($ZEND_MULTIBYTE)
        
 if test "$ZEND_DEBUG" = "yes"; then
   AC_DEFINE(ZEND_DEBUG,1,[ ])
@@ -168,6 +178,9 @@
   AC_DEFINE(MEMORY_LIMIT, 0, [Memory limit])
 fi
 
+if test "$ZEND_MULTIBYTE" = "yes"; then
+  AC_DEFINE(ZEND_MULTIBYTE, 1, [ ])
+fi
 
 changequote({,})
 if test -n "$GCC" && test "$ZEND_INLINE_OPTIMIZATION" != "yes"; then
Index: ZendEngine2/flex.skl
diff -u ZendEngine2/flex.skl:1.30 ZendEngine2/flex.skl:1.31
--- ZendEngine2/flex.skl:1.30   Tue Feb 18 04:51:21 2003
+++ ZendEngine2/flex.skl        Mon Aug 11 01:24:41 2003
@@ -1,7 +1,7 @@
 /* A Lexical scanner generated by flex */
 
 /* Scanner skeleton version:
- * $Header: /usr/repository/ZendEngine2/flex.skl,v 1.30 2003/02/18 09:51:21 wez Exp $
+ * $Header: /usr/repository/ZendEngine2/flex.skl,v 1.31 2003/08/11 05:24:41 fujimoto 
Exp $
  * vim:ft=lex:
  */
 
@@ -440,12 +440,17 @@
 #define ECHO /* There is no output */
 #endif
 
-#define YY_INPUT(buf, result, max_size) \
+#ifdef ZEND_MULTIBYTE
+# define YY_INPUT(buf, result, max_size) \
+       if ( ((result = zend_multibyte_yyinput(yyin, buf, max_size TSRMLS_CC)) == 0) \
+               && zend_stream_ferror( yyin TSRMLS_CC) ) \
+               YY_FATAL_ERROR( "input in flex scanner failed" );
+#else
+# define YY_INPUT(buf, result, max_size) \
        if ( ((result = zend_stream_read(yyin, buf, max_size TSRMLS_CC)) == 0) \
                  && zend_stream_ferror( yyin TSRMLS_CC) ) \
                YY_FATAL_ERROR( "input in flex scanner failed" );
-
-
+#endif
 
 #ifndef ECHO
 %- Standard (non-C++) definition
Index: ZendEngine2/zend_compile.c
diff -u ZendEngine2/zend_compile.c:1.458 ZendEngine2/zend_compile.c:1.459
--- ZendEngine2/zend_compile.c:1.458    Sun Aug 10 19:59:35 2003
+++ ZendEngine2/zend_compile.c  Mon Aug 11 01:24:41 2003
@@ -17,7 +17,7 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: zend_compile.c,v 1.458 2003/08/10 23:59:35 helly Exp $ */
+/* $Id: zend_compile.c,v 1.459 2003/08/11 05:24:41 fujimoto Exp $ */
 
 #include "zend_language_parser.h"
 #include "zend.h"
@@ -27,6 +27,10 @@
 #include "zend_API.h"
 #include "zend_fast_cache.h"
 
+#ifdef ZEND_MULTIBYTE
+#include "zend_multibyte.h"
+#endif /* ZEND_MULTIBYTE */
+
 ZEND_API zend_op_array *(*zend_compile_file)(zend_file_handle *file_handle, int type 
TSRMLS_DC);
 
 
@@ -51,7 +55,14 @@
        /* NULL, name length, filename length, line number length */
        result->value.str.len = 1+name_length+strlen(filename)+lineno_len;
        result->value.str.val = (char *) emalloc(result->value.str.len+1);
+#ifdef ZEND_MULTIBYTE
+       /* must be binary safe */
+       result->value.str.val[0] = '\0';
+       memcpy(result->value.str.val+1, name, name_length);
+       sprintf(result->value.str.val+1+name_length, "%s%s", filename, lineno_buf);
+#else
        sprintf(result->value.str.val, "%c%s%s%s", '\0', name, filename, lineno_buf);
+#endif /* ZEND_MULTIBYTE */
        result->type = IS_STRING;
        result->refcount = 1;
 }
@@ -90,6 +101,15 @@
        init_compiler_declarables(TSRMLS_C);
        CG(throw_list) = NULL;
        zend_hash_apply(CG(auto_globals), (apply_func_t) zend_auto_global_arm 
TSRMLS_CC);
+
+#ifdef ZEND_MULTIBYTE
+       CG(script_encoding_list) = NULL;
+       CG(script_encoding_list_size) = 0;
+       CG(internal_encoding) = NULL;
+       CG(encoding_detector) = NULL;
+       CG(encoding_converter) = NULL;
+       CG(encoding_oddlen) = NULL;
+#endif /* ZEND_MULTIBYTE */
 }
 
 
@@ -114,6 +134,12 @@
        zend_stack_destroy(&CG(list_stack));
        zend_hash_destroy(&CG(filenames_table));
        zend_llist_destroy(&CG(open_files));
+
+#ifdef ZEND_MULTIBYTE
+       if (CG(script_encoding_list)) {
+               efree(CG(script_encoding_list));
+       }
+#endif /* ZEND_MULTIBYTE */
 }
 
 
@@ -3064,6 +3090,32 @@
        if (!zend_binary_strcasecmp(var->u.constant.value.str.val, 
var->u.constant.value.str.len, "ticks", sizeof("ticks")-1)) {
                convert_to_long(&val->u.constant);
                CG(declarables).ticks = val->u.constant;
+#ifdef ZEND_MULTIBYTE
+       } else if (!zend_binary_strcasecmp(var->u.constant.value.str.val, 
var->u.constant.value.str.len, "encoding", sizeof("encoding")-1)) {
+               zend_encoding *new_encoding, *old_encoding;
+               zend_encoding_filter old_input_filter;
+
+               if (val->u.constant.type == IS_CONSTANT) {
+                       zend_error(E_COMPILE_ERROR, "Cannot use constants as 
encoding");
+               }
+               convert_to_string(&val->u.constant);
+               new_encoding = 
zend_multibyte_fetch_encoding(val->u.constant.value.str.val);
+               if (!new_encoding) {
+                       zend_error(E_COMPILE_WARNING, "Unsupported encoding [%s]", 
val->u.constant.value.str.val);
+               } else {
+                       old_input_filter = LANG_SCNG(input_filter);
+                       old_encoding = LANG_SCNG(script_encoding);
+                       zend_multibyte_set_filter(new_encoding TSRMLS_CC);
+
+                       /* need to re-scan if input filter changed */
+                       if (old_input_filter != LANG_SCNG(input_filter) ||
+                               ((old_input_filter == 
zend_multibyte_script_encoding_filter) &&
+                                (new_encoding != old_encoding))) {
+                               zend_multibyte_yyinput_again(old_input_filter, 
old_encoding TSRMLS_CC);
+                       }
+               }
+               efree(val->u.constant.value.str.val);
+#endif /* ZEND_MULTIBYTE */
        }
        zval_dtor(&var->u.constant);
 }
Index: ZendEngine2/zend_globals.h
diff -u ZendEngine2/zend_globals.h:1.127 ZendEngine2/zend_globals.h:1.128
--- ZendEngine2/zend_globals.h:1.127    Wed Jul 23 04:58:46 2003
+++ ZendEngine2/zend_globals.h  Mon Aug 11 01:24:41 2003
@@ -17,7 +17,7 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: zend_globals.h,v 1.127 2003/07/23 08:58:46 stas Exp $ */
+/* $Id: zend_globals.h,v 1.128 2003/08/11 05:24:41 fujimoto Exp $ */
 
 #ifndef ZEND_GLOBALS_H
 #define ZEND_GLOBALS_H
@@ -35,6 +35,10 @@
 #include "zend_objects.h"
 #include "zend_objects_API.h"
 
+#ifdef ZEND_MULTIBYTE
+#include "zend_multibyte.h"
+#endif /* ZEND_MULTIBYTE */
+
 /* Define ZTS if you want a thread-safe Zend */
 /*#undef ZTS*/
 
@@ -127,6 +131,18 @@
 
        char *doc_comment;
        zend_uint doc_comment_len;
+
+#ifdef ZEND_MULTIBYTE
+       zend_encoding **script_encoding_list;
+       int script_encoding_list_size;
+
+       zend_encoding *internal_encoding;
+
+       /* multibyte utility functions */
+       zend_encoding_detector encoding_detector;
+       zend_encoding_converter encoding_converter;
+       zend_encoding_oddlen encoding_oddlen;
+#endif /* ZEND_MULTIBYTE */
 };
 
 
@@ -271,6 +287,22 @@
        int yy_start_stack_ptr;
        int yy_start_stack_depth;
        int *yy_start_stack;
+
+#ifdef ZEND_MULTIBYTE
+       /* original (unfiltered) script */
+       char *script_org;
+       int script_org_size;
+
+       /* filtered script */
+       char *script_filtered;
+       int script_filtered_size;
+
+       /* input/ouput filters */
+       zend_encoding_filter input_filter;
+       zend_encoding_filter output_filter;
+       zend_encoding *script_encoding;
+       zend_encoding *internal_encoding;
+#endif /* ZEND_MULTIBYTE */
 };
 
 #endif /* ZEND_GLOBALS_H */
Index: ZendEngine2/zend_highlight.c
diff -u ZendEngine2/zend_highlight.c:1.38 ZendEngine2/zend_highlight.c:1.39
--- ZendEngine2/zend_highlight.c:1.38   Tue Jun 10 16:03:25 2003
+++ ZendEngine2/zend_highlight.c        Mon Aug 11 01:24:41 2003
@@ -17,7 +17,7 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: zend_highlight.c,v 1.38 2003/06/10 20:03:25 imajes Exp $ */
+/* $Id: zend_highlight.c,v 1.39 2003/08/11 05:24:41 fujimoto Exp $ */
 
 #include "zend.h"
 #include "zend_language_parser.h"
@@ -57,6 +57,17 @@
 ZEND_API void zend_html_puts(const char *s, uint len TSRMLS_DC)
 {
        const char *ptr=s, *end=s+len;
+
+#ifdef ZEND_MULTIBYTE
+       char *filtered;
+       int filtered_len;
+
+       if (LANG_SCNG(output_filter)) {
+               LANG_SCNG(output_filter)(&filtered, &filtered_len, s, len TSRMLS_CC);
+               ptr = filtered;
+               end = filtered + filtered_len;
+       }
+#endif /* ZEND_MULTIBYTE */
        
        while (ptr<end) {
                if (*ptr==' ') {
@@ -75,6 +86,12 @@
                        zend_html_putc(*ptr++);
                }
        }
+
+#ifdef ZEND_MULTIBYTE
+       if (LANG_SCNG(output_filter)) {
+               efree(filtered);
+       }
+#endif /* ZEND_MULTIBYTE */
 }
 
 
Index: ZendEngine2/zend_language_scanner.h
diff -u ZendEngine2/zend_language_scanner.h:1.13 
ZendEngine2/zend_language_scanner.h:1.14
--- ZendEngine2/zend_language_scanner.h:1.13    Tue Jun 10 16:03:25 2003
+++ ZendEngine2/zend_language_scanner.h Mon Aug 11 01:24:41 2003
@@ -17,7 +17,7 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: zend_language_scanner.h,v 1.13 2003/06/10 20:03:25 imajes Exp $ */
+/* $Id: zend_language_scanner.h,v 1.14 2003/08/11 05:24:41 fujimoto Exp $ */
 
 #ifndef ZEND_SCANNER_H
 #define ZEND_SCANNER_H
@@ -28,6 +28,22 @@
        zend_file_handle *in;
        uint lineno;
        char *filename;
+
+#ifdef ZEND_MULTIBYTE
+       /* original (unfiltered) script */
+       char *script_org;
+       int script_org_size;
+
+       /* filtered script */
+       char *script_filtered;
+       int script_filtered_size;
+
+       /* input/ouput filters */
+       zend_encoding_filter input_filter;
+       zend_encoding_filter output_filter;
+       zend_encoding *script_encoding;
+       zend_encoding *internal_encoding;
+#endif /* ZEND_MULTIBYTE */
 } zend_lex_state;
 
 
Index: ZendEngine2/zend_language_scanner.l
diff -u ZendEngine2/zend_language_scanner.l:1.95 
ZendEngine2/zend_language_scanner.l:1.96
--- ZendEngine2/zend_language_scanner.l:1.95    Sun Jul 27 08:25:50 2003
+++ ZendEngine2/zend_language_scanner.l Mon Aug 11 01:24:41 2003
@@ -19,7 +19,7 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: zend_language_scanner.l,v 1.95 2003/07/27 12:25:50 stas Exp $ */
+/* $Id: zend_language_scanner.l,v 1.96 2003/08/11 05:24:41 fujimoto Exp $ */
 
 #define yyleng SCNG(yy_leng)
 #define yytext SCNG(yy_text)
@@ -127,6 +127,12 @@
        RESET_DOC_COMMENT();
        SCNG(yy_start_stack_ptr) = 0;
        SCNG(yy_start_stack_depth) = 0;
+#ifdef ZEND_MULTIBYTE
+       SCNG(script_org) = NULL;
+       SCNG(script_org_size) = 0;
+       SCNG(script_filtered) = NULL;
+       SCNG(script_filtered_size) = 0;
+#endif /* ZEND_MULTIBYTE */
 }
 
 
@@ -137,6 +143,17 @@
                CG(heredoc_len)=0;
        }
        RESET_DOC_COMMENT();
+       
+#ifdef ZEND_MULTIBYTE
+       if (SCNG(script_org)) {
+               efree(SCNG(script_org));
+               SCNG(script_org) = NULL;
+       }
+       if (SCNG(script_filtered)) {
+               efree(SCNG(script_filtered));
+               SCNG(script_filtered) = NULL;
+       }
+#endif /* ZEND_MULTIBYTE */
 }
 END_EXTERN_C()
 
@@ -148,6 +165,17 @@
        lex_state->state = YYSTATE;
        lex_state->filename = zend_get_compiled_filename(TSRMLS_C);
        lex_state->lineno = CG(zend_lineno);
+
+#ifdef ZEND_MULTIBYTE
+       lex_state->script_org = SCNG(script_org);
+       lex_state->script_org_size = SCNG(script_org_size);
+       lex_state->script_filtered = SCNG(script_filtered);
+       lex_state->script_filtered_size = SCNG(script_filtered_size);
+       lex_state->input_filter = SCNG(input_filter);
+       lex_state->output_filter = SCNG(output_filter);
+       lex_state->script_encoding = SCNG(script_encoding);
+       lex_state->internal_encoding = SCNG(internal_encoding);
+#endif /* ZEND_MULTIBYTE */
 }
 
 
@@ -166,6 +194,17 @@
        BEGIN(lex_state->state);
        CG(zend_lineno) = lex_state->lineno;
        zend_restore_compiled_filename(lex_state->filename TSRMLS_CC);
+
+#ifdef ZEND_MULTIBYTE
+       SCNG(script_org) = lex_state->script_org;
+       SCNG(script_org_size) = lex_state->script_org_size;
+       SCNG(script_filtered) = lex_state->script_filtered;
+       SCNG(script_filtered_size) = lex_state->script_filtered_size;
+       SCNG(input_filter) = lex_state->input_filter;
+       SCNG(output_filter) = lex_state->output_filter;
+       SCNG(script_encoding) = lex_state->script_encoding;
+       SCNG(internal_encoding) = lex_state->internal_encoding;
+#endif /* ZEND_MULTIBYTE */
 }
 
 
@@ -235,7 +274,40 @@
        
        /* Reset the scanner for scanning the new file */
        SCNG(yy_in) = file_handle;
+
+#ifdef ZEND_MULTIBYTE
+       if (file_handle->handle.stream.interactive == 0) {
+               if (zend_multibyte_read_script(TSRMLS_C) != 0) {
+                       return FAILURE;
+               }
+
+               /* force flex to use buffer only */
+               SCNG(yy_in) = NULL;
+               SCNG(init) = 0;
+               SCNG(start) = 1;
+
+               zend_multibyte_set_filter(NULL TSRMLS_CC);
+
+               if (!SCNG(input_filter)) {
+                       SCNG(script_filtered) = 
(char*)emalloc(SCNG(script_org_size)+1);
+                       memcpy(SCNG(script_filtered), SCNG(script_org), 
SCNG(script_org_size)+1);
+                       SCNG(script_filtered_size) = SCNG(script_org_size);
+               } else {
+                       SCNG(input_filter)(&SCNG(script_filtered), 
&SCNG(script_filtered_size), SCNG(script_org), SCNG(script_org_size) TSRMLS_CC);
+               }
+
+               /* flex requires doubled null */
+               SCNG(script_filtered) = (char*)erealloc(SCNG(script_filtered), 
SCNG(script_filtered_size)+2);
+               *(SCNG(script_filtered)+SCNG(script_filtered_size)) = (char)NULL;
+               *(SCNG(script_filtered)+SCNG(script_filtered_size)+1) = (char)NULL;
+               yy_scan_buffer(SCNG(script_filtered), SCNG(script_filtered_size)+2 
TSRMLS_CC);
+       } else {
+               yy_switch_to_buffer(yy_create_buffer(SCNG(yy_in), YY_BUF_SIZE 
TSRMLS_CC) TSRMLS_CC);
+       }
+#else  /* !ZEND_MULTIBYTE */
        yy_switch_to_buffer(yy_create_buffer(SCNG(yy_in), YY_BUF_SIZE TSRMLS_CC) 
TSRMLS_CC);
+#endif /* ZEND_MULTIBYTE */
+
        BEGIN(INITIAL);
 
        if (file_handle->opened_path) {
@@ -300,6 +372,17 @@
                        retval = NULL;
                }
                compilation_successful=1;
+
+#ifdef ZEND_MULTIBYTE
+               if (SCNG(script_org)) {
+                       efree(SCNG(script_org));
+                       SCNG(script_org) = NULL;
+               }
+               if (SCNG(script_filtered)) {
+                       efree(SCNG(script_filtered));
+                       SCNG(script_filtered) = NULL;
+               }
+#endif /* ZEND_MULTIBYTE */
        }
 
        if (retval) {
@@ -367,7 +450,29 @@
        str->value.str.val[str->value.str.len+1]=0;
 
        SCNG(yy_in)=NULL;
+
+#ifdef ZEND_MULTIBYTE
+       SCNG(script_org) = estrdup(str->value.str.val);
+       SCNG(script_org_size) = str->value.str.len;
+
+       zend_multibyte_set_filter(CG(internal_encoding) TSRMLS_CC);
+
+       if (!SCNG(input_filter)) {
+               SCNG(script_filtered) = (char*)emalloc(SCNG(script_org_size)+1);
+               memcpy(SCNG(script_filtered), SCNG(script_org), 
SCNG(script_org_size)+1);
+               SCNG(script_filtered_size) = SCNG(script_org_size);
+       } else {
+               SCNG(input_filter)(&SCNG(script_filtered), 
&SCNG(script_filtered_size), SCNG(script_org), SCNG(script_org_size) TSRMLS_CC);
+       }
+
+       /* flex requires doubled null */
+       SCNG(script_filtered) = (char*)erealloc(SCNG(script_filtered), 
SCNG(script_filtered_size)+2);
+       *(SCNG(script_filtered)+SCNG(script_filtered_size)) = (char)NULL;
+       *(SCNG(script_filtered)+SCNG(script_filtered_size)+1) = (char)NULL;
+       yy_scan_buffer(SCNG(script_filtered), SCNG(script_filtered_size)+2 TSRMLS_CC);
+#else /* !ZEND_MULTIBYTE */
        yy_scan_buffer(str->value.str.val, str->value.str.len+2 TSRMLS_CC);
+#endif /* ZEND_MULTIBYTE */
 
        zend_set_compiled_filename(filename TSRMLS_CC);
        CG(zend_lineno) = 1;
@@ -408,6 +513,17 @@
                BEGIN(ST_IN_SCRIPTING);
                compiler_result = zendparse(TSRMLS_C);
 
+#ifdef ZEND_MULTIBYTE
+               if (SCNG(script_org)) {
+                       efree(SCNG(script_org));
+                       SCNG(script_org) = NULL;
+               }
+               if (SCNG(script_filtered)) {
+                       efree(SCNG(script_filtered));
+                       SCNG(script_filtered) = NULL;
+               }
+#endif /* ZEND_MULTIBYTE */
+
                if (compiler_result==1) {
                        CG(active_op_array) = original_active_op_array;
                        CG(unclean_shutdown)=1;
@@ -442,6 +558,16 @@
                return FAILURE;
        }
        zend_highlight(syntax_highlighter_ini TSRMLS_CC);
+#ifdef ZEND_MULTIBYTE
+       if (SCNG(script_org)) {
+               efree(SCNG(script_org));
+               SCNG(script_org) = NULL;
+       }
+       if (SCNG(script_filtered)) {
+               efree(SCNG(script_filtered));
+               SCNG(script_filtered) = NULL;
+       }
+#endif /* ZEND_MULTIBYTE */
        zend_destroy_file_handle(&file_handle TSRMLS_CC);
        zend_restore_lexical_state(&original_lex_state TSRMLS_CC);
        return SUCCESS;
@@ -459,12 +585,166 @@
                return FAILURE;
        }
        zend_highlight(syntax_highlighter_ini TSRMLS_CC);
+#ifdef ZEND_MULTIBYTE
+       if (SCNG(script_org)) {
+               efree(SCNG(script_org));
+               SCNG(script_org) = NULL;
+       }
+       if (SCNG(script_filtered)) {
+               efree(SCNG(script_filtered));
+               SCNG(script_filtered) = NULL;
+       }
+#endif /* ZEND_MULTIBYTE */
        zend_restore_lexical_state(&original_lex_state TSRMLS_CC);
        zval_dtor(str);
        return SUCCESS;
 }
 END_EXTERN_C()
 
+#ifdef ZEND_MULTIBYTE
+BEGIN_EXTERN_C()
+ZEND_API void zend_multibyte_yyinput_again(zend_encoding_filter old_input_filter, 
zend_encoding *old_encoding TSRMLS_DC)
+{
+       YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+       int offset, original_offset, length, free_flag;
+       char *p;
+       zend_encoding *new_encoding;
+
+       /* calculate current position */
+       offset = original_offset = yy_c_buf_p - b->yy_ch_buf;
+       if (old_input_filter && original_offset > 0) {
+               new_encoding = SCNG(script_encoding);
+               SCNG(script_encoding) = old_encoding;
+               do {
+                       (old_input_filter)(&p, &length, SCNG(script_org), offset 
TSRMLS_CC);
+                       if (!p) {
+                               SCNG(script_encoding) = new_encoding;
+                               return;
+                       }
+                       efree(p);
+                       if (length > original_offset) {
+                               offset--;
+                       } else if (length < original_offset) {
+                               offset++;
+                       }
+               } while (original_offset != length);
+               SCNG(script_encoding) = new_encoding;
+       }
+
+       /* convert and set */
+       if (!SCNG(input_filter)) {
+               length = SCNG(script_org_size)-offset-1;
+               p = SCNG(script_org)+offset+1;
+               free_flag = 0;
+       } else {
+               SCNG(input_filter)(&p, &length, SCNG(script_org)+offset+1, 
SCNG(script_org_size)-offset-1 TSRMLS_CC);
+               free_flag = 1;
+       }
+       if (original_offset+length+1 > (int)b->yy_buf_size) {
+               b->yy_buf_size = original_offset+length+1;
+               b->yy_ch_buf = (char*)erealloc(b->yy_ch_buf, b->yy_buf_size+2);
+               SCNG(script_filtered) = b->yy_ch_buf;
+               SCNG(script_filtered_size) = b->yy_buf_size;
+       }
+       yy_c_buf_p = b->yy_ch_buf + original_offset;
+       strncpy(yy_c_buf_p+1, p, length);
+       b->yy_n_chars = original_offset + length + 1;
+       SCNG(yy_n_chars) = b->yy_n_chars;
+       b->yy_ch_buf[SCNG(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
+       b->yy_ch_buf[SCNG(yy_n_chars)+1] = YY_END_OF_BUFFER_CHAR;
+
+       if (free_flag) {
+               efree(p);
+       }
+}
+
+
+ZEND_API int zend_multibyte_yyinput(zend_file_handle *file_handle, char *buf, size_t 
len TSRMLS_DC)
+{
+       int c = '*', n;
+
+       if (file_handle->handle.stream.interactive == 0) {
+               return zend_stream_read(file_handle, buf, len TSRMLS_CC);
+       }
+
+       /* interactive */
+       if (SCNG(script_org)) {
+               efree(SCNG(script_org));
+       }
+       if (SCNG(script_filtered)) {
+               efree(SCNG(script_filtered));
+       }
+       SCNG(script_org) = NULL;
+       SCNG(script_org_size) = 0;
+
+       /* TODO: support widechars */
+
+       for (n = 0; n < sizeof(buf) && (c = zend_stream_getc(yyin TSRMLS_CC)) != EOF 
&& c != '\n'; ++n) {
+               buf[n] = (char)c;
+       }
+       if (c == '\n') {
+               buf[n++] = (char) c;
+       }
+
+       SCNG(script_org_size) = n;
+       SCNG(script_org) = (char*)emalloc(SCNG(script_org_size)+1);
+       memcpy(SCNG(script_org)+SCNG(script_org_size)-n, buf, n);
+
+       return n;
+}
+
+
+ZEND_API int zend_multibyte_read_script(TSRMLS_D)
+{
+       char buf[8192];
+       int n;
+
+       if (SCNG(script_org)) {
+               efree(SCNG(script_org));
+       }
+       SCNG(script_org) = NULL;
+       SCNG(script_org_size) = 0;
+
+       for (;;) {
+               n = zend_stream_read(yyin, buf, sizeof(buf) TSRMLS_CC);
+               if (n <= 0) {
+                       break;
+               }
+
+               SCNG(script_org_size) += n;
+               if (SCNG(script_org)) {
+                       SCNG(script_org) = (char*)erealloc(SCNG(script_org), 
SCNG(script_org_size)+1);
+               } else {
+                       SCNG(script_org) = (char*)emalloc(SCNG(script_org_size)+1);
+               }
+               memcpy(SCNG(script_org)+SCNG(script_org_size)-n, buf, n);
+       }
+
+       if (n < 0) {
+               return -1;
+       }
+
+       if (!SCNG(script_org)) {
+               SCNG(script_org) = emalloc(SCNG(script_org_size)+1);
+       }
+       *(SCNG(script_org)+SCNG(script_org_size)) = (char)NULL;
+
+       return 0;
+}
+
+
+# define zend_copy_value(zendlval, yytext, yyleng) \
+       if (SCNG(output_filter)) { \
+               SCNG(output_filter)(&(zendlval->value.str.val), 
&(zendlval->value.str.len), yytext, yyleng TSRMLS_CC); \
+       } else { \
+               zendlval->value.str.val = (char *) estrndup(yytext, yyleng); \
+               zendlval->value.str.len = yyleng; \
+       }
+#else /* ZEND_MULTIBYTE */
+# define zend_copy_value(zendlval, yytext, yyleng) \
+       zendlval->value.str.val = (char *)estrndup(yytext, yyleng); \
+       zendlval->value.str.len = yyleng;
+#endif /* ZEND_MULTIBYTE */
 %}
 
 LNUM   [0-9]+
@@ -631,8 +911,7 @@
 
 <ST_LOOKING_FOR_PROPERTY>{LABEL} {
        yy_pop_state(TSRMLS_C);
-       zendlval->value.str.val = (char *)estrndup(yytext, yyleng);
-       zendlval->value.str.len = yyleng;
+       zend_copy_value(zendlval, yytext, yyleng);
        zendlval->type = IS_STRING;
        return T_STRING;
 }
@@ -889,8 +1168,7 @@
 
 
 <ST_LOOKING_FOR_VARNAME>{LABEL} {
-       zendlval->value.str.val = (char *) estrndup(yytext, yyleng);
-       zendlval->value.str.len = yyleng;
+       zend_copy_value(zendlval, yytext, yyleng);
        zendlval->type = IS_STRING;
        yy_pop_state(TSRMLS_C);
        yy_push_state(ST_IN_SCRIPTING TSRMLS_CC);
@@ -1025,8 +1303,21 @@
 }
 
 <INITIAL>(([^<]|"<"[^?%s<]){1,400})|"<s"|"<" {
+#ifdef ZEND_MULTIBYTE
+       if (SCNG(output_filter)) {
+               int readsize;
+               readsize = SCNG(output_filter)(&(zendlval->value.str.val), 
&(zendlval->value.str.len), yytext, yyleng TSRMLS_CC);
+               if (readsize < yyleng) {
+                       yyless(readsize);
+               }
+       } else {
+               zendlval->value.str.val = (char *) estrndup(yytext, yyleng);
+               zendlval->value.str.len = yyleng;
+       }
+#else /* !ZEND_MULTIBYTE */
        zendlval->value.str.val = (char *) estrndup(yytext, yyleng);
        zendlval->value.str.len = yyleng;
+#endif /* ZEND_MULTIBYTE */
        zendlval->type = IS_STRING;
        HANDLE_NEWLINES(yytext, yyleng);
        return T_INLINE_HTML;
@@ -1101,22 +1392,19 @@
 }
 
 <ST_IN_SCRIPTING,ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE>"$"{LABEL} {
-       zendlval->value.str.val = (char *)estrndup(yytext+1, yyleng-1);
-       zendlval->value.str.len = yyleng-1;
+       zend_copy_value(zendlval, (yytext+1), (yyleng-1));
        zendlval->type = IS_STRING;
        return T_VARIABLE;
 }
 
 <ST_IN_SCRIPTING>{LABEL} {
-       zendlval->value.str.val = (char *)estrndup(yytext, yyleng);
-       zendlval->value.str.len = yyleng;
+       zend_copy_value(zendlval, yytext, yyleng);
        zendlval->type = IS_STRING;
        return T_STRING;
 }
 
 <ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>{LABEL} {
-       zendlval->value.str.val = (char *)estrndup(yytext, yyleng);
-       zendlval->value.str.len = yyleng;
+       zend_copy_value(zendlval, yytext, yyleng);
        zendlval->type = IS_STRING;
        return T_STRING;
 }
@@ -1302,6 +1590,14 @@
        }
        *t = 0;
 
+#ifdef ZEND_MULTIBYTE
+       if (SCNG(output_filter)) {
+               s = zendlval->value.str.val;
+               SCNG(output_filter)(&(zendlval->value.str.val), 
&(zendlval->value.str.len), s, zendlval->value.str.len TSRMLS_CC);
+               efree(s);
+       }
+#endif /* ZEND_MULTIBYTE */
+
        return T_CONSTANT_ENCAPSED_STRING;
 }
 
@@ -1342,6 +1638,14 @@
        }
        *t = 0;
 
+#ifdef ZEND_MULTIBYTE
+       if (SCNG(output_filter)) {
+               s = zendlval->value.str.val;
+               SCNG(output_filter)(&(zendlval->value.str.val), 
&(zendlval->value.str.len), s, zendlval->value.str.len TSRMLS_CC);
+               efree(s);
+       }
+#endif /* ZEND_MULTIBYTE */
+
        return T_CONSTANT_ENCAPSED_STRING;
 }
 
@@ -1409,8 +1713,7 @@
                BEGIN(ST_IN_SCRIPTING);
                return T_END_HEREDOC;
        } else {
-               zendlval->value.str.val = (char *)estrndup(yytext, yyleng);
-               zendlval->value.str.len = yyleng;
+               zend_copy_value(zendlval, yytext, yyleng);
                zendlval->type = IS_STRING;
                return T_STRING;
        }
@@ -1427,24 +1730,21 @@
 
 <ST_SINGLE_QUOTE>([^'\\]|\\[^'\\])+ {
        HANDLE_NEWLINES(yytext, yyleng);
-       zendlval->value.str.val = (char *) estrndup(yytext, yyleng);
-       zendlval->value.str.len = yyleng;
+       zend_copy_value(zendlval, yytext, yyleng);
        zendlval->type = IS_STRING;
        return T_ENCAPSED_AND_WHITESPACE;
 }
 
 
 <ST_DOUBLE_QUOTES>[`]+ {
-       zendlval->value.str.val = (char *) estrndup(yytext, yyleng);
-       zendlval->value.str.len = yyleng;
+       zend_copy_value(zendlval, yytext, yyleng);
        zendlval->type = IS_STRING;
        return T_ENCAPSED_AND_WHITESPACE;
 }
 
 
 <ST_BACKQUOTE>["]+ {
-       zendlval->value.str.val = (char *) estrndup(yytext, yyleng);
-       zendlval->value.str.len = yyleng;
+       zend_copy_value(zendlval, yytext, yyleng);
        zendlval->type = IS_STRING;
        return T_ENCAPSED_AND_WHITESPACE;
 }
Index: ZendEngine2/zend_multibyte.c
diff -u ZendEngine2/zend_multibyte.c:1.1 ZendEngine2/zend_multibyte.c:1.2
--- ZendEngine2/zend_multibyte.c:1.1    Sun May 26 15:17:49 2002
+++ ZendEngine2/zend_multibyte.c        Mon Aug 11 01:24:41 2003
@@ -0,0 +1,1133 @@
+/*
+   +----------------------------------------------------------------------+
+   | Zend Engine                                                          |
+   +----------------------------------------------------------------------+
+   | Copyright (c) 1998-2003 Zend Technologies Ltd. (http://www.zend.com) |
+   +----------------------------------------------------------------------+
+   | This source file is subject to version 2.00 of the Zend license,     |
+   | that is bundled with this package in the file LICENSE, and is        | 
+   | available at through the world-wide-web at                           |
+   | http://www.zend.com/license/2_00.txt.                                |
+   | If you did not receive a copy of the Zend license and are unable to  |
+   | obtain it through the world-wide-web, please send a note to          |
+   | [EMAIL PROTECTED] so we can mail you a copy immediately.              |
+   +----------------------------------------------------------------------+
+   | Authors: Masaki Fujimoto <[EMAIL PROTECTED]>                          |
+   |          Rui Hirokawa <[EMAIL PROTECTED]>                             |
+   +----------------------------------------------------------------------+
+*/
+
+/* $Id: zend_multibyte.c,v 1.2 2003/08/11 05:24:41 fujimoto Exp $ */
+
+#include "zend.h"
+#include "zend_compile.h"
+#include "zend_operators.h"
+#include "zend_multibyte.h"
+
+#ifdef ZEND_MULTIBYTE
+static int zend_multibyte_encoding_filter(char **to, int *to_length, const char 
*to_encoding, const char *from, int from_length, const char *from_encoding TSRMLS_DC);
+int sjis_input_filter(char **buf, int *length, const char *sjis, int sjis_length 
TSRMLS_DC);
+int sjis_output_filter(char **buf, int *length, const char *sjis, int sjis_length 
TSRMLS_DC);
+static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, int 
encoding_list_size);
+static int zend_multibyte_parse_encoding_list(const char *encoding_list, int 
encoding_list_size, zend_encoding ***result, int *result_size);
+static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding 
*onetime_encoding TSRMLS_DC);
+static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D);
+static zend_encoding* zend_multibyte_detect_utf_encoding(char *script, int 
script_size TSRMLS_DC);
+
+/*
+ * encodings
+ */
+const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL};
+zend_encoding encoding_ucs2 = {
+       NULL,
+       NULL,
+       "UCS-2",
+       (const char *(*)[])&ucs2_aliases,
+       0
+};
+
+zend_encoding encoding_ucs2be = {
+       NULL,
+       NULL,
+       "UCS-2BE",
+       NULL,
+       0
+};
+
+zend_encoding encoding_ucs2le = {
+       NULL,
+       NULL,
+       "UCS-2LE",
+       NULL,
+       0
+};
+
+const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL};
+zend_encoding encoding_ucs4 = {
+       NULL,
+       NULL,
+       "UCS-4",
+       (const char *(*)[])&ucs4_aliases,
+       0
+};
+
+zend_encoding encoding_ucs4be = {
+       NULL,
+       NULL,
+       "UCS-4BE",
+       NULL,
+       0
+};
+
+zend_encoding encoding_ucs4le = {
+       NULL,
+       NULL,
+       "UCS-4LE",
+       NULL,
+       0
+};
+
+const char *utf32_aliases[] = {"utf32", NULL};
+zend_encoding encoding_utf32 = {
+       NULL,
+       NULL,
+       "UTF-32",
+       (const char *(*)[])&utf32_aliases,
+       0
+};
+
+zend_encoding encoding_utf32be = {
+       NULL,
+       NULL,
+       "UTF-32BE",
+       NULL,
+       0
+};
+
+zend_encoding encoding_utf32le = {
+       NULL,
+       NULL,
+       "UTF-32LE",
+       NULL,
+       0
+};
+
+const char *utf16_aliases[] = {"utf16", NULL};
+zend_encoding encoding_utf16 = {
+       NULL,
+       NULL,
+       "UTF-16",
+       (const char *(*)[])&utf16_aliases,
+       0
+};
+
+zend_encoding encoding_utf16be = {
+       NULL,
+       NULL,
+       "UTF-16BE",
+       NULL,
+       0
+};
+
+zend_encoding encoding_utf16le = {
+       NULL,
+       NULL,
+       "UTF-16LE",
+       NULL,
+       0
+};
+
+const char *utf8_aliases[] = {"utf8", NULL};
+zend_encoding encoding_utf8 = {
+       NULL,
+       NULL,
+       "UTF-8",
+       (const char *(*)[])&utf8_aliases,
+       1
+};
+
+const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", 
"ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL};
+zend_encoding encoding_ascii = {
+       NULL,
+       NULL,
+       "ASCII",
+       (const char *(*)[])&ascii_aliases,
+       1
+};
+
+const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
+zend_encoding encoding_euc_jp = {
+       NULL,
+       NULL,
+       "EUC-JP",
+       (const char *(*)[])&euc_jp_aliases,
+       1
+};
+
+const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL};
+zend_encoding encoding_sjis = {
+       sjis_input_filter,
+       sjis_output_filter,
+       "Shift_JIS",
+       (const char *(*)[])&sjis_aliases,
+       0
+};
+
+const char *eucjp_win_aliases[] = {"eucJP-open", NULL};
+zend_encoding encoding_eucjp_win = {
+       NULL,
+       NULL,
+       "eucJP-win",
+       (const char *(*)[])&eucjp_win_aliases,
+       1
+};
+
+const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", 
NULL};
+zend_encoding encoding_sjis_win = {
+       /* sjis-filters does not care about diffs of Shift_JIS and CP932 */
+       sjis_input_filter,
+       sjis_output_filter,
+       "SJIS-win",
+       (const char *(*)[])&sjis_win_aliases,
+       0
+};
+
+const char *jis_aliases[] = {"ISO-2022-JP", NULL};
+zend_encoding encoding_jis = {
+       NULL,
+       NULL,
+       "JIS",
+       (const char *(*)[])&jis_aliases,
+       0
+};
+
+const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", 
NULL};
+zend_encoding encoding_euc_cn = {
+       NULL,
+       NULL,
+       "EUC-CN",
+       (const char *(*)[])&euc_cn_aliases,
+       1
+};
+
+const char *cp936_aliases[] = {"CP-936", NULL};
+zend_encoding encoding_cp936 = {
+       NULL,
+       NULL,
+       "CP936",
+       (const char *(*)[])&cp936_aliases,
+       0
+};
+
+const char *hz_aliases[] = {"HZ-GB-2312", NULL};
+zend_encoding encoding_hz = {
+       NULL,
+       NULL,
+       "HZ",
+       (const char *(*)[])&hz_aliases,
+       0
+};
+
+const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
+zend_encoding encoding_euc_tw = {
+       NULL,
+       NULL,
+       "EUC-TW",
+       (const char *(*)[])&euc_tw_aliases,
+       1
+};
+
+const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", 
NULL};
+zend_encoding encoding_big5 = {
+       NULL,
+       NULL,
+       "BIG-5",
+       (const char *(*)[])&big5_aliases,
+       0
+};
+
+const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
+zend_encoding encoding_euc_kr = {
+       NULL,
+       NULL,
+       "EUC-KR",
+       (const char *(*)[])&euc_kr_aliases,
+       1
+};
+
+const char *uhc_aliases[] = {"CP949", NULL};
+zend_encoding encoding_uhc = {
+       NULL,
+       NULL,
+       "UHC",
+       (const char *(*)[])&uhc_aliases,
+       1
+};
+
+zend_encoding encoding_2022kr = {
+       NULL,
+       NULL,
+       "ISO-2022-KR",
+       NULL,
+       0
+};
+
+const char *cp1252_aliases[] = {"cp1252", NULL};
+zend_encoding encoding_cp1252 = {
+       NULL,
+       NULL,
+       "Windows-1252",
+       (const char *(*)[])&cp1252_aliases,
+       1
+};
+
+const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL};
+zend_encoding encoding_8859_1 = {
+       NULL,
+       NULL,
+       "ISO-8859-1",
+       (const char *(*)[])&iso_8859_1_aliases,
+       1
+};
+
+const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL};
+zend_encoding encoding_8859_2 = {
+       NULL,
+       NULL,
+       "ISO-8859-2",
+       (const char *(*)[])&iso_8859_2_aliases,
+       1
+};
+
+const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL};
+zend_encoding encoding_8859_3 = {
+       NULL,
+       NULL,
+       "ISO-8859-3",
+       (const char *(*)[])&iso_8859_3_aliases,
+       1
+};
+
+const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL};
+zend_encoding encoding_8859_4 = {
+       NULL,
+       NULL,
+       "ISO-8859-4",
+       (const char *(*)[])&iso_8859_4_aliases,
+       1
+};
+
+const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL};
+zend_encoding encoding_8859_5 = {
+       NULL,
+       NULL,
+       "ISO-8859-5",
+       (const char *(*)[])&iso_8859_5_aliases,
+       1
+};
+
+const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL};
+zend_encoding encoding_8859_6 = {
+       NULL,
+       NULL,
+       "ISO-8859-6",
+       (const char *(*)[])&iso_8859_6_aliases,
+       1
+};
+
+const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL};
+zend_encoding encoding_8859_7 = {
+       NULL,
+       NULL,
+       "ISO-8859-7",
+       (const char *(*)[])&iso_8859_7_aliases,
+       1
+};
+
+const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL};
+zend_encoding encoding_8859_8 = {
+       NULL,
+       NULL,
+       "ISO-8859-8",
+       (const char *(*)[])&iso_8859_8_aliases,
+       1
+};
+
+const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL};
+zend_encoding encoding_8859_9 = {
+       NULL,
+       NULL,
+       "ISO-8859-9",
+       (const char *(*)[])&iso_8859_9_aliases,
+       1
+};
+
+const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL};
+zend_encoding encoding_8859_10 = {
+       NULL,
+       NULL,
+       "ISO-8859-10",
+       (const char *(*)[])&iso_8859_10_aliases,
+       1
+};
+
+const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL};
+zend_encoding encoding_8859_13 = {
+       NULL,
+       NULL,
+       "ISO-8859-13",
+       (const char *(*)[])&iso_8859_13_aliases,
+       1
+};
+
+const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL};
+zend_encoding encoding_8859_14 = {
+       NULL,
+       NULL,
+       "ISO-8859-14",
+       (const char *(*)[])&iso_8859_14_aliases,
+       1
+};
+
+const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL};
+zend_encoding encoding_8859_15 = {
+       NULL,
+       NULL,
+       "ISO-8859-15",
+       (const char *(*)[])&iso_8859_15_aliases,
+       1
+};
+
+const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL};
+zend_encoding encoding_cp1251 = {
+       NULL,
+       NULL,
+       "Windows-1251",
+       (const char *(*)[])&cp1251_aliases,
+       1
+};
+
+const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL};
+zend_encoding encoding_cp866 = {
+       NULL,
+       NULL,
+       "CP866",
+       (const char *(*)[])&cp866_aliases,
+       1
+};
+
+const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL};
+zend_encoding encoding_koi8r = {
+       NULL,
+       NULL,
+       "KOI8-R",
+       (const char *(*)[])&koi8r_aliases,
+       1
+};
+
+zend_encoding *zend_encoding_table[] = {
+       &encoding_ucs4,
+       &encoding_ucs4be,
+       &encoding_ucs4le,
+       &encoding_ucs2,
+       &encoding_ucs2be,
+       &encoding_ucs2le,
+       &encoding_utf32,
+       &encoding_utf32be,
+       &encoding_utf32le,
+       &encoding_utf16,
+       &encoding_utf16be,
+       &encoding_utf16le,
+       &encoding_utf8,
+       &encoding_ascii,
+       &encoding_euc_jp,
+       &encoding_sjis,
+       &encoding_eucjp_win,
+       &encoding_sjis_win,
+       &encoding_jis,
+       &encoding_cp1252,
+       &encoding_8859_1,
+       &encoding_8859_2,
+       &encoding_8859_3,
+       &encoding_8859_4,
+       &encoding_8859_5,
+       &encoding_8859_6,
+       &encoding_8859_7,
+       &encoding_8859_8,
+       &encoding_8859_9,
+       &encoding_8859_10,
+       &encoding_8859_13,
+       &encoding_8859_14,
+       &encoding_8859_15,
+       &encoding_euc_cn,
+       &encoding_cp936,
+       &encoding_hz,
+       &encoding_euc_tw,
+       &encoding_big5,
+       &encoding_euc_kr,
+       &encoding_uhc,
+       &encoding_2022kr,
+       &encoding_cp1251,
+       &encoding_cp866,
+       &encoding_koi8r,
+       NULL
+};
+
+
+
+ZEND_API int zend_multibyte_set_script_encoding(char *encoding_list, int 
encoding_list_size TSRMLS_DC)
+{
+       if (CG(script_encoding_list)) {
+               efree(CG(script_encoding_list));
+               CG(script_encoding_list) = NULL;
+       }
+       CG(script_encoding_list_size) = 0;
+
+       if (!encoding_list) {
+               return 0;
+       }
+
+       zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, 
&(CG(script_encoding_list)), &(CG(script_encoding_list_size)));
+
+       return 0;
+}
+
+
+ZEND_API int zend_multibyte_set_internal_encoding(char *encoding_name, int 
encoding_name_size TSRMLS_DC)
+{
+       CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name);
+       return 0;
+}
+
+ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, 
zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen 
TSRMLS_DC)
+{
+       CG(encoding_detector) = encoding_detector;
+       CG(encoding_converter) = encoding_converter;
+       CG(encoding_oddlen) = encoding_oddlen;
+       return 0;
+}
+
+
+ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC)
+{
+       LANG_SCNG(script_encoding) = 
zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC);
+       LANG_SCNG(internal_encoding) = CG(internal_encoding);
+
+       /* judge input/output filter */
+       LANG_SCNG(input_filter) = NULL;
+       LANG_SCNG(output_filter) = NULL;
+
+       if (!LANG_SCNG(script_encoding)) {
+               return 0;
+       }
+
+       if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == 
LANG_SCNG(internal_encoding)) {
+               /* if encoding specfic filters exist, use them */
+               if (LANG_SCNG(script_encoding)->input_filter && 
LANG_SCNG(script_encoding)->output_filter) {
+                       LANG_SCNG(input_filter) = 
LANG_SCNG(script_encoding)->input_filter;
+                       LANG_SCNG(output_filter) = 
LANG_SCNG(script_encoding)->output_filter;
+                       return 0;
+               }
+
+               if (!LANG_SCNG(script_encoding)->compatible) {
+                       /* and if not, work around w/ script_encoding -> utf-8 -> 
script_encoding conversion */
+                       LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding);
+                       LANG_SCNG(input_filter) = 
zend_multibyte_script_encoding_filter;
+                       LANG_SCNG(output_filter) = 
zend_multibyte_internal_encoding_filter;
+                       return 0;
+               } else {
+                       /* nothing to do in this case */
+                       return 0;
+               }
+       }
+
+       /* LANG_SCNG(internal_encoding) cannot be NULL here */
+       if (LANG_SCNG(internal_encoding)->compatible) {
+               LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
+               return 0;
+       } else if (LANG_SCNG(script_encoding)->compatible) {
+               LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
+               return 0;
+       }
+
+       /* both script and internal encodings are incompatible w/ flex */
+       LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
+       LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
+
+       return 0;
+}
+
+
+ZEND_API zend_encoding* zend_multibyte_fetch_encoding(char *encoding_name)
+{
+       int i, j;
+       zend_encoding *encoding;
+
+       if (!encoding_name) {
+               return NULL;
+       }
+
+       for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
+               if (zend_binary_strcasecmp((char*)encoding->name, 
strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) {
+                       return encoding;
+               }
+       }
+
+       for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
+               if (encoding->aliases != NULL) {
+                       for (j = 0; (*encoding->aliases)[j] != NULL; j++) {
+                               if 
(zend_binary_strcasecmp((char*)(*encoding->aliases)[j], 
strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) {
+                                       return encoding;
+                               }
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+
+ZEND_API int zend_multibyte_script_encoding_filter(char **to, int *to_length, const 
char *from, int from_length TSRMLS_DC)
+{
+       const char *name;
+
+       if (LANG_SCNG(internal_encoding) == NULL || 
LANG_SCNG(internal_encoding)->compatible == 0) {
+               name = "UTF-8";
+       } else {
+               name = LANG_SCNG(internal_encoding)->name;
+       }
+
+       return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, 
LANG_SCNG(script_encoding)->name TSRMLS_CC);
+}
+
+ZEND_API int zend_multibyte_internal_encoding_filter(char **to, int *to_length, const 
char *from, int from_length TSRMLS_DC)
+{
+       const char *name;
+
+       if (LANG_SCNG(script_encoding)->compatible == 0) {
+               name = "UTF-8";
+       } else {
+               name = LANG_SCNG(script_encoding)->name;
+       }
+
+       return zend_multibyte_encoding_filter(to, to_length, 
LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC);
+}
+
+static int zend_multibyte_encoding_filter(char **to, int *to_length, const char 
*to_encoding, const char *from, int from_length, const char *from_encoding TSRMLS_DC)
+{
+       int oddlen;
+
+       if (!CG(encoding_converter)) {
+               return 0;
+       }
+
+       if (CG(encoding_oddlen)) {
+               oddlen = CG(encoding_oddlen)(from, from_length, from_encoding 
TSRMLS_CC);
+               if (oddlen > 0) {
+                       from_length -= oddlen;
+               }
+       }
+
+       if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, 
from_encoding TSRMLS_CC) != 0) {
+               return 0;
+       }
+
+       return from_length;
+}
+
+
+/*
+ *     Shift_JIS Input/Output Filter
+ */
+static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0
+};
+
+int sjis_input_filter(char **buf, int *length, const char *sjis, int sjis_length 
TSRMLS_DC)
+{
+       unsigned char *p, *q;
+       unsigned char  c1, c2;
+
+       *buf = (char*)emalloc(sjis_length*3/2+1);
+       if (!*buf)
+               return 0;
+       *length = 0;
+
+       p = (unsigned char*)sjis;
+       q = (unsigned char*)*buf;
+
+       /* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */
+       while (*p && (p-(unsigned char*)sjis) < sjis_length) {
+               if (!(*p & 0x80)) {
+                       *q++ = *p++;
+                       continue;
+               }
+
+               /* handling 8 bit code */
+               if (table_sjis[*p] == 1) {
+                       /* 1 byte kana */
+                       *q++ = 0x8e;
+                       *q++ = *p++;
+                       continue;
+               }
+
+               if (!*(p+1)) {
+                       *q++ = *p++;
+                       break;
+               }
+
+               if (table_sjis[*p] == 2) {
+                       /* 2 byte kanji code */
+                       c1 = *p++;
+                       if (!*p || (p-(unsigned char*)sjis) >= sjis_length) {
+                               break;
+                       }
+                       c2 = *p++;
+                       c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1;
+                       c1 = (c1 << 1) + 1;
+                       if (c2 >= 0x9e) {
+                               c2 -= 0x7e;
+                               c1++;
+                       } else if (c2 > 0x7f) {
+                               c2 -= 0x20;
+                       } else {
+                               c2 -= 0x1f;
+                       }
+
+                       c1 |= 0x80;
+                       c2 |= 0x80;
+
+                       *q++ = c1;
+                       *q++ = c2;
+               } else {
+                       /*
+                        * for user defined chars (ATTENTION)
+                        *
+                        * THESE ARE NOT CODE FOR CONVERSION! :-P
+                        * (using *ILLEGALLY* 3byte EUC-JP space)
+                        *
+                        * we cannot perfectly (== 1 to 1)  convert these chars to 
EUC-JP.
+                        * so, these code are for perfect RESTORING in 
sjis_output_filter()
+                        */
+                       c1 = *p++;
+                       if (!*p || (p-(unsigned char*)sjis) >= sjis_length) {
+                               break;
+                       }
+                       c2 = *p++;
+                       *q++ = (char)0x8f;
+                       /*
+                        * MAP TO (EUC-JP):
+                        * type A: 0xeba1 - 0xf4fe
+                        * type B: 0xf5a1 - 0xfefe
+                        * type C: 0xa1a1 - 0xa6fe
+                        */
+                       c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1);
+                       c1 = (c1 << 1) + 1;
+                       if (c2 >= 0x9e) {
+                               c2 -= 0x7e;
+                               c1++;
+                       } else if (c2 > 0x7f) {
+                               c2 -= 0x20;
+                       } else {
+                               c2 -= 0x1f;
+                       }
+                       
+                       c1 |= 0x80;
+                       c2 |= 0x80;
+
+                       *q++ = c1;
+                       *q++ = c2;
+               }
+       }
+       *q = (char)NULL;
+       *length = (char*)q - *buf;
+
+       return *length;
+}
+
+static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
+};
+
+int sjis_output_filter(char **sjis, int *sjis_length, const char *buf, int length 
TSRMLS_DC)
+{
+       unsigned char c1, c2;
+       char *p;
+       const char *q;
+
+       if (!sjis || !sjis_length) {
+               return 0;
+       }
+
+       /* always Shift_JIS <= EUC-JP */
+       *sjis = (char*)emalloc(length+1);
+       if (!sjis) {
+               return 0;
+       }
+       p = *sjis;
+       q = buf;
+
+       /* restore converted strings [EUC-JP -> Shift_JIS] */
+       while (*q) {
+               if (!(*q & 0x80)) {
+                       *p++ = *q++;
+                       continue;
+               }
+
+               /* hankaku kana */
+               if (*q == (char)0x8e) {
+                       q++;
+                       if (*q) {
+                               *p++ = *q++;
+                       }
+                       continue;
+               }
+
+               /* 2 byte kanji code */
+               if (table_eucjp[(unsigned char)*q] == 2) {
+                       c1 = (*q++ & ~0x80) & 0xff;
+                       if (*q) {
+                               c2 = (*q++ & ~0x80) & 0xff;
+                       } else {
+                               q--;
+                               break;
+                       }
+
+                       c2 += (c1 & 0x01) ? 0x1f : 0x7d;
+                       if (c2 >= 0x7f) {
+                               c2++;
+                       }
+                       c1 = ((c1 - 0x21) >> 1) + 0x81;
+                       if (c1 > 0x9f) {
+                               c1 += 0x40;
+                       }
+                       
+                       *p++ = c1;
+                       *p++ = c2;
+                       continue;
+               }
+
+               if (*q == (char)0x8f) {
+                       q++;
+                       if (*q) {
+                               c1 = (*q++ & ~0x80) & 0xff;
+                       } else {
+                               q--;
+                               break;
+                       }
+                       if (*q) {
+                               c2 = (*q++ & ~0x80) & 0xff;
+                       } else {
+                               q -= 2;
+                               break;
+                       }
+                       
+                       c2 += (c1 & 0x01) ? 0x1f : 0x7d;
+                       if (c2 >= 0x7f) {
+                               c2++;
+                       }
+                       c1 = ((c1 - 0x21) >> 1) + 0x81;
+                       if (c1 > 0x9f) {
+                               c1 += 0x40;
+                       }
+                       
+                       if (c1 >= 0x81 && c1 <= 0x9f) {
+                               c1 += 0x79;
+                       } else {
+                               c1 += 0x0a;
+                       }
+                       
+                       *p++ = c1;
+                       *p++ = c2;
+                       continue;
+               }
+
+               /* some other chars (may not happen) */
+               *p++ = *q++;
+       }
+       *p = '\0';
+       *sjis_length = p - *sjis;
+
+       return q-buf;   /* return length we actually read */
+}
+
+
+static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, int 
encoding_list_size)
+{
+       int i, list_size = 0;
+       const char *name;
+       char *list = NULL;
+
+       if (!encoding_list || !encoding_list_size) {
+               return NULL;
+       }
+
+       for (i = 0; i < encoding_list_size; i++) {
+               name = (*(encoding_list+i))->name;
+               if (name) {
+                       list_size += strlen(name) + 1;
+                       if (!list) {
+                               list = (char*)emalloc(list_size);
+                               if (!list) {
+                                       return NULL;
+                               }
+                               *list = (char)NULL;
+                       } else {
+                               list = (char*)erealloc(list, list_size);
+                               if (!list) {
+                                       return NULL;
+                               }
+                               strcat(list, ",");
+                       }
+                       strcat(list, name);
+               }
+       }
+       return list;
+}
+
+
+static int zend_multibyte_parse_encoding_list(const char *encoding_list, int 
encoding_list_size, zend_encoding ***result, int *result_size)
+{
+       int n, size;
+       char *p, *p1, *p2, *endp, *tmpstr;
+       zend_encoding **list, **entry, *encoding;
+
+       list = NULL;
+       if (encoding_list == NULL || encoding_list_size <= 0) {
+               return -1;
+       } else {
+               /* copy the encoding_list string for work */
+               tmpstr = (char *)estrndup(encoding_list, encoding_list_size);
+               if (tmpstr == NULL) {
+                       return -1;
+               }
+               /* count the number of listed encoding names */
+               endp = tmpstr + encoding_list_size;
+               n = 1;
+               p1 = tmpstr;
+               while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) {
+                       p1 = p2 + 1;
+                       n++;
+               }
+               size = n;
+               /* make list */
+               list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*));
+               if (list != NULL) {
+                       entry = list;
+                       n = 0;
+                       p1 = tmpstr;
+                       do {
+                               p2 = p = zend_memnstr(p1, ",", 1, endp);
+                               if (p == NULL) {
+                                       p = endp;
+                               }
+                               *p = '\0';
+                               /* trim spaces */
+                               while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
+                                       p1++;
+                               }
+                               p--;
+                               while (p > p1 && (*p == ' ' || *p == '\t')) {
+                                       *p = '\0';
+                                       p--;
+                               }
+                               /* convert to the encoding number and check encoding */
+                               encoding = zend_multibyte_fetch_encoding(p1);
+                               if (encoding)
+                               {
+                                       *entry++ = encoding;
+                                       n++;
+                               }
+                               p1 = p2 + 1;
+                       } while (n < size && p2 != NULL);
+                       *result = list;
+                       *result_size = n;
+               }
+               efree(tmpstr);
+       }
+
+       if (list == NULL) {
+               return -1;
+       }
+
+       return 0;
+}
+
+
+static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding 
*onetime_encoding TSRMLS_DC)
+{
+       zend_encoding *script_encoding;
+       char *name, *list;
+
+       /* onetime_encoding is prior to everything */
+       if (onetime_encoding != NULL) {
+               return onetime_encoding;
+       }
+
+       /* check out bom(byte order mark) and see if containing wchars */
+       script_encoding = zend_multibyte_detect_unicode(TSRMLS_C);
+       if (script_encoding != NULL) {
+               /* bom or wchar detection is prior to 'script_encoding' option */
+               return script_encoding;
+       }
+
+       /* if no script_encoding specified, just leave alone */
+       if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) {
+               return NULL;
+       }
+
+       /* if multiple encodings specified, detect automagically */
+       if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) {
+               list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list),
+                               CG(script_encoding_list_size));
+               name = CG(encoding_detector)(LANG_SCNG(script_org), 
+                               LANG_SCNG(script_org_size), list TSRMLS_CC);
+               if (list) {
+                       efree(list);
+               }
+               if (name) {
+                       script_encoding = zend_multibyte_fetch_encoding(name);
+                       efree(name);
+               } else {
+                       script_encoding = NULL;
+               }
+               return script_encoding;
+       }
+
+       return *(CG(script_encoding_list));
+}
+
+
+static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D)
+{
+       zend_encoding *script_encoding = NULL;
+       int bom_size;
+       char *script;
+
+       if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) {
+               return NULL;
+       }
+
+       /* check out BOM */
+       if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) {
+               script_encoding = &encoding_utf32be;
+               bom_size = sizeof(BOM_UTF32_BE)-1;
+       } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, 
sizeof(BOM_UTF32_LE)-1)) {
+               script_encoding = &encoding_utf32le;
+               bom_size = sizeof(BOM_UTF32_LE)-1;
+       } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, 
sizeof(BOM_UTF16_BE)-1)) {
+               script_encoding = &encoding_utf16be;
+               bom_size = sizeof(BOM_UTF16_BE)-1;
+       } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, 
sizeof(BOM_UTF16_LE)-1)) {
+               script_encoding = &encoding_utf16le;
+               bom_size = sizeof(BOM_UTF16_LE)-1;
+       } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) {
+               script_encoding = &encoding_utf8;
+               bom_size = sizeof(BOM_UTF8)-1;
+       }
+
+       if (script_encoding) {
+               /* remove BOM */
+               script = (char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size);
+               memcpy(script, LANG_SCNG(script_org)+bom_size, 
LANG_SCNG(script_org_size)+1-bom_size);
+               efree(LANG_SCNG(script_org));
+               LANG_SCNG(script_org) = script;
+               LANG_SCNG(script_org_size) -= bom_size;
+
+               return script_encoding;
+       }
+
+       /* script contains NULL bytes -> auto-detection */
+       if (memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size))) {
+               /* make best effort if BOM is missing */
+               return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), 
LANG_SCNG(script_org_size) TSRMLS_CC);
+       }
+
+       return NULL;
+}
+
+static zend_encoding* zend_multibyte_detect_utf_encoding(char *script, int 
script_size TSRMLS_DC)
+{
+       char *p;
+       int wchar_size = 2;
+       int le = 0;
+
+       /* utf-16 or utf-32? */
+       p = script;
+       while ((p-script) < script_size) {
+               p = memchr(p, 0, script_size-(p-script)-2);
+               if (!p) {
+                       break;
+               }
+               if (*(p+1) == (char)NULL && *(p+2) == (char)NULL) {
+                       wchar_size = 4;
+                       break;
+               }
+
+               /* searching for UTF-32 specific byte orders, so this will do */
+               p += 4;
+       }
+
+       /* BE or LE? */
+       p = script;
+       while ((p-script) < script_size) {
+               if (*p == (char)NULL && *(p+wchar_size-1) != (char)NULL) {
+                       /* BE */
+                       le = 0;
+                       break;
+               } else if (*p != (char)NULL && *(p+wchar_size-1) == (char)NULL) {
+                       /* LE* */
+                       le = 1;
+                       break;
+               }
+               p += wchar_size;
+       }
+
+       if (wchar_size == 2) {
+               return le ? &encoding_utf16le : &encoding_utf16be;
+       } else {
+               return le ? &encoding_utf32le : &encoding_utf32be;
+       }
+
+       return NULL;
+}
+#endif /* ZEND_MULTIBYTE */
+
+/*
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: sw=4 ts=4 tw=78
+ * vim<600: sw=4 ts=4 tw=78
+ */
Index: php-src/ext/mbstring/mbstring.c
diff -u php-src/ext/mbstring/mbstring.c:1.186 php-src/ext/mbstring/mbstring.c:1.187
--- php-src/ext/mbstring/mbstring.c:1.186       Fri Aug  8 05:51:28 2003
+++ php-src/ext/mbstring/mbstring.c     Mon Aug 11 01:24:42 2003
@@ -17,7 +17,7 @@
    +----------------------------------------------------------------------+
  */
 
-/* $Id: mbstring.c,v 1.186 2003/08/08 09:51:28 moriyoshi Exp $ */
+/* $Id: mbstring.c,v 1.187 2003/08/11 05:24:42 fujimoto Exp $ */
 
 /*
  * PHP4 Multibyte String module "mbstring"
@@ -848,6 +848,9 @@
 #if HAVE_MBREGEX
        PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
 #endif
+#ifdef ZEND_MULTIBYTE
+       php_mb_set_zend_encoding(TSRMLS_C);
+#endif /* ZEND_MULTIBYTE */
 
        return SUCCESS;
 }
@@ -982,7 +985,10 @@
                } else {
                        MBSTRG(current_internal_encoding) = no_encoding;
 #ifdef ZEND_MULTIBYTE
-                       zend_multibyte_set_internal_encoding(Z_STRVAL_PP(arg1), 
Z_STRLEN_PP(arg1) TSRMLS_CC);
+                       /* TODO: make independent from mbstring.encoding_translation? 
*/
+                       if (MBSTRG(encoding_translation)) {
+                               zend_multibyte_set_internal_encoding(name, name_len 
TSRMLS_CC);
+                       }
 #endif /* ZEND_MULTIBYTE */
                        RETURN_TRUE;
                }
@@ -3366,7 +3372,7 @@
        int n, *entry, list_size = 0;
        zend_encoding_detector encoding_detector;
        zend_encoding_converter encoding_converter;
-       zend_multibyte_oddlen multibyte_oddlen;
+       zend_encoding_oddlen encoding_oddlen;
 
        /* notify script encoding to Zend Engine */
        entry = MBSTRG(script_encoding_list);
@@ -3392,19 +3398,17 @@
                efree(list);
        }
        encoding_detector = php_mb_encoding_detector;
-       encoding_converter = NULL;
-       multibyte_oddlen = php_mb_oddlen;
+       encoding_converter = php_mb_encoding_converter;
+       encoding_oddlen = php_mb_oddlen;
 
+       /* TODO: make independent from mbstring.encoding_translation? */
        if (MBSTRG(encoding_translation)) {
                /* notify internal encoding to Zend Engine */
                name = (char*)mbfl_no_encoding2name(MBSTRG(current_internal_encoding));
                zend_multibyte_set_internal_encoding(name, strlen(name) TSRMLS_CC);
-
-               encoding_converter = php_mb_encoding_converter;
        }
 
-       zend_multibyte_set_functions(encoding_detector, encoding_converter,
-                       multibyte_oddlen TSRMLS_CC);
+       zend_multibyte_set_functions(encoding_detector, encoding_converter, 
encoding_oddlen TSRMLS_CC);
 
        return 0;
 }
Index: php-src/main/main.c
diff -u php-src/main/main.c:1.563 php-src/main/main.c:1.564
--- php-src/main/main.c:1.563   Sat Aug  9 19:15:40 2003
+++ php-src/main/main.c Mon Aug 11 01:24:42 2003
@@ -18,7 +18,7 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: main.c,v 1.563 2003/08/09 23:15:40 iliaa Exp $ */
+/* $Id: main.c,v 1.564 2003/08/11 05:24:42 fujimoto Exp $ */
 
 /* {{{ includes
  */
@@ -90,10 +90,6 @@
 #include "php_logos.h"
 #include "php_streams.h"
 
-#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
-#include "ext/mbstring/mbstring.h"
-#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */
-
 #include "SAPI.h"
 #include "rfc1867.h"
 /* }}} */
@@ -1564,9 +1560,6 @@
                } else {
                        append_file_p = NULL;
                }
-#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
-               php_mb_set_zend_encoding(TSRMLS_C);
-#endif /* ZEND_MULTIBYTE && HAVE_MBSTRING */
 #ifdef PHP_WIN32
                zend_unset_timeout(TSRMLS_C);
 #endif

Index: ZendEngine2/zend_multibyte.h
+++ ZendEngine2/zend_multibyte.h
/*
   +----------------------------------------------------------------------+
   | Zend Engine                                                          |
   +----------------------------------------------------------------------+
   | Copyright (c) 1998-2003 Zend Technologies Ltd. (http://www.zend.com) |
   +----------------------------------------------------------------------+
   | This source file is subject to version 2.00 of the Zend license,     |
   | that is bundled with this package in the file LICENSE, and is        | 
   | available at through the world-wide-web at                           |
   | http://www.zend.com/license/2_00.txt.                                |
   | If you did not receive a copy of the Zend license and are unable to  |
   | obtain it through the world-wide-web, please send a note to          |
   | [EMAIL PROTECTED] so we can mail you a copy immediately.              |
   +----------------------------------------------------------------------+
   | Authors: Masaki Fujimoto <[EMAIL PROTECTED]>                          |
   |          Rui Hirokawa <[EMAIL PROTECTED]>                             |
   +----------------------------------------------------------------------+
*/

/* $Id: zend_multibyte.h,v 1.1 2003/08/11 05:24:41 fujimoto Exp $ */

#ifndef ZEND_MULTIBYTE_H
#define ZEND_MULTIBYTE_H

#ifdef ZEND_MULTIBYTE

#define BOM_UTF32_BE    "\x00\x00\xfe\xff"
#define BOM_UTF32_LE    "\xff\xfe\x00\x00"
#define BOM_UTF16_BE    "\xfe\xff"
#define BOM_UTF16_LE    "\xff\xfe"
#define BOM_UTF8                "\xef\xbb\xbf"

typedef int (*zend_encoding_filter)(char **str, int *str_length, const char *buf, int 
length TSRMLS_DC);

typedef char* (*zend_encoding_detector)(const char *string, int length, char *list 
TSRMLS_DC);

typedef int (*zend_encoding_converter)(char **to, int *to_length, const char *from, 
int from_length, const char *encoding_to, const char *encoding_from TSRMLS_DC);

typedef int (*zend_encoding_oddlen)(const char *string, int length, const char 
*encoding TSRMLS_DC);

typedef struct _zend_encoding {
        zend_encoding_filter input_filter;              /* escape input filter */
        zend_encoding_filter output_filter;             /* escape output filter */
        const char *name;                                       /* encoding name */
        const char *(*aliases)[];                       /* encoding name aliases */
        int compatible;                                         /* flex compatible or 
not */
} zend_encoding;


/*
 * zend multibyte APIs
 */
BEGIN_EXTERN_C()
ZEND_API int zend_multibyte_set_script_encoding(char *encoding_list, int 
encoding_list_size TSRMLS_DC);
ZEND_API int zend_multibyte_set_internal_encoding(char *encoding_name, int 
encoding_name_size TSRMLS_DC);
ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, 
zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen 
TSRMLS_DC);
ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC);
ZEND_API zend_encoding* zend_multibyte_fetch_encoding(char *encoding_name);
ZEND_API int zend_multibyte_script_encoding_filter(char **to, int *to_length, const 
char *from, int from_length TSRMLS_DC);
ZEND_API int zend_multibyte_internal_encoding_filter(char **to, int *to_length, const 
char *from, int from_length TSRMLS_DC);

/* in zend_language_scanner.l */
ZEND_API void zend_multibyte_yyinput_again(zend_encoding_filter old_input_filter, 
zend_encoding *old_encoding TSRMLS_DC);
ZEND_API int zend_multibyte_yyinput(zend_file_handle *file_handle, char *buf, size_t 
len TSRMLS_DC);
ZEND_API int zend_multibyte_read_script(TSRMLS_D);
END_EXTERN_C()

#endif /* ZEND_MULTIBYTE */

#endif /* ZEND_MULTIBYTE_H */

/*
 * Local variables:
 * tab-width: 4
 * c-basic-offset: 4
 * End:
 * vim600: sw=4 ts=4 tw=78
 * vim<600: sw=4 ts=4 tw=78
 */

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to