Good idea. I assume you want this for WidgetHTML.php? ;-)
- Stig
On Wed, 2002-06-12 at 14:26, Alan Knowles wrote:
> Attached hopefully is the re2c source for a html tokenizer - I added it
> to tokenizer.c - any thoughts on inclusion?
>
> regards
> alan
>
> ----
>
>
> enum {
> STATE_PLAIN = 0,
> STATE_TAG,
> STATE_NEXT_ARG,
> STATE_ARG,
> STATE_BEFORE_VAL,
> STATE_VAL
> };
>
> /*!re2c
> any = [\000-\377];
> N = (any\[<]);
> alpha = [a-zA-Z];
> alphanumeric = [a-zA-Z0-9];
> */
>
>
>
> #define YYFILL(n) goto stop
> #define YYCTYPE unsigned char
> #define YYCURSOR xp
> #define YYLIMIT end
> #define YYMARKER q
> #define STATE state
>
> PHP_FUNCTION(token_html)
> {
> char *source = NULL;
> int argc = ZEND_NUM_ARGS();
> int source_len;
> int state;
> char *end, *q;
> char *xp;
> char *start;
> zval *tag, *attribute;
>
> if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) ==
>FAILURE)
> return;
>
> YYCURSOR = source;
> YYLIMIT = source + source_len;
> STATE = STATE_PLAIN;
>
> array_init(return_value);
> switch (STATE) {
> case STATE_PLAIN: goto state_plain;
> case STATE_TAG: goto state_tag;
> case STATE_NEXT_ARG: goto state_next_arg;
> case STATE_ARG: goto state_arg;
> case STATE_BEFORE_VAL: goto state_before_val;
> case STATE_VAL: goto state_val;
> }
>
> /*
>
> I need to split the stuff into:
> array ( "TAG", array("name"=>"value","name=>"value"))
> or
> string
>
>
> add_next_index_zval(return_value, tag);handle_tag(STD_ARGS);
> */
>
>
>
> state_plain_begin:
> STATE = STATE_PLAIN;
>
> state_plain:
> start = YYCURSOR;
> /*!re2c
> "<" { STATE = STATE_TAG; goto state_tag; }
> N+ { add_next_index_stringl(return_value, start , xp -
>start , 1); goto state_plain; }
> */
>
> state_tag:
> start = YYCURSOR;
>
> // start -> xp contains currunt pos,
> // needs to deal with comments !-- and ?xml or php etc.
> /*!re2c
> [/!]? alphanumeric+ { MAKE_STD_ZVAL(tag); array_init(tag);
>add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; }
> "!" "-" "-" { MAKE_STD_ZVAL(tag); array_init(tag);
>add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; }
> any { add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR;
>goto state_plain_begin; }
> */
>
>
>
> state_comment_begin:
> start = YYCURSOR;
>
> state_comment_next:
>
> /*!re2c
> "-" "-" ">" { add_next_index_stringl(tag, start, xp - start -3, 1);
>add_next_index_zval(return_value, tag); goto state_plain_begin; }
> any { goto state_comment_next; }
> */
>
> state_next_arg_begin:
> STATE = STATE_NEXT_ARG;
>
> // at first bit after < or just after a name or name='xxxx'
> state_next_arg:
> start = YYCURSOR;
> /*!re2c
> ">" { add_next_index_zval(return_value, tag); goto state_plain_begin; }
> [ \v\t\n]+ { goto state_next_arg; }
> alpha { --YYCURSOR; STATE = STATE_ARG; goto state_arg; }
> "/" { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag,
>attribute); goto state_next_arg; }
> ["] (any\["])* ["] { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start + 1, xp - start -2, 1);
>add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute);
>goto state_next_arg_begin; }
> ['] (any\['])* ['] { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start + 1, xp - start -2, 1);
>add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute);
>goto state_next_arg_begin; }
> any { add_next_index_zval(return_value, tag); goto state_plain_begin; }
> */
>
> state_arg:
> start = YYCURSOR;
> /*!re2c
> alpha+ { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL;
>goto state_before_val; }
> any { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; }
> */
>
> state_before_val:
> start = YYCURSOR;
> /*!re2c
> [ ]* "=" [ ]* { STATE = STATE_VAL; goto state_val; }
> any { add_next_index_zval(tag, attribute); --YYCURSOR; goto
>state_next_arg_begin; }
> */
>
>
> state_val:
> start = YYCURSOR;
> /*!re2c
> ["] (any\["])* ["] { add_next_index_stringl(attribute, start + 1, xp - start -2,
>1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag,
>attribute); goto state_next_arg_begin; }
> ['] (any\['])* ['] { add_next_index_stringl(attribute, start + 1, xp - start -2,
>1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag,
>attribute); goto state_next_arg_begin; }
> (any\[ \n>"'])+ { add_next_index_stringl(attribute, start, xp - start, 1);
>add_next_index_zval(tag, attribute); goto state_next_arg_begin; }
> any { add_next_index_zval(tag, attribute); --YYCURSOR; goto
>state_next_arg_begin; }
> */
>
> stop:
> // should do a bit of checking - adding loose attribute or tags to return
>value....
>
>
> }
>
> ----
>
> --
> PHP Development Mailing List <http://www.php.net/>
> To unsubscribe, visit: http://www.php.net/unsub.php
--
Stig S�ther Bakken, Fast Search & Transfer ASA, Trondheim, Norway
http://pear.php.net/wishlist.php/ssb
--
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php