andrei Sat Feb 11 00:16:43 2006 UTC Modified files: /php-src/ext/unicode unicode_iterators.c Log: Implement character/word/line/sentence iterators and the reverse counterparts. http://cvs.php.net/viewcvs.cgi/php-src/ext/unicode/unicode_iterators.c?r1=1.18&r2=1.19&diff_format=u Index: php-src/ext/unicode/unicode_iterators.c diff -u php-src/ext/unicode/unicode_iterators.c:1.18 php-src/ext/unicode/unicode_iterators.c:1.19 --- php-src/ext/unicode/unicode_iterators.c:1.18 Fri Feb 10 00:23:29 2006 +++ php-src/ext/unicode/unicode_iterators.c Sat Feb 11 00:16:43 2006 @@ -14,7 +14,7 @@ +----------------------------------------------------------------------+ */ -/* $Id: unicode_iterators.c,v 1.18 2006/02/10 00:23:29 andrei Exp $ */ +/* $Id: unicode_iterators.c,v 1.19 2006/02/11 00:16:43 andrei Exp $ */ /* * TODO @@ -28,11 +28,16 @@ #include "php.h" #include "zend_interfaces.h" #include "zend_exceptions.h" +#include <unicode/ubrk.h> typedef enum { ITER_CODE_UNIT, ITER_CODE_POINT, ITER_COMB_SEQUENCE, + ITER_CHARACTER, + ITER_WORD, + ITER_LINE, + ITER_SENTENCE, ITER_TYPE_LAST, } text_iter_type; @@ -60,6 +65,12 @@ int32_t start; int32_t end; } cs; + struct { + UBreakIterator *iter; + int32_t index; + int32_t start; + int32_t end; + } brk; } u; } text_iter_obj; @@ -76,6 +87,13 @@ void (*rewind) (text_iter_obj* object TSRMLS_DC); } text_iter_ops; +enum UBreakIteratorType brk_type_map[] = { + UBRK_CHARACTER, + UBRK_WORD, + UBRK_LINE, + UBRK_SENTENCE, +}; + PHPAPI zend_class_entry* text_iterator_aggregate_ce; PHPAPI zend_class_entry* text_iterator_ce; PHPAPI zend_class_entry* rev_text_iterator_ce; @@ -276,12 +294,95 @@ }; +/* UBreakIterator Character Ops */ + +static int text_iter_brk_char_valid(text_iter_obj* object TSRMLS_DC) +{ + if (object->flags & ITER_REVERSE) { + return (object->u.brk.start != UBRK_DONE); + } else { + return (object->u.brk.end != UBRK_DONE); + } +} + +static void text_iter_brk_char_current(text_iter_obj* object TSRMLS_DC) +{ + uint32_t length; + int32_t start = object->u.brk.start; + int32_t end = object->u.brk.end; + + if (object->flags & ITER_REVERSE) { + if (end == UBRK_DONE) { + end = object->text_len; + } + } else { + if (start == UBRK_DONE) { + start = 0; + } + } + length = end - start; + if (length > object->current_alloc-1) { + object->current_alloc = length+1; + Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc); + } + u_memcpy(Z_USTRVAL_P(object->current), object->text + start, length); + Z_USTRVAL_P(object->current)[length] = 0; + Z_USTRLEN_P(object->current) = length; +} + +static int text_iter_brk_char_key(text_iter_obj* object TSRMLS_DC) +{ + return object->u.brk.index; +} + +static void text_iter_brk_char_next(text_iter_obj* object TSRMLS_DC) +{ + if (object->flags & ITER_REVERSE) { + if (object->u.brk.start != UBRK_DONE) { + object->u.brk.end = object->u.brk.start; + object->u.brk.start = ubrk_previous(object->u.brk.iter); + object->u.brk.index++; + } + } else { + if (object->u.brk.end != UBRK_DONE) { + object->u.brk.start = object->u.brk.end; + object->u.brk.end = ubrk_next(object->u.brk.iter); + object->u.brk.index++; + } + } +} + +static void text_iter_brk_char_rewind(text_iter_obj *object TSRMLS_DC) +{ + if (object->flags & ITER_REVERSE) { + object->u.brk.end = ubrk_last(object->u.brk.iter); + object->u.brk.start = ubrk_previous(object->u.brk.iter); + } else { + object->u.brk.start = ubrk_first(object->u.brk.iter); + object->u.brk.end = ubrk_next(object->u.brk.iter); + } + object->u.brk.index = 0; +} + +static text_iter_ops text_iter_brk_ops = { + text_iter_brk_char_valid, + text_iter_brk_char_current, + text_iter_brk_char_key, + text_iter_brk_char_next, + text_iter_brk_char_rewind, +}; + + /* Ops array */ static text_iter_ops* iter_ops[] = { &text_iter_cu_ops, &text_iter_cp_ops, &text_iter_cs_ops, + &text_iter_brk_ops, + &text_iter_brk_ops, + &text_iter_brk_ops, + &text_iter_brk_ops, }; /* Iterator Funcs */ @@ -376,6 +477,9 @@ if (intern->text) { efree(intern->text); } + if (intern->type > ITER_CHARACTER && intern->u.brk.iter) { + ubrk_close(intern->u.brk.iter); + } zval_ptr_dtor(&intern->current); efree(object); } @@ -399,6 +503,7 @@ intern->current_alloc = 3; Z_USTRVAL_P(intern->current) = eumalloc(3); Z_USTRVAL_P(intern->current)[0] = 0; + Z_USTRLEN_P(intern->current) = 0; Z_TYPE_P(intern->current) = IS_UNICODE; retval.handle = zend_objects_store_put(intern, (zend_objects_store_dtor_t)zend_objects_destroy_object, (zend_objects_free_object_storage_t) text_iterator_free_storage, NULL TSRMLS_CC); @@ -426,11 +531,11 @@ intern->text_len = text_len; if (ZEND_NUM_ARGS() > 1) { ti_type = flags & ITER_TYPE_MASK; - if (ti_type < ITER_TYPE_LAST) { - intern->type = ti_type; - } else { + if (ti_type < 0 || ti_type >= ITER_TYPE_LAST) { php_error(E_WARNING, "Invalid iterator type in TextIterator constructor"); + ti_type = ITER_CODE_POINT; } + intern->type = ti_type; intern->flags = flags; } @@ -438,6 +543,15 @@ intern->flags |= ITER_REVERSE; } + if (ti_type >= ITER_CHARACTER && ti_type < ITER_TYPE_LAST) { + UErrorCode status = U_ZERO_ERROR; + intern->u.brk.iter = ubrk_open(brk_type_map[ti_type - ITER_CHARACTER], UG(default_locale), text, text_len, &status); + if (!U_SUCCESS(status)) { + php_error(E_RECOVERABLE_ERROR, "Could not create UBreakIterator: %s", u_errorName(status)); + return; + } + } + iter_ops[intern->type]->rewind(intern TSRMLS_CC); } @@ -513,6 +627,10 @@ zend_declare_class_constant_long(text_iterator_ce, "CODE_UNIT", sizeof("CODE_UNIT")-1, ITER_CODE_UNIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "CODE_POINT", sizeof("CODE_POINT")-1, ITER_CODE_POINT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "COMB_SEQUENCE", sizeof("COMB_SEQUENCE")-1, ITER_COMB_SEQUENCE TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "CHARACTER", sizeof("CHARACTER")-1, ITER_CHARACTER TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "WORD", sizeof("WORD")-1, ITER_WORD TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "LINE", sizeof("LINE")-1, ITER_LINE TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "SENTENCE", sizeof("SENTENCE")-1, ITER_SENTENCE TSRMLS_CC); } /*
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php