On Sun, Mar 09, 2014 at 04:11:40PM +0530, rsk1994 wrote: Thanks for this. I'm going to ignore the generated files and new test data, as they're uninteresting!
> diff --git a/build/get-entities/getlist.py b/build/get-entities/getlist.py > new file mode 100644 > index 0000000..a06120b > --- /dev/null > +++ b/build/get-entities/getlist.py What is this file for? It appears to be unused. If it is unnecessary, please remove it. > diff --git a/build/make-entities.pl b/build/make-entities.pl > index 7492052..f9bad7b 100644 > --- a/build/make-entities.pl > +++ b/build/make-entities.pl > @@ -4,8 +4,11 @@ > # http://www.opensource.org/licenses/mit-license.php > # Copyright 2010 Daniel Silverstone <[email protected]> > # John-Mark Bell <[email protected]> > +# Rupinder Singh Khokhar <[email protected]> > > use strict; > +use utf8; > +use open ":std", ":encoding(UTF-8)"; I'm not convinced this is necessary, as we shouldn't be emitting verbatim UTF-8 at any point. > use constant ENTITIES_FILE => 'build/Entities'; > use constant ENTITIES_INC => 'src/tokeniser/entities.inc'; > @@ -21,8 +24,32 @@ while (my $line = <INFILE>) { > next if ($line eq ''); > my @elements = split /\s+/, $line; > my $entity = shift @elements; > - my $code = shift @elements; > - $entities{$entity} = $code; > + my $len = 0; > + my $code = ""; > + while(@elements) { > + my $utf8_val = chr(hex(shift(@elements))); chr() does magical, unexpected, things for values in the range 0x80-0xff. pack("U", hex(shift(@elements)) is almost certainly better. See below, however. > + my $backslash_flag = 0; > + if($utf8_val eq "\n") { > + $utf8_val = "\\n"; > + $backslash_flag = 1; > + } > + if($utf8_val eq "\"" > + || $utf8_val eq "\'" > + || $utf8_val eq "\\") { > + $utf8_val = "\\".$utf8_val; > + $backslash_flag = 1; > + } > + $code = $code.$utf8_val; > + if($backslash_flag) { > + $len-=1; > + } > + } > + > + { > + use Encode; Unless there's good reason, please declare all module use at the start of the file. > + $len += length(Encode::encode_utf8($code)); > + } There are a couple of issues here: 1) This isn't robust to other control characters 2) In the non \n, ", ', \ case, we're emitting verbatim UTF-8, which many C compilers can't cope with in string literals. Thus, I think a different approach is needed. Something like this: while (@elements) { my $ucs4 = hex(shift(@entities)); my $utf8 = Encode::encode_utf8(pack('U', $ucs4)); $len += length($utf8); if ($ucs4 < 0x20 || $ucs4 == 0x22 || $ucs4 == 0x27 || $ucs4 == 0x5C || $ucs4 >= 0x7f) { # Unsafe, or non-ASCII: emit escaped for my $octet (unpack('C*', $utf8)) { $code = $code . sprintf("\\x%02x", $octet); } } else { # Safe: emit verbatim $code = $code . $utf8; } } > + $entities{$entity} = "(uint8_t*)\"$code\",".($len); This wants to be (const uint8_t *) > } > > close(INFILE); > @@ -113,9 +140,9 @@ foreach my $node (@nodelist) { > $split = ord($split) if ($split ne ''); > $split = 0 if ($split eq ''); > > - $value = "0" unless defined($value); > + $value = "0,0" unless defined($value); "NULL, 0". Using 0 where the type is a pointer is evil. > - $output .= "\t{ $split, $lt, $eq, $gt, $value },\n"; > + $output .= "\t{ $split, $lt, $eq, $gt, {$value} },\n"; > } > > $output .= "};\n\n"; > diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c > index ac47d80..4ff8625 100644 > --- a/src/tokeniser/entities.c > +++ b/src/tokeniser/entities.c > @@ -7,15 +7,20 @@ > > #include "utils/utils.h" > #include "tokeniser/entities.h" Can I have a linebreak here, please? > +/** > + * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER > + */ > +static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' }; > +static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; > > /** Node in our entity tree */ > typedef struct hubbub_entity_node { > - /* Do not reorder this without fixing make-entities.pl */ > + /* Do not reorder this without fixing make-entities.pl */ > uint8_t split; /**< Data to split on */ > int32_t lt; /**< Subtree for data less than split */ > int32_t eq; /**< Subtree for data equal to split */ > int32_t gt; /**< Subtree for data greater than split */ > - uint32_t value; /**< Data for this node */ > + hubbub_string value; /**< Data for this node */ > } hubbub_entity_node; > > #include "entities.inc" > @@ -38,7 +43,7 @@ typedef struct hubbub_entity_node { > * is found. > */ > static hubbub_error hubbub_entity_tree_search_step(uint8_t c, > - uint32_t *result, int32_t *context) > + hubbub_string *result, int32_t *context) > { > bool match = false; > int32_t p; > @@ -63,7 +68,7 @@ static hubbub_error hubbub_entity_tree_search_step(uint8_t > c, > match = true; > *result = dict[dict[p].eq].value; > p = dict[p].eq; > - } else if (dict[p].value != 0) { > + } else if (!(dict[p].value.ptr == NULL && > dict[p].value.len==0)) { In general, our style is not to use ! on its own. Instead, express this as: if (dict[p].value.ptr != NULL || dict[p].value.len != 0) However, surely if the pointer is NULL, the length is irrelevant? In which case, if (dict[p].value.ptr != NULL) will suffice. > match = true; > *result = dict[p].value; > p = dict[p].eq; > @@ -100,13 +105,13 @@ static hubbub_error > hubbub_entity_tree_search_step(uint8_t c, > * The location pointed to by ::result will be set to U+FFFD unless a match > * is found. > */ > -hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, > +hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result, > int32_t *context) > { > if (result == NULL) > return HUBBUB_BADPARM; > > - *result = 0xFFFD; > - > + *result = u_fffd_str; > + > return hubbub_entity_tree_search_step(c, result, context); > } > diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h > index 0703b37..a8b9bbf 100644 > --- a/src/tokeniser/entities.h > +++ b/src/tokeniser/entities.h > @@ -14,7 +14,7 @@ > #include <hubbub/functypes.h> > > /* Step-wise search for an entity in the dictionary */ > -hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, > +hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result, > int32_t *context); > > #endif > diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c > index a7e67a1..4ac6f0f 100644 > --- a/src/tokeniser/tokeniser.c > +++ b/src/tokeniser/tokeniser.c > @@ -4,6 +4,7 @@ > * http://www.opensource.org/licenses/mit-license.php > * Copyright 2007 John-Mark Bell <[email protected]> > * Copyright 2008 Andrew Sidwell <[email protected]> > + * Copyright 2014 Rupinder Singh Khokhar <[email protected]> > */ > #include <assert.h> > #include <stdbool.h> > @@ -128,7 +129,8 @@ typedef struct hubbub_tokeniser_context { > struct { > size_t offset; /**< Offset in buffer */ > uint32_t length; /**< Length of entity */ > - uint32_t codepoint; /**< UCS4 codepoint */ > + hubbub_string codepoint; /**< UTF-8 codepoint > + * for Named Entity*/ > bool complete; /**< True if match complete */ > > uint32_t poss_length; /**< Optimistic length > @@ -147,6 +149,8 @@ typedef struct hubbub_tokeniser_context { > * numeric entity value */ > hubbub_tokeniser_state return_state; /**< State we were > * called from */ > + uint32_t ucs4_tmp; /**<UCS-4 value for > + * numeric entities*/ Can we name this better, please? In fact, as you can't have a numeric entity at the same time as a named entity, I'd be inclined to do this: union { hubbub_string named; uint32_t numeric; } codepoint; However, I still dislike the need for more of the tokeniser than strictly necessary to know the difference between a named and numeric entity. I really think we would be better off with: hubbub_string codepoint; union { uint32_t ucs4; uint8_t numeric_buf[6]; } numeric_state; and then have hubbub_tokeniser_handle_numbered_entity() do the work to convert the UCS4 value to UTF8, and then populate the codepoint string values appropriately. This way, the difference between named and numeric entities is only relevant within hubbub_tokeniser_handle_numbered_entity(), which is the only code that knows about numeric entities, anyway. In general, implementation hiding is a good thing as it stops thinks making assumptions. J.
