commit 0e3d5f60213ba55935364c73422b373ac380f574
Author:     Laslo Hunhold <d...@frign.de>
AuthorDate: Wed Dec 8 17:47:58 2021 +0100
Commit:     Laslo Hunhold <d...@frign.de>
CommitDate: Wed Dec 8 17:55:56 2021 +0100

    Refactor data-generation and library structure
    
    What I always didn't like was the fact that you would have to have
    two heisenstates in grapheme_boundary() (one for the grapheme-proptable
    and one for the emoji-proptable). This unnecessarily complicated the
    handling a little bit, even though there is still room for improvement.
    
    A new folder gen was created to contain the generation tools. The data
    folder from now on only contains data files.
    
    Now gen/util.c contains all necessary functions to properly parse
    property files (and test files) and you merely have to create an
    "order list" (e.g. in gen/grapheme.c and gen/grapheme-test.c) and then
    are good to go. This doesn't immensely remove code duplication, but
    will come in handy in the future.
    
    Additionally, src/boundary.c was moved into src/grapheme.c so there's
    only one object file pulling in the data-table. This separation makes
    the structure of the program clearer and helps the linker discard
    unused library elements.
    
    The heisenstate was increased to 64 bits for future use.
    
    Signed-off-by: Laslo Hunhold <d...@frign.de>

diff --git a/LICENSE b/LICENSE
index 936b515..43ddeef 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 ISC-License
 
-Copyright 2019-2020 Laslo Hunhold <d...@frign.de>
+Copyright 2019-2021 Laslo Hunhold <d...@frign.de>
 
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted, provided that the above
diff --git a/Makefile b/Makefile
index e76ae0d..a7a6eee 100644
--- a/Makefile
+++ b/Makefile
@@ -4,52 +4,52 @@
 
 include config.mk
 
-LIB = src/boundary src/codepoint src/grapheme src/util
-TEST = test/grapheme_boundary test/utf8-decode test/utf8-encode
-DATA = data/emoji data/grapheme_boundary data/grapheme_boundary_test
+DATA =\
+       data/emoji-data.txt\
+       data/GraphemeBreakProperty.txt\
+       data/GraphemeBreakTest.txt
+GEN = gen/grapheme gen/grapheme-test
+LIB = src/codepoint src/grapheme src/util
+TEST = test/grapheme test/utf8-decode test/utf8-encode
 
 MAN3 = man/grapheme_bytelen.3
 MAN7 = man/libgrapheme.7
 
 all: libgrapheme.a libgrapheme.so
 
-data/emoji.h: data/emoji.txt data/emoji
-data/grapheme_boundary.h: data/grapheme_boundary.txt data/grapheme_boundary
-data/grapheme_boundary_test.h: data/grapheme_boundary_test.txt 
data/grapheme_boundary_test
-
-data/emoji.o: data/emoji.c config.mk data/datautil.h
-data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/datautil.h
-data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk 
data/datautil.h
-data/datautil.o: data/datautil.c config.mk data/datautil.h
-src/boundary.o: src/boundary.c config.mk data/emoji.h data/grapheme_boundary.h 
grapheme.h
+gen/grapheme.o: gen/grapheme.c config.mk gen/util.h
+gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h
+gen/util.o: gen/util.c config.mk gen/util.h
 src/codepoint.o: src/codepoint.c config.mk grapheme.h
-src/grapheme.o: src/grapheme.c config.mk grapheme.h
+src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h
 src/util.o: src/util.c config.mk src/util.h
-test/grapheme_boundary.o: test/grapheme_boundary.c config.mk 
data/grapheme_boundary_test.h grapheme.h
+test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h
 test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h
 test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h
 
-data/emoji: data/emoji.o data/datautil.o
-data/grapheme_boundary: data/grapheme_boundary.o data/datautil.o
-data/grapheme_boundary_test: data/grapheme_boundary_test.o data/datautil.o
-test/grapheme_boundary: test/grapheme_boundary.o libgrapheme.a
+gen/grapheme: gen/grapheme.o gen/util.o
+gen/grapheme-test: gen/grapheme-test.o gen/util.o
+test/grapheme: test/grapheme.o libgrapheme.a
 test/utf8-encode: test/utf8-encode.o libgrapheme.a
 test/utf8-decode: test/utf8-decode.o libgrapheme.a
 
-data/emoji.txt:
+gen/grapheme.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/grapheme
+gen/grapheme-test.h: data/GraphemeBreakTest.txt gen/grapheme-test
+
+data/emoji-data.txt:
        wget -O $@ 
https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
 
-data/grapheme_boundary.txt:
+data/GraphemeBreakProperty.txt:
        wget -O $@ 
https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
 
-data/grapheme_boundary_test.txt:
+data/GraphemeBreakTest.txt:
        wget -O $@ 
https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
 
-$(DATA:=.h):
-       $(@:.h=) < $(@:.h=.txt) > $@
+$(GEN):
+       $(CC) -o $@ $(LDFLAGS) $@.o gen/util.o
 
-$(DATA):
-       $(CC) -o $@ $(LDFLAGS) $@.o data/datautil.o
+$(GEN:=.h):
+       $(@:.h=) > $@
 
 $(TEST):
        $(CC) -o $@ $(LDFLAGS) $@.o libgrapheme.a
@@ -86,7 +86,7 @@ uninstall:
        rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
 
 clean:
-       rm -f $(DATA:=.h) $(DATA:=.o) data/datautil.o $(LIB:=.o) $(TEST:=.o) 
$(DATA) $(TEST) libgrapheme.a libgrapheme.so
+       rm -f $(GEN:=.h) $(GEN:=.o) $(GEN) gen/util.o $(LIB:=.o) $(TEST:=.o) 
$(TEST) libgrapheme.a libgrapheme.so
 
 clean-data:
-       rm -f $(DATA:=.txt)
+       rm -f $(DATA)
diff --git a/data/grapheme_boundary.txt b/data/GraphemeBreakProperty.txt
similarity index 100%
rename from data/grapheme_boundary.txt
rename to data/GraphemeBreakProperty.txt
diff --git a/data/grapheme_boundary_test.txt b/data/GraphemeBreakTest.txt
similarity index 100%
rename from data/grapheme_boundary_test.txt
rename to data/GraphemeBreakTest.txt
diff --git a/data/datautil.c b/data/datautil.c
deleted file mode 100644
index 84f059e..0000000
--- a/data/datautil.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-
-#include "datautil.h"
-
-void
-parse_input(int (*process_line)(char **, size_t, char *))
-{
-       char *line = NULL, **field = NULL, *comment;
-       size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
-       ssize_t len;
-
-       while ((len = getline(&line, &linebufsize, stdin)) >= 0) {
-               /* remove trailing newline */
-               if (len > 0 && line[len - 1] == '\n') {
-                       line[len - 1] = '\0';
-                       len--;
-               }
-
-               /* skip empty lines and comment lines */
-               if (len == 0 || line[0] == '#') {
-                       continue;
-               }
-
-               /* tokenize line into fields */
-               for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
-                       /* extend field buffer, if necessary */
-                       if (++nfields > fieldbufsize) {
-                               if ((field = realloc(field, nfields *
-                                                    sizeof(*field))) == NULL) {
-                                       fprintf(stderr, "realloc: %s\n", 
strerror(errno));
-                                       exit(1);
-                               }
-                               fieldbufsize = nfields;
-                       }
-
-                       /* skip leading whitespace */
-                       while (line[i] == ' ') {
-                               i++;
-                       }
-
-                       /* set current position as field start */
-                       field[nfields - 1] = &line[i];
-
-                       /* continue until we reach ';' or '#' or end */
-                       while (line[i] != ';' && line[i] != '#' &&
-                              line[i] != '\0') {
-                               i++;
-                       }
-                       if (line [i] == '#') {
-                               /* set comment-variable for later */
-                               comment = &line[i + 1];
-                       }
-
-                       /* go back whitespace and terminate field there */
-                       if (i > 0) {
-                               for (j = i - 1; line[j] == ' '; j--)
-                                       ;
-                               line[j + 1] = '\0';
-                       } else {
-                               line[i] = '\0';
-                       }
-
-                       /* if comment is set, we are done */
-                       if (comment != NULL) {
-                               break;
-                       }
-               }
-
-               /* skip leading whitespace in comment */
-               while (comment != NULL && comment[0] == ' ') {
-                       comment++;
-               }
-
-               /* call line processing function */
-               if (process_line(field, nfields, comment)) {
-                       exit(1);
-               }
-       }
-
-       free(line);
-       free(field);
-}
-
-static int
-valid_hexstring(const char *str)
-{
-       const char *p = str;
-
-       while ((*p >= '0' && *p <= '9') ||
-              (*p >= 'a' && *p <= 'f') ||
-              (*p >= 'A' && *p <= 'F')) {
-               p++;
-       }
-
-       if (*p != '\0') {
-               fprintf(stderr, "invalid code point range '%s'\n", str);
-               return 0;
-       }
-
-       return 1;
-}
-
-int
-cp_parse(const char *str, uint32_t *cp)
-{
-       if (!valid_hexstring(str)) {
-               return 1;
-       }
-       *cp = strtol(str, NULL, 16);
-
-       return 0;
-}
-
-int
-range_parse(const char *str, struct range *range)
-{
-       char *p;
-
-       if ((p = strstr(str, "..")) == NULL) {
-               /* input has the form "XXXXXX" */
-               if (!valid_hexstring(str)) {
-                       return 1;
-               }
-               range->lower = range->upper = strtol(str, NULL, 16);
-       } else {
-               /* input has the form "XXXXXX..XXXXXX" */
-               *p = '\0';
-               p += 2;
-               if (!valid_hexstring(str) || !valid_hexstring(p)) {
-                       return 1;
-               }
-               range->lower = strtol(str, NULL, 16);
-               range->upper = strtol(p, NULL, 16);
-       }
-
-       return 0;
-}
-
-void
-range_list_append(struct range **range, size_t *nranges, const struct range 
*new)
-{
-       if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
-               /* we can merge with previous entry */
-               (*range)[*nranges - 1].upper = new->upper;
-       } else {
-               /* need to append new entry */
-               if ((*range = realloc(*range, (++(*nranges)) * 
sizeof(**range))) == NULL) {
-                       fprintf(stderr, "realloc: %s\n", strerror(errno));
-                       exit(1);
-               }
-               (*range)[*nranges - 1].lower = new->lower;
-               (*range)[*nranges - 1].upper = new->upper;
-       }
-}
diff --git a/data/datautil.h b/data/datautil.h
deleted file mode 100644
index c64e037..0000000
--- a/data/datautil.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#ifndef DATAUTIL_H
-#define DATAUTIL_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#define LEN(x) (sizeof (x) / sizeof *(x))
-
-struct range {
-       uint32_t lower;
-       uint32_t upper;
-};
-
-void parse_input(int (*process_line)(char **, size_t, char *));
-int cp_parse(const char *, uint32_t *);
-int range_parse(const char *, struct range *);
-void range_list_append(struct range **, size_t *, const struct range *);
-
-#endif /* DATAUTIL_H */
diff --git a/data/emoji.txt b/data/emoji-data.txt
similarity index 100%
rename from data/emoji.txt
rename to data/emoji-data.txt
diff --git a/data/emoji.c b/data/emoji.c
deleted file mode 100644
index 8c6c3ce..0000000
--- a/data/emoji.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "datautil.h"
-
-static struct {
-       char         *enumname;
-       char         *identifier;
-       struct range *table;
-       size_t        tablelen;
-} properties[] = {
-       {
-               /* extended pictographic */
-               .enumname   = "EMOJI_PROP_EXTPICT",
-               .identifier = "Extended_Pictographic",
-       },
-};
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
-       size_t i;
-       struct range r;
-
-       (void)comment;
-
-       if (nfields < 2) {
-               return 1;
-       }
-
-       for (i = 0; i < LEN(properties); i++) {
-               if (!strcmp(field[1], properties[i].identifier)) {
-                       if (range_parse(field[0], &r)) {
-                               return 1;
-                       }
-                       range_list_append(&(properties[i].table),
-                                         &(properties[i].tablelen), &r);
-                       break;
-               }
-       }
-
-       return 0;
-}
-
-int
-main(void)
-{
-       size_t i, j;
-
-       printf("/* Automatically generated by data/emo */\n"
-              "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
-
-       parse_input(process_line);
-
-       /* output enum */
-       printf("enum emoji_prop {\n");
-       for (i = 0; i < LEN(properties); i++) {
-               printf("\t%s,\n", properties[i].enumname);
-       }
-       printf("};\n\n");
-
-       /* output table */
-       printf("static const struct range_list emoji_prop[] = {\n");
-       for (i = 0; i < LEN(properties); i++) {
-               printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", 
properties[i].enumname);
-               for (j = 0; j < properties[i].tablelen; j++) {
-                       printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) 
},\n",
-                              properties[i].table[j].lower,
-                              properties[i].table[j].upper);
-               }
-               printf("\t\t},\n\t\t.len = %zu,\n\t},\n", 
properties[i].tablelen);
-       }
-       printf("};\n");
-
-       return 0;
-}
diff --git a/data/grapheme_boundary.c b/data/grapheme_boundary.c
deleted file mode 100644
index 068c350..0000000
--- a/data/grapheme_boundary.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "datautil.h"
-
-static struct {
-       char         *enumname;
-       char         *identifier;
-       struct range *table;
-       size_t        tablelen;
-} properties[] = {
-       {
-               /* carriage return */
-               .enumname   = "GB_PROP_CR",
-               .identifier = "CR",
-       },
-       {
-               /* line feed */
-               .enumname   = "GB_PROP_LF",
-               .identifier = "LF",
-       },
-       {
-               /* control character */
-               .enumname   = "GB_PROP_CONTROL",
-               .identifier = "Control",
-       },
-       {
-               /* grapheme extender */
-               .enumname   = "GB_PROP_EXTEND",
-               .identifier = "Extend",
-       },
-       {
-               /* zero width joiner */
-               .enumname   = "GB_PROP_ZWJ",
-               .identifier = "ZWJ",
-       },
-       {
-               /* regional indicator */
-               .enumname   = "GB_PROP_REGIONAL_INDICATOR",
-               .identifier = "Regional_Indicator",
-       },
-       {
-               /* prepend character */
-               .enumname   = "GB_PROP_PREPEND",
-               .identifier = "Prepend",
-       },
-       {
-               /* spacing mark */
-               .enumname   = "GB_PROP_SPACINGMARK",
-               .identifier = "SpacingMark",
-       },
-       {
-               /* hangul syllable type L */
-               .enumname   = "GB_PROP_L",
-               .identifier = "L",
-       },
-       {
-               /* hangul syllable type V */
-               .enumname   = "GB_PROP_V",
-               .identifier = "V",
-       },
-       {
-               /* hangul syllable type T */
-               .enumname   = "GB_PROP_T",
-               .identifier = "T",
-       },
-       {
-               /* hangul syllable type LV */
-               .enumname   = "GB_PROP_LV",
-               .identifier = "LV",
-       },
-       {
-               /* hangul syllable type LVT */
-               .enumname   = "GB_PROP_LVT",
-               .identifier = "LVT",
-       },
-};
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
-       size_t i;
-       struct range r;
-
-       (void)comment;
-
-       if (nfields < 2) {
-               return 1;
-       }
-
-       for (i = 0; i < LEN(properties); i++) {
-               if (!strcmp(field[1], properties[i].identifier)) {
-                       if (range_parse(field[0], &r)) {
-                               return 1;
-                       }
-                       range_list_append(&(properties[i].table),
-                                         &(properties[i].tablelen), &r);
-                       break;
-               }
-       }
-
-       return 0;
-}
-
-int
-main(void)
-{
-       size_t i, j;
-
-       printf("/* Automatically generated by data/gbp */\n"
-              "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
-
-       parse_input(process_line);
-
-       /* output enum */
-       printf("enum gb_prop {\n");
-       for (i = 0; i < LEN(properties); i++) {
-               printf("\t%s,\n", properties[i].enumname);
-       }
-       printf("};\n\n");
-
-       /* output table */
-       printf("static const struct range_list gb_prop[] = {\n");
-       for (i = 0; i < LEN(properties); i++) {
-               printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", 
properties[i].enumname);
-               for (j = 0; j < properties[i].tablelen; j++) {
-                       printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) 
},\n",
-                              properties[i].table[j].lower,
-                              properties[i].table[j].upper);
-               }
-               printf("\t\t},\n\t\t.len = %zu,\n\t},\n", 
properties[i].tablelen);
-       }
-       printf("};\n");
-
-       return 0;
-}
diff --git a/data/grapheme_boundary_test.c b/data/grapheme_boundary_test.c
deleted file mode 100644
index 2f3d6b4..0000000
--- a/data/grapheme_boundary_test.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include "datautil.h"
-
-struct break_test {
-       uint32_t *cp;
-       size_t cplen;
-       size_t *len;
-       size_t lenlen;
-       char *descr;
-};
-
-static struct break_test *test = NULL;
-static size_t ntests = 0;
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
-       struct break_test *t;
-       size_t i;
-       char *token;
-
-       if (nfields < 1) {
-               return 1;
-       }
-
-       /* append new testcase and initialize with zeroes */
-       if ((test = realloc(test, ++ntests * sizeof(*test))) == NULL) {
-               fprintf(stderr, "realloc: %s\n", strerror(errno));
-               return 1;
-       }
-       t = &test[ntests - 1];
-       memset(t, 0, sizeof(*t));
-
-       /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
-       for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
-            token = strtok(NULL, " ")) {
-               if (i % 2 == 0) {
-                       /* delimiter */
-                       if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
-                               /*
-                                * '÷' indicates a breakpoint,
-                                * the current length is done; allocate
-                                * a new length field and set it to 0
-                                */
-                               if ((t->len = realloc(t->len,
-                                    ++t->lenlen * sizeof(*t->len))) == NULL) {
-                                       fprintf(stderr, "realloc: %s\n",
-                                               strerror(errno));
-                                       return 1;
-                               }
-                               t->len[t->lenlen - 1] = 0;
-                       } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
-                               /*
-                                * '×' indicates a non-breakpoint, do nothing
-                                */
-                       } else {
-                               fprintf(stderr, "malformed delimiter '%s'\n",
-                                       token);
-                               return 1;
-                       }
-               } else {
-                       /* add code point to cp-array */
-                       if ((t->cp = realloc(t->cp, ++t->cplen *
-                                            sizeof(*t->cp))) == NULL) {
-                               fprintf(stderr, "realloc: %s\n", 
strerror(errno));
-                               return 1;
-                       }
-                       if (cp_parse(token, &t->cp[t->cplen - 1])) {
-                               return 1;
-                       }
-                       if (t->lenlen > 0) {
-                               t->len[t->lenlen - 1]++;
-                       }
-               }
-       }
-       if (t->len[t->lenlen - 1] == 0) {
-               /* we allocated one more length than we needed */
-               t->lenlen--;
-       }
-
-       /* store comment */
-       if ((test[ntests - 1].descr = strdup(comment)) == NULL) {
-               fprintf(stderr, "strdup: %s\n", strerror(errno));
-               return 1;
-       }
-
-       return 0;
-}
-
-int
-main(void)
-{
-       size_t i, j;
-
-       printf("/* Automatically generated by data/gbt */\n"
-              "#include <stdint.h>\n#include <stddef.h>\n\n");
-
-       parse_input(process_line);
-
-       printf("static const struct break_test {\n\tuint32_t *cp;\n"
-              "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
-              "\tchar *descr;\n} t[] = {\n");
-       for (i = 0; i < ntests; i++) {
-               printf("\t{\n");
-
-               printf("\t\t.cp     = (uint32_t[]){");
-               for (j = 0; j < test[i].cplen; j++) {
-                       printf(" UINT32_C(0x%06X)", test[i].cp[j]);
-                       if (j + 1 < test[i].cplen) {
-                               putchar(',');
-                       }
-               }
-               printf(" },\n");
-               printf("\t\t.cplen  = %zu,\n", test[i].cplen);
-
-               printf("\t\t.len    = (size_t[]){");
-               for (j = 0; j < test[i].lenlen; j++) {
-                       printf(" %zu", test[i].len[j]);
-                       if (j + 1 < test[i].lenlen) {
-                               putchar(',');
-                       }
-               }
-               printf(" },\n");
-               printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
-
-               printf("\t\t.descr  = \"%s\",\n", test[i].descr);
-
-               printf("\t},\n");
-       }
-       printf("};\n");
-
-       return 0;
-}
diff --git a/gen/grapheme-test.c b/gen/grapheme-test.c
new file mode 100644
index 0000000..e05dae6
--- /dev/null
+++ b/gen/grapheme-test.c
@@ -0,0 +1,18 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+int
+main(int argc, char *argv[])
+{
+       struct segment_test *st = NULL;
+       size_t numsegtests = 0;
+
+       (void)argc;
+
+       segment_test_list_parse("data/GraphemeBreakTest.txt", &st, 
&numsegtests);
+       segment_test_list_print(st, numsegtests, "grapheme_test", argv[0]);
+
+       return 0;
+}
diff --git a/gen/grapheme.c b/gen/grapheme.c
new file mode 100644
index 0000000..232a156
--- /dev/null
+++ b/gen/grapheme.c
@@ -0,0 +1,92 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+#define FILE_EMOJI    "data/emoji-data.txt"
+#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt"
+
+static struct property segment_property[] = {
+       {
+               .enumname   = "GRAPHEME_PROP_CONTROL",
+               .identifier = "Control",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_CR",
+               .identifier = "CR",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_EXTEND",
+               .identifier = "Extend",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC",
+               .identifier = "Extended_Pictographic",
+               .fname      = FILE_EMOJI,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_HANGUL_L",
+               .identifier = "L",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_HANGUL_V",
+               .identifier = "V",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_HANGUL_T",
+               .identifier = "T",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_HANGUL_LV",
+               .identifier = "LV",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_HANGUL_LVT",
+               .identifier = "LVT",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_LF",
+               .identifier = "LF",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_PREPEND",
+               .identifier = "Prepend",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_REGIONAL_INDICATOR",
+               .identifier = "Regional_Indicator",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_SPACINGMARK",
+               .identifier = "SpacingMark",
+               .fname      = FILE_GRAPHEME,
+       },
+       {
+               .enumname   = "GRAPHEME_PROP_ZWJ",
+               .identifier = "ZWJ",
+               .fname      = FILE_GRAPHEME,
+       },
+};
+
+int
+main(int argc, char *argv[])
+{
+       (void)argc;
+
+       property_list_parse(segment_property, LEN(segment_property));
+       property_list_print(segment_property, LEN(segment_property),
+                           "grapheme_prop", argv[0]);
+
+       return 0;
+}
diff --git a/gen/util.c b/gen/util.c
new file mode 100644
index 0000000..ec5afb7
--- /dev/null
+++ b/gen/util.c
@@ -0,0 +1,384 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include "util.h"
+
+struct property_list_payload
+{
+       struct property *prop;
+       size_t numprops;
+};
+
+struct segment_test_payload
+{
+       struct segment_test **st;
+       size_t *numsegtests;
+};
+
+static int
+valid_hexstring(const char *str)
+{
+       const char *p = str;
+
+       while ((*p >= '0' && *p <= '9') ||
+              (*p >= 'a' && *p <= 'f') ||
+              (*p >= 'A' && *p <= 'F')) {
+               p++;
+       }
+
+       if (*p != '\0') {
+               fprintf(stderr, "valid_hexstring: Invalid code point range 
'%s'\n", str);
+               return 0;
+       }
+
+       return 1;
+}
+
+static int
+cp_parse(const char *str, uint32_t *cp)
+{
+       if (!valid_hexstring(str)) {
+               return 1;
+       }
+       *cp = strtol(str, NULL, 16);
+
+       return 0;
+}
+
+static int
+range_parse(const char *str, struct range *range)
+{
+       char *p;
+
+       if ((p = strstr(str, "..")) == NULL) {
+               /* input has the form "XXXXXX" */
+               if (!valid_hexstring(str)) {
+                       return 1;
+               }
+               range->lower = range->upper = strtol(str, NULL, 16);
+       } else {
+               /* input has the form "XXXXXX..XXXXXX" */
+               *p = '\0';
+               p += 2;
+               if (!valid_hexstring(str) || !valid_hexstring(p)) {
+                       return 1;
+               }
+               range->lower = strtol(str, NULL, 16);
+               range->upper = strtol(p, NULL, 16);
+       }
+
+       return 0;
+}
+
+void
+range_list_append(struct range **range, size_t *nranges, const struct range 
*new)
+{
+       if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
+               /* we can merge with previous entry */
+               (*range)[*nranges - 1].upper = new->upper;
+       } else {
+               /* need to append new entry */
+               if ((*range = realloc(*range, (++(*nranges)) * 
sizeof(**range))) == NULL) {
+                       fprintf(stderr, "realloc: %s\n", strerror(errno));
+                       exit(1);
+               }
+               (*range)[*nranges - 1].lower = new->lower;
+               (*range)[*nranges - 1].upper = new->upper;
+       }
+}
+
+void parse_file_with_callback(char *fname, int (*callback)(char *, char **, 
size_t, char *, void *), void *payload)
+{
+       FILE *fp;
+       char *line = NULL, **field = NULL, *comment;
+       size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
+       ssize_t len;
+
+       /* open file */
+       if (!(fp = fopen(fname, "r"))) {
+               fprintf(stderr, "fopen '%s': %s\n", fname,
+                       strerror(errno));
+               exit(1);
+       }
+
+       while ((len = getline(&line, &linebufsize, fp)) >= 0) {
+               /* remove trailing newline */
+               if (len > 0 && line[len - 1] == '\n') {
+                       line[len - 1] = '\0';
+                       len--;
+               }
+
+               /* skip empty lines and comment lines */
+               if (len == 0 || line[0] == '#') {
+                       continue;
+               }
+
+               /* tokenize line into fields */
+               for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
+                       /* extend field buffer, if necessary */
+                       if (++nfields > fieldbufsize) {
+                               if ((field = realloc(field, nfields *
+                                     sizeof(*field))) == NULL) {
+                                       fprintf(stderr, "realloc: %s\n", 
strerror(errno));
+                                       exit(1);
+                               }
+                               fieldbufsize = nfields;
+                       }
+
+                       /* skip leading whitespace */
+                       while (line[i] == ' ') {
+                               i++;
+                       }
+
+                       /* set current position as field start */
+                       field[nfields - 1] = &line[i];
+
+                       /* continue until we reach ';' or '#' or end */
+                       while (line[i] != ';' && line[i] != '#' &&
+                              line[i] != '\0') {
+                               i++;
+                       }
+                       if (line[i] == '#') {
+                               /* set comment-variable for later */
+                               comment = &line[i + 1];
+                       }
+
+                       /* go back whitespace and terminate field there */
+                       if (i > 0) {
+                               for (j = i - 1; line[j] == ' '; j--)
+                                       ;
+                               line[j + 1] = '\0';
+                       } else {
+                               line[i] = '\0';
+                       }
+
+                       /* if comment is set, we are done */
+                       if (comment != NULL) {
+                               break;
+                       }
+               }
+
+               /* skip leading whitespace in comment */
+               while (comment != NULL && comment[0] == ' ') {
+                       comment++;
+               }
+
+               /* call callback function */
+               if (callback(fname, field, nfields, comment, payload)) {
+                       fprintf(stderr, "parse_file_with_callback: Malformed 
input.\n");
+                       exit(1);
+               }
+       }
+}
+
+int
+property_list_callback(char *fname, char **field, size_t nfields, char 
*comment, void *payload)
+{
+       struct property *prop = ((struct property_list_payload *)payload)->prop;
+       struct range r;
+       size_t i, numprops = ((struct property_list_payload 
*)payload)->numprops;
+
+       (void)comment;
+
+       if (nfields < 2) {
+               return 1;
+       }
+
+       for (i = 0; i < numprops; i++) {
+               if (!strcmp(field[1], prop[i].identifier) &&
+                   !strcmp(fname, prop[i].fname)) {
+                       if (range_parse(field[0], &r)) {
+                               return 1;
+                       }
+                       range_list_append(&(prop[i].table),
+                                         &(prop[i].tablelen), &r);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+void
+property_list_parse(struct property *prop, size_t numprops)
+{
+       struct property_list_payload pl = { .prop = prop, .numprops = numprops 
};
+       size_t i;
+
+       /* make sure to parse each file only once */
+       for (i = 0; i < numprops; i++) {
+               if (prop[i].tablelen > 0) {
+                       /* property's file was already parsed */
+                       continue;
+               }
+
+               parse_file_with_callback(prop[i].fname, property_list_callback, 
&pl);
+       }
+}
+
+void
+property_list_print(const struct property *prop, size_t numprops,
+                    const char *identifier, const char *progname)
+{
+       size_t i, j;
+
+       printf("/* Automatically generated by %s */\n"
+              "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n",
+              progname);
+
+       /* print enum */
+       printf("enum %s {\n", identifier);
+       for (i = 0; i < numprops; i++) {
+               printf("\t%s,\n", prop[i].enumname);
+       }
+       printf("};\n\n");
+
+       /* print table */
+       printf("static const struct range_list %s[] = {\n", identifier);
+       for (i = 0; i < numprops; i++) {
+               printf("\t[%s] = {\n\t\t.data = (struct range[]){\n",
+                      prop[i].enumname);
+               for (j = 0; j < prop[i].tablelen; j++) {
+                       printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) 
},\n",
+                              prop[i].table[j].lower,
+                              prop[i].table[j].upper);
+               }
+               printf("\t\t},\n\t\t.len = %zu,\n\t},\n", prop[i].tablelen);
+       }
+       printf("};\n");
+}
+
+int
+segment_test_callback(char *fname, char **field, size_t nfields, char 
*comment, void *payload)
+{
+       struct segment_test *t, **test = ((struct segment_test_payload 
*)payload)->st;
+       size_t i, *ntests = ((struct segment_test_payload 
*)payload)->numsegtests;
+       char *token;
+
+       (void)fname;
+
+       if (nfields < 1) {
+               return 1;
+       }
+
+       /* append new testcase and initialize with zeroes */
+       if ((*test = realloc(*test, ++(*ntests) * sizeof(**test))) == NULL) {
+               fprintf(stderr, "realloc: %s\n", strerror(errno));
+               return 1;
+       }
+       t = &(*test)[*ntests - 1];
+       memset(t, 0, sizeof(*t));
+
+       /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
+       for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
+            token = strtok(NULL, " ")) {
+               if (i % 2 == 0) {
+                       /* delimiter */
+                       if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
+                               /*
+                                * '÷' indicates a breakpoint,
+                                * the current length is done; allocate
+                                * a new length field and set it to 0
+                                */
+                               if ((t->len = realloc(t->len,
+                                    ++t->lenlen * sizeof(*t->len))) == NULL) {
+                                       fprintf(stderr, "realloc: %s\n",
+                                               strerror(errno));
+                                       return 1;
+                               }
+                               t->len[t->lenlen - 1] = 0;
+                       } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
+                               /*
+                                * '×' indicates a non-breakpoint, do nothing
+                                */
+                       } else {
+                               fprintf(stderr, "malformed delimiter '%s'\n",
+                                       token);
+                               return 1;
+                       }
+               } else {
+                       /* add code point to cp-array */
+                       if ((t->cp = realloc(t->cp, ++t->cplen *
+                                            sizeof(*t->cp))) == NULL) {
+                               fprintf(stderr, "realloc: %s\n", 
strerror(errno));
+                               return 1;
+                       }
+                       if (cp_parse(token, &t->cp[t->cplen - 1])) {
+                               return 1;
+                       }
+                       if (t->lenlen > 0) {
+                               t->len[t->lenlen - 1]++;
+                       }
+               }
+       }
+       if (t->len[t->lenlen - 1] == 0) {
+               /* we allocated one more length than we needed */
+               t->lenlen--;
+       }
+
+       /* store comment */
+       if (((*test)[*ntests - 1].descr = strdup(comment)) == NULL) {
+               fprintf(stderr, "strdup: %s\n", strerror(errno));
+               return 1;
+       }
+
+       return 0;
+}
+
+void
+segment_test_list_parse(char *fname, struct segment_test **st, size_t 
*numsegtests)
+{
+       struct segment_test_payload pl = { .st = st, .numsegtests = numsegtests 
};
+       *st = NULL;
+       *numsegtests = 0;
+
+       parse_file_with_callback(fname, segment_test_callback, &pl);
+}
+
+void
+segment_test_list_print(struct segment_test *st, size_t numsegtests,
+                        const char *identifier, const char *progname)
+{
+       size_t i, j;
+
+       printf("/* Automatically generated by %s */\n"
+              "#include <stdint.h>\n#include <stddef.h>\n\n", progname);
+
+       printf("static const struct {\n\tuint32_t *cp;\n"
+              "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
+              "\tchar *descr;\n} %s[] = {\n", identifier);
+       for (i = 0; i < numsegtests; i++) {
+               printf("\t{\n");
+
+               printf("\t\t.cp     = (uint32_t[]){");
+               for (j = 0; j < st[i].cplen; j++) {
+                       printf(" UINT32_C(0x%06X)", st[i].cp[j]);
+                       if (j + 1 < st[i].cplen) {
+                               putchar(',');
+                       }
+               }
+               printf(" },\n");
+               printf("\t\t.cplen  = %zu,\n", st[i].cplen);
+
+               printf("\t\t.len    = (size_t[]){");
+               for (j = 0; j < st[i].lenlen; j++) {
+                       printf(" %zu", st[i].len[j]);
+                       if (j + 1 < st[i].lenlen) {
+                               putchar(',');
+                       }
+               }
+               printf(" },\n");
+               printf("\t\t.lenlen = %zu,\n", st[i].lenlen);
+
+               printf("\t\t.descr  = \"%s\",\n", st[i].descr);
+
+               printf("\t},\n");
+       }
+       printf("};\n");
+}
+
+
diff --git a/gen/util.h b/gen/util.h
new file mode 100644
index 0000000..9461416
--- /dev/null
+++ b/gen/util.h
@@ -0,0 +1,37 @@
+/* See LICENSE file for copyright and license details. */
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define LEN(x) (sizeof (x) / sizeof *(x))
+
+struct range {
+       uint32_t lower;
+       uint32_t upper;
+};
+
+struct property {
+       char         *enumname;
+       char         *identifier;
+       char         *fname;
+       struct range *table;
+       size_t        tablelen;
+};
+
+struct segment_test {
+       uint32_t *cp;
+       size_t cplen;
+       size_t *len;
+       size_t lenlen;
+       char *descr;
+};
+
+void property_list_parse(struct property *, size_t);
+void property_list_print(const struct property *, size_t, const char *, const 
char *);
+
+void segment_test_list_parse(char *, struct segment_test **, size_t *);
+void segment_test_list_print(struct segment_test *, size_t, const char *, 
const char *);
+
+#endif /* UTIL_H */
diff --git a/src/boundary.c b/src/boundary.c
deleted file mode 100644
index f1c03d2..0000000
--- a/src/boundary.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "../data/emoji.h"
-#include "../data/grapheme_boundary.h"
-
-enum {
-       GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
-       GRAPHEME_STATE_EMOJI  = 1 << 1, /* within emoji modifier or zwj 
sequence */
-};
-
-static int
-cp_cmp(const void *a, const void *b)
-{
-       uint32_t cp = *(uint32_t *)a;
-       uint32_t *range = (uint32_t *)b;
-
-       return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
-}
-
-static int
-has_property(uint32_t cp, struct heisenstate *cpstate,
-             const struct range_list *proptable, int property)
-{
-       if (heisenstate_get(cpstate, property) == -1) {
-               /* state undetermined, make a lookup and set it */
-               heisenstate_set(cpstate, property, bsearch(&cp,
-                               proptable[property].data,
-                               proptable[property].len,
-                               sizeof(*proptable[property].data),
-                               cp_cmp) ? 1 : 0);
-       }
-
-       return heisenstate_get(cpstate, property);
-}
-
-int
-grapheme_boundary(uint32_t a, uint32_t b, int *state)
-{
-       struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 };
-       int s;
-
-       /* skip printable ASCII */
-       if ((a >= 0x20 && a <= 0x7E) &&
-           (b >= 0x20 && b <= 0x7E)) {
-               return 1;
-       }
-
-       /* set internal state based on given state-pointer */
-       s = (state != NULL) ? *state : 0;
-
-       /*
-        * Apply grapheme cluster breaking algorithm (UAX #29), see
-        * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
-        */
-
-       /*
-        * update state
-        */
-       if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
-               if (has_property(a, &gb[0], gb_prop, 
GB_PROP_REGIONAL_INDICATOR)) {
-                       /* one more RI is on the left side of the seam */
-                       s ^= GRAPHEME_STATE_RI_ODD;
-               } else {
-                       /* an RI appeared on the right side but the left
-                          side is not an RI, reset state (0 is even) */
-                       s &= ~GRAPHEME_STATE_RI_ODD;
-               }
-       }
-       if (!(*state & GRAPHEME_STATE_EMOJI) &&
-           ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
-             has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
-             (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
-             has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)))) {
-               s |= GRAPHEME_STATE_EMOJI;
-       } else if ((*state & GRAPHEME_STATE_EMOJI) &&
-                  ((has_property(a, &gb[0],    gb_prop,    GB_PROP_ZWJ) &&
-                    has_property(b, &emoji[1], emoji_prop, 
EMOJI_PROP_EXTPICT)) ||
-                   (has_property(a, &gb[0],    gb_prop,    GB_PROP_EXTEND) &&
-                    has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)) ||
-                   (has_property(a, &gb[0],    gb_prop,    GB_PROP_EXTEND) &&
-                    has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
-                   (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) 
&&
-                    has_property(b, &gb[1],    gb_prop,    GB_PROP_ZWJ)) ||
-                   (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) 
&&
-                    has_property(b, &gb[1],    gb_prop,    GB_PROP_EXTEND)))) {
-               /* GRAPHEME_STATE_EMOJI remains */
-       } else {
-               s &= ~GRAPHEME_STATE_EMOJI;
-       }
-
-       /* write updated state to state-pointer, if given */
-       if (state != NULL) {
-               *state = s;
-       }
-
-       /*
-        * apply rules
-        */
-
-       /* skip GB1 and GB2, as they are never satisfied here */
-
-       /* GB3 */
-       if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) &&
-           has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
-               return 0;
-       }
-
-       /* GB4 */
-       if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) ||
-           has_property(a, &gb[0], gb_prop, GB_PROP_CR) ||
-           has_property(a, &gb[0], gb_prop, GB_PROP_LF)) {
-               return 1;
-       }
-
-       /* GB5 */
-       if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) ||
-           has_property(b, &gb[1], gb_prop, GB_PROP_CR) ||
-           has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
-               return 1;
-       }
-
-       /* GB6 */
-       if (has_property(a, &gb[0], gb_prop, GB_PROP_L) &&
-           (has_property(b, &gb[1], gb_prop, GB_PROP_L) ||
-            has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
-            has_property(b, &gb[1], gb_prop, GB_PROP_LV) ||
-            has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) {
-               return 0;
-       }
-
-       /* GB7 */
-       if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) ||
-            has_property(a, &gb[0], gb_prop, GB_PROP_V)) &&
-           (has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
-            has_property(b, &gb[1], gb_prop, GB_PROP_T))) {
-               return 0;
-       }
-
-       /* GB8 */
-       if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) ||
-            has_property(a, &gb[0], gb_prop, GB_PROP_T)) &&
-           has_property(b, &gb[1], gb_prop, GB_PROP_T)) {
-               return 0;
-       }
-
-       /* GB9 */
-       if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) ||
-           has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) {
-               return 0;
-       }
-
-       /* GB9a */
-       if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) {
-               return 0;
-       }
-
-       /* GB9b */
-       if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) {
-               return 0;
-       }
-
-       /* GB11 */
-       if ((s & GRAPHEME_STATE_EMOJI) &&
-           has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) &&
-           has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) {
-               return 0;
-       }
-
-       /* GB12/GB13 */
-       if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
-           has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
-           (s & GRAPHEME_STATE_RI_ODD)) {
-               return 0;
-       }
-
-       /* GB999 */
-       return 1;
-}
diff --git a/src/grapheme.c b/src/grapheme.c
index 8577038..068f91b 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
@@ -2,8 +2,158 @@
 #include <stddef.h>
 #include <stdlib.h>
 
+#include "../gen/grapheme.h"
 #include "../grapheme.h"
 
+enum {
+       GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
+       GRAPHEME_STATE_EMOJI  = 1 << 1, /* within emoji modifier or zwj 
sequence */
+};
+
+int
+grapheme_boundary(uint32_t a, uint32_t b, int *state)
+{
+       struct heisenstate prop[2] = { 0 };
+       int s;
+
+       /* skip printable ASCII */
+       if ((a >= 0x20 && a <= 0x7E) &&
+           (b >= 0x20 && b <= 0x7E)) {
+               return 1;
+       }
+
+       /* set internal state based on given state-pointer */
+       s = (state != NULL) ? *state : 0;
+
+       /*
+        * Apply grapheme cluster breaking algorithm (UAX #29), see
+        * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+        */
+
+       /*
+        * update state
+        */
+       if (has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+               if (has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+                       /* one more RI is on the left side of the seam */
+                       s ^= GRAPHEME_STATE_RI_ODD;
+               } else {
+                       /* an RI appeared on the right side but the left
+                          side is not an RI, reset state (0 is even) */
+                       s &= ~GRAPHEME_STATE_RI_ODD;
+               }
+       }
+       if (!(*state & GRAPHEME_STATE_EMOJI) &&
+           ((has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+             has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+             (has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+             has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) 
{
+               s |= GRAPHEME_STATE_EMOJI;
+       } else if ((*state & GRAPHEME_STATE_EMOJI) &&
+                  ((has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_ZWJ) &&
+                    has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) ||
+                   (has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_EXTEND) &&
+                    has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_EXTEND)) ||
+                   (has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_EXTEND) &&
+                    has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_ZWJ)) ||
+                   (has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+                    has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_ZWJ)) ||
+                   (has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+                    has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_EXTEND)))) {
+               /* GRAPHEME_STATE_EMOJI remains */
+       } else {
+               s &= ~GRAPHEME_STATE_EMOJI;
+       }
+
+       /* write updated state to state-pointer, if given */
+       if (state != NULL) {
+               *state = s;
+       }
+
+       /*
+        * apply rules
+        */
+
+       /* skip GB1 and GB2, as they are never satisfied here */
+
+       /* GB3 */
+       if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) &&
+           has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+               return 0;
+       }
+
+       /* GB4 */
+       if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+           has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) ||
+           has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_LF)) {
+               return 1;
+       }
+
+       /* GB5 */
+       if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+           has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CR) ||
+           has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+               return 1;
+       }
+
+       /* GB6 */
+       if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) &&
+           (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) ||
+            has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+            has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) 
||
+            has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_HANGUL_LVT))) {
+               return 0;
+       }
+
+       /* GB7 */
+       if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) 
||
+            has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) 
&&
+           (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+            has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) 
{
+               return 0;
+       }
+
+       /* GB8 */
+       if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) 
||
+            has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) 
&&
+           has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) {
+               return 0;
+       }
+
+       /* GB9 */
+       if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND) ||
+           has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) {
+               return 0;
+       }
+
+       /* GB9a */
+       if (has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_SPACINGMARK)) {
+               return 0;
+       }
+
+       /* GB9b */
+       if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) {
+               return 0;
+       }
+
+       /* GB11 */
+       if ((s & GRAPHEME_STATE_EMOJI) &&
+           has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
+           has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) {
+               return 0;
+       }
+
+       /* GB12/GB13 */
+       if (has_property(a, &prop[0], grapheme_prop, 
GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+           has_property(b, &prop[1], grapheme_prop, 
GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+           (s & GRAPHEME_STATE_RI_ODD)) {
+               return 0;
+       }
+
+       /* GB999 */
+       return 1;
+}
+
 size_t
 grapheme_bytelen(const char *str)
 {
diff --git a/src/util.c b/src/util.c
index 662ea98..955cdad 100644
--- a/src/util.c
+++ b/src/util.c
@@ -1,10 +1,13 @@
 /* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+
 #include "util.h"
 
 int
 heisenstate_get(struct heisenstate *h, int slot)
 {
-       if (h == NULL || slot >= 16 || slot < 0 ||
+       if (h == NULL || slot >= 64 || slot < 0 ||
            !(h->determined & (1 << slot))) {
                /* no state given, slot out of range or undetermined */
                return -1;
@@ -17,7 +20,7 @@ heisenstate_get(struct heisenstate *h, int slot)
 int
 heisenstate_set(struct heisenstate *h, int slot, int state)
 {
-       if (h == NULL || slot >= 16 || slot < 0) {
+       if (h == NULL || slot >= 64 || slot < 0) {
                /* no state given or slot out of range */
                return 1;
        } else {
@@ -31,3 +34,28 @@ heisenstate_set(struct heisenstate *h, int slot, int state)
 
        return 0;
 }
+
+static int
+cp_cmp(const void *a, const void *b)
+{
+       uint32_t cp = *(uint32_t *)a;
+       uint32_t *range = (uint32_t *)b;
+
+       return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
+}
+
+int
+has_property(uint32_t cp, struct heisenstate *cpstate,
+             const struct range_list *proptable, int property)
+{
+       if (heisenstate_get(cpstate, property) == -1) {
+               /* state undetermined, make a lookup and set it */
+               heisenstate_set(cpstate, property, bsearch(&cp,
+                               proptable[property].data,
+                               proptable[property].len,
+                               sizeof(*proptable[property].data),
+                               cp_cmp) ? 1 : 0);
+       }
+
+       return heisenstate_get(cpstate, property);
+}
diff --git a/src/util.h b/src/util.h
index e213428..e480da0 100644
--- a/src/util.h
+++ b/src/util.h
@@ -17,13 +17,16 @@ struct range_list {
        size_t len;
 };
 
-/* 16-slot (0,...,15) optionally undetermined binary state */
+/* 64-slot (0,...,63) optionally undetermined binary state */
 struct heisenstate {
-       uint_least16_t determined;
-       uint_least16_t state;
+       uint_least64_t determined;
+       uint_least64_t state;
 };
 
 int heisenstate_get(struct heisenstate *, int);
 int heisenstate_set(struct heisenstate *, int, int);
 
+int has_property(uint32_t, struct heisenstate *,
+                 const struct range_list *, int);
+
 #endif /* UTIL_H */
diff --git a/test/grapheme_boundary.c b/test/grapheme.c
similarity index 54%
rename from test/grapheme_boundary.c
rename to test/grapheme.c
index 09f5971..ff4d1f4 100644
--- a/test/grapheme_boundary.c
+++ b/test/grapheme.c
@@ -5,7 +5,7 @@
 #include <string.h>
 
 #include "../grapheme.h"
-#include "../data/grapheme_boundary_test.h"
+#include "../gen/grapheme-test.h"
 
 #define LEN(x) (sizeof(x) / sizeof(*x))
 
@@ -16,15 +16,17 @@ main(void)
        size_t i, j, k, len, failed;
 
        /* grapheme break test */
-       for (i = 0, failed = 0; i < LEN(t); i++) {
-               for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
-                       if ((j + 1) == t[i].cplen ||
-                           grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
+       for (i = 0, failed = 0; i < LEN(grapheme_test); i++) {
+               for (j = 0, k = 0, state = 0, len = 1; j < 
grapheme_test[i].cplen; j++) {
+                       if ((j + 1) == grapheme_test[i].cplen ||
+                           grapheme_boundary(grapheme_test[i].cp[j],
+                                             grapheme_test[i].cp[j + 1],
                                              &state)) {
                                /* check if our resulting length matches */
-                               if (k == t[i].lenlen || len != t[i].len[k++]) {
+                               if (k == grapheme_test[i].lenlen ||
+                                   len != grapheme_test[i].len[k++]) {
                                        fprintf(stderr, "Failed \"%s\"\n",
-                                               t[i].descr);
+                                               grapheme_test[i].descr);
                                        failed++;
                                        break;
                                }
@@ -35,7 +37,7 @@ main(void)
                }
        }
        printf("Grapheme break test: Passed %zu out of %zu tests.\n",
-              LEN(t) - failed, LEN(t));
+              LEN(grapheme_test) - failed, LEN(grapheme_test));
 
        return (failed > 0) ? 1 : 0;
 }

Reply via email to