commit 1a67a7fc7df73df260f8b7a18effd3b3e2b161c7
Author: Laslo Hunhold
AuthorDate: Mon Jun 6 22:16:46 2022 +0200
Commit: Laslo Hunhold
CommitDate: Mon Jun 6 22:40:22 2022 +0200
Implement word-segmentation
This was a tough nut to crack and took a lot of hours and multiple
rewrites to get right.
The first issue was that some codepoints could be in multiple classes
at the same time, requiring the implementation of a "conflict-handler"
in the data parser.
The segmentation algorithm itself then was highly complicated, as it
parses the data on two levels involving ignoring certain character
property classes and doing so gracefully and simultaneously.
Now it works though and passes the 1800+ tests provided by the Unicode
consortium. The LUTs are highly compressed and the complete library
still only weighs in at around 92K, which is lightweight given what
it does. If you link statically, it will cut away most of it as well.
What needed to be rethought was the general API-structure. It is
impossible to do word-segmentation on a 2-codepoint-comparison-with-
state basis and the only "form" is a function taking the entire
array and returning the offset to the next break. The API was adapted
accordingly.
Signed-off-by: Laslo Hunhold
diff --git a/Makefile b/Makefile
index 7bb10d9..74f3352 100644
--- a/Makefile
+++ b/Makefile
@@ -7,25 +7,32 @@ include config.mk
BENCHMARK =\
benchmark/character\
benchmark/utf8-decode\
+ benchmark/word\
DATA =\
data/emoji-data.txt\
data/GraphemeBreakProperty.txt\
data/GraphemeBreakTest.txt\
+ data/WordBreakProperty.txt\
+ data/WordBreakTest.txt\
GEN =\
gen/character\
gen/character-test\
+ gen/word\
+ gen/word-test\
SRC =\
src/character\
src/utf8\
src/util\
+ src/word\
TEST =\
test/character\
test/utf8-decode\
test/utf8-encode\
+ test/word\
MAN3 =\
man/grapheme_decode_utf8.3\
@@ -40,27 +47,38 @@ all: libgrapheme.a libgrapheme.so
benchmark/character.o: benchmark/character.c config.mk gen/character-test.h
grapheme.h benchmark/util.h
benchmark/utf8-decode.o: benchmark/utf8-decode.c config.mk
gen/character-test.h grapheme.h benchmark/util.h
benchmark/util.o: benchmark/util.c config.mk benchmark/util.h
+benchmark/word.o: benchmark/word.c config.mk gen/word-test.h grapheme.h
benchmark/util.h
gen/character.o: gen/character.c config.mk gen/util.h
gen/character-test.o: gen/character-test.c config.mk gen/util.h
+gen/word.o: gen/word.c config.mk gen/util.h
+gen/word-test.o: gen/word-test.c config.mk gen/util.h
gen/util.o: gen/util.c config.mk gen/util.h
src/character.o: src/character.c config.mk gen/character.h grapheme.h
src/util.h
src/utf8.o: src/utf8.c config.mk grapheme.h
src/util.o: src/util.c config.mk gen/types.h grapheme.h src/util.h
+src/word.o: src/word.c config.mk gen/word.h grapheme.h src/util.h
test/character.o: test/character.c config.mk gen/character-test.h grapheme.h
test/util.h
test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/util.h
test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
test/util.o: test/util.c config.mk test/util.h
+test/word.o: test/word.c config.mk gen/word-test.h grapheme.h test/util.h
benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a
benchmark/utf8-decode: benchmark/utf8-decode.o benchmark/util.o libgrapheme.a
+benchmark/word: benchmark/word.o benchmark/util.o libgrapheme.a
gen/character: gen/character.o gen/util.o
gen/character-test: gen/character-test.o gen/util.o
+gen/word: gen/word.o gen/util.o
+gen/word-test: gen/word-test.o gen/util.o
test/character: test/character.o test/util.o libgrapheme.a
test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a
test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a
+test/word: test/word.o test/util.o libgrapheme.a
gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt
gen/character
gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test
+gen/word.h: data/WordBreakProperty.txt gen/word
+gen/word-test.h: data/WordBreakTest.txt gen/word-test
data/emoji-data.txt:
wget -O $@
https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
@@ -71,6 +89,12 @@ data/GraphemeBreakProperty.txt:
data/GraphemeBreakTest.txt:
wget -O $@
https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
+data/WordBreakProperty.txt:
+ wget -O $@
https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt
+
+data/WordBreakTest.txt:
+ wget -O $@
https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakTest.txt
+
$(BENCHMARK):
$(CC) -o $@ $(LDFLAGS) $@.o benchmark/util.o libgrapheme.a -lutf8proc
diff --git a/benchmark/character.c b/benchmark/character.c
index 2