After the detection of line break possibilities, I'm also adding functions for determining the word breaks in Unicode strings. As always, there is also a function ulc_wordbreaks() that accepts a strings in locale encoding.
2009-02-08 Bruno Haible <[email protected]> New module 'uniwbrk/ulc-wordbreaks'. * modules/uniwbrk/ulc-wordbreaks: New file. * lib/uniwbrk/ulc-wordbreaks.c: New file. New module 'uniwbrk/u32-wordbreaks'. * modules/uniwbrk/u32-wordbreaks: New file. * lib/uniwbrk/u32-wordbreaks.c: New file. New module 'uniwbrk/u16-wordbreaks'. * modules/uniwbrk/u16-wordbreaks: New file. * lib/uniwbrk/u16-wordbreaks.c: New file. New module 'uniwbrk/u8-wordbreaks'. * modules/uniwbrk/u8-wordbreaks: New file. * lib/uniwbrk/u8-wordbreaks.c: New file. * lib/uniwbrk/u-wordbreaks.h: New file. New module 'uniwbrk/table'. * modules/uniwbrk/table: New file. * lib/uniwbrk/wbrktable.h: New file. * lib/uniwbrk/wbrktable.c: New file. New module 'uniwbrk/wordbreak-property'. * modules/uniwbrk/wordbreak-property: New file. * lib/uniwbrk/wordbreak-property.c: New file. * lib/gen-uni-tables.c (WBP_*): New enum items. (get_wbp, debug_output_wbp, debug_output_wbrk_tables): New functions. (unicode_org_wbp): New variable. (fill_org_wbp, debug_output_org_wbp, debug_output_org_wbrk_tables): New functions. (wbp_table): New structure. (output_wbp, output_wbrk_tables): New functions. (main): Accept additional argument. Invoke fill_org_wbp, debug_output_wbrk_tables, debug_output_org_wbrk_tables, output_wbrk_tables. * modules/gen-uni-tables (Description): Update. * lib/uniwbrk/wbrkprop.h: New file, automatically generated by gen-uni-tables. New module 'uniwbrk/base'. * modules/uniwbrk/base: New file. * lib/uniwbrk.h: New file. ================================ lib/uniwbrk.h ================================ /* Word breaks in Unicode strings. Copyright (C) 2001-2003, 2005-2009 Free Software Foundation, Inc. Written by Bruno Haible <[email protected]>, 2009. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef _UNIWBRK_H #define _UNIWBRK_H /* Get size_t. */ #include <stddef.h> #include "unitypes.h" #ifdef __cplusplus extern "C" { #endif /* ========================================================================= */ /* Property defined in Unicode Standard Annex #29, section "Word Boundaries" <http://www.unicode.org/reports/tr29/#Word_Boundaries> */ /* Possible values of the Word_Break property. This enumeration may be extended in the future. */ enum { WBP_OTHER = 0, WBP_CR = 11, WBP_LF = 12, WBP_NEWLINE = 10, WBP_EXTEND = 8, WBP_FORMAT = 9, WBP_KATAKANA = 1, WBP_ALETTER = 2, WBP_MIDNUMLET = 3, WBP_MIDLETTER = 4, WBP_MIDNUM = 5, WBP_NUMERIC = 6, WBP_EXTENDNUMLET = 7 }; /* Return the Word_Break property of a Unicode character. */ extern int uc_wordbreak_property (ucs4_t uc); /* ========================================================================= */ /* Word breaks. */ /* Determine the word break points in S, and store the result at p[0..n-1]. p[i] = 1 means that there is a word boundary between s[i-1] and s[i]. p[i] = 0 means that s[i-1] and s[i] must not be separated. */ extern void u8_wordbreaks (const uint8_t *s, size_t n, char *p); extern void u16_wordbreaks (const uint16_t *s, size_t n, char *p); extern void u32_wordbreaks (const uint32_t *s, size_t n, char *p); extern void ulc_wordbreaks (const char *s, size_t n, char *p); /* ========================================================================= */ #ifdef __cplusplus } #endif #endif /* _UNIWBRK_H */
