Hi Marc, Thanks for the additional information. It did help to locate the bug. Please apply the attached patch.
Regards, Sergey
>From aa2df3934c858977cd9033af2345f3566d136bd7 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff <g...@gnu.org.ua> Date: Sun, 23 May 2010 14:50:12 +0300 Subject: [PATCH] Fix improper handling of conversion errors in levenshtein.c (debian #582692) * include/dico/utf8.h (utf8_mbstr_to_wc) (utf8_mbstr_to_norm_wc): Change signature. * lib/utf8.c (utf8_mbstr_to_wc) (utf8_mbstr_to_norm_wc): Take additional return argument. Return error code. All callers updated. * lib/levenshtein.c (dico_levenshtein_distance): conv returns non-zero (not necessarily negative) value on errors. --- include/dico/utf8.h | 4 ++-- lib/levenshtein.c | 6 +++--- lib/utf8.c | 44 +++++++++++++++++++++++--------------------- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/include/dico/utf8.h b/include/dico/utf8.h index 1f24b78..3462443 100644 --- a/include/dico/utf8.h +++ b/include/dico/utf8.h @@ -54,8 +54,8 @@ size_t utf8_wc_hash_string (const unsigned *ws, size_t n_buckets); int utf8_wc_strcmp (const unsigned *a, const unsigned *b); int utf8_wc_to_mbstr(const unsigned *wordbuf, size_t wordlen, char **sptr); -int utf8_mbstr_to_wc(const char *str, unsigned **wptr); -int utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr); +int utf8_mbstr_to_wc(const char *str, unsigned **wptr, size_t *plen); +int utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr, size_t *plen); int utf8_quote (const char *str, char **sptr); unsigned *utf8_wc_quote (const unsigned *s); diff --git a/lib/levenshtein.c b/lib/levenshtein.c index 366df04..125756f 100644 --- a/lib/levenshtein.c +++ b/lib/levenshtein.c @@ -42,12 +42,12 @@ dico_levenshtein_distance(const char *astr, const char *bstr, int flags) unsigned *row[3]; int i, j, idx, nrows; int dist; - int (*conv) (const char *, unsigned **) = + int (*conv) (const char *, unsigned **, size_t *) = (flags & DICO_LEV_NORM) ? utf8_mbstr_to_norm_wc : utf8_mbstr_to_wc; - if (conv(astr, &a) < 0) + if (conv(astr, &a, NULL)) return -1; - if (conv(bstr, &b) < 0) { + if (conv(bstr, &b, NULL)) { free(a); return -1; } diff --git a/lib/utf8.c b/lib/utf8.c index 105cef6..bfe9951 100644 --- a/lib/utf8.c +++ b/lib/utf8.c @@ -1992,33 +1992,33 @@ utf8_wc_to_mbstr(const unsigned *wordbuf, size_t wordlen, char **sptr) } int -utf8_mbstr_to_wc(const char *str, unsigned **wptr) +utf8_mbstr_to_wc(const char *str, unsigned **wptr, size_t *plen) { - size_t sc = strlen(str); - size_t len, i; - unsigned *w = calloc(sizeof(w[0]), sc+1); + ssize_t sc = strlen(str); + size_t len, i; + unsigned *w = calloc(sizeof(w[0]), sc+1); - if (!w) - return -1; - for (i = 0, len = strlen(str); len; i++) - { - int rc = utf8_mbtowc (w + i, (unsigned char *)str, len); - if (rc <= 0) - { - free(w); - return -1; + if (!w) + return -1; + for (i = 0, len = strlen(str); len; i++) { + int rc = utf8_mbtowc (w + i, (unsigned char *)str, len); + if (rc <= 0) { + free(w); + return -1; } - str += rc; - len -= rc; + str += rc; + len -= rc; } - *wptr = w; - return sc; + *wptr = w; + if (plen) + *plen = sc; + return 0; } #define ISWS(c) ((c)==' '||(c)=='\t'||(c)=='\n') int -utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr) +utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr, size_t *plen) { int inws = 0; size_t len = strlen(str); @@ -2032,7 +2032,7 @@ utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr) unsigned wc; int rc = utf8_mbtowc(&wc, (unsigned char *)str, len); if (rc <= 0) - return 1; + return -1; str += rc; len -= rc; if (rc == 1 && ISWS(wc)) { @@ -2047,6 +2047,8 @@ utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr) } base[i++] = 0; *nptr = realloc(base, i * sizeof(base[0])); + if (plen) + *plen = i; return 0; } @@ -2056,8 +2058,8 @@ utf8_quote (const char *str, char **sptr) int rc; unsigned *ws, *ret; - rc = utf8_mbstr_to_wc (str, &ws); - if (rc < 0) + rc = utf8_mbstr_to_wc (str, &ws, NULL); + if (rc) return rc; ret = utf8_wc_quote (ws); if (ret) -- 1.6.0.3