Module Name: src Committed By: joerg Date: Thu Nov 24 18:44:25 UTC 2011
Modified Files: src/lib/libc/string: wcscspn.c wcspbrk.c Added Files: src/lib/libc/string: wcscspn_bloom.h Log Message: In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly. For larger sets, use a bloom filter to avoid the inner loop for most of the input. The current implementation uses a simple modular hash as first function (well suited for input e.g. in ISO Latin character sets) and a more complex multiplicative hash as second function with a filter size of 512 Bit. This reduces the typical run time to O(n+m). To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/lib/libc/string/wcscspn.c cvs rdiff -u -r0 -r1.1 src/lib/libc/string/wcscspn_bloom.h cvs rdiff -u -r1.4 -r1.5 src/lib/libc/string/wcspbrk.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/lib/libc/string/wcscspn.c diff -u src/lib/libc/string/wcscspn.c:1.3 src/lib/libc/string/wcscspn.c:1.4 --- src/lib/libc/string/wcscspn.c:1.3 Mon Nov 21 15:02:48 2011 +++ src/lib/libc/string/wcscspn.c Thu Nov 24 18:44:25 2011 @@ -1,7 +1,8 @@ -/* $NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $ */ +/* $NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $ */ /*- - * Copyright (c)1999 Citrus Project, + * Copyright (c) 1999 Citrus Project, + * Copyright (c) 2011 Joerg Sonnenberger, * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,25 +30,45 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $"); +__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $"); #include <assert.h> +#include <inttypes.h> +#include <string.h> #include <wchar.h> +#include "wcscspn_bloom.h" + size_t wcscspn(const wchar_t *s, const wchar_t *set) { + size_t bloom[BLOOM_ARRAY_SIZE]; const wchar_t *p; const wchar_t *q; _DIAGASSERT(s != NULL); _DIAGASSERT(set != NULL); + if (set[0] == '\0') + return wcslen(s); + if (set[1] == '\0') { + for (p = s; *p; ++p) + if (*p == set[0]) + break; + return p - s; + } + + wcsspn_bloom_init(bloom, set); + for (p = s; *p; ++p) { - for (q = set; *q; ++q) { + if (!wcsspn_in_bloom(bloom, *p)) + continue; + + q = set; + do { if (*p == *q) goto done; - } + } while (*++q); } done: Index: src/lib/libc/string/wcspbrk.c diff -u src/lib/libc/string/wcspbrk.c:1.4 src/lib/libc/string/wcspbrk.c:1.5 --- src/lib/libc/string/wcspbrk.c:1.4 Mon Nov 21 15:02:48 2011 +++ src/lib/libc/string/wcspbrk.c Thu Nov 24 18:44:25 2011 @@ -1,7 +1,8 @@ -/* $NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $ */ +/* $NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $ */ /*- - * Copyright (c)1999 Citrus Project, + * Copyright (c) 1999 Citrus Project, + * Copyright (c) 2011 Joerg Sonnenberger, * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,25 +30,41 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $"); +__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $"); #include <assert.h> +#include <inttypes.h> +#include <string.h> #include <wchar.h> +#include "wcscspn_bloom.h" + wchar_t * wcspbrk(const wchar_t *s, const wchar_t *set) { + size_t bloom[BLOOM_ARRAY_SIZE]; const wchar_t *p; const wchar_t *q; _DIAGASSERT(s != NULL); _DIAGASSERT(set != NULL); + if (set[0] == '\0') + return NULL; + if (set[1] == '\0') + return wcschr(s, set[0]); + + wcsspn_bloom_init(bloom, set); + for (p = s; *p; ++p) { - for (q = set; *q; ++q) { + if (!wcsspn_in_bloom(bloom, *p)) + continue; + + q = set; + do { if (*p == *q) return __UNCONST(p); - } + } while (*++q); } return NULL; } Added files: Index: src/lib/libc/string/wcscspn_bloom.h diff -u /dev/null src/lib/libc/string/wcscspn_bloom.h:1.1 --- /dev/null Thu Nov 24 18:44:25 2011 +++ src/lib/libc/string/wcscspn_bloom.h Thu Nov 24 18:44:25 2011 @@ -0,0 +1,84 @@ +/* $NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $ */ + +/*- + * Copyright (c) 2011 Joerg Sonnenberger, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Bloom filter for fast test if a given character is part of the reject set. + * The test may have false positives, but doesn't have false negatives. + * The first hash function is designed to be very fast to evaluate. + * It is collision free if the input is part of the same European language + * and shouldn't be too bad even other input. The second hash function + * tries to provide a much better mixing, but involves the slower + * multiplication. + */ + +#define BLOOM_SIZE 64 +#define BLOOM_ARRAY_SIZE (BLOOM_SIZE / sizeof(size_t)) +#define BLOOM_MASK (BLOOM_SIZE * 8 - 1) +#define BLOOM_DIV (sizeof(size_t) * 8) + +static inline size_t +wcscspn_bloom1(size_t x) +{ + return x & BLOOM_MASK; +} + +static inline size_t +wcscspn_bloom2(size_t x) +{ + return (uint32_t)(x * 2654435761U) / + (0x100000000ULL / (BLOOM_MASK + 1)); +} + +static inline void +wcsspn_bloom_init(size_t *bloom, const wchar_t *charset) +{ + size_t val; + + memset(bloom, 0, BLOOM_SIZE); + do { + val = wcscspn_bloom1((size_t)*charset); + bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV); + val = wcscspn_bloom2((size_t)*charset); + bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV); + } + while (*++charset); +} + +static inline int +wcsspn_in_bloom(const size_t *bloom, wchar_t ch) +{ + size_t val; + + val = wcscspn_bloom1((size_t)ch); + if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV))) + return 1; + val = wcscspn_bloom2((size_t)ch); + if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV))) + return 1; + return 0; +}