Am 24.11.11 19:44, schrieb Joerg Sonnenberger: > Module Name: src > Committed By: joerg > Date: Thu Nov 24 18:44:25 UTC 2011 > > Modified Files: > src/lib/libc/string: wcscspn.c wcspbrk.c > Added Files: > src/lib/libc/string: wcscspn_bloom.h
This breaks the build on i386, at least. > > Log Message: > In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly. > For larger sets, use a bloom filter to avoid the inner loop for most of > the input. The current implementation uses a simple modular hash as > first function (well suited for input e.g. in ISO Latin character sets) > and a more complex multiplicative hash as second function with a filter > size of 512 Bit. This reduces the typical run time to O(n+m). > > > To generate a diff of this commit: > cvs rdiff -u -r1.3 -r1.4 src/lib/libc/string/wcscspn.c > cvs rdiff -u -r0 -r1.1 src/lib/libc/string/wcscspn_bloom.h > cvs rdiff -u -r1.4 -r1.5 src/lib/libc/string/wcspbrk.c > > Please note that diffs are not public domain; they are subject to the > copyright notices on the relevant files. > > > > > Modified files: > > Index: src/lib/libc/string/wcscspn.c > diff -u src/lib/libc/string/wcscspn.c:1.3 src/lib/libc/string/wcscspn.c:1.4 > --- src/lib/libc/string/wcscspn.c:1.3 Mon Nov 21 15:02:48 2011 > +++ src/lib/libc/string/wcscspn.c Thu Nov 24 18:44:25 2011 > @@ -1,7 +1,8 @@ > -/* $NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $ */ > +/* $NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $ */ > > /*- > - * Copyright (c)1999 Citrus Project, > + * Copyright (c) 1999 Citrus Project, > + * Copyright (c) 2011 Joerg Sonnenberger, > * All rights reserved. > * > * Redistribution and use in source and binary forms, with or without > @@ -29,25 +30,45 @@ > */ > > #include <sys/cdefs.h> > -__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $"); > +__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $"); > > #include <assert.h> > +#include <inttypes.h> > +#include <string.h> > #include <wchar.h> > > +#include "wcscspn_bloom.h" > + > size_t > wcscspn(const wchar_t *s, const wchar_t *set) > { > + size_t bloom[BLOOM_ARRAY_SIZE]; > const wchar_t *p; > const wchar_t *q; > > _DIAGASSERT(s != NULL); > _DIAGASSERT(set != NULL); > > + if (set[0] == '\0') > + return wcslen(s); > + if (set[1] == '\0') { > + for (p = s; *p; ++p) > + if (*p == set[0]) > + break; > + return p - s; > + } > + > + wcsspn_bloom_init(bloom, set); > + > for (p = s; *p; ++p) { > - for (q = set; *q; ++q) { > + if (!wcsspn_in_bloom(bloom, *p)) > + continue; > + > + q = set; > + do { > if (*p == *q) > goto done; > - } > + } while (*++q); > } > > done: > > Index: src/lib/libc/string/wcspbrk.c > diff -u src/lib/libc/string/wcspbrk.c:1.4 src/lib/libc/string/wcspbrk.c:1.5 > --- src/lib/libc/string/wcspbrk.c:1.4 Mon Nov 21 15:02:48 2011 > +++ src/lib/libc/string/wcspbrk.c Thu Nov 24 18:44:25 2011 > @@ -1,7 +1,8 @@ > -/* $NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $ */ > +/* $NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $ */ > > /*- > - * Copyright (c)1999 Citrus Project, > + * Copyright (c) 1999 Citrus Project, > + * Copyright (c) 2011 Joerg Sonnenberger, > * All rights reserved. > * > * Redistribution and use in source and binary forms, with or without > @@ -29,25 +30,41 @@ > */ > > #include <sys/cdefs.h> > -__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $"); > +__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $"); > > #include <assert.h> > +#include <inttypes.h> > +#include <string.h> > #include <wchar.h> > > +#include "wcscspn_bloom.h" > + > wchar_t * > wcspbrk(const wchar_t *s, const wchar_t *set) > { > + size_t bloom[BLOOM_ARRAY_SIZE]; > const wchar_t *p; > const wchar_t *q; > > _DIAGASSERT(s != NULL); > _DIAGASSERT(set != NULL); > > + if (set[0] == '\0') > + return NULL; > + if (set[1] == '\0') > + return wcschr(s, set[0]); > + > + wcsspn_bloom_init(bloom, set); > + > for (p = s; *p; ++p) { > - for (q = set; *q; ++q) { > + if (!wcsspn_in_bloom(bloom, *p)) > + continue; > + > + q = set; > + do { > if (*p == *q) > return __UNCONST(p); > - } > + } while (*++q); > } > return NULL; > } > > Added files: > > Index: src/lib/libc/string/wcscspn_bloom.h > diff -u /dev/null src/lib/libc/string/wcscspn_bloom.h:1.1 > --- /dev/null Thu Nov 24 18:44:25 2011 > +++ src/lib/libc/string/wcscspn_bloom.h Thu Nov 24 18:44:25 2011 > @@ -0,0 +1,84 @@ > +/* $NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $ */ > + > +/*- > + * Copyright (c) 2011 Joerg Sonnenberger, > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * Bloom filter for fast test if a given character is part of the reject set. > + * The test may have false positives, but doesn't have false negatives. > + * The first hash function is designed to be very fast to evaluate. > + * It is collision free if the input is part of the same European language > + * and shouldn't be too bad even other input. The second hash function > + * tries to provide a much better mixing, but involves the slower > + * multiplication. > + */ > + > +#define BLOOM_SIZE 64 > +#define BLOOM_ARRAY_SIZE (BLOOM_SIZE / sizeof(size_t)) > +#define BLOOM_MASK (BLOOM_SIZE * 8 - 1) > +#define BLOOM_DIV (sizeof(size_t) * 8) > + > +static inline size_t > +wcscspn_bloom1(size_t x) > +{ > + return x & BLOOM_MASK; > +} > + > +static inline size_t > +wcscspn_bloom2(size_t x) > +{ > + return (uint32_t)(x * 2654435761U) / > + (0x100000000ULL / (BLOOM_MASK + 1)); > +} > + > +static inline void > +wcsspn_bloom_init(size_t *bloom, const wchar_t *charset) > +{ > + size_t val; > + > + memset(bloom, 0, BLOOM_SIZE); > + do { > + val = wcscspn_bloom1((size_t)*charset); > + bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV); > + val = wcscspn_bloom2((size_t)*charset); > + bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV); > + } > + while (*++charset); > +} > + > +static inline int > +wcsspn_in_bloom(const size_t *bloom, wchar_t ch) > +{ > + size_t val; > + > + val = wcscspn_bloom1((size_t)ch); > + if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV))) > + return 1; > + val = wcscspn_bloom2((size_t)ch); > + if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV))) > + return 1; > + return 0; > +} > -- \~~~~~. The NetBSD Foundation \~~~~~' Marc Balmer, Developer / Marketing NetBSD \ mbal...@netbsd.org http://www.NetBSD.org/