Module Name: src
Committed By: joerg
Date: Thu Nov 24 18:44:25 UTC 2011
Modified Files:
src/lib/libc/string: wcscspn.c wcspbrk.c
Added Files:
src/lib/libc/string: wcscspn_bloom.h
Log Message:
In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly.
For larger sets, use a bloom filter to avoid the inner loop for most of
the input. The current implementation uses a simple modular hash as
first function (well suited for input e.g. in ISO Latin character sets)
and a more complex multiplicative hash as second function with a filter
size of 512 Bit. This reduces the typical run time to O(n+m).
To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/lib/libc/string/wcscspn.c
cvs rdiff -u -r0 -r1.1 src/lib/libc/string/wcscspn_bloom.h
cvs rdiff -u -r1.4 -r1.5 src/lib/libc/string/wcspbrk.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/lib/libc/string/wcscspn.c
diff -u src/lib/libc/string/wcscspn.c:1.3 src/lib/libc/string/wcscspn.c:1.4
--- src/lib/libc/string/wcscspn.c:1.3 Mon Nov 21 15:02:48 2011
+++ src/lib/libc/string/wcscspn.c Thu Nov 24 18:44:25 2011
@@ -1,7 +1,8 @@
-/* $NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $ */
+/* $NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $ */
/*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,45 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $");
#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
#include <wchar.h>
+#include "wcscspn_bloom.h"
+
size_t
wcscspn(const wchar_t *s, const wchar_t *set)
{
+ size_t bloom[BLOOM_ARRAY_SIZE];
const wchar_t *p;
const wchar_t *q;
_DIAGASSERT(s != NULL);
_DIAGASSERT(set != NULL);
+ if (set[0] == '\0')
+ return wcslen(s);
+ if (set[1] == '\0') {
+ for (p = s; *p; ++p)
+ if (*p == set[0])
+ break;
+ return p - s;
+ }
+
+ wcsspn_bloom_init(bloom, set);
+
for (p = s; *p; ++p) {
- for (q = set; *q; ++q) {
+ if (!wcsspn_in_bloom(bloom, *p))
+ continue;
+
+ q = set;
+ do {
if (*p == *q)
goto done;
- }
+ } while (*++q);
}
done:
Index: src/lib/libc/string/wcspbrk.c
diff -u src/lib/libc/string/wcspbrk.c:1.4 src/lib/libc/string/wcspbrk.c:1.5
--- src/lib/libc/string/wcspbrk.c:1.4 Mon Nov 21 15:02:48 2011
+++ src/lib/libc/string/wcspbrk.c Thu Nov 24 18:44:25 2011
@@ -1,7 +1,8 @@
-/* $NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $ */
+/* $NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $ */
/*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,41 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $");
#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
#include <wchar.h>
+#include "wcscspn_bloom.h"
+
wchar_t *
wcspbrk(const wchar_t *s, const wchar_t *set)
{
+ size_t bloom[BLOOM_ARRAY_SIZE];
const wchar_t *p;
const wchar_t *q;
_DIAGASSERT(s != NULL);
_DIAGASSERT(set != NULL);
+ if (set[0] == '\0')
+ return NULL;
+ if (set[1] == '\0')
+ return wcschr(s, set[0]);
+
+ wcsspn_bloom_init(bloom, set);
+
for (p = s; *p; ++p) {
- for (q = set; *q; ++q) {
+ if (!wcsspn_in_bloom(bloom, *p))
+ continue;
+
+ q = set;
+ do {
if (*p == *q)
return __UNCONST(p);
- }
+ } while (*++q);
}
return NULL;
}
Added files:
Index: src/lib/libc/string/wcscspn_bloom.h
diff -u /dev/null src/lib/libc/string/wcscspn_bloom.h:1.1
--- /dev/null Thu Nov 24 18:44:25 2011
+++ src/lib/libc/string/wcscspn_bloom.h Thu Nov 24 18:44:25 2011
@@ -0,0 +1,84 @@
+/* $NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $ */
+
+/*-
+ * Copyright (c) 2011 Joerg Sonnenberger,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Bloom filter for fast test if a given character is part of the reject set.
+ * The test may have false positives, but doesn't have false negatives.
+ * The first hash function is designed to be very fast to evaluate.
+ * It is collision free if the input is part of the same European language
+ * and shouldn't be too bad even other input. The second hash function
+ * tries to provide a much better mixing, but involves the slower
+ * multiplication.
+ */
+
+#define BLOOM_SIZE 64
+#define BLOOM_ARRAY_SIZE (BLOOM_SIZE / sizeof(size_t))
+#define BLOOM_MASK (BLOOM_SIZE * 8 - 1)
+#define BLOOM_DIV (sizeof(size_t) * 8)
+
+static inline size_t
+wcscspn_bloom1(size_t x)
+{
+ return x & BLOOM_MASK;
+}
+
+static inline size_t
+wcscspn_bloom2(size_t x)
+{
+ return (uint32_t)(x * 2654435761U) /
+ (0x100000000ULL / (BLOOM_MASK + 1));
+}
+
+static inline void
+wcsspn_bloom_init(size_t *bloom, const wchar_t *charset)
+{
+ size_t val;
+
+ memset(bloom, 0, BLOOM_SIZE);
+ do {
+ val = wcscspn_bloom1((size_t)*charset);
+ bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+ val = wcscspn_bloom2((size_t)*charset);
+ bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+ }
+ while (*++charset);
+}
+
+static inline int
+wcsspn_in_bloom(const size_t *bloom, wchar_t ch)
+{
+ size_t val;
+
+ val = wcscspn_bloom1((size_t)ch);
+ if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+ return 1;
+ val = wcscspn_bloom2((size_t)ch);
+ if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+ return 1;
+ return 0;
+}