Module Name:    src
Committed By:   joerg
Date:           Thu Nov 24 18:44:25 UTC 2011

Modified Files:
        src/lib/libc/string: wcscspn.c wcspbrk.c
Added Files:
        src/lib/libc/string: wcscspn_bloom.h

Log Message:
In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly.
For larger sets, use a bloom filter to avoid the inner loop for most of
the input. The current implementation uses a simple modular hash as
first function (well suited for input e.g. in ISO Latin character sets)
and a more complex multiplicative hash as second function with a filter
size of 512 Bit. This reduces the typical run time to O(n+m).


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/lib/libc/string/wcscspn.c
cvs rdiff -u -r0 -r1.1 src/lib/libc/string/wcscspn_bloom.h
cvs rdiff -u -r1.4 -r1.5 src/lib/libc/string/wcspbrk.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/lib/libc/string/wcscspn.c
diff -u src/lib/libc/string/wcscspn.c:1.3 src/lib/libc/string/wcscspn.c:1.4
--- src/lib/libc/string/wcscspn.c:1.3	Mon Nov 21 15:02:48 2011
+++ src/lib/libc/string/wcscspn.c	Thu Nov 24 18:44:25 2011
@@ -1,7 +1,8 @@
-/*	$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $	*/
+/*	$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $	*/
 
 /*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,45 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $");
 
 #include <assert.h>
+#include <inttypes.h>
+#include <string.h>
 #include <wchar.h>
 
+#include "wcscspn_bloom.h"
+
 size_t
 wcscspn(const wchar_t *s, const wchar_t *set)
 {
+	size_t bloom[BLOOM_ARRAY_SIZE];
 	const wchar_t *p;
 	const wchar_t *q;
 
 	_DIAGASSERT(s != NULL);
 	_DIAGASSERT(set != NULL);
 
+	if (set[0] == '\0')
+		return wcslen(s);
+	if (set[1] == '\0') {
+		for (p = s; *p; ++p)
+			if (*p == set[0])
+				break;
+		return p - s;
+	}
+
+	wcsspn_bloom_init(bloom, set);
+
 	for (p = s; *p; ++p) {
-		for (q = set; *q; ++q) {
+		if (!wcsspn_in_bloom(bloom, *p))
+			continue;
+
+		q = set;
+		do {
 			if (*p == *q)
 				goto done;
-		}
+		} while (*++q);
 	}
 
 done:

Index: src/lib/libc/string/wcspbrk.c
diff -u src/lib/libc/string/wcspbrk.c:1.4 src/lib/libc/string/wcspbrk.c:1.5
--- src/lib/libc/string/wcspbrk.c:1.4	Mon Nov 21 15:02:48 2011
+++ src/lib/libc/string/wcspbrk.c	Thu Nov 24 18:44:25 2011
@@ -1,7 +1,8 @@
-/*	$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $	*/
+/*	$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $	*/
 
 /*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,41 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $");
 
 #include <assert.h>
+#include <inttypes.h>
+#include <string.h>
 #include <wchar.h>
 
+#include "wcscspn_bloom.h"
+
 wchar_t *
 wcspbrk(const wchar_t *s, const wchar_t *set)
 {
+	size_t bloom[BLOOM_ARRAY_SIZE];
 	const wchar_t *p;
 	const wchar_t *q;
 
 	_DIAGASSERT(s != NULL);
 	_DIAGASSERT(set != NULL);
 
+	if (set[0] == '\0')
+		return NULL;
+	if (set[1] == '\0')
+		return wcschr(s, set[0]);
+
+	wcsspn_bloom_init(bloom, set);
+
 	for (p = s; *p; ++p) {
-		for (q = set; *q; ++q) {
+		if (!wcsspn_in_bloom(bloom, *p))
+			continue;
+
+		q = set;
+		do {
 			if (*p == *q)
 				return __UNCONST(p);
-		}
+		} while (*++q);
 	}
 	return NULL;
 }

Added files:

Index: src/lib/libc/string/wcscspn_bloom.h
diff -u /dev/null src/lib/libc/string/wcscspn_bloom.h:1.1
--- /dev/null	Thu Nov 24 18:44:25 2011
+++ src/lib/libc/string/wcscspn_bloom.h	Thu Nov 24 18:44:25 2011
@@ -0,0 +1,84 @@
+/*	$NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $	*/
+
+/*-
+ * Copyright (c) 2011 Joerg Sonnenberger,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Bloom filter for fast test if a given character is part of the reject set.
+ * The test may have false positives, but doesn't have false negatives.
+ * The first hash function is designed to be very fast to evaluate.
+ * It is collision free if the input is part of the same European language
+ * and shouldn't be too bad even other input.  The second hash function
+ * tries to provide a much better mixing, but involves the slower
+ * multiplication.
+ */
+
+#define	BLOOM_SIZE		64
+#define	BLOOM_ARRAY_SIZE	(BLOOM_SIZE / sizeof(size_t))
+#define	BLOOM_MASK		(BLOOM_SIZE * 8 - 1)
+#define	BLOOM_DIV		(sizeof(size_t) * 8)
+
+static inline size_t
+wcscspn_bloom1(size_t x)
+{
+	return x & BLOOM_MASK;
+}
+
+static inline size_t
+wcscspn_bloom2(size_t x)
+{
+	return (uint32_t)(x * 2654435761U) /
+	    (0x100000000ULL / (BLOOM_MASK + 1));
+}
+
+static inline void
+wcsspn_bloom_init(size_t *bloom, const wchar_t *charset)
+{
+	size_t val;
+
+	memset(bloom, 0, BLOOM_SIZE);
+	do {
+		val = wcscspn_bloom1((size_t)*charset);
+		bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+		val = wcscspn_bloom2((size_t)*charset);
+		bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+	}
+	while (*++charset);
+}
+
+static inline int
+wcsspn_in_bloom(const size_t *bloom, wchar_t ch)
+{
+	size_t val;
+
+	val = wcscspn_bloom1((size_t)ch);
+	if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+		return 1;
+	val = wcscspn_bloom2((size_t)ch);
+	if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+		return 1;
+	return 0;
+}

Reply via email to