On Mon, Oct 14, 2024 at 01:25:31PM +0100, Gavin Smith wrote:
> These numbers do not seem so high that we need an ultra-optimised
> algorithm (developed by C++ boffins) to deal with them - anything faster
> than a linear search would probably good enough.
>
> I have some spare time over the next couple of days so may be able
> to come up with something.
>
> We do not need the full level of generality that a general purpose library
> in C would likely provide as the hashmap is from std::string (char * from
> the C code) and maps to an int, and moreover keys only take the value 1 or
> are undefined.
I may as well post what I've been able to come up with. The new
code is about 150 lines of C, in the file convert/hashmap.c (patch
below). Probably not perfect but hopefully simple enough to be
maintainable.
I've tested this with three manuals with teximakehtml:
(seconds)
manual C++ C linear search
elisp 1.26 1.26 1.62
libc 1.50 1.51 1.82
texinfo 0.39 0.40 0.47
There was no detectable difference in memory usage. (I thought I might
have made a mistake as the numbers for C and C++ implementations seemed
so close.)
I've checked with valgrind that all memory is freed correctly at
the end. I used block allocation to save time on cleanup.
If this approach looks ok I can try to tidy it up and commit it.
We could also experiment with different numbers of hash bins or the
memory allocator. I'll read a bit more about hash tables as I don't
know a lot about this subject.
It could also be possible to use a completely different
algorithm like a binary tree.
diff --git a/tp/Texinfo/XS/Makefile.am b/tp/Texinfo/XS/Makefile.am
index 101038877b..9a9fcc622e 100644
--- a/tp/Texinfo/XS/Makefile.am
+++ b/tp/Texinfo/XS/Makefile.am
@@ -404,6 +404,8 @@ C_libtexinfo_convert_sources = \
convert/convert_html.c \
convert/format_html.h \
convert/format_html.c \
+ convert/hashmap.c \
+ convert/hashmap.h \
convert/html_converter_types.h \
convert/html_converter_init_options.c \
convert/html_converter_finish.c \
diff --git a/tp/Texinfo/XS/convert/converter.c
b/tp/Texinfo/XS/convert/converter.c
index e5d38c25ea..fe96a53b75 100644
--- a/tp/Texinfo/XS/convert/converter.c
+++ b/tp/Texinfo/XS/convert/converter.c
@@ -283,6 +283,8 @@ new_converter (enum converter_format format, unsigned long
flags)
/* set low level data representations options */
if (flags & CONVF_string_list)
converter->ids_data_type = IDT_string_list;
+ else if (flags & CONVF_hashmap)
+ converter->ids_data_type = IDT_hashmap;
#ifdef HAVE_CXX_HASHMAP
else if (flags & CONVF_cxx_hashmap)
converter->ids_data_type = IDT_cxx_hashmap;
@@ -454,10 +456,9 @@ converter_converter (enum converter_format format,
CONVERTER_INITIALIZATION_INFO *format_defaults;
unsigned long flags;
- /* NOTE if HAVE_CXX_HASHMAP is not set, even with CONVF_cxx_hashmap
- string lists will be used */
if (!converter_flags)
- flags = CONVF_cxx_hashmap;
+ //flags = CONVF_cxx_hashmap;
+ flags = CONVF_hashmap;
/*
To use a string list. Slower.
flags = CONVF_string_list;
diff --git a/tp/Texinfo/XS/convert/get_converter_perl_info.c
b/tp/Texinfo/XS/convert/get_converter_perl_info.c
index 364bcb2af2..bece2fff3b 100644
--- a/tp/Texinfo/XS/convert/get_converter_perl_info.c
+++ b/tp/Texinfo/XS/convert/get_converter_perl_info.c
@@ -105,7 +105,8 @@ get_or_create_sv_converter (SV *converter_in, const char
*input_class)
}
converter_descriptor = new_converter (converter_format,
- CONVF_perl_hashmap);
+ //CONVF_perl_hashmap);
+ CONVF_hashmap);
/*
CONVF_string_list);
*/
diff --git a/tp/Texinfo/XS/convert/hashmap.c b/tp/Texinfo/XS/convert/hashmap.c
new file mode 100644
index 0000000000..dd55230e6b
--- /dev/null
+++ b/tp/Texinfo/XS/convert/hashmap.c
@@ -0,0 +1,164 @@
+/* Copyright 2024 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "converter_types.h"
+
+#include "hashmap.h"
+
+typedef struct BUCKET {
+ /* Linked list of strings. */
+ char *string;
+ struct BUCKET *next;
+} BUCKET;
+
+/* Allocator for bucket object. */
+#define BUCKETS_PER_ARENA 64
+
+typedef struct BUCKET_ARENA {
+ BUCKET buckets[BUCKETS_PER_ARENA];
+ int used;
+ struct BUCKET_ARENA *next;
+} BUCKET_ARENA;
+
+#define NBUCKETS 256
+typedef struct C_HASHMAP {
+ BUCKET *bucket[NBUCKETS];
+ long count;
+
+ BUCKET_ARENA *arena;
+} C_HASHMAP;
+
+static BUCKET *
+new_bucket (C_HASHMAP *H)
+{
+ if (H->arena->used < BUCKETS_PER_ARENA)
+ return &H->arena->buckets[H->arena->used++];
+
+ BUCKET_ARENA *new_arena = malloc (sizeof (BUCKET_ARENA));
+ memset (new_arena, 0, sizeof (BUCKET_ARENA));
+
+ /* Add to front of list. */
+ new_arena->next = H->arena;
+ H->arena = new_arena;
+
+ return &H->arena->buckets[H->arena->used++];
+}
+
+
+static unsigned int
+hash_string (const char *string)
+{
+ unsigned int hash = 0;
+
+ char c;
+ const char *pc = string;
+
+ while ((c = *pc))
+ {
+ hash *= 127; /* prime */
+ hash += c;
+ pc++;
+ }
+
+ hash %= NBUCKETS;
+
+ return hash;
+}
+
+void
+init_registered_ids_c_hashmap (CONVERTER *self)
+{
+ C_HASHMAP *H = malloc (sizeof (C_HASHMAP));
+ memset (H, 0, sizeof (C_HASHMAP));
+
+ H->arena = malloc (sizeof (BUCKET_ARENA));
+ memset (H->arena, 0, sizeof (BUCKET_ARENA));
+
+ self->registered_ids_c_hashmap = H;
+}
+
+int
+is_c_hashmap_registered_id (CONVERTER *self, const char *in_string)
+{
+ C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap;
+ unsigned int hash = hash_string(in_string);
+ BUCKET *B = H->bucket[hash];
+
+ while (B)
+ {
+ if (!strcmp(B->string, in_string))
+ return 1;
+ B = B->next;
+ }
+
+ return 0;
+}
+
+void
+c_hashmap_register_id (CONVERTER *self, const char *in_string)
+{
+ C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap;
+
+ BUCKET *new = new_bucket(H);
+ new->string = strdup (in_string);
+ unsigned int hash = hash_string(in_string);
+
+ /* Add to front of linked list. */
+ new->next = H->bucket[hash];
+ H->bucket[hash] = new;
+
+ H->count++;
+}
+
+void
+clear_registered_ids_c_hashmap (CONVERTER *self)
+{
+ C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap;
+ int i;
+
+ BUCKET_ARENA *arena, *next;
+ /* Free chain. */
+ next = H->arena;
+ while (next)
+ {
+ arena = next;
+ next = arena->next;
+
+ /* TODO: also need to loop through until arena->used and
+ free strings. */
+ for (i = 0; i < arena->used; i++)
+ {
+ free (arena->buckets[i].string);
+ }
+ free (arena);
+ }
+
+ memset (H, 0, sizeof (C_HASHMAP));
+}
+
+void
+free_registered_ids_c_hashmap (CONVERTER *self)
+{
+ C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap;
+ clear_registered_ids_c_hashmap (self);
+ free (H);
+}
+
diff --git a/tp/Texinfo/XS/convert/hashmap.h b/tp/Texinfo/XS/convert/hashmap.h
new file mode 100644
index 0000000000..3f7c67396a
--- /dev/null
+++ b/tp/Texinfo/XS/convert/hashmap.h
@@ -0,0 +1,27 @@
+/* hashmap.h - declarations for hashmap.c */
+#ifndef HASHMAP_H
+#define HASHMAP_H
+
+/* Copyright 2024 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+void init_registered_ids_c_hashmap (CONVERTER *self);
+int is_c_hashmap_registered_id (CONVERTER *self, const char *in_string);
+void c_hashmap_register_id (CONVERTER *self, const char *in_string);
+void clear_registered_ids_c_hashmap (CONVERTER *self);
+void free_registered_ids_c_hashmap (CONVERTER *self);
+
+
+#endif
diff --git a/tp/Texinfo/XS/convert/html_converter_finish.c
b/tp/Texinfo/XS/convert/html_converter_finish.c
index 9ecbf80e63..d5bdf7eb33 100644
--- a/tp/Texinfo/XS/convert/html_converter_finish.c
+++ b/tp/Texinfo/XS/convert/html_converter_finish.c
@@ -31,6 +31,7 @@
#include "api_to_perl.h"
#include "call_html_perl_function.h"
#include "call_html_cxx_function.h"
+#include "hashmap.h"
/* html_reset_translated_special_unit_info_tree
html_clear_direction_string_type */
#include "convert_html.h"
@@ -161,6 +162,8 @@ html_reset_converter (CONVERTER *self)
if (self->ids_data_type == IDT_perl_hashmap)
clear_registered_ids_hv (self);
+ else if (self->ids_data_type == IDT_hashmap)
+ clear_registered_ids_c_hashmap (self);
#ifdef HAVE_CXX_HASHMAP
else if (self->ids_data_type == IDT_cxx_hashmap)
clear_registered_ids_hashmap (self);
@@ -342,6 +345,8 @@ html_free_converter (CONVERTER *self)
if (self->ids_data_type == IDT_perl_hashmap)
free_registered_ids_hv (self);
+ else if (self->ids_data_type == IDT_hashmap)
+ free_registered_ids_c_hashmap (self);
#ifdef HAVE_CXX_HASHMAP
else if (self->ids_data_type == IDT_cxx_hashmap)
free_registered_ids_hashmap (self);
diff --git a/tp/Texinfo/XS/convert/html_prepare_converter.c
b/tp/Texinfo/XS/convert/html_prepare_converter.c
index 3d786e1c1f..5034489fc2 100644
--- a/tp/Texinfo/XS/convert/html_prepare_converter.c
+++ b/tp/Texinfo/XS/convert/html_prepare_converter.c
@@ -55,6 +55,7 @@
#include "converter.h"
#include "call_html_perl_function.h"
#include "call_html_cxx_function.h"
+#include "hashmap.h"
#include "format_html.h"
/* html_complete_no_arg_commands_formatting html_run_stage_handlers
html_add_to_files_source_info html_find_file_source_info
@@ -1721,6 +1722,8 @@ html_converter_customize (CONVERTER *self)
if (self->ids_data_type == IDT_perl_hashmap)
init_registered_ids_hv (self);
+ else if (self->ids_data_type == IDT_hashmap)
+ init_registered_ids_c_hashmap (self);
#ifdef HAVE_CXX_HASHMAP
else if (self->ids_data_type == IDT_cxx_hashmap)
init_registered_ids_hashmap (self);
@@ -3768,6 +3771,8 @@ html_id_is_registered (CONVERTER *self, const char
*string)
{
if (self->ids_data_type == IDT_perl_hashmap)
return is_hv_registered_id (self, string);
+ else if (self->ids_data_type == IDT_hashmap)
+ return is_c_hashmap_registered_id (self, string);
#ifdef HAVE_CXX_HASHMAP
else if (self->ids_data_type == IDT_cxx_hashmap)
return is_hashmap_registered_id (self, string);
@@ -3781,6 +3786,8 @@ html_register_id (CONVERTER *self, const char *string)
{
if (self->ids_data_type == IDT_perl_hashmap)
hv_register_id (self, string);
+ else if (self->ids_data_type == IDT_hashmap)
+ c_hashmap_register_id (self, string);
#ifdef HAVE_CXX_HASHMAP
else if (self->ids_data_type == IDT_cxx_hashmap)
hashmap_register_id (self, string);
diff --git a/tp/Texinfo/XS/main/converter_types.h
b/tp/Texinfo/XS/main/converter_types.h
index 16a14328ee..cca8cf20b9 100644
--- a/tp/Texinfo/XS/main/converter_types.h
+++ b/tp/Texinfo/XS/main/converter_types.h
@@ -42,12 +42,14 @@ enum ids_data_type {
IDT_perl_hashmap,
IDT_cxx_hashmap,
IDT_string_list,
+ IDT_hashmap,
};
/* converter low level customization */
#define CONVF_perl_hashmap 0x0001
#define CONVF_string_list 0x0002
#define CONVF_cxx_hashmap 0x0004
+#define CONVF_hashmap 0x0008
/* for string information passing to/from perl */
enum sv_string_type {
@@ -883,6 +885,7 @@ typedef struct CONVERTER {
STRING_LIST *registered_ids;
/* actually HV * but we do not want to drag in Perl headers */
void *registered_ids_hv;
+ void *registered_ids_c_hashmap;
#ifdef HAVE_CXX_HASHMAP
/* a pointer on C++ data */
void *registered_ids_hashmap;
diff --git a/tp/Texinfo/XS/teximakehtml.c b/tp/Texinfo/XS/teximakehtml.c
index 16029d4bd8..ecb353215e 100644
--- a/tp/Texinfo/XS/teximakehtml.c
+++ b/tp/Texinfo/XS/teximakehtml.c
@@ -313,10 +313,11 @@ main (int argc, char *argv[])
&converter_texinfo_language_config_dirs,
&convert_options,
/* default, use C++ hashmap if available */
- 0);
- /* to test linear search
- CONVF_string_list);
- */
+ //0);
+ /* to test linear search */
+ //CONVF_string_list);
+ /* to test C implementation of hashmap */
+ CONVF_hashmap);
free_strings_list (&converter_texinfo_language_config_dirs);
free_strings_list (&texinfo_language_config_dirs);