A 58 character encoding that:
- avoids visually ambiguous 0OIl characters
- uses only alphanumeric characters
Described at:
- https://tools.ietf.org/html/draft-msporny-base58-03
This implementation uses GMP (or gnulib's gmp fallback).
Performance is good in comparison to other implementations.
For example when using libgmp, encoding is 6 times faster,
and decoding 28 times faster than the implementation
using arbitrary precision ints in cypthon 3.13.
Memory use is proportional to the size of input.
Encoding benchmarks:
$ time yes | head -c65535 | src/basenc --base58 -w0 >file.enc
real 0m1.533s
$ ./configure --without-libgmp && make # gnulib gmp
$ time yes | head -c65535 | src/basenc --base58 -w0 >file.enc
real 0m3.587s
# dnf install python3-base58
$ time yes | head -c65535 | base58 >file.enc # cpython 3.13
real 0m9.700s
Decoding benchmarks:
$ time src/basenc --base58 -d <file.enc >/dev/null
real 0m0.299s
$ ./configure --without-libgmp && make # gnulib gmp
$ time src/basenc --base58 -d <file.enc >/dev/null
real 0m1.469s
$ time base58 -d <file.enc >/dev/null # cpython 3.13
real 0m8.302s
* src/basenc.c (base_decode_ctx_finalize, base_encode_ctx_init,
base_encode_ctx, base_encode_ctx_finalize): New functions to
provide more general processing functionality.
(base58_{de,en}code_ctx{_init,,_finalize}): New functions to
accumulate all input before calling ...
(base58_{de,en}code): ... the GMP based encoding/decoding routines.
(do_encode, do_decode): Call the ctx variants if enabled.
* doc/coreutils.texi (basenc invocation): Describe the new option,
and indicate the main use case being interactive user use.
* src/local.mk: Link basenc with GMP.
* tests/basenc/basenc.pl: Add test cases.
* NEWS: Mention the new feature.
---
NEWS | 5 +
doc/coreutils.texi | 9 +
src/basenc.c | 361 ++++++++++++++++++++++++++++++++++++++++-
src/local.mk | 1 +
tests/basenc/basenc.pl | 42 +++++
5 files changed, 413 insertions(+), 5 deletions(-)
diff --git a/NEWS b/NEWS
index aa04c1884..fa2919421 100644
--- a/NEWS
+++ b/NEWS
@@ -68,6 +68,11 @@ GNU coreutils NEWS -*-
outline -*-
pretended that standard input was not a tty.
[This bug was present in "the beginning".]
+** New Features
+
+ basenc supports the --base58 option to encode and decode
+ the visually unambiguous Base58 encoding.
+
** Improvements
cp, install and mv now avoid possible data corruption on
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 34ea70085..19de577e5 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -2370,6 +2370,15 @@ base64 form (using @samp{_} and @samp{-} instead of
@samp{+} and @samp{/}).
The format conforms to
@uref{https://datatracker.ietf.org/doc/html/rfc4648#section-5, RFC 4648#5}.
+@item --base58
+@opindex --base58
+Encode into (or decode from with @option{-d/--decode}) base58 form.
+The format conforms to
+@uref{https://datatracker.ietf.org/doc/html/draft-msporny-base58-03,
+Base58 draft}.
+This encoding is useful for transcription as the output avoids
+visually similar characters. It's best suited to smaller amounts of data.
+
@item --base32
@opindex --base32
Encode into (or decode from with @option{-d/--decode}) base32 form.
diff --git a/src/basenc.c b/src/basenc.c
index 3f550a71f..5556bc82d 100644
--- a/src/basenc.c
+++ b/src/basenc.c
@@ -21,6 +21,7 @@
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+#include <gmp.h>
#include "system.h"
#include "assure.h"
@@ -61,6 +62,7 @@ enum
{
BASE64_OPTION = CHAR_MAX + 1,
BASE64URL_OPTION,
+ BASE58_OPTION,
BASE32_OPTION,
BASE32HEX_OPTION,
BASE16_OPTION,
@@ -78,6 +80,7 @@ static struct option const long_options[] =
#if BASE_TYPE == 42
{"base64", no_argument, 0, BASE64_OPTION},
{"base64url", no_argument, 0, BASE64URL_OPTION},
+ {"base58", no_argument, 0, BASE58_OPTION},
{"base32", no_argument, 0, BASE32_OPTION},
{"base32hex", no_argument, 0, BASE32HEX_OPTION},
{"base16", no_argument, 0, BASE16_OPTION},
@@ -119,6 +122,9 @@ Base%d encode or decode FILE, or standard input, to
standard output.\n\
"), stdout);
fputs (_("\
--base64url file- and url-safe base64 (RFC4648 section 5)\n\
+"), stdout);
+ fputs (_("\
+ --base58 visually unambiguous base58 encoding\n\
"), stdout);
fputs (_("\
--base32 same as 'base32' program (RFC4648 section 6)\n\
@@ -263,6 +269,13 @@ struct z85_decode_context
unsigned char octets[5];
};
+struct base58_context
+{
+ unsigned char *buf;
+ idx_t size;
+ idx_t capacity;
+};
+
struct base2_decode_context
{
unsigned char octet;
@@ -277,6 +290,7 @@ struct base_decode_context
struct base16_decode_context base16;
struct base2_decode_context base2;
struct z85_decode_context z85;
+ struct base58_context base58;
} ctx;
char *inbuf;
idx_t bufsize;
@@ -285,6 +299,23 @@ static void (*base_decode_ctx_init) (struct
base_decode_context *ctx);
static bool (*base_decode_ctx) (struct base_decode_context *ctx,
char const *restrict in, idx_t inlen,
char *restrict out, idx_t *outlen);
+static bool (*base_decode_ctx_finalize) (struct base_decode_context *ctx,
+ char *restrict *out, idx_t *outlen);
+
+struct base_encode_context
+{
+ union {
+ struct base58_context base58;
+ } ctx;
+};
+
+static void (*base_encode_ctx_init) (struct base_encode_context *ctx);
+static bool (*base_encode_ctx) (struct base_encode_context *ctx,
+ char const *restrict in, idx_t inlen,
+ char *restrict out, idx_t *outlen);
+static bool (*base_encode_ctx_finalize) (struct base_encode_context *ctx,
+ char *restrict *out, idx_t *outlen);
+
#endif
@@ -1036,6 +1067,269 @@ base2msbf_decode_ctx (struct base_decode_context *ctx,
return true;
}
+static char const base58_alphabet[58] _GL_ATTRIBUTE_NONSTRING =
+ "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";
+
+static signed char const base58_to_int[256] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1,
+ -1, 9, 10, 11, 12, 13, 14, 15, 16, -1, 17, 18, 19, 20, 21, -1,
+ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, -1, -1, -1, -1, -1,
+ -1, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+static bool
+isubase58 (unsigned char ch)
+{
+ return ch < sizeof base58_to_int && 0 <= base58_to_int[ch];
+}
+
+
+static int
+base58_length (int len)
+{
+ /* Base58 output length is approximately log(256)/log(58). */
+ return (len * 138) / 100 + 1;
+}
+
+
+static void
+base58_encode_ctx_init (struct base_encode_context *ctx)
+{
+ ctx->ctx.base58.buf = nullptr;
+ ctx->ctx.base58.size = 0;
+ ctx->ctx.base58.capacity = 0;
+}
+
+
+static bool
+base58_encode_ctx (struct base_encode_context *ctx,
+ char const *restrict in, idx_t inlen,
+ MAYBE_UNUSED char *restrict out, idx_t *outlen)
+{
+ *outlen = 0; /* Only accumulate input in this function. */
+
+ if (inlen == 0)
+ return true;
+
+ idx_t free_space = ctx->ctx.base58.capacity - ctx->ctx.base58.size;
+ if (free_space < inlen)
+ {
+ ctx->ctx.base58.buf = xpalloc (ctx->ctx.base58.buf,
+ &ctx->ctx.base58.capacity,
+ inlen - free_space,
+ -1, sizeof *ctx->ctx.base58.buf);
+ }
+
+ memcpy (ctx->ctx.base58.buf + ctx->ctx.base58.size, in, inlen);
+ ctx->ctx.base58.size += inlen;
+
+ return true;
+}
+
+static void
+base58_encode (char const* data, size_t data_len,
+ char *out, idx_t *outlen)
+{
+ affirm (base_length (data_len) <= *outlen);
+
+ size_t leading_zeros = 0;
+ while (leading_zeros < data_len && data[leading_zeros] == 0)
+ leading_zeros++;
+
+ /* Init GMP integer from binary (base 256) data. */
+ mpz_t num;
+ mpz_init (num);
+ mpz_import (num, data_len, 1, 1, 0, 0, data);
+
+ char *ptr = out + *outlen; /* Start just beyond end. */
+
+ /* Convert to base 58 by repeatedly dividing by 58. */
+ mpz_t quotient, remainder;
+ mpz_init (quotient);
+ mpz_init (remainder);
+ while (mpz_cmp_ui (num, 0) > 0)
+ {
+ mpz_fdiv_qr_ui (quotient, remainder, num, 58);
+ unsigned long rem_val = mpz_get_ui (remainder);
+ *(--ptr) = base58_alphabet[rem_val];
+ mpz_set (num, quotient);
+ }
+
+ /* Account for leading zeros. */
+ ptr -= leading_zeros;
+ memset (ptr, '1', leading_zeros);
+
+ /* Prepare return. */
+ *outlen -= (ptr - out);
+ memmove (out, ptr, *outlen);
+
+ mpz_clear (num);
+ mpz_clear (quotient);
+ mpz_clear (remainder);
+
+ return;
+}
+
+
+static bool
+base58_encode_ctx_finalize (struct base_encode_context *ctx,
+ char *restrict *out, idx_t *outlen)
+{
+ /* Ensure output buffer is large enough. */
+ idx_t max_outlen = base_length (ctx->ctx.base58.size);
+ if (max_outlen > *outlen)
+ {
+ *out = xrealloc (*out, max_outlen);
+ *outlen = max_outlen;
+ }
+
+ base58_encode ((char *)ctx->ctx.base58.buf, ctx->ctx.base58.size,
+ *out, outlen);
+
+ free (ctx->ctx.base58.buf);
+ ctx->ctx.base58.buf = nullptr;
+
+ return true;
+}
+
+
+static void
+base58_decode_ctx_init (struct base_decode_context *ctx)
+{
+ ctx->ctx.base58.size = 0;
+ ctx->ctx.base58.capacity = 0;
+ ctx->ctx.base58.buf = nullptr;
+ ctx->i = 0;
+}
+
+static bool
+base58_decode_ctx (struct base_decode_context *ctx,
+ char const *restrict in, idx_t inlen,
+ MAYBE_UNUSED char *restrict out, idx_t *outlen)
+{
+ bool ignore_lines = true; /* for now, always ignore them */
+
+ *outlen = 0; /* Only accumulate input in this function. */
+
+ if (inlen == 0)
+ return true;
+
+ idx_t free_space = ctx->ctx.base58.capacity - ctx->ctx.base58.size;
+ if (free_space < inlen)
+ {
+ ctx->ctx.base58.buf = xpalloc (ctx->ctx.base58.buf,
+ &ctx->ctx.base58.capacity,
+ inlen - free_space,
+ -1, sizeof *ctx->ctx.base58.buf);
+ }
+
+ /* Accumulate all valid input characters in our buffer */
+ for (idx_t i = 0; i < inlen; i++)
+ {
+ unsigned char c = in[i];
+
+ if (ignore_lines && c == '\n')
+ continue;
+
+ if (c == '\0')
+ continue;
+
+ if (!isubase58 (c))
+ return false;
+
+ ctx->ctx.base58.buf[ctx->ctx.base58.size++] = c;
+ }
+
+ return true;
+}
+
+
+static bool
+base58_decode (char const *data, size_t data_len,
+ char *restrict out, idx_t *outlen)
+{
+ affirm (data_len <= *outlen);
+
+ /* Count leading '1's in the accumulated buffer */
+ size_t leading_ones = 0;
+ while (leading_ones < data_len && data[leading_ones] == '1')
+ leading_ones++;
+
+ /* Initialize GMP integer */
+ mpz_t num;
+ mpz_init (num);
+
+ /* Convert base58 string to GMP integer. */
+ unsigned char const *b58_digits = (unsigned char *)data + leading_ones;
+ size_t b58_len = data_len - leading_ones;
+
+ for (size_t i = 0; i < b58_len; i++)
+ {
+ int val = base58_to_int[b58_digits[i]];
+ if (val == -1)
+ {
+ mpz_clear (num);
+ return false;
+ }
+
+ mpz_mul_ui (num, num, 58);
+ mpz_add_ui (num, num, val);
+ }
+
+ /* Add leading zeros for leading '1's */
+ memset (out, 0, leading_ones);
+
+ /* Export GMP integer to binary (big-endian) */
+ size_t exported_size = 0;
+ if (mpz_cmp_ui (num, 0) > 0)
+ {
+ size_t binary_size = (mpz_sizeinbase (num, 2) + 7) / 8;
+ assert (*outlen - leading_ones >= binary_size);
+ mpz_export (out + leading_ones, &exported_size, 1, 1, 1, 0, num);
+ }
+
+ *outlen = leading_ones + exported_size;
+
+ mpz_clear (num);
+
+ return true;
+}
+
+
+static bool
+base58_decode_ctx_finalize (struct base_decode_context *ctx,
+ char *restrict *out, idx_t *outlen)
+{
+ /* Ensure output buffer is large enough.
+ Worst case is input is all '1's. */
+ idx_t max_outlen = ctx->ctx.base58.size;
+ if (max_outlen > *outlen)
+ {
+ *out = xrealloc (*out, max_outlen);
+ *outlen = max_outlen;
+ }
+
+ bool ret = base58_decode ((char *)ctx->ctx.base58.buf, ctx->ctx.base58.size,
+ *out, outlen);
+
+ free (ctx->ctx.base58.buf);
+ ctx->ctx.base58.buf = nullptr;
+
+ return ret;
+}
+
#endif /* BASE_TYPE == 42, i.e., "basenc"*/
@@ -1095,6 +1389,14 @@ do_encode (FILE *in, char const *infile, FILE *out,
idx_t wrap_column)
inbuf = xmalloc (ENC_BLOCKSIZE);
outbuf = xmalloc (BASE_LENGTH (ENC_BLOCKSIZE));
+#if BASE_TYPE == 42
+ /* Initialize encoding context if needed (for base58) */
+ struct base_encode_context encode_ctx;
+ bool use_ctx = (base_encode_ctx_init != nullptr);
+ if (use_ctx)
+ base_encode_ctx_init (&encode_ctx);
+#endif
+
do
{
idx_t n;
@@ -1109,16 +1411,38 @@ do_encode (FILE *in, char const *infile, FILE *out,
idx_t wrap_column)
if (sum > 0)
{
- /* Process input one block at a time. Note that ENC_BLOCKSIZE
- is sized so that no pad chars will appear in output. */
- base_encode (inbuf, sum, outbuf, BASE_LENGTH (sum));
+#if BASE_TYPE == 42
+ if (use_ctx)
+ {
+ idx_t outlen = 0;
+ base_encode_ctx (&encode_ctx, inbuf, sum, outbuf, &outlen);
+
+ wrap_write (outbuf, outlen, wrap_column, ¤t_column, out);
+ }
+ else
+#endif
+ {
+ /* Process input one block at a time. Note that ENC_BLOCKSIZE
+ is sized so that no pad chars will appear in output. */
+ base_encode (inbuf, sum, outbuf, BASE_LENGTH (sum));
- wrap_write (outbuf, BASE_LENGTH (sum), wrap_column,
- ¤t_column, out);
+ wrap_write (outbuf, BASE_LENGTH (sum), wrap_column,
+ ¤t_column, out);
+ }
}
}
while (!feof (in) && !ferror (in) && sum == ENC_BLOCKSIZE);
+#if BASE_TYPE == 42
+ if (use_ctx && base_encode_ctx_finalize)
+ {
+ idx_t outlen = BASE_LENGTH (ENC_BLOCKSIZE);
+ base_encode_ctx_finalize (&encode_ctx, &outbuf, &outlen);
+
+ wrap_write (outbuf, outlen, wrap_column, ¤t_column, out);
+ }
+#endif
+
/* When wrapping, terminate last line. */
if (wrap_column && current_column > 0 && fputc ('\n', out) == EOF)
write_error ();
@@ -1208,6 +1532,20 @@ do_decode (FILE *in, char const *infile, FILE *out, bool
ignore_garbage)
}
while (!feof (in));
+#if BASE_TYPE == 42
+ if (base_decode_ctx_finalize)
+ {
+ idx_t outlen = DEC_BLOCKSIZE;
+ bool ok = base_decode_ctx_finalize (&ctx, &outbuf, &outlen);
+
+ if (fwrite (outbuf, 1, outlen, out) < outlen)
+ write_error ();
+
+ if (!ok)
+ error (EXIT_FAILURE, 0, _("invalid input"));
+ }
+#endif
+
finish_and_exit (in, infile);
}
@@ -1268,6 +1606,7 @@ main (int argc, char **argv)
case BASE2MSBF_OPTION:
case BASE2LSBF_OPTION:
case Z85_OPTION:
+ case BASE58_OPTION:
base_type = opt;
break;
#endif
@@ -1356,6 +1695,18 @@ main (int argc, char **argv)
base_decode_ctx = z85_decode_ctx;
break;
+ case BASE58_OPTION:
+ base_length = base58_length;
+ required_padding = no_required_padding;
+ isubase = isubase58;
+ base_encode_ctx_init = base58_encode_ctx_init;
+ base_encode_ctx = base58_encode_ctx;
+ base_encode_ctx_finalize = base58_encode_ctx_finalize;
+ base_decode_ctx_init = base58_decode_ctx_init;
+ base_decode_ctx = base58_decode_ctx;
+ base_decode_ctx_finalize = base58_decode_ctx_finalize;
+ break;
+
default:
error (0, 0, _("missing encoding type"));
usage (EXIT_FAILURE);
diff --git a/src/local.mk b/src/local.mk
index ef1035bcb..c7c77a7c9 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -286,6 +286,7 @@ src_sort_LDADD += $(NANOSLEEP_LIB)
src_tail_LDADD += $(NANOSLEEP_LIB)
# for various GMP functions
+src_basenc_LDADD += $(LIBGMP)
src_expr_LDADD += $(LIBGMP)
src_factor_LDADD += $(LIBGMP)
diff --git a/tests/basenc/basenc.pl b/tests/basenc/basenc.pl
index c598d29e5..1d50a8121 100755
--- a/tests/basenc/basenc.pl
+++ b/tests/basenc/basenc.pl
@@ -72,6 +72,18 @@ my $base2msbf_ab = "0110000101100010";
my $base2msbf_ab_nl = $base2msbf_ab;
$base2msbf_ab_nl =~ s/(...)/$1\n/g; # Add newline every 3 characters
+# Base58 test vectors
+my $base58_in = "Hello World!";
+my $base58_out = "2NEpo7TZRRrLZSi2U";
+my $base58_in2 = "\x00\x00\x28\x7f\xb4\xcd";
+my $base58_out2 = "11233QC4";
+my $base58_in3 = "\x00";
+my $base58_out3 = "1";
+my $base58_in4 = "1\x00";
+my $base58_out4 = "4jH";
+my $base58_large_ones = "1" x 32768;
+my $base58_large_NULs = "\x00" x 32768;
+
my $try_help = "Try '$prog --help' for more information.\n";
my @Tests =
@@ -100,6 +112,7 @@ my @Tests =
['empty6', '--base2msbf', {IN=>''}, {OUT=>""}],
['empty7', '--base2lsbf', {IN=>''}, {OUT=>""}],
['empty8', '--z85', {IN=>''}, {OUT=>""}],
+ ['empty9', '--base58', {IN=>''}, {OUT=>""}],
@@ -266,6 +279,35 @@ my @Tests =
{ERR=>"$prog: invalid input\n"}],
['z85_47', '--z85 -d', {IN=>'#0000'}, {EXIT=>1},
{ERR=>"$prog: invalid input\n"}],
+
+
+
+
+ ['b58_1', '--base58', {IN=>$base58_in}, {OUT=>$base58_out}],
+ ['b58_2', '--base58 -d', {IN=>$base58_out}, {OUT=>$base58_in}],
+ ['b58_3', '--base58 -d -i', {IN=>'&'. $base58_out}, {OUT=>$base58_in}],
+ ['b58_4', '--base58', {IN=>$base58_in2}, {OUT=>$base58_out2}],
+ ['b58_5', '--base58 -d', {IN=>$base58_out2}, {OUT=>$base58_in2}],
+ ['b58_6', '--base58', {IN=>$base58_in3}, {OUT=>$base58_out3}],
+ ['b58_7', '--base58 -d', {IN=>$base58_out3}, {OUT=>$base58_in3}],
+ ['b58_8', '--base58 -d', {IN=>$base58_out."\n"}, {OUT=>$base58_in}],
+ ['b58_9', '--base58 -d -i', {IN=>$base58_out."\n"}, {OUT=>$base58_in}],
+ ['b58_10', '--base58', {IN=>$base58_in4}, {OUT=>$base58_out4}],
+ ['b58_11', '--base58 -d', {IN=>$base58_out4}, {OUT=>$base58_in4}],
+ ['b58_buf1', '--base58', {IN=>$base58_large_NULs},
+ {OUT=>$base58_large_ones}],
+ ['b58_buf2', '--base58 -d', {IN=>$base58_large_ones},
+ {OUT=>$base58_large_NULs}],
+
+ # Invalid base58 characters (0, O, I, l)
+ ['b58_inval_1', '--base58 -d', {IN=>'0'}, {EXIT=>1},
+ {ERR=>"$prog: invalid input\n"}],
+ ['b58_inval_2', '--base58 -d', {IN=>'O'}, {EXIT=>1},
+ {ERR=>"$prog: invalid input\n"}],
+ ['b58_inval_3', '--base58 -d', {IN=>'I'}, {EXIT=>1},
+ {ERR=>"$prog: invalid input\n"}],
+ ['b58_inval_4', '--base58 -d', {IN=>'l'}, {EXIT=>1},
+ {ERR=>"$prog: invalid input\n"}],
);
# Prepend the command line argument and append a newline to end
--
2.50.0