RPM Package Manager, CVS Repository http://rpm5.org/cvs/ ____________________________________________________________________________
Server: rpm5.org Name: Jeff Johnson Root: /v/rpm/cvs Email: j...@rpm5.org Module: rpm Date: 25-Mar-2012 18:37:25 Branch: rpm-5_4 Handle: 2012032516372400 Added files: (Branch: rpm-5_4) rpm/rpmio set.c set.h Modified files: (Branch: rpm-5_4) rpm CHANGES rpm/rpmio Makefile.am Log: - set: swipe the dependency set encoding from Alt. Summary: Revision Changes Path 1.3501.2.233+1 -0 rpm/CHANGES 1.293.2.11 +1 -0 rpm/rpmio/Makefile.am 1.1.2.1 +1408 -0 rpm/rpmio/set.c 1.1.2.1 +32 -0 rpm/rpmio/set.h ____________________________________________________________________________ patch -p0 <<'@@ .' Index: rpm/CHANGES ============================================================================ $ cvs diff -u -r1.3501.2.232 -r1.3501.2.233 CHANGES --- rpm/CHANGES 22 Mar 2012 00:04:04 -0000 1.3501.2.232 +++ rpm/CHANGES 25 Mar 2012 16:37:24 -0000 1.3501.2.233 @@ -1,4 +1,5 @@ 5.4.7 -> 5.4.8: + - jbj: set: swipe the dependency set encoding from Alt. - jbj: tests: dink with perl module load path to load from within tree. - jbj: tests: dink with python module load path to load from within tree. - jbj: tests: extend the unixodbc test case. @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/Makefile.am ============================================================================ $ cvs diff -u -r1.293.2.10 -r1.293.2.11 Makefile.am --- rpm/rpmio/Makefile.am 21 Mar 2012 19:08:20 -0000 1.293.2.10 +++ rpm/rpmio/Makefile.am 25 Mar 2012 16:37:24 -0000 1.293.2.11 @@ -11,6 +11,7 @@ fnmatch_loop.c getdate.y rpmcpio.c rpmcpio.h \ rpmgenbasedir.c rpmgenpkglist.c rpmgensrclist.c \ rpmjsio.msg rpmtar.c rpmtar.h \ + set.c set.h \ tdir.c tfts.c tget.c tgit.c tglob.c thkp.c thtml.c tinv.c tkey.c \ tmire.c todbc.c tput.c trpmio.c tsexp.c tsvn.c tsw.c lookup3.c tpw.c \ librpmio.vers testit.sh @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/set.c ============================================================================ $ cvs diff -u -r0 -r1.1.2.1 set.c --- /dev/null 2012-03-25 18:35:29.000000000 +0200 +++ set.c 2012-03-25 18:37:25.109854210 +0200 @@ -0,0 +1,1408 @@ +/* + * set.c - base62, golomb and set-string routines + * + * Copyright (C) 2010, 2011, 2012 Alexey Tourbin <a...@altlinux.org> + * + * License: GPLv2+ or LGPL, see RPM COPYING + */ + +#ifdef SELF_TEST +#undef NDEBUG +#include <stdio.h> +#endif + +#include <string.h> +#include <stdlib.h> +#include <assert.h> + +/* + * Base62 routines - encode bits with alnum characters. + * + * This is a base64-based base62 implementation. Values 0..61 are encoded + * with '0'..'9', 'a'..'z', and 'A'..'Z'. However, 'Z' is special: it will + * also encode 62 and 63. To achieve this, 'Z' will occupy two high bits in + * the next character. Thus 'Z' can be interpreted as an escape character + * (which indicates that the next character must be handled specially). + * Note that setting high bits to "00", "01" or "10" cannot contribute + * to another 'Z' (which would require high bits set to "11"). This is + * how multiple escapes are avoided. + */ + +// Estimate base62 buffer size required to encode a given number of bits. +static inline +int encode_base62_size(int bitc) +{ + // In the worst case, which is ZxZxZx..., five bits can make a character; + // the remaining bits can make a character, too. And the string must be + // null-terminated. + return bitc / 5 + 2; +} + +// Main base62 encoding routine: pack bitv into base62 string. +static +int encode_base62(int bitc, const char *bitv, char *base62) +{ + char *base62_start = base62; + void put_digit(int c) + { + assert(c >= 0 && c <= 61); + if (c < 10) + *base62++ = c + '0'; + else if (c < 36) + *base62++ = c - 10 + 'a'; + else if (c < 62) + *base62++ = c - 36 + 'A'; + } + int bits2 = 0; // number of high bits set + int bits6 = 0; // number of regular bits set + int num6b = 0; // pending 6-bit number + while (bitc-- > 0) { + num6b |= (*bitv++ << bits6++); + if (bits6 + bits2 < 6) + continue; + switch (num6b) { + case 61: + // escape + put_digit(61); + // extra "00...." high bits (in the next character) + bits2 = 2; + bits6 = 0; + num6b = 0; + break; + case 62: + put_digit(61); + // extra "01...." high bits + bits2 = 2; + bits6 = 0; + num6b = 16; + break; + case 63: + put_digit(61); + // extra "10...." high bits + bits2 = 2; + bits6 = 0; + num6b = 32; + break; + default: + assert(num6b < 61); + put_digit(num6b); + bits2 = 0; + bits6 = 0; + num6b = 0; + break; + } + } + if (bits6 + bits2) { + assert(num6b < 61); + put_digit(num6b); + } + *base62 = '\0'; + return base62 - base62_start; +} + +// Estimate how many bits will result from decoding a base62 string. +static inline +int decode_base62_size(int len) +{ + // Each character will fill at most 6 bits. + return len * 6; +} + +// This table maps alnum characters to their numeric values. +static +const int char_to_num[256] = { + [0 ... 255] = 0xee, + [0] = 0xff, +#define C1(c, b) [c] = c - b +#define C2(c, b) C1(c, b), C1(c + 1, b) +#define C5(c, b) C1(c, b), C2(c + 1, b), C2(c + 3, b) +#define C10(c, b) C5(c, b), C5(c + 5, b) + C10('0', '0'), +#define C26(c, b) C1(c, b), C5(c + 1, b), C10(c + 6, b), C10(c + 16, b) + C26('a', 'a' + 10), + C26('A', 'A' + 36), +}; + +// Main base62 decoding routine: unpack base62 string into bitv[]. +static +int decode_base62(const char *base62, char *bitv) +{ + char *bitv_start = bitv; + inline + void put6bits(int c) + { + *bitv++ = (c >> 0) & 1; + *bitv++ = (c >> 1) & 1; + *bitv++ = (c >> 2) & 1; + *bitv++ = (c >> 3) & 1; + *bitv++ = (c >> 4) & 1; + *bitv++ = (c >> 5) & 1; + } + inline + void put4bits(int c) + { + *bitv++ = (c >> 0) & 1; + *bitv++ = (c >> 1) & 1; + *bitv++ = (c >> 2) & 1; + *bitv++ = (c >> 3) & 1; + } + while (1) { + long c = (unsigned char) *base62++; + int num6b = char_to_num[c]; + while (num6b < 61) { + put6bits(num6b); + c = (unsigned char) *base62++; + num6b = char_to_num[c]; + } + if (num6b == 0xff) + break; + if (num6b == 0xee) + return -1; + assert(num6b == 61); + c = (unsigned char) *base62++; + int num4b = char_to_num[c]; + if (num4b == 0xff) + return -2; + if (num4b == 0xee) + return -3; + switch (num4b & (16 + 32)) { + case 0: + break; + case 16: + num6b = 62; + num4b &= ~16; + break; + case 32: + num6b = 63; + num4b &= ~32; + break; + default: + return -4; + } + put6bits(num6b); + put4bits(num4b); + } + return bitv - bitv_start; +} + +#ifdef SELF_TEST +static +void test_base62() +{ + const char rnd_bitv[] = { + 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, + 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, + 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, + 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, + // trigger some 'Z' + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + }; + const int rnd_bitc = sizeof rnd_bitv; + // encode + char base62[encode_base62_size(rnd_bitc)]; + int len = encode_base62(rnd_bitc, rnd_bitv, base62); + assert(len > 0); + assert(len == (int)strlen(base62)); + fprintf(stderr, "len=%d base62=%s\n", len, base62); + // The length cannot be shorter than 6 bits per symbol. + assert(len >= rnd_bitc / 6); + // Neither too long: each second character must fill at least 4 bits. + assert(len <= rnd_bitc / 2 / 4 + rnd_bitc / 2 / 6 + 1); + // decode + char bitv[decode_base62_size(len)]; + int bitc = decode_base62(base62, bitv); + fprintf(stderr, "rnd_bitc=%d bitc=%d\n", rnd_bitc, bitc); + assert(bitc >= rnd_bitc); + // Decoded bits must match. + int i; + for (i = 0; i < rnd_bitc; i++) + assert(rnd_bitv[i] == bitv[i]); + // The remaining bits must be zero bits. + for (i = rnd_bitc; i < bitc; i++) + assert(bitv[i] == 0); + fprintf(stderr, "%s: base62 test OK\n", __FILE__); +} +#endif + +/* + * Golomb-Rice routines - compress integer values into bits. + * + * The idea is as follows. Input values are assumed to be small integers. + * Each value is split into two parts: an integer resulting from its higher + * bits and an integer resulting from its lower bits (with the number of lower + * bits specified by the Mshift parameter). The frist integer is then stored + * in unary coding (which is a variable-length sequence of '0' followed by a + * terminating '1'); the second part is stored in normal binary coding (using + * Mshift bits). + * + * The method is justified by the fact that, since most of the values are + * small, their first parts will be short (typically 1..3 bits). In particular, + * the method is known to be optimal for uniformly distributed hash values, + * after the values are sorted and delta-encoded. See e.g. + * Putze, F.; Sanders, P.; Singler, J. (2007), + * "Cache-, Hash- and Space-Efficient Bloom Filters", + * http://algo2.iti.uni-karlsruhe.de/singler/publications/cacheefficientbloomfilters-wea2007.pdf + */ + +// Calculate Mshift paramter for encoding. +static +int encode_golomb_Mshift(int c, int bpp) +{ + int log2i(int n) + { + int m = 0; + while (n >>= 1) + m++; + return m; + } + // XXX Slightly better Mshift estimations are probably possible. + // Recheck "Compression and coding algorithms" by Moffat & Turpin. + int Mshift = bpp - log2i(c) - 1; + // Adjust out-of-range values. + if (Mshift < 7) + Mshift = 7; + if (Mshift > 31) + Mshift = 31; + assert(Mshift < bpp); + return Mshift; +} + +// Estimate how many bits can be filled up. +static inline +int encode_golomb_size(int c, int Mshift) +{ + // XXX No precise estimation. However, we do not expect unary-encoded bits + // to take more than binary-encoded Mshift bits. + return Mshift * 2 * c + 16; +} + +// Main golomb encoding routine: package integers into bits. +static +int encode_golomb(int c, const unsigned *v, int Mshift, char *bitv) +{ + char *bitv_start = bitv; + const unsigned mask = (1 << Mshift) - 1; + while (c > 0) { + c--; + unsigned v0 = *v++; + int i; + // first part: variable-length sequence + unsigned q = v0 >> Mshift; + for (i = 0; i < (int)q; i++) + *bitv++ = 0; + *bitv++ = 1; + // second part: lower Mshift bits + unsigned r = v0 & mask; + for (i = 0; i < Mshift; i++) + *bitv++ = (r >> i) & 1; + } + return bitv - bitv_start; +} + +// Estimate how many values will emerge. +static inline +int decode_golomb_size(int bitc, int Mshift) +{ + // Each (Mshift + 1) bits can make a value. + // The remaining bits cannot make a value, though. + return bitc / (Mshift + 1); +} + +// Main golomb decoding routine: unpackage bits into values. +static +int decode_golomb(int bitc, const char *bitv, int Mshift, unsigned *v) +{ + unsigned *v_start = v; + // next value + while (bitc > 0) { + // first part + unsigned q = 0; + char bit = 0; + while (bitc > 0) { + bitc--; + bit = *bitv++; + if (bit == 0) + q++; + else + break; + } + // trailing zero bits in the input are okay + if (bitc == 0 && bit == 0) { + // up to 5 bits can be used to complete last character + if (q > 5) + return -10; + break; + } + // otherwise, incomplete value is not okay + if (bitc < Mshift) + return -11; + // second part + unsigned r = 0; + int i; + for (i = 0; i < Mshift; i++) { + bitc--; + if (*bitv++) + r |= (1 << i); + } + // the value + *v++ = (q << Mshift) | r; + } + return v - v_start; +} + +#ifdef SELF_TEST +static +void test_golomb() +{ + const unsigned rnd_v[] = { + // do re mi fa sol la si + 1, 2, 3, 4, 5, 6, 7, + // koshka sela na taksi + 7, 6, 5, 4, 3, 2, 1, + }; + const int rnd_c = sizeof rnd_v / sizeof *rnd_v; + int bpp = 10; + int Mshift = encode_golomb_Mshift(rnd_c, bpp); + fprintf(stderr, "rnd_c=%d bpp=%d Mshift=%d\n", rnd_c, bpp, Mshift); + assert(Mshift > 0); + assert(Mshift < bpp); + // encode + int alloc_bitc = encode_golomb_size(rnd_c, Mshift); + assert(alloc_bitc > rnd_c); + char bitv[alloc_bitc]; + int bitc = encode_golomb(rnd_c, rnd_v, Mshift, bitv); + fprintf(stderr, "alloc_bitc=%d bitc=%d\n", alloc_bitc, bitc); + assert(bitc > rnd_c); + assert(bitc <= alloc_bitc); + // decode + int alloc_c = decode_golomb_size(bitc, Mshift); + assert(alloc_c >= rnd_c); + unsigned v[alloc_c]; + int c = decode_golomb(bitc, bitv, Mshift, v); + fprintf(stderr, "rnd_c=%d alloc_c=%d c=%d\n", rnd_c, alloc_c, c); + assert(alloc_c >= c); + // Decoded values must match. + assert(rnd_c == c); + int i; + for (i = 0; i < c; i++) + assert(rnd_v[i] == v[i]); + // At the end of the day, did it save your money? + int golomb_bpp = bitc / c; + fprintf(stderr, "bpp=%d golomb_bpp=%d\n", bpp, golomb_bpp); + assert(golomb_bpp < bpp); + fprintf(stderr, "%s: golomb test OK\n", __FILE__); +} +#endif + +/* + * Combined base62+gololb decoding routine - implemented for efficiency. + * + * As Dmitry V. Levin once noticed, when it comes to speed, very few objections + * can be made against complicating the code. Which reminds me of Karl Marx, + * who said that there is not a crime at which a capitalist will scruple for + * the sake of 300 per cent profit, even at the chance of being hanged. Anyway, + * here Alexey Tourbin demonstrates that by using sophisticated - or should he + * say "ridiculously complicated" - techniques it is indeed possible to gain + * some profit, albeit of another kind. + */ + +// Word types (when two bytes from base62 string cast to unsigned short). +enum { + W_AA = 0x0000, + W_AZ = 0x1000, + W_ZA = 0x2000, + W_A0 = 0x3000, + W_0X = 0x4000, + W_EE = 0xeeee, +}; + +// Combine two characters into array index (with respect to endianness). +#include <sys/types.h> +#if BYTE_ORDER && BYTE_ORDER == LITTLE_ENDIAN +#define CCI(c1, c2) ((c1) | ((c2) << 8)) +#elif BYTE_ORDER && BYTE_ORDER == BIG_ENDIAN +#define CCI(c1, c2) ((c2) | ((c1) << 8)) +#else +#error "unknown byte order" +#endif + +// Maps base62 word into numeric value (decoded bits) ORed with word type. +static +const unsigned short word_to_num[65536] = { + [0 ... 65535] = W_EE, +#define AA1(c1, c2, b1, b2) [CCI(c1, c2)] = (c1 - b1) | ((c2 - b2) << 6) +#define AA1x2(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA1(c1, c2 + 1, b1, b2) +#define AA1x3(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA1x2(c1, c2 + 1, b1, b2) +#define AA1x5(c1, c2, b1, b2) AA1x2(c1, c2, b1, b2), AA1x3(c1, c2 + 2, b1, b2) +#define AA1x10(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA1x5(c1, c2 + 5, b1, b2) +#define AA1x20(c1, c2, b1, b2) AA1x10(c1, c2, b1, b2), AA1x10(c1, c2 + 10, b1, b2) +#define AA1x25(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA1x20(c1, c2 + 5, b1, b2) +#define AA2x1(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA1(c1 + 1, c2, b1, b2) +#define AA3x1(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA2x1(c1 + 1, c2, b1, b2) +#define AA5x1(c1, c2, b1, b2) AA2x1(c1, c2, b1, b2), AA3x1(c1 + 2, c2, b1, b2) +#define AA10x1(c1, c2, b1, b2) AA5x1(c1, c2, b1, b2), AA5x1(c1 + 5, c2, b1, b2) +#define AA20x1(c1, c2, b1, b2) AA10x1(c1, c2, b1, b2), AA10x1(c1 + 10, c2, b1, b2) +#define AA25x1(c1, c2, b1, b2) AA5x1(c1, c2, b1, b2), AA20x1(c1 + 5, c2, b1, b2) +#define AA26x1(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA25x1(c1 + 1, c2, b1, b2) +#define AA2x5(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA1x5(c1 + 1, c2, b1, b2) +#define AA3x5(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA2x5(c1 + 1, c2, b1, b2) +#define AA5x5(c1, c2, b1, b2) AA2x5(c1, c2, b1, b2), AA3x5(c1 + 2, c2, b1, b2) +#define AA5x10(c1, c2, b1, b2) AA5x5(c1, c2, b1, b2), AA5x5(c1, c2 + 5, b1, b2) +#define AA10x5(c1, c2, b1, b2) AA5x5(c1, c2, b1, b2), AA5x5(c1 + 5, c2, b1, b2) +#define AA20x5(c1, c2, b1, b2) AA10x5(c1, c2, b1, b2), AA10x5(c1 + 10, c2, b1, b2) +#define AA25x5(c1, c2, b1, b2) AA5x5(c1, c2, b1, b2), AA20x5(c1 + 5, c2, b1, b2) +#define AA10x10(c1, c2, b1, b2) AA5x10(c1, c2, b1, b2), AA5x10(c1 + 5, c2, b1, b2) +#define AA10x20(c1, c2, b1, b2) AA10x10(c1, c2, b1, b2), AA10x10(c1, c2 + 10, b1, b2) +#define AA10x25(c1, c2, b1, b2) AA10x5(c1, c2, b1, b2), AA10x20(c1, c2 + 5, b1, b2) +#define AA10x26(c1, c2, b1, b2) AA10x1(c1, c2, b1, b2), AA10x25(c1, c2 + 1, b1, b2) +#define AA20x10(c1, c2, b1, b2) AA10x10(c1, c2, b1, b2), AA10x10(c1 + 10, c2, b1, b2) +#define AA25x10(c1, c2, b1, b2) AA5x10(c1, c2, b1, b2), AA20x10(c1 + 5, c2, b1, b2) +#define AA26x10(c1, c2, b1, b2) AA1x10(c1, c2, b1, b2), AA25x10(c1 + 1, c2, b1, b2) +#define AA25x20(c1, c2, b1, b2) AA25x10(c1, c2, b1, b2), AA25x10(c1, c2 + 10, b1, b2) +#define AA25x25(c1, c2, b1, b2) AA25x5(c1, c2, b1, b2), AA25x20(c1, c2 + 5, b1, b2) +#define AA25x26(c1, c2, b1, b2) AA25x1(c1, c2, b1, b2), AA25x25(c1, c2 + 1, b1, b2) +#define AA26x25(c1, c2, b1, b2) AA1x25(c1, c2, b1, b2), AA25x25(c1 + 1, c2, b1, b2) +#define AA26x26(c1, c2, b1, b2) AA26x1(c1, c2, b1, b2), AA26x25(c1, c2 + 1, b1, b2) + AA10x10('0', '0', '0', '0'), + AA10x26('0', 'a', '0', 'a' + 10), + AA10x25('0', 'A', '0', 'A' + 36), + AA26x10('a', '0', 'a' + 10, '0'), + AA25x10('A', '0', 'A' + 36, '0'), + AA26x26('a', 'a', 'a' + 10, 'a' + 10), + AA26x25('a', 'A', 'a' + 10, 'A' + 36), + AA25x26('A', 'a', 'A' + 36, 'a' + 10), + AA25x25('A', 'A', 'A' + 36, 'A' + 36), +#define AZ1(c, b) [CCI(c, 'Z')] = (c - b) | W_AZ +#define AZ2(c, b) AZ1(c, b), AZ1(c + 1, b) +#define AZ5(c, b) AZ1(c, b), AZ2(c + 1, b), AZ2(c + 3, b) +#define AZ10(c, b) AZ5(c, b), AZ5(c + 5, b) +#define AZ25(c, b) AZ5(c, b), AZ10(c + 5, b), AZ10(c + 15, b) +#define AZ26(c, b) AZ1(c, b), AZ25(c + 1, b) + AZ10('0', '0'), + AZ26('a', 'a' + 10), + AZ25('A', 'A' + 36), +#define ZA1(c, b) [CCI('Z', c)] = (61 + ((c - b) >> 4)) | (((c - b) & 0xf) << 6) | W_ZA +#define ZA2(c, b) ZA1(c, b), ZA1(c + 1, b) +#define ZA5(c, b) ZA1(c, b), ZA2(c + 1, b), ZA2(c + 3, b) +#define ZA10(c, b) ZA5(c, b), ZA5(c + 5, b) +#define ZA25(c, b) ZA5(c, b), ZA10(c + 5, b), ZA10(c + 15, b) +#define ZA26(c, b) ZA1(c, b), ZA25(c + 1, b) + ZA10('0', '0'), + ZA26('a', 'a' + 10), + ZA25('A', 'A' + 36), +#define A01(c, b) [CCI(c, 0)] = (c - b) | W_A0 +#define A02(c, b) A01(c, b), A01(c + 1, b) +#define A05(c, b) A01(c, b), A02(c + 1, b), A02(c + 3, b) +#define A010(c, b) A05(c, b), A05(c + 5, b) +#define A025(c, b) A05(c, b), A010(c + 5, b), A010(c + 15, b) +#define A026(c, b) A01(c, b), A025(c + 1, b) + A010('0', '0'), + A026('a', 'a' + 10), + A025('A', 'A' + 36), +#define OX(c) [CCI(0, c)] = W_0X +#define OX4(c) OX(c), OX(c + 1), OX(c + 2), OX(c + 3) +#define OX16(c) OX4(c), OX4(c + 4), OX4(c + 8), OX4(c + 12) +#define OX64(c) OX16(c), OX16(c + 16), OX16(c + 32), OX16(c + 48) +#define OX256(c) OX64(c), OX64(c + 64), OX64(c + 128), OX64(c + 192) + OX256('\0'), +}; + +// Combined base62+golomb decoding routine. +static +int decode_base62_golomb(const char *base62, int Mshift, unsigned *v) +{ + unsigned *v_start = v; + unsigned mask = (1 << Mshift) - 1; + unsigned q = 0; + unsigned r = 0; + int rfill = 0; + long c, w; + int n, vbits, left; + unsigned bits, morebits; + // need align + if (1 & (long) base62) { + c = (unsigned char) *base62++; + bits = char_to_num[c]; + if (bits < 61) + goto put6q_align; + else { + if (bits == 0xff) + goto eolq; + if (bits == 0xee) + return -1; + assert(bits == 61); + goto esc1q; + } + } + // regular mode, process two-byte words +#define Get24(X) \ + w = *(unsigned short *) base62; \ + base62 += 2; \ + bits = word_to_num[w]; \ + if (bits >= 0x1000) \ + goto gotNN ## X; \ + w = *(unsigned short *) base62; \ + base62 += 2; \ + morebits = word_to_num[w]; \ + if (morebits >= 0x1000) \ + goto put12 ## X; \ + bits |= (morebits << 12); \ + goto put24 ## X +#define Get12(X) \ + bits = morebits +#define GotNN(X) \ + switch (bits & 0xf000) { \ + case W_AZ: \ + bits &= 0x0fff; \ + goto put6 ## X ## _AZ; \ + case W_ZA: \ + bits &= 0x0fff; \ + goto put10 ## X ## _ZA; \ + case W_A0: \ + bits &= 0x0fff; \ + goto put6 ## X ## _A0; \ + case W_0X: \ + goto eol ## X; \ + default: \ + return -2; \ + } + // make coroutines + get24q: Get24(q); + get24r: Get24(r); + get12q: Get12(q); + gotNNq: GotNN(q); + get12r: Get12(r); + gotNNr: GotNN(r); + // escape mode, handle 2 bytes one by one +#define Esc1(X) \ + bits = 61; \ + c = (unsigned char) *base62++; \ + morebits = char_to_num[c]; \ + if (morebits == 0xff) \ + return -3; \ + if (morebits == 0xee) \ + return -4; \ + switch (morebits & (16 + 32)) { \ + case 0: \ + break; \ + case 16: \ + bits = 62; \ + morebits &= ~16; \ + break; \ + case 32: \ + bits = 63; \ + morebits &= ~32; \ + break; \ + default: \ + return -5; \ + } \ + bits |= (morebits << 6); \ + goto put10 ## X ## _esc1 +#define Esc2(X) \ + c = (unsigned char) *base62++; \ + bits = char_to_num[c]; \ + if (bits < 61) \ + goto put6 ## X ## _esc2; \ + else { \ + if (bits == 0xff) \ + goto eol ## X; \ + if (bits == 0xee) \ + return -6; \ + goto esc1 ## X; \ + } + // make coroutines + esc1q: Esc1(q); + esc2q: Esc2(q); + esc1r: Esc1(r); + esc2r: Esc2(r); + // golomb pieces +#define QInit(N) \ + n = N +#define RInit(N) \ + n = N; \ + r |= (bits << rfill); \ + rfill += n +#define RMake(Get) \ + left = rfill - Mshift; \ + if (left < 0) \ + goto Get ## r; \ + r &= mask; \ + *v++ = (q << Mshift) | r; \ + q = 0; \ + bits >>= n - left; \ + n = left +#define QMake(Get) \ + if (bits == 0) { \ + q += n; \ + goto Get ## q; \ + } \ + vbits = __builtin_ffs(bits); \ + n -= vbits; \ + bits >>= vbits; \ + q += vbits - 1; \ + r = bits; \ + rfill = n + // this assumes that minumum Mshift value is 7 +#define Put24Q(Get) \ + QInit(24); \ + QMake(Get); RMake(Get); \ + QMake(Get); RMake(Get); \ + QMake(Get); RMake(Get); \ + goto Get ## q +#define Put24R(Get) \ + RInit(24); \ + RMake(Get); \ + QMake(Get); RMake(Get); \ + QMake(Get); RMake(Get); \ + QMake(Get); goto Get ## r +#define Put12Q(Get) \ + QInit(12); \ + QMake(Get); RMake(Get); \ + QMake(Get); goto Get ## r +#define Put12R(Get) \ + RInit(12); \ + RMake(Get); \ + QMake(Get); RMake(Get); \ + QMake(Get); goto Get ## r +#define Put10Q(Get) \ + QInit(10); \ + QMake(Get); RMake(Get); \ + QMake(Get); goto Get ## r +#define Put10R(Get) \ + RInit(10); \ + RMake(Get); \ + QMake(Get); RMake(Get); \ + QMake(Get); goto Get ## r +#define Put6Q(Get) \ + QInit(6); \ + QMake(Get); goto Get ## r +#define Put6R(Get) \ + RInit(6); \ + RMake(Get); \ + QMake(Get); goto Get ## r + // make coroutines + put24q: Put24Q(get24); + put24r: Put24R(get24); + put12q: Put12Q(get12); + put12r: Put12R(get12); + put6q_align: + put6q_esc2: Put6Q(get24); + put6r_esc2: Put6R(get24); + put6q_AZ: Put6Q(esc1); + put6r_AZ: Put6R(esc1); + put10q_esc1: Put10Q(esc2); + put10r_esc1: Put10R(esc2); + put10q_ZA: Put10Q(get24); + put10r_ZA: Put10R(get24); + put6q_A0: Put6Q(eol); + put6r_A0: Put6R(eol); + // handle end of line and return + eolq: + if (q > 5) + return -10; + return v - v_start; + eolr: + return -11; +} + +#ifdef SELF_TEST +static +void test_word_table() +{ + int i, j; + for (i = 0; i < 256; i++) + for (j = 0; j < 256; j++) { + unsigned char u[2] __attribute__((aligned(2))) = { i, j }; + unsigned short ix = *(unsigned short *) u; + int w = word_to_num[ix]; + if (w < 0x1000) + assert(w == (char_to_num[i] | (char_to_num[j] << 6))); + else + assert(char_to_num[i] >= 61 || char_to_num[j] >= 61); + } + fprintf(stderr, "%s: word table test OK\n", __FILE__); +} + +static +void test_base62_golomb() +{ + const char str[] = "set:hdf7q2P5VZwtLGr9TKxhrEM1"; + const char *base62 = str + 4 + 2; + int Mshift = 10; + char bitv[256]; + int bitc = decode_base62(base62, bitv); + assert(bitc > 0); + unsigned v1[32], v2[32]; + int c1 = decode_golomb(bitc, bitv, Mshift, v1); + assert(c1 > 0); + int c2 = decode_base62_golomb(base62, Mshift, v2); + assert(c2 > 0); + assert(c1 == c2); + int i; + for (i = 0; i < c1; i++) + assert(v1[i] == v2[i]); + fprintf(stderr, "%s: base62_golomb test OK\n", __FILE__); +} +#endif + +/* + * Delta encoding routines - replace an increasing sequence of integer values + * by the sequence of their differences. + */ + +static +void encode_delta(int c, unsigned *v) +{ + assert(c > 0); + unsigned *v_end = v + c; + unsigned v0 = *v++; + while (v < v_end) { + *v -= v0; + v0 += *v++; + } +} + +static +void decode_delta(int c, unsigned *v) +{ + assert(c > 0); + unsigned *v_end = v + c; + unsigned v0 = *v++; + while (v < v_end) { + *v += v0; + v0 = *v++; + } +} + +#ifdef SELF_TEST +static +void test_delta() +{ + unsigned v[] = { + 1, 3, 7, 0 + }; + int c = 3; + encode_delta(c, v); + assert(v[0] == 1); + assert(v[1] == 2); + assert(v[2] == 4); + assert(v[3] == 0); + decode_delta(c, v); + assert(v[0] == 1); + assert(v[1] == 3); + assert(v[2] == 7); + assert(v[3] == 0); + fprintf(stderr, "%s: delta test OK\n", __FILE__); +} +#endif + +/* + * Higher-level set-string routines - serialize integers into set-string. + * + * A set-string looks like this: "set:bMxyz..." + * + * The "set:" prefix marks set-versions in rpm (to distinguish them between + * regular rpm versions). It is assumed to be stripped here. + * + * The next two characters (denoted 'b' and 'M') encode two small integers + * in the range 7..32 using 'a'..'z'. The first character encodes bpp. + * Valid bpp range is 10..32. The second character encodes Mshift. Valid + * Mshift range is 7..31. Also, valid Mshift must be less than bpp. + * + * The rest ("xyz...") is a variable-length sequence of alnum characters. + * It encodes a (sorted) set of (non-negative) integer values, as follows: + * integers are delta-encoded, golomb-compressed and base62-serialized. + */ + +static +int encode_set_size(int c, int bpp) +{ + int Mshift = encode_golomb_Mshift(c, bpp); + int bitc = encode_golomb_size(c, Mshift); + // two leading characters are special + return 2 + encode_base62_size(bitc); +} + +static +int encode_set(int c, unsigned *v, int bpp, char *base62) +{ + // XXX v is non-const due to encode_delta + int Mshift = encode_golomb_Mshift(c, bpp); + int bitc = encode_golomb_size(c, Mshift); + char bitv[bitc]; + // bpp + if (bpp < 10 || bpp > 32) + return -1; + *base62++ = bpp - 7 + 'a'; + // golomb parameter + if (Mshift < 7 || Mshift > 31) + return -2; + *base62++ = Mshift - 7 + 'a'; + // delta + encode_delta(c, v); + // golomb + bitc = encode_golomb(c, v, Mshift, bitv); +#ifdef SELF_TEST + decode_delta(c, v); +#endif + if (bitc < 0) + return -3; + // base62 + int len = encode_base62(bitc, bitv, base62); + if (len < 0) + return -4; + return 2 + len; +} + +static +int decode_set_init(const char *str, int *pbpp, int *pMshift) +{ + // 7..32 values encoded with 'a'..'z' + int bpp = *str++ + 7 - 'a'; + if (bpp < 10 || bpp > 32) + return -1; + // golomb parameter + int Mshift = *str++ + 7 - 'a'; + if (Mshift < 7 || Mshift > 31) + return -2; + if (Mshift >= bpp) + return -3; + // no empty sets for now + if (*str == '\0') + return -4; + *pbpp = bpp; + *pMshift = Mshift; + return 0; +} + +static inline +int decode_set_size(int len, int Mshift) +{ + int bitc = decode_base62_size(len - 2); + return decode_golomb_size(bitc, Mshift); +} + +static +int decode_set(const char *str, int Mshift, unsigned *v) +{ + const char *base62 = str + 2; + // separate base62+golomb stages, for reference + if (0) { + // base62 + int len = strlen(base62); + char bitv[decode_base62_size(len)]; + int bitc = decode_base62(base62, bitv); + if (bitc < 0) + return bitc; + // golomb + int c = decode_golomb(bitc, bitv, Mshift, v); + if (c < 0) + return c; + // delta + decode_delta(c, v); + return c; + } + // combined base62+golomb stage + int c = decode_base62_golomb(base62, Mshift, v); + if (c < 0) + return c; + // delta + decode_delta(c, v); + return c; +} + +// Special decode_set version with LRU caching. +static +int cache_decode_set(const char *str, int Mshift, const unsigned **pv) +{ + struct cache_ent { + char *str; + int len; + int c; + unsigned v[]; + }; +#define CACHE_SIZE 256 +#define PIVOT_SIZE 243 + static int hc; + static unsigned hv[CACHE_SIZE]; + static struct cache_ent *ev[CACHE_SIZE]; + // look up in the cache + int i; + unsigned *hp; + struct cache_ent *ent; + unsigned hash = str[0] | (str[2] << 8) | (str[3] << 16); + for (hp = hv; hp < hv + hc; hp++) { + if (hash == *hp) { + i = hp - hv; + ent = ev[i]; + if (memcmp(str, ent->str, ent->len + 1) == 0) { + // hit, move to front + if (i) { + memmove(hv + 1, hv, i * sizeof(hv[0])); + memmove(ev + 1, ev, i * sizeof(ev[0])); + hv[0] = hash; + ev[0] = ent; + } + *pv = ent->v; + return ent->c; + } + } + } + // decode + int len = strlen(str); + int c = decode_set_size(len, Mshift); +#define SENTINELS 8 + ent = malloc(sizeof(*ent) + len + 1 + (c + SENTINELS) * sizeof(unsigned)); + assert(ent); + c = ent->c = decode_set(str, Mshift, ent->v); + if (c <= 0) { + free(ent); + return c; + } + for (i = 0; i < SENTINELS; i++) + ent->v[c + i] = ~0u; + ent->str = (char *)(ent->v + c + SENTINELS); + memcpy(ent->str, str, len + 1); + ent->len = len; + // insert + if (hc < CACHE_SIZE) + i = hc++; + else { + // free last entry + free(ev[CACHE_SIZE - 1]); + // position at midpoint + i = PIVOT_SIZE; + memmove(hv + i + 1, hv + i, (CACHE_SIZE - i - 1) * sizeof(hv[0])); + memmove(ev + i + 1, ev + i, (CACHE_SIZE - i - 1) * sizeof(ev[0])); + } + hv[i] = hash; + ev[i] = ent; + *pv = ent->v; + return c; +} + +// Reduce a set of (bpp + 1) values to a set of bpp values. +static +int downsample_set(int c, const unsigned *v, unsigned *w, int bpp) +{ + unsigned mask = (1 << bpp) - 1; + // find the first element with high bit set + int l = 0; + int u = c; + while (l < u) { + int i = (l + u) / 2; + if (v[i] <= mask) + l = i + 1; + else + u = i; + } + // initialize parts + const unsigned *w_start = w; + const unsigned *v1 = v + 0, *v1end = v + u; + const unsigned *v2 = v + u, *v2end = v + c; + // merge v1 and v2 into w + if (v1 < v1end && v2 < v2end) { + unsigned v1val = *v1; + unsigned v2val = *v2 & mask; + while (1) { + if (v1val < v2val) { + *w++ = v1val; + v1++; + if (v1 == v1end) + break; + v1val = *v1; + } + else if (v2val < v1val) { + *w++ = v2val; + v2++; + if (v2 == v2end) + break; + v2val = *v2 & mask; + } + else { + *w++ = v1val; + v1++; + v2++; + if (v1 == v1end) + break; + if (v2 == v2end) + break; + v1val = *v1; + v2val = *v2 & mask; + } + } + } + // append what's left + while (v1 < v1end) + *w++ = *v1++; + while (v2 < v2end) + *w++ = *v2++ & mask; + return w - w_start; +} + +#ifdef SELF_TEST +static +void test_set() +{ + unsigned rnd_v[] = { + 0x020a, 0x07e5, 0x3305, 0x35f5, + 0x4980, 0x4c4f, 0x74ef, 0x7739, + 0x82ae, 0x8415, 0xa3e7, 0xb07e, + 0xb584, 0xb89f, 0xbb40, 0xf39e, + }; + int rnd_c = sizeof rnd_v / sizeof *rnd_v; + // encode + int bpp = 16; + char base62[encode_set_size(rnd_c, bpp)]; + int len = encode_set(rnd_c, rnd_v, bpp, base62); + assert(len > 0); + fprintf(stderr, "len=%d set=%s\n", len, base62); + // decode + int Mshift = bpp; + int rc = decode_set_init(base62, &bpp, &Mshift); + assert(rc == 0); + assert(bpp == 16); + assert(Mshift < bpp); + int c = decode_set_size(len, Mshift); + assert(c >= rnd_c); + unsigned vbuf[c]; + const unsigned *v = vbuf; + c = decode_set(base62, Mshift, vbuf); + // Decoded values must match. + assert(c == rnd_c); + int i; + for (i = 0; i < c; i++) + assert(v[i] == rnd_v[i]); + // Cached version. + c = cache_decode_set(base62, Mshift, &v); + assert(c == rnd_c); + for (i = 0; i < c; i++) + assert(v[i] == rnd_v[i]); + fprintf(stderr, "%s: set test OK\n", __FILE__); +} +#endif + +/* + * API routines start here. + */ + +#include "set.h" + +// main API routine +int rpmsetcmp(const char *str1, const char *str2) +{ + if (strncmp(str1, "set:", 4) == 0) + str1 += 4; + if (strncmp(str2, "set:", 4) == 0) + str2 += 4; + // initialize decoding + int bpp1, Mshift1; + int bpp2, Mshift2; + if (decode_set_init(str1, &bpp1, &Mshift1) < 0) + return -3; + if (decode_set_init(str2, &bpp2, &Mshift2) < 0) + return -4; + // decode set1 (comes on behalf of provides) + const unsigned *v1 = NULL; + int c1 = cache_decode_set(str1, Mshift1, &v1); + if (c1 < 0) + return -3; + unsigned v1bufA[c1 + 1]; + unsigned v1bufB[c1 + 1]; + // decode set2 (on the stack) + int len2 = strlen(str2); + int c2 = decode_set_size(len2, Mshift2); + unsigned v2bufA[c2]; + unsigned v2bufB[c2]; + const unsigned *v2 = v2bufA; + c2 = decode_set(str2, Mshift2, v2bufA); + if (c2 < 0) + return -4; + // adjust for comparison + int i; + while (bpp1 > bpp2) { + unsigned *v1buf = v1bufA; + if (v1 == v1buf) + v1buf = v1bufB; + bpp1--; + c1 = downsample_set(c1, v1, v1buf, bpp1); + for (i = 0; i < SENTINELS; i++) + v1buf[c1 + i] = ~0u; + v1 = v1buf; + } + while (bpp2 > bpp1) { + unsigned *v2buf = v2bufA; + if (v2 == v2buf) + v2buf = v2bufB; + bpp2--; + c2 = downsample_set(c2, v2, v2buf, bpp2); + v2 = v2buf; + } + // compare + int ge = 1; + int le = 1; + const unsigned *v1end = v1 + c1; + const unsigned *v2end = v2 + c2; + for (i = 0; i < SENTINELS; i++) + assert(v1end[i] == ~0u); + unsigned v2val = *v2; + // loop pieces +#define IFLT4 \ + if (*v1 < v2val) { \ + le = 0; \ + v1 += 4; \ + while (*v1 < v2val) \ + v1 += 4; \ + v1 -= 2; \ + if (*v1 < v2val) \ + v1++; \ + else \ + v1--; \ + if (*v1 < v2val) \ + v1++; \ + if (v1 == v1end) \ + break; \ + } +#define IFLT8 \ + if (*v1 < v2val) { \ + le = 0; \ + v1 += 8; \ + while (*v1 < v2val) \ + v1 += 8; \ + v1 -= 4; \ + if (*v1 < v2val) \ + v1 += 2; \ + else \ + v1 -= 2; \ + if (*v1 < v2val) \ + v1++; \ + else \ + v1--; \ + if (*v1 < v2val) \ + v1++; \ + if (v1 == v1end) \ + break; \ + } +#define IFGE \ + if (*v1 == v2val) { \ + v1++; \ + v2++; \ + if (v1 == v1end) \ + break; \ + if (v2 == v2end) \ + break; \ + v2val = *v2; \ + } \ + else { \ + ge = 0; \ + v2++; \ + if (v2 == v2end) \ + break; \ + v2val = *v2; \ + } + // choose the right stepper + if (c1 >= 16 * c2) { + while (1) { + IFLT8; + IFGE; + } + } + else { + while (1) { + IFLT4; + IFGE; + } + } + // return + if (v1 < v1end) + le = 0; + if (v2 < v2end) + ge = 0; + if (le && ge) + return 0; + if (ge) + return 1; + if (le) + return -1; + return -2; +} + +/* + * Simple API for creating set-versions. + */ + +#include "system.h" +#include "rpmlib.h" + +// Internally, "struct set" is just a bag of strings and their hash values. +struct set { + int c; + struct sv { + const char *s; + unsigned v; + } *sv; +}; + +struct set *set_new() +{ + struct set *set = xmalloc(sizeof *set); + set->c = 0; + set->sv = NULL; + return set; +} + +void set_add(struct set *set, const char *sym) +{ + const int delta = 1024; + if ((set->c & (delta - 1)) == 0) + set->sv = xrealloc(set->sv, sizeof(*set->sv) * (set->c + delta)); + set->sv[set->c].s = xstrdup(sym); + set->sv[set->c].v = 0; + set->c++; +} + +struct set *set_free(struct set *set) +{ + if (set) { + int i; + for (i = 0; i < set->c; i++) + set->sv[i].s = _free(set->sv[i].s); + set->sv = _free(set->sv); + } + return NULL; +} + +// This routine does the whole job. +const char *set_fini(struct set *set, int bpp) +{ + if (set->c < 1) + return NULL; + if (bpp < 10) + return NULL; + if (bpp > 32) + return NULL; + unsigned mask = (bpp < 32) ? (1u << bpp) - 1 : ~0u; + // Jenkins' one-at-a-time hash + unsigned int hash(const char *str) + { + unsigned int hash = 0x9e3779b9; + const unsigned char *p = (const unsigned char *) str; + while (*p) { + hash += *p++; + hash += (hash << 10); + hash ^= (hash >> 6); + } + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + return hash; + } + // hash sv strings + int i; + for (i = 0; i < set->c; i++) + set->sv[i].v = hash(set->sv[i].s) & mask; + // sort by hash value + int cmp(const void *arg1, const void *arg2) + { + struct sv *sv1 = (struct sv *) arg1; + struct sv *sv2 = (struct sv *) arg2; + if (sv1->v > sv2->v) + return 1; + if (sv2->v > sv1->v) + return -1; + return 0; + } + qsort(set->sv, set->c, sizeof *set->sv, cmp); + // warn on hash collisions + for (i = 0; i < set->c - 1; i++) { + if (set->sv[i].v != set->sv[i+1].v) + continue; + if (strcmp(set->sv[i].s, set->sv[i+1].s) == 0) + continue; + fprintf(stderr, "warning: hash collision: %s %s\n", + set->sv[i].s, set->sv[i+1].s); + } + // encode + unsigned v[set->c]; + for (i = 0; i < set->c; i++) + v[i] = set->sv[i].v; + int uniqv(int c, unsigned *v) + { + int i, j; + for (i = 0, j = 0; i < c; i++) { + while (i + 1 < c && v[i] == v[i+1]) + i++; + v[j++] = v[i]; + } + return j; + } + int c = uniqv(set->c, v); + char base62[encode_set_size(c, bpp)]; + int len = encode_set(c, v, bpp, base62); + if (len < 0) + return NULL; + return xstrdup(base62); +} + +#ifdef SELF_TEST +static +void test_api() +{ + struct set *set1 = set_new(); + set_add(set1, "mama"); + set_add(set1, "myla"); + set_add(set1, "ramu"); + const char *str10 = set_fini(set1, 16); + fprintf(stderr, "set10=%s\n", str10); + + int cmp; + struct set *set2 = set_new(); + set_add(set2, "myla"); + set_add(set2, "mama"); + const char *str20 = set_fini(set2, 16); + fprintf(stderr, "set20=%s\n", str20); + cmp = rpmsetcmp(str10, str20); + assert(cmp == 1); + + set_add(set2, "ramu"); + const char *str21 = set_fini(set2, 16); + fprintf(stderr, "set21=%s\n", str21); + cmp = rpmsetcmp(str10, str21); + assert(cmp == 0); + + set_add(set2, "baba"); + const char *str22 = set_fini(set2, 16); + cmp = rpmsetcmp(str10, str22); + assert(cmp == -1); + + set_add(set1, "deda"); + const char *str11 = set_fini(set1, 16); + cmp = rpmsetcmp(str11, str22); + assert(cmp == -2); + + set1 = set_free(set1); + set2 = set_free(set2); + str10 = _free(str10); + str11 = _free(str11); + str20 = _free(str20); + str21 = _free(str21); + str22 = _free(str22); + + fprintf(stderr, "%s: api test OK\n", __FILE__); +} +#endif + +#ifdef SELF_TEST +int main() +{ + test_base62(); + test_golomb(); + test_word_table(); + test_base62_golomb(); + test_delta(); + test_set(); + test_api(); + return 0; +} +#endif +// ex: set ts=8 sts=4 sw=4 noet: @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/set.h ============================================================================ $ cvs diff -u -r0 -r1.1.2.1 set.h --- /dev/null 2012-03-25 18:35:29.000000000 +0200 +++ set.h 2012-03-25 18:37:25.249863211 +0200 @@ -0,0 +1,32 @@ +#ifndef SET_H +#define SET_H + +/* Compare two set-versions. + * Return value: + * 1: set1 > set2 + * 0: set1 == set2 + * -1: set1 < set2 (aka set1 \subset set2) + * -2: set1 != set2 + * -3: set1 decoder error + * -4: set2 decoder error + * For performance reasons, set1 should come on behalf of Provides. + */ +int rpmsetcmp(const char *set1, const char *set2); + +/* + * API for creating set versions. + */ + +// initialize new set +struct set *set_new(void); + +// add new symbol to set +void set_add(struct set *set, const char *sym); + +// make set-version +const char *set_fini(struct set *set, int bpp); + +// free set +struct set *set_free(struct set *set); + +#endif @@ . ______________________________________________________________________ RPM Package Manager http://rpm5.org CVS Sources Repository rpm-cvs@rpm5.org