THis check-in likely needs some explanation because it essentially commits rpm to "UTF-8" in --queryformats.
I tried several ways (and spent several days) trying to devise some way to translate the encodings issues that I see, particularly in ancient %changelog's. The reude-and-crude brute firce solution (and all I could figger) is to attempt a idempotent "UTF-8" -> "UTF-8//IGNORE" transform to drop any improperly (i.e non-utf-8) encoded characters on systems that use glibc's iconv_open(3). The other brain-fart (knew it was there, quite painful finding) is that escaping rules for JSON are almost but not quite the same as for SQL. Basically every changelog that included "don't" failed to import. With those two changes encoding is approaching 95+% on the tricky %changelogs, and filelists/primary repo-md like analoges are approaching 99.9% reliable. But that should be gud enough to deploy populated MongoDB's at http://mongohq using Mandriva cooker 2011 beta2 as a *ahem* Guiiea pig, and then I can set about writing up client tools to "blueprint" the data contents and figger some more useful schema than the repo-md analogue I'm currently using simply because it exist. I have no intention of sticking with anything resembling repo-md going "forward", the poorly designed markup was designed for mirroring, not a NoDIST! distributed database and there's too many iisues (including that any change in repo-md takes years to achieve, and any patches from @rpm5.org are publicly/loudly not acceptable). All the gory details (except user/password) that I'm using to access the 5 MongoDB's I know have deployed can be found in scripts/mgo on the rpm-5_4 branch. Anyone who wishes (there were 2 people, that's 2 more than I expected ;-) to access these stores sooner rather than later (this is very much active work in progress) should mail me privately, and I'll either send you the existing MongoDB passwords, or (just as likely) just wire you up in the "rpm5.org" account at mongohq.com mongohq.com has a pretty nice admin interface; I will also be looking at other MongoDB hosting, but MongoHQ is more than enough functionality to get "production" quality fire-and-forget imports using rpm --query --qf *.rpm | mongo sufficient to start client-side tool development of RPM+MONGODB hth 73 de Jeff On Apr 11, 2011, at 7:05 PM, Jeff Johnson wrote: > RPM Package Manager, CVS Repository > http://rpm5.org/cvs/ > ____________________________________________________________________________ > > Server: rpm5.org Name: Jeff Johnson > Root: /v/rpm/cvs Email: j...@rpm5.org > Module: rpm Date: 12-Apr-2011 01:05:26 > Branch: HEAD Handle: 2011041123052103 > > Modified files: > rpm CHANGES > rpm/rpmdb hdrfmt.c > > Log: > - mongo: wire-up a per-spewage macro expansion, more todo++. > - mongo: fix: iconv_open("UTF-8", "UTF-8//IGNORE") zero tolerance > transform for encoding randomness, particularly in ancient %changelog's. > - mongo: fix: JSON has no escaped single quote or vertical tab. > > Summary: > Revision Changes Path > 1.3614 +4 -0 rpm/CHANGES > 1.154 +62 -43 rpm/rpmdb/hdrfmt.c > ____________________________________________________________________________ > > patch -p0 <<'@@ .' > Index: rpm/CHANGES > ============================================================================ > $ cvs diff -u -r1.3613 -r1.3614 CHANGES > --- rpm/CHANGES 11 Apr 2011 14:41:35 -0000 1.3613 > +++ rpm/CHANGES 11 Apr 2011 23:05:21 -0000 1.3614 > @@ -1,4 +1,8 @@ > 5.4.0 -> 5.4.1: > + - jbj: mongo: wire-up a per-spewage macro expansion, more todo++. > + - jbj: mongo: fix: iconv_open("UTF-8", "UTF-8//IGNORE") zero tolerance > + transform for encoding randomness, particularly in ancient %changelog's. > + - jbj: mongo: fix: JSON has no escaped single quote or vertical tab. > - devzero2000: add more gpg keys to thkp.c test program > - jbj: move multiarch* to mandriva.in. > - jbj: put "devel(libfoo)" deps under RPM_VENDOR_MANDRIVA. > @@ . > patch -p0 <<'@@ .' > Index: rpm/rpmdb/hdrfmt.c > ============================================================================ > $ cvs diff -u -r1.153 -r1.154 hdrfmt.c > --- rpm/rpmdb/hdrfmt.c 7 Apr 2011 22:02:10 -0000 1.153 > +++ rpm/rpmdb/hdrfmt.c 11 Apr 2011 23:05:24 -0000 1.154 > @@ -482,12 +482,12 @@ > case '\b': > case '\t': > case '\n': > - case '\v': > case '\f': > case '\r': > - case '\"': > - case '\'': len += 1; /*@fallthrough@*/ > + case '"': > + case '\\': len += 1; /*@fallthrough@*/ > default: len += 1; /*@switchbreak@*/ break; > + /* XXX todo: emit \u1234 here somehow */ > } > } > return len; > @@ -512,12 +512,12 @@ > case '\b': *te++ = '\\'; *te++ = 'b'; /*@switchbreak@*/ break; > case '\t': *te++ = '\\'; *te++ = 't'; /*@switchbreak@*/ break; > case '\n': *te++ = '\\'; *te++ = 'n'; /*@switchbreak@*/ break; > - case '\v': *te++ = '\\'; *te++ = 'v'; /*@switchbreak@*/ break; > case '\f': *te++ = '\\'; *te++ = 'f'; /*@switchbreak@*/ break; > case '\r': *te++ = '\\'; *te++ = 'r'; /*@switchbreak@*/ break; > - case '\"': *te++ = '\\'; *te++ = '"'; /*@switchbreak@*/ break; > - case '\'': *te++ = '\\'; *te++ = '\''; /*@switchbreak@*/ break; > + case '"': *te++ = '\\'; *te++ = '"'; /*@switchbreak@*/ break; > + case '\\': *te++ = '\\'; *te++ = '\\'; /*@switchbreak@*/ break; > default: *te++ = (char) c; /*@switchbreak@*/ break; > + /* XXX todo: emit \u1234 here somehow */ > } > } > *te = '\0'; > @@ -858,21 +858,36 @@ > > /*====================================================================*/ > > +#if defined(__GLIBC__) /* XXX todo: find where iconv(3) was > implemented. */ > +/* XXX using "//TRANSLIT" instead assumes known fromcode? */ > +/*@unchecked@*/ > +static const char * _iconv_tocode = "UTF-8//IGNORE"; > +/*@unchecked@*/ > +static const char * _iconv_fromcode = "UTF-8"; > +#else > +/*@unchecked@*/ > +static const char * _iconv_tocode = "UTF-8"; > +/*@unchecked@*/ > +static const char * _iconv_fromcode = NULL; > +#endif > + > static /*@only@*/ /*@null@*/ char * > strdup_locale_convert (/*@null@*/ const char * buffer, > /*@null@*/ const char * tocode) > /*@*/ > { > - char *dest_str; > + char *dest_str = NULL; > #if defined(HAVE_ICONV) > - char *fromcode = NULL; > + char *fromcode = _iconv_fromcode; > iconv_t fd; > + int is_error = 0; > + int done = 0; > > if (buffer == NULL) > - return NULL; > + goto exit; > > if (tocode == NULL) > - tocode = "UTF-8"; > + tocode = _iconv_tocode; > > #ifdef HAVE_LANGINFO_H > fromcode = nl_langinfo (CODESET); > @@ -884,8 +899,6 @@ > const char *pin = buffer; > char *pout = NULL; > size_t ib, ob, dest_size; > - int done; > - int is_error; > size_t err; > const char *shift_pin = NULL; > int xx; > @@ -895,7 +908,6 @@ > dest_str = pout = malloc((dest_size + 1) * sizeof(*dest_str)); > if (dest_str) > *dest_str = '\0'; > - done = is_error = 0; > if (pout != NULL) > while (done == 0 && is_error == 0) { > err = iconv(fd, (char **)&pin, &ib, &pout, &ob); > @@ -944,6 +956,7 @@ > dest_str = xstrdup((buffer ? buffer : "")); > } > > +exit: > return dest_str; > } > > @@ -1031,20 +1044,12 @@ > assert(he->t == RPM_STRING_TYPE || he->t == RPM_UINT64_TYPE || he->t == > RPM_BIN_TYPE); > switch (he->t) { > case RPM_STRING_ARRAY_TYPE: /* XXX currently never happens */ > - s = he->p.argv[ix]; > - xtag = "string"; > - /* XXX Force utf8 strings. */ > - s = xstrdup(s); > - s = xstrtolocale(s); > - freeit = 1; > - break; > case RPM_I18NSTRING_TYPE: /* XXX currently never happens */ > +assert(0); > case RPM_STRING_TYPE: > - s = he->p.str; > xtag = "string"; > /* XXX Force utf8 strings. */ > - s = xstrdup(s); > - s = xstrtolocale(s); > + s = strdup_locale_convert(he->p.str, (av ? av[0] : NULL)); > freeit = 1; > break; > case RPM_BIN_TYPE: > @@ -1191,8 +1196,7 @@ > } > > /* XXX Force utf8 strings. */ > - s = xstrdup(he->p.str); > - s = xstrtolocale(s); > + s = strdup_locale_convert(he->p.str, (av ? av[0] : NULL)); > freeit = 1; > break; > case RPM_BIN_TYPE: > @@ -1301,11 +1305,10 @@ > switch (he->t) { > case RPM_STRING_ARRAY_TYPE: /* XXX currently never happens */ > case RPM_I18NSTRING_TYPE: /* XXX currently never happens */ > +assert(0); > case RPM_STRING_TYPE: > - s = (he->t == RPM_STRING_ARRAY_TYPE ? he->p.argv[ix] : he->p.str); > /* XXX Force utf8 strings. */ > - s = xstrdup(he->p.str); > - s = xstrtolocale(s); > + s = strdup_locale_convert(he->p.str, (av ? av[0] : NULL)); > freeit = 1; > break; > case RPM_BIN_TYPE: > @@ -1346,7 +1349,7 @@ > s = t; > c = '\0'; > } else > - c = '\''; > + c = '"'; > > nb = spew->spew_strlen(s, lvl); > if (c != '\0') > @@ -3516,6 +3519,7 @@ > /*@globals internalState @*/ > /*@modifies he, internalState @*/ > { > + static char q = '"'; > rpmTag tag = he->tag; > rpmTagData N = { .ptr = NULL }; > rpmTagData EVR = { .ptr = NULL }; > @@ -3601,9 +3605,8 @@ > /*@=nullstate@*/ > he->p.argv[ac++] = te; > te = stpcpy(te, instance); > - te = stpcpy(te, ", '"); > - te = stpcpy(te, N.argv[i]); > - te = stpcpy(te, "'"); > + *te++ = ','; *te++ = ' '; > + *te++ = q; te = stpcpy(te, N.argv[i]); *te++ = q; > /*@-readonlytrans@*/ > if (EVR.argv != NULL && EVR.argv[i] != NULL && *EVR.argv[i] != '\0') { > static const char *Fstr[] = { > "?0","LT","GT","?3","EQ","LE","GE","?7" }; > @@ -3617,16 +3620,28 @@ > const char * D = Revr->F[RPMEVR_D]; > #endif > xx = xx; > - te = stpcpy( stpcpy( stpcpy(te, ", '"), Fstr[Fx]), "'"); > - te = stpcpy( stpcpy( stpcpy(te, ", '"), E), "'"); > - te = stpcpy( stpcpy( stpcpy(te, ", '"), V), "'"); > - te = stpcpy( stpcpy( stpcpy(te, ", '"), R), "'"); > + *te++ = ','; *te++ = ' '; > + *te++ = q; te = stpcpy(te, Fstr[Fx]); *te++ = q; > + *te++ = ','; *te++ = ' '; > + *te++ = q; te = stpcpy(te, E); *te++ = q; > + *te++ = ','; *te++ = ' '; > + *te++ = q; te = stpcpy(te, V); *te++ = q; > + *te++ = ','; *te++ = ' '; > + *te++ = q; te = stpcpy(te, R); *te++ = q; > #ifdef NOTYET /* XXX turning this on breaks rpmrepo */ > - te = stpcpy( stpcpy( stpcpy(te, ", '"), D), "'"); > + *te++ = ','; *te++ = ' '; > + *te++ = q; te = stpcpy(te, D); *te++ = q; > #endif > Revr = rpmEVRfree(Revr); > } else { > - te = stpcpy(te, ", '', '', '', ''"); > + *te++ = ','; *te++ = ' '; > + *te++ = q; *te++ = q; > + *te++ = ','; *te++ = ' '; > + *te++ = q; *te++ = q; > + *te++ = ','; *te++ = ' '; > + *te++ = q; *te++ = q; > + *te++ = ','; *te++ = ' '; > + *te++ = q; *te++ = q; > } > /*@=readonlytrans@*/ > #ifdef NOTNOW > @@ -6597,7 +6612,7 @@ > sprintfToken nextfmt; > sprintfTag tag; > char * t, * te; > - int need; > + size_t need; > spew_t spew = NULL; > > /*@-modfilesys@*/ > @@ -6649,10 +6664,12 @@ > spew = &_json_spew; > > if (spew && spew->spew_init && spew->spew_init[0]) { > - need = strlen(spew->spew_init); > + char * spew_init = rpmExpand(spew->spew_init, NULL); > + need = strlen(spew_init); > t = hsaReserve(hsa, need); > - te = stpcpy(t, spew->spew_init); > + te = stpcpy(t, spew_init); > hsa->vallen += (te - t); > + spew_init = _free(spew_init); > } > > hsa = hsaInit(hsa); > @@ -6668,10 +6685,12 @@ > hsa = hsaFini(hsa); > > if (spew && spew->spew_fini && spew->spew_fini[0]) { > - need = strlen(spew->spew_fini); > + char * spew_fini = rpmExpand(spew->spew_fini, NULL); > + need = strlen(spew_fini); > t = hsaReserve(hsa, need); > - te = stpcpy(t, spew->spew_fini); > + te = stpcpy(t, spew_fini); > hsa->vallen += (te - t); > + spew_fini = _free(spew_fini); > } > > if (hsa->val != NULL && hsa->vallen < hsa->alloced) > @@ . > ______________________________________________________________________ > RPM Package Manager http://rpm5.org > CVS Sources Repository rpm-...@rpm5.org
smime.p7s
Description: S/MIME cryptographic signature