THis check-in likely needs some explanation because
it essentially commits rpm to "UTF-8" in --queryformats.

I tried several ways (and spent several days) trying to devise some
way to translate the encodings issues that I see, particularly in
ancient %changelog's.

The reude-and-crude brute firce solution (and all I could
figger) is to attempt a idempotent
        "UTF-8" -> "UTF-8//IGNORE"
transform to drop any improperly (i.e non-utf-8) encoded characters
on systems that use glibc's iconv_open(3).

The other brain-fart (knew it was there, quite painful finding) is that
escaping rules for JSON are almost but not quite the same as for SQL.

Basically every changelog that included "don't" failed to import.

With those two changes encoding is approaching 95+% on the tricky %changelogs,
and filelists/primary repo-md like analoges are approaching 99.9% reliable.

But that should be gud enough to deploy populated MongoDB's at http://mongohq
using Mandriva cooker 2011 beta2 as a *ahem* Guiiea pig, and then I can set
about writing up client tools to "blueprint" the data contents and figger some
more useful schema than the repo-md analogue I'm currently using simply
because it exist.

I have no intention of sticking with anything resembling repo-md going 
"forward",
the poorly designed markup was designed for mirroring, not a NoDIST! distributed
database and there's too many iisues (including that any change in repo-md takes
years to achieve, and any patches from @rpm5.org are publicly/loudly not 
acceptable).

All the gory details (except user/password) that I'm using to access the 5 
MongoDB's
I know have deployed can be found in scripts/mgo on the rpm-5_4 branch.

Anyone who wishes (there were 2 people, that's 2 more than I expected ;-) to
access these stores sooner rather than later (this is very much active work in 
progress)
should mail me privately, and I'll either send you the existing MongoDB 
passwords,
or (just as likely) just wire you up in the "rpm5.org" account at mongohq.com

mongohq.com has a pretty nice admin interface; I will also be looking at other 
MongoDB hosting,
but MongoHQ is more than enough functionality to get "production" quality 
fire-and-forget
imports using

        rpm --query --qf *.rpm | mongo

sufficient to start client-side tool development of RPM+MONGODB

hth

73 de Jeff
On Apr 11, 2011, at 7:05 PM, Jeff Johnson wrote:

>  RPM Package Manager, CVS Repository
>  http://rpm5.org/cvs/
>  ____________________________________________________________________________
> 
>  Server: rpm5.org                         Name:   Jeff Johnson
>  Root:   /v/rpm/cvs                       Email:  j...@rpm5.org
>  Module: rpm                              Date:   12-Apr-2011 01:05:26
>  Branch: HEAD                             Handle: 2011041123052103
> 
>  Modified files:
>    rpm                     CHANGES
>    rpm/rpmdb               hdrfmt.c
> 
>  Log:
>    - mongo: wire-up a per-spewage macro expansion, more todo++.
>    - mongo: fix: iconv_open("UTF-8", "UTF-8//IGNORE") zero tolerance
>       transform for encoding randomness, particularly in ancient %changelog's.
>    - mongo: fix: JSON has no escaped single quote or vertical tab.
> 
>  Summary:
>    Revision    Changes     Path
>    1.3614      +4  -0      rpm/CHANGES
>    1.154       +62 -43     rpm/rpmdb/hdrfmt.c
>  ____________________________________________________________________________
> 
>  patch -p0 <<'@@ .'
>  Index: rpm/CHANGES
>  ============================================================================
>  $ cvs diff -u -r1.3613 -r1.3614 CHANGES
>  --- rpm/CHANGES      11 Apr 2011 14:41:35 -0000      1.3613
>  +++ rpm/CHANGES      11 Apr 2011 23:05:21 -0000      1.3614
>  @@ -1,4 +1,8 @@
>   5.4.0 -> 5.4.1:
>  +    - jbj: mongo: wire-up a per-spewage macro expansion, more todo++.
>  +    - jbj: mongo: fix: iconv_open("UTF-8", "UTF-8//IGNORE") zero tolerance
>  +    transform for encoding randomness, particularly in ancient %changelog's.
>  +    - jbj: mongo: fix: JSON has no escaped single quote or vertical tab.
>       - devzero2000: add more gpg keys to thkp.c test program
>       - jbj: move multiarch* to mandriva.in.
>       - jbj: put "devel(libfoo)" deps under RPM_VENDOR_MANDRIVA.
>  @@ .
>  patch -p0 <<'@@ .'
>  Index: rpm/rpmdb/hdrfmt.c
>  ============================================================================
>  $ cvs diff -u -r1.153 -r1.154 hdrfmt.c
>  --- rpm/rpmdb/hdrfmt.c       7 Apr 2011 22:02:10 -0000       1.153
>  +++ rpm/rpmdb/hdrfmt.c       11 Apr 2011 23:05:24 -0000      1.154
>  @@ -482,12 +482,12 @@
>       case '\b':
>       case '\t':
>       case '\n':
>  -    case '\v':
>       case '\f':
>       case '\r':
>  -    case '\"':
>  -    case '\'':      len += 1;                       /*@fallthrough@*/
>  +    case '"':
>  +    case '\\':      len += 1;                       /*@fallthrough@*/
>       default:        len += 1;                       /*@switchbreak@*/ break;
>  +    /* XXX todo: emit \u1234 here somehow */
>       }
>       }
>       return len;
>  @@ -512,12 +512,12 @@
>       case '\b':      *te++ = '\\'; *te++ = 'b';      /*@switchbreak@*/ break;
>       case '\t':      *te++ = '\\'; *te++ = 't';      /*@switchbreak@*/ break;
>       case '\n':      *te++ = '\\'; *te++ = 'n';      /*@switchbreak@*/ break;
>  -    case '\v':      *te++ = '\\'; *te++ = 'v';      /*@switchbreak@*/ break;
>       case '\f':      *te++ = '\\'; *te++ = 'f';      /*@switchbreak@*/ break;
>       case '\r':      *te++ = '\\'; *te++ = 'r';      /*@switchbreak@*/ break;
>  -    case '\"':      *te++ = '\\'; *te++ = '"';      /*@switchbreak@*/ break;
>  -    case '\'':      *te++ = '\\'; *te++ = '\'';     /*@switchbreak@*/ break;
>  +    case '"':       *te++ = '\\'; *te++ = '"';      /*@switchbreak@*/ break;
>  +    case '\\':      *te++ = '\\'; *te++ = '\\';     /*@switchbreak@*/ break;
>       default:        *te++ = (char) c;               /*@switchbreak@*/ break;
>  +    /* XXX todo: emit \u1234 here somehow */
>       }
>       }
>       *te = '\0';
>  @@ -858,21 +858,36 @@
> 
>   /*====================================================================*/
> 
>  +#if defined(__GLIBC__)      /* XXX todo: find where iconv(3) was 
> implemented. */
>  +/* XXX using "//TRANSLIT" instead assumes known fromcode? */
>  +/*@unchecked@*/
>  +static const char * _iconv_tocode = "UTF-8//IGNORE";
>  +/*@unchecked@*/
>  +static const char * _iconv_fromcode = "UTF-8";
>  +#else
>  +/*@unchecked@*/
>  +static const char * _iconv_tocode = "UTF-8";
>  +/*@unchecked@*/
>  +static const char * _iconv_fromcode = NULL;
>  +#endif
>  +
>   static /*@only@*/ /*@null@*/ char *
>   strdup_locale_convert (/*@null@*/ const char * buffer,
>               /*@null@*/ const char * tocode)
>       /*@*/
>   {
>  -    char *dest_str;
>  +    char *dest_str = NULL;
>   #if defined(HAVE_ICONV)
>  -    char *fromcode = NULL;
>  +    char *fromcode = _iconv_fromcode;
>       iconv_t fd;
>  +    int is_error = 0;
>  +    int done = 0;
> 
>       if (buffer == NULL)
>  -    return NULL;
>  +    goto exit;
> 
>       if (tocode == NULL)
>  -    tocode = "UTF-8";
>  +    tocode = _iconv_tocode;
> 
>   #ifdef HAVE_LANGINFO_H
>       fromcode = nl_langinfo (CODESET);
>  @@ -884,8 +899,6 @@
>       const char *pin = buffer;
>       char *pout = NULL;
>       size_t ib, ob, dest_size;
>  -    int done;
>  -    int is_error;
>       size_t err;
>       const char *shift_pin = NULL;
>       int xx;
>  @@ -895,7 +908,6 @@
>       dest_str = pout = malloc((dest_size + 1) * sizeof(*dest_str));
>       if (dest_str)
>           *dest_str = '\0';
>  -    done = is_error = 0;
>       if (pout != NULL)
>       while (done == 0 && is_error == 0) {
>           err = iconv(fd, (char **)&pin, &ib, &pout, &ob);
>  @@ -944,6 +956,7 @@
>       dest_str = xstrdup((buffer ? buffer : ""));
>       }
> 
>  +exit:
>       return dest_str;
>   }
> 
>  @@ -1031,20 +1044,12 @@
>   assert(he->t == RPM_STRING_TYPE || he->t == RPM_UINT64_TYPE || he->t == 
> RPM_BIN_TYPE);
>       switch (he->t) {
>       case RPM_STRING_ARRAY_TYPE:     /* XXX currently never happens */
>  -    s = he->p.argv[ix];
>  -    xtag = "string";
>  -    /* XXX Force utf8 strings. */
>  -    s = xstrdup(s);
>  -    s = xstrtolocale(s);
>  -    freeit = 1;
>  -    break;
>       case RPM_I18NSTRING_TYPE:       /* XXX currently never happens */
>  +assert(0);
>       case RPM_STRING_TYPE:
>  -    s = he->p.str;
>       xtag = "string";
>       /* XXX Force utf8 strings. */
>  -    s = xstrdup(s);
>  -    s = xstrtolocale(s);
>  +    s = strdup_locale_convert(he->p.str, (av ? av[0] : NULL));
>       freeit = 1;
>       break;
>       case RPM_BIN_TYPE:
>  @@ -1191,8 +1196,7 @@
>       }
> 
>       /* XXX Force utf8 strings. */
>  -    s = xstrdup(he->p.str);
>  -    s = xstrtolocale(s);
>  +    s = strdup_locale_convert(he->p.str, (av ? av[0] : NULL));
>       freeit = 1;
>       break;
>       case RPM_BIN_TYPE:
>  @@ -1301,11 +1305,10 @@
>       switch (he->t) {
>       case RPM_STRING_ARRAY_TYPE:     /* XXX currently never happens */
>       case RPM_I18NSTRING_TYPE:       /* XXX currently never happens */
>  +assert(0);
>       case RPM_STRING_TYPE:
>  -    s = (he->t == RPM_STRING_ARRAY_TYPE ? he->p.argv[ix] : he->p.str);
>       /* XXX Force utf8 strings. */
>  -    s = xstrdup(he->p.str);
>  -    s = xstrtolocale(s);
>  +    s = strdup_locale_convert(he->p.str, (av ? av[0] : NULL));
>       freeit = 1;
>       break;
>       case RPM_BIN_TYPE:
>  @@ -1346,7 +1349,7 @@
>       s = t;
>       c = '\0';
>       } else
>  -    c = '\'';
>  +    c = '"';
> 
>       nb = spew->spew_strlen(s, lvl);
>       if (c != '\0')
>  @@ -3516,6 +3519,7 @@
>       /*@globals internalState @*/
>       /*@modifies he, internalState @*/
>   {
>  +    static char q = '"';
>       rpmTag tag = he->tag;
>       rpmTagData N = { .ptr = NULL };
>       rpmTagData EVR = { .ptr = NULL };
>  @@ -3601,9 +3605,8 @@
>   /*@=nullstate@*/
>       he->p.argv[ac++] = te;
>       te = stpcpy(te, instance);
>  -    te = stpcpy(te, ", '");
>  -    te = stpcpy(te, N.argv[i]);
>  -    te = stpcpy(te, "'");
>  +    *te++ = ',';    *te++ = ' ';
>  +    *te++ = q;      te = stpcpy(te, N.argv[i]);     *te++ = q;
>   /*@-readonlytrans@*/
>       if (EVR.argv != NULL && EVR.argv[i] != NULL && *EVR.argv[i] != '\0') {
>           static const char *Fstr[] = { 
> "?0","LT","GT","?3","EQ","LE","GE","?7" };
>  @@ -3617,16 +3620,28 @@
>           const char * D = Revr->F[RPMEVR_D];
>   #endif
>           xx = xx;
>  -        te = stpcpy( stpcpy( stpcpy(te, ", '"), Fstr[Fx]), "'");
>  -        te = stpcpy( stpcpy( stpcpy(te, ", '"), E), "'");
>  -        te = stpcpy( stpcpy( stpcpy(te, ", '"), V), "'");
>  -        te = stpcpy( stpcpy( stpcpy(te, ", '"), R), "'");
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;  te = stpcpy(te, Fstr[Fx]);      *te++ = q;
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;  te = stpcpy(te, E);     *te++ = q;
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;  te = stpcpy(te, V);     *te++ = q;
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;  te = stpcpy(te, R);     *te++ = q;
>   #ifdef      NOTYET  /* XXX turning this on breaks rpmrepo */
>  -        te = stpcpy( stpcpy( stpcpy(te, ", '"), D), "'");
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;  te = stpcpy(te, D);     *te++ = q;
>   #endif
>           Revr = rpmEVRfree(Revr);
>       } else {
>  -        te = stpcpy(te, ", '', '', '', ''");
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;          *te++ = q;
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;          *te++ = q;
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;          *te++ = q;
>  +        *te++ = ',';        *te++ = ' ';
>  +        *te++ = q;          *te++ = q;
>       }
>   /*@=readonlytrans@*/
>   #ifdef      NOTNOW
>  @@ -6597,7 +6612,7 @@
>       sprintfToken nextfmt;
>       sprintfTag tag;
>       char * t, * te;
>  -    int need;
>  +    size_t need;
>   spew_t spew = NULL;
> 
>   /*@-modfilesys@*/
>  @@ -6649,10 +6664,12 @@
>       spew = &_json_spew;
> 
>       if (spew && spew->spew_init && spew->spew_init[0]) {
>  -    need = strlen(spew->spew_init);
>  +    char * spew_init = rpmExpand(spew->spew_init, NULL);
>  +    need = strlen(spew_init);
>       t = hsaReserve(hsa, need);
>  -    te = stpcpy(t, spew->spew_init);
>  +    te = stpcpy(t, spew_init);
>       hsa->vallen += (te - t);
>  +    spew_init = _free(spew_init);
>       }
> 
>       hsa = hsaInit(hsa);
>  @@ -6668,10 +6685,12 @@
>       hsa = hsaFini(hsa);
> 
>       if (spew && spew->spew_fini && spew->spew_fini[0]) {
>  -    need = strlen(spew->spew_fini);
>  +    char * spew_fini = rpmExpand(spew->spew_fini, NULL);
>  +    need = strlen(spew_fini);
>       t = hsaReserve(hsa, need);
>  -    te = stpcpy(t, spew->spew_fini);
>  +    te = stpcpy(t, spew_fini);
>       hsa->vallen += (te - t);
>  +    spew_fini = _free(spew_fini);
>       }
> 
>       if (hsa->val != NULL && hsa->vallen < hsa->alloced)
>  @@ .
> ______________________________________________________________________
> RPM Package Manager                                    http://rpm5.org
> CVS Sources Repository                                rpm-...@rpm5.org

Attachment: smime.p7s
Description: S/MIME cryptographic signature

Reply via email to