RPM Package Manager, CVS Repository
  http://rpm5.org/cvs/
  ____________________________________________________________________________

  Server: rpm5.org                         Name:   Jeff Johnson
  Root:   /v/rpm/cvs                       Email:  [EMAIL PROTECTED]
  Module: rpm                              Date:   12-Jul-2008 00:00:25
  Branch: HEAD                             Handle: 2008071122002300

  Modified files:
    rpm/rpmio               tget.c

  Log:
    - jbj: save the html parser development.

  Summary:
    Revision    Changes     Path
    1.18        +229 -35    rpm/rpmio/tget.c
  ____________________________________________________________________________

  patch -p0 <<'@@ .'
  Index: rpm/rpmio/tget.c
  ============================================================================
  $ cvs diff -u -r1.17 -r1.18 tget.c
  --- rpm/rpmio/tget.c  24 Feb 2008 06:50:23 -0000      1.17
  +++ rpm/rpmio/tget.c  11 Jul 2008 22:00:23 -0000      1.18
  @@ -2,9 +2,30 @@
   
   #include <rpmio_internal.h>
   #include <poptIO.h>
  +#include <mire.h>
   
   #include "debug.h"
   
  +typedef struct rpmtget_s * rpmtget;
  +
  +struct rpmtget_s {
  +    const char * pattern;
  +    miRE mires;
  +    int nmires;
  +
  +    rpmop sop;
  +    rpmop gop;
  +
  +    const char * uri;
  +    struct stat sb;
  +    FD_t fd;
  +    char * buf;
  +    size_t nbuf;
  +    char * b;
  +    size_t nb;
  +    ARGV_t  av;
  +};
  +
   static char * rpmPermsString(mode_t st_mode)
   {
       char *perms = xstrdup("----------");
  @@ -52,8 +73,9 @@
       return perms;
   }
   
  -static void printStat(const char * path, struct stat * st)
  +static void printStat(rpmtget tget)
   {
  +    struct stat * st = &tget->sb;
       size_t nt = 100;
       char * t = alloca(nt);
       time_t when = st->st_mtime;
  @@ -67,54 +89,219 @@
        (unsigned) st->st_ino, (unsigned)st->st_blocks/2,
        perms, (unsigned)st->st_nlink,
        (unsigned)st->st_uid, (unsigned)st->st_gid,
  -     (unsigned)st->st_size, t, path);
  +     (unsigned)st->st_size, t, tget->uri);
       fprintf(stderr, "\n");
       perms = _free(perms);
   }
   
  -static int readFile(const char * path)
  +static int tgetFini(rpmtget tget)
  +{
  +    int rc = 0;
  +
  +    if (tget->sop) {
  +     rpmswPrint("stat:", tget->sop);
  +     tget->sop = _free(tget->sop);
  +    }
  +    if (tget->gop) {
  +     rpmswPrint(" get:", tget->gop);
  +     tget->gop = _free(tget->gop);
  +    }
  +
  +    tget->buf = _free(tget->buf);
  +    tget->nbuf = 0;
  +    if (tget->fd) (void) Fclose(tget->fd);
  +    tget->fd = NULL;
  +
  +argvPrint(tget->uri, tget->av, NULL);
  +    tget->av = argvFree(tget->av);
  +
  +    return rc;
  +}
  +
  +static int tgetInit(rpmtget tget, size_t nbuf)
   {
  -    rpmop sop = memset(alloca(sizeof(*sop)), 0, sizeof(*sop));
  -    rpmop gop = memset(alloca(sizeof(*gop)), 0, sizeof(*gop));
  -    FD_t fd;
  -    struct stat sb;
  -    size_t len = 0;
       int rc;
       int xx;
   
  -fprintf(stderr, "===== %s\n", path);
  -    xx = rpmswEnter(sop, 0);
  -    rc = Stat(path, &sb);
  -    xx = rpmswExit(sop, 1);
  +    if (_rpmsw_stats) {
  +     tget->sop = xcalloc(1, sizeof(*tget->sop));
  +     tget->gop = xcalloc(1, sizeof(*tget->gop));
  +    }
  +
  +fprintf(stderr, "===== %s\n", tget->uri);
  +    xx = rpmswEnter(tget->sop, 0);
  +    rc = Stat(tget->uri, &tget->sb);
  +    xx = rpmswExit(tget->sop, 1);
       if (rc < 0)
        goto exit;
   
  -    printStat(path, &sb);
  -    if (!S_ISREG(sb.st_mode))
  +    printStat(tget);
  +
  +    if (nbuf == 0 && tget->sb.st_size > 0)
  +     nbuf = tget->sb.st_size;
  +
  +    tget->fd = Fopen(tget->uri, "r.ufdio");
  +    if (tget->fd == NULL || Ferror(tget->fd)) {
  +     rc = -1;
        goto exit;
  +    }
  +    tget->nbuf = nbuf;
  +    tget->buf = xmalloc(tget->nbuf + 2);
  +    tget->buf[0] = '\0';
  +
  +    tget->b = NULL;
  +    tget->nb = 0;
  +
  +    rc = 0;
  +
  +exit:
  +    if (rc)
  +     (void) tgetFini(tget);
  +    return rc;
  +}
  +
  +static ssize_t tgetFill(rpmtget tget)
  +{
  +    char * b = tget->buf;
  +    size_t nb = tget->nbuf;
  +    ssize_t rc;
  +
  +    if (tget->b != NULL && tget->nb > 0 && tget->b > tget->buf) {
  +     memmove(tget->buf, tget->b, tget->nb);
  +     b += tget->nb;
  +     nb -= tget->nb;
  +    }
   
  -    xx = rpmswEnter(gop, 0);
  -    fd = Fopen(path, "r.ufdio");
  -    if (fd != NULL) {
  -     size_t nb = 8 * BUFSIZ;
  -     char * buf = alloca(nb);
  -     *buf = '\0';
  -     len = Fread(buf, 1, nb-2, fd);
  -     buf[BUFSIZ-1] = '\0';
  -        xx = Fclose(fd);
  -     if (rpmIsVerbose() && len >= 0) {
  -         buf[len] = '\n';
  -         buf[len+1] = '\0';
  -         fwrite(buf, 1, len+1, stderr);
  +    rc = Fread(b, 1, nb, tget->fd);
  +    if (Ferror(tget->fd))
  +     rc = -1;
  +    else if (rc > 0) {
  +     tget->nb += rc;
  +     if (rpmIsVerbose())
  +         fwrite(b, 1, rc, stderr);
  +    }
  +    tget->b = tget->buf;
  +
  +    return rc;
  +}
  +
  +static const char * hrefpat = 
"(?i)<a(?:\\s+[a-z][a-z0-9_]*(?:=(?:\"[^\"]*\"|\\S+))?)*?\\s+href=(?:\"([^\"]*)\"|(\\S+))";
  +
  +static int parseHTML(rpmtget tget)
  +{
  +    miRE mire = tget->mires;
  +    int noffsets = 3;
  +    int offsets[3];
  +    ssize_t nr = (tget->b != NULL ? (ssize_t)tget->nb : tgetFill(tget));
  +    int xx;
  +
  +    xx = mireSetEOptions(mire, offsets, noffsets);
  +
  +    while (tget->nb > 0) {
  +     char * gbn, * hbn;
  +     char * f, * fe;
  +     char * g, * ge;
  +     char * h, * he;
  +     char * t;
  +     mode_t mode;
  +     size_t nb;
  +
  +     offsets[0] = offsets[1] = -1;
  +     xx = mireRegexec(mire, tget->b, tget->nb);
  +     if (xx == 0 && offsets[0] != -1 && offsets[1] != -1) {
  +
  +         /* [f:fe) contains |<a href="..."| match. */
  +         f = tget->b + offsets[0];
  +         fe = tget->b + offsets[1];
  +
  +         /* [h:he) contains the href basename. */
  +         he = fe;
  +         if (he[-1] == '"') he--;
  +         if (he[-1] == '/') {
  +             mode = S_IFDIR | 0755;
  +             he--;
  +         } else
  +             mode = S_IFREG | 0644;
  +         h = he;
  +         while (h > f && (h[-1] != '"' && h[-1] != '/'))
  +             h--;
  +         nb = (size_t)(he - h);
  +         hbn = t = xmalloc(nb + 1 + 1);
  +         while (h < he)
  +             *t++ = *h++;
  +         if (S_ISDIR(mode))
  +             *t++ = '/';
  +         *t = '\0';
  +
  +         /* [g:ge) contains the URI basename. */
  +         g = fe;
  +         while (*g != '>')
  +             g++;
  +         ge = ++g;
  +         while (*ge != '<')
  +             ge++;
  +         nb = (size_t)(ge - g);
  +         gbn = t = xmalloc(nb + 1 + 1);
  +         while (g < ge)
  +             *t++ = *g++;
  +         if (S_ISDIR(mode))
  +             *t++ = '/';
  +         *t = '\0';
  +
  +         /* Filter out weirdos and "." and "..". */
  +         if (!strcmp(gbn, hbn) && strcmp(hbn, "./") && strcmp(hbn, "../")) {
  +             fprintf(stderr, "\t%s\n", gbn);
  +             xx = argvAdd(&tget->av, gbn);
  +         }
  +
  +         gbn = _free(gbn);
  +         hbn = _free(hbn);
  +
  +         offsets[1] += (ge - fe);
  +         tget->b += offsets[1];
  +         tget->nb -= offsets[1];
  +     } else {
  +         size_t nb = tget->nb;
  +         if (nr > 0) nb -= 1024;     /* XXX overlap a bit if filling. */
  +         tget->b += nb;
  +         tget->nb -= nb;
        }
  +
  +     if (nr > 0)
  +         nr = tgetFill(tget);
       }
  -    xx = rpmswExit(gop, len);
  +
  +    xx = mireSetEOptions(mire, NULL, 0);
  +
  +    return 0;
  +}
  +
  +static int readFile(rpmtget tget)
  +{
  +    int rc;
  +    int xx;
  +
  +    xx = tgetInit(tget, 8 * BUFSIZ);
  +
  +    xx = rpmswEnter(tget->gop, 0);
  +
  +    if (S_ISDIR(tget->sb.st_mode)) {
  +     rc = tgetFill(tget);
  +    } else
  +    if (S_ISREG(tget->sb.st_mode)) {
  +     rc = tgetFill(tget);
  +    } else
  +     rc = -1;
  +
  +    xx = rpmswExit(tget->gop, tget->nbuf);
  +
  +    if (rc < 0)
  +     goto exit;
  +
  +    rc = parseHTML(tget);
   
   exit:
  -    if (_rpmsw_stats) {
  -     rpmswPrint("stat:", sop);
  -     rpmswPrint(" get:", gop);
  -    }
  +    xx = tgetFini(tget);
       return rc;
   }
   
  @@ -132,10 +319,11 @@
   main(int argc, char *argv[])
   {
       poptContext optCon = rpmioInit(argc, argv, optionsTable);
  +    rpmtget tget = xcalloc(1, sizeof(*tget));
       ARGV_t av = NULL;
       int ac;
  -    const char * fn;
       int rc;
  +    int xx;
   
       if (__debug) {
   _av_debug = -1;
  @@ -152,9 +340,15 @@
        goto exit;
       }
   
  +    tget->pattern = hrefpat;
  +    xx = mireAppend(RPMMIRE_PCRE, 0, tget->pattern, NULL, &tget->mires, 
&tget->nmires);
  +
       rc = 0;
  -    while (rc == 0 && (fn = *av++) != NULL)
  -     rc = readFile(fn);
  +    while (rc == 0 && (tget->uri = *av++) != NULL)
  +     rc = readFile(tget);
  +
  +    tget->mires = mireFreeAll(tget->mires, tget->nmires);
  +    tget->nmires = 0;
   
   exit:
   
  @@ .
______________________________________________________________________
RPM Package Manager                                    http://rpm5.org
CVS Sources Repository                                rpm-cvs@rpm5.org

Reply via email to