RPM Package Manager, CVS Repository http://rpm5.org/cvs/ ____________________________________________________________________________
Server: rpm5.org Name: Jeff Johnson Root: /v/rpm/cvs Email: [EMAIL PROTECTED] Module: rpm Date: 12-Jul-2008 00:00:25 Branch: HEAD Handle: 2008071122002300 Modified files: rpm/rpmio tget.c Log: - jbj: save the html parser development. Summary: Revision Changes Path 1.18 +229 -35 rpm/rpmio/tget.c ____________________________________________________________________________ patch -p0 <<'@@ .' Index: rpm/rpmio/tget.c ============================================================================ $ cvs diff -u -r1.17 -r1.18 tget.c --- rpm/rpmio/tget.c 24 Feb 2008 06:50:23 -0000 1.17 +++ rpm/rpmio/tget.c 11 Jul 2008 22:00:23 -0000 1.18 @@ -2,9 +2,30 @@ #include <rpmio_internal.h> #include <poptIO.h> +#include <mire.h> #include "debug.h" +typedef struct rpmtget_s * rpmtget; + +struct rpmtget_s { + const char * pattern; + miRE mires; + int nmires; + + rpmop sop; + rpmop gop; + + const char * uri; + struct stat sb; + FD_t fd; + char * buf; + size_t nbuf; + char * b; + size_t nb; + ARGV_t av; +}; + static char * rpmPermsString(mode_t st_mode) { char *perms = xstrdup("----------"); @@ -52,8 +73,9 @@ return perms; } -static void printStat(const char * path, struct stat * st) +static void printStat(rpmtget tget) { + struct stat * st = &tget->sb; size_t nt = 100; char * t = alloca(nt); time_t when = st->st_mtime; @@ -67,54 +89,219 @@ (unsigned) st->st_ino, (unsigned)st->st_blocks/2, perms, (unsigned)st->st_nlink, (unsigned)st->st_uid, (unsigned)st->st_gid, - (unsigned)st->st_size, t, path); + (unsigned)st->st_size, t, tget->uri); fprintf(stderr, "\n"); perms = _free(perms); } -static int readFile(const char * path) +static int tgetFini(rpmtget tget) +{ + int rc = 0; + + if (tget->sop) { + rpmswPrint("stat:", tget->sop); + tget->sop = _free(tget->sop); + } + if (tget->gop) { + rpmswPrint(" get:", tget->gop); + tget->gop = _free(tget->gop); + } + + tget->buf = _free(tget->buf); + tget->nbuf = 0; + if (tget->fd) (void) Fclose(tget->fd); + tget->fd = NULL; + +argvPrint(tget->uri, tget->av, NULL); + tget->av = argvFree(tget->av); + + return rc; +} + +static int tgetInit(rpmtget tget, size_t nbuf) { - rpmop sop = memset(alloca(sizeof(*sop)), 0, sizeof(*sop)); - rpmop gop = memset(alloca(sizeof(*gop)), 0, sizeof(*gop)); - FD_t fd; - struct stat sb; - size_t len = 0; int rc; int xx; -fprintf(stderr, "===== %s\n", path); - xx = rpmswEnter(sop, 0); - rc = Stat(path, &sb); - xx = rpmswExit(sop, 1); + if (_rpmsw_stats) { + tget->sop = xcalloc(1, sizeof(*tget->sop)); + tget->gop = xcalloc(1, sizeof(*tget->gop)); + } + +fprintf(stderr, "===== %s\n", tget->uri); + xx = rpmswEnter(tget->sop, 0); + rc = Stat(tget->uri, &tget->sb); + xx = rpmswExit(tget->sop, 1); if (rc < 0) goto exit; - printStat(path, &sb); - if (!S_ISREG(sb.st_mode)) + printStat(tget); + + if (nbuf == 0 && tget->sb.st_size > 0) + nbuf = tget->sb.st_size; + + tget->fd = Fopen(tget->uri, "r.ufdio"); + if (tget->fd == NULL || Ferror(tget->fd)) { + rc = -1; goto exit; + } + tget->nbuf = nbuf; + tget->buf = xmalloc(tget->nbuf + 2); + tget->buf[0] = '\0'; + + tget->b = NULL; + tget->nb = 0; + + rc = 0; + +exit: + if (rc) + (void) tgetFini(tget); + return rc; +} + +static ssize_t tgetFill(rpmtget tget) +{ + char * b = tget->buf; + size_t nb = tget->nbuf; + ssize_t rc; + + if (tget->b != NULL && tget->nb > 0 && tget->b > tget->buf) { + memmove(tget->buf, tget->b, tget->nb); + b += tget->nb; + nb -= tget->nb; + } - xx = rpmswEnter(gop, 0); - fd = Fopen(path, "r.ufdio"); - if (fd != NULL) { - size_t nb = 8 * BUFSIZ; - char * buf = alloca(nb); - *buf = '\0'; - len = Fread(buf, 1, nb-2, fd); - buf[BUFSIZ-1] = '\0'; - xx = Fclose(fd); - if (rpmIsVerbose() && len >= 0) { - buf[len] = '\n'; - buf[len+1] = '\0'; - fwrite(buf, 1, len+1, stderr); + rc = Fread(b, 1, nb, tget->fd); + if (Ferror(tget->fd)) + rc = -1; + else if (rc > 0) { + tget->nb += rc; + if (rpmIsVerbose()) + fwrite(b, 1, rc, stderr); + } + tget->b = tget->buf; + + return rc; +} + +static const char * hrefpat = "(?i)<a(?:\\s+[a-z][a-z0-9_]*(?:=(?:\"[^\"]*\"|\\S+))?)*?\\s+href=(?:\"([^\"]*)\"|(\\S+))"; + +static int parseHTML(rpmtget tget) +{ + miRE mire = tget->mires; + int noffsets = 3; + int offsets[3]; + ssize_t nr = (tget->b != NULL ? (ssize_t)tget->nb : tgetFill(tget)); + int xx; + + xx = mireSetEOptions(mire, offsets, noffsets); + + while (tget->nb > 0) { + char * gbn, * hbn; + char * f, * fe; + char * g, * ge; + char * h, * he; + char * t; + mode_t mode; + size_t nb; + + offsets[0] = offsets[1] = -1; + xx = mireRegexec(mire, tget->b, tget->nb); + if (xx == 0 && offsets[0] != -1 && offsets[1] != -1) { + + /* [f:fe) contains |<a href="..."| match. */ + f = tget->b + offsets[0]; + fe = tget->b + offsets[1]; + + /* [h:he) contains the href basename. */ + he = fe; + if (he[-1] == '"') he--; + if (he[-1] == '/') { + mode = S_IFDIR | 0755; + he--; + } else + mode = S_IFREG | 0644; + h = he; + while (h > f && (h[-1] != '"' && h[-1] != '/')) + h--; + nb = (size_t)(he - h); + hbn = t = xmalloc(nb + 1 + 1); + while (h < he) + *t++ = *h++; + if (S_ISDIR(mode)) + *t++ = '/'; + *t = '\0'; + + /* [g:ge) contains the URI basename. */ + g = fe; + while (*g != '>') + g++; + ge = ++g; + while (*ge != '<') + ge++; + nb = (size_t)(ge - g); + gbn = t = xmalloc(nb + 1 + 1); + while (g < ge) + *t++ = *g++; + if (S_ISDIR(mode)) + *t++ = '/'; + *t = '\0'; + + /* Filter out weirdos and "." and "..". */ + if (!strcmp(gbn, hbn) && strcmp(hbn, "./") && strcmp(hbn, "../")) { + fprintf(stderr, "\t%s\n", gbn); + xx = argvAdd(&tget->av, gbn); + } + + gbn = _free(gbn); + hbn = _free(hbn); + + offsets[1] += (ge - fe); + tget->b += offsets[1]; + tget->nb -= offsets[1]; + } else { + size_t nb = tget->nb; + if (nr > 0) nb -= 1024; /* XXX overlap a bit if filling. */ + tget->b += nb; + tget->nb -= nb; } + + if (nr > 0) + nr = tgetFill(tget); } - xx = rpmswExit(gop, len); + + xx = mireSetEOptions(mire, NULL, 0); + + return 0; +} + +static int readFile(rpmtget tget) +{ + int rc; + int xx; + + xx = tgetInit(tget, 8 * BUFSIZ); + + xx = rpmswEnter(tget->gop, 0); + + if (S_ISDIR(tget->sb.st_mode)) { + rc = tgetFill(tget); + } else + if (S_ISREG(tget->sb.st_mode)) { + rc = tgetFill(tget); + } else + rc = -1; + + xx = rpmswExit(tget->gop, tget->nbuf); + + if (rc < 0) + goto exit; + + rc = parseHTML(tget); exit: - if (_rpmsw_stats) { - rpmswPrint("stat:", sop); - rpmswPrint(" get:", gop); - } + xx = tgetFini(tget); return rc; } @@ -132,10 +319,11 @@ main(int argc, char *argv[]) { poptContext optCon = rpmioInit(argc, argv, optionsTable); + rpmtget tget = xcalloc(1, sizeof(*tget)); ARGV_t av = NULL; int ac; - const char * fn; int rc; + int xx; if (__debug) { _av_debug = -1; @@ -152,9 +340,15 @@ goto exit; } + tget->pattern = hrefpat; + xx = mireAppend(RPMMIRE_PCRE, 0, tget->pattern, NULL, &tget->mires, &tget->nmires); + rc = 0; - while (rc == 0 && (fn = *av++) != NULL) - rc = readFile(fn); + while (rc == 0 && (tget->uri = *av++) != NULL) + rc = readFile(tget); + + tget->mires = mireFreeAll(tget->mires, tget->nmires); + tget->nmires = 0; exit: @@ . ______________________________________________________________________ RPM Package Manager http://rpm5.org CVS Sources Repository rpm-cvs@rpm5.org