http://www.mediawiki.org/wiki/Special:Code/MediaWiki/81438

Revision: 81438
Author:   ariel
Date:     2011-02-03 03:20:10 +0000 (Thu, 03 Feb 2011)
Log Message:
-----------
filter for use in cleaning up half-written history xml files

Added Paths:
-----------
    branches/ariel/xmldumps-backup/writeuptopageid.c

Added: branches/ariel/xmldumps-backup/writeuptopageid.c
===================================================================
--- branches/ariel/xmldumps-backup/writeuptopageid.c                            
(rev 0)
+++ branches/ariel/xmldumps-backup/writeuptopageid.c    2011-02-03 03:20:10 UTC 
(rev 81438)
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <string.h>
+
+typedef enum { None, StartHeader, StartPage, AtPageID, WriteMem, Write, 
EndPage, AtLastPageID } States;
+
+/* assume the header is never going to be longer than 1000 x 80 4-byte 
characters... how many
+   namespaces will one project want? */
+#define MAXHEADERLEN 524289
+
+void usage(char *me) {
+  fprintf(stderr,"Usage: %s pageID\n",me);
+  fprintf(stderr,"Copies the contents of an XML file up to but not 
including\n");
+  fprintf(stderr,"the specified pageID. This program is used in processing 
XML\n");
+  fprintf(stderr,"dump files that were only partially written.\n");
+}
+
+/* note that even if we have only read a partial line
+   of text from the body of the page, (cause the text 
+   is longer than our buffer), it's fine, since the 
+   <> delimiters only mark xml, they can't appear
+   in the page text. 
+
+   returns new state */
+States setState (char *line, States currentState, int endPageID) {
+  int pageID = 0;
+
+  if (!strncmp(line,"<mediawiki",10)) {
+    return(StartHeader);
+  }
+  else if (!strncmp(line,"<page>",6)) {
+    return(StartPage);
+  }
+  /* there are also user ids, revision ids, etc... pageid will be the first 
one */
+  else if (currentState == StartPage && (!strncmp(line, "<id>", 4))) {
+    /* dig the id out, format is <id>num</id> */
+    pageID = atoi(line+4);
+    if (pageID == endPageID) {
+      return(AtLastPageID);
+    }
+    else {
+      return(WriteMem);
+    }
+  }
+  else if (currentState == WriteMem) {
+    return(Write);
+  }
+  else if (!strncmp(line, "</page>", 6)) {
+    return(EndPage);
+  }
+  return(currentState);
+}
+
+/* returns 1 on success, 0 on error */
+int writeMemoryIfNeeded(char *mem, States state) {
+  int res = 0;
+
+  if (state == WriteMem) {
+    res = fwrite(mem,strlen(mem),1,stdout);
+    mem[0]='\0';
+    return(res);
+  }
+}
+
+/* returns 1 on success, 0 on error */
+int writeIfNeeded(char *line, States state) {
+  if (state == StartHeader || state == WriteMem || state == Write || state == 
EndPage) {
+    return(fwrite(line,strlen(line),1,stdout));
+  }
+}
+
+/*  returns 1 on success, 0 on error */
+int saveInMemIfNeeded(char *mem, char *line, States state) {
+  if (state == StartPage) {
+    if (strlen(mem) + strlen(line) < MAXHEADERLEN) {
+      strcpy(mem + strlen(mem),line);
+    }
+    else {
+      /* we actually ran out of room, who knew */
+      return(0);
+    }
+  }
+  return(1);
+}
+
+int main(int argc,char **argv) {
+  long int pageID = 0;
+  char *nonNumeric = 0;
+  States state = None;
+  char *text;
+  char line[4097];
+  /* order of magnitude of 2K lines of 80 chrs each,
+     no header of either a page nor the mw header should
+     ever be longer than that. At least not for some good 
+     length of time. */
+  char mem[MAXHEADERLEN];
+
+  if (argc != 2) {
+    usage(argv[0]);
+    exit(-1);
+  }
+
+  errno = 0;
+  pageID = strtol(argv[1], &nonNumeric, 10);
+  if (pageID == 0 || 
+      *nonNumeric != 0 ||
+      nonNumeric == (char *) &pageID || 
+      errno != 0) {
+    fprintf (stderr,"The value you entered for pageID must be a positive 
integer.\n");
+    usage(argv[0]);
+    exit(-1);
+  }
+
+  while (fgets(line, sizeof(line)-1, stdin) != NULL) {
+    text=line;
+    while (*text && isspace(*text))
+      text++;
+    state = setState(text, state, pageID);
+    if (!saveInMemIfNeeded(mem,line,state)) {
+      fprintf(stderr,"failed to save text in temp memory, bailing\n");
+      exit(-1);
+    };
+    if (!writeMemoryIfNeeded(mem,state)) {
+      fprintf(stderr,"failed to write text from memory, bailing\n");
+      exit(-1);
+    }
+    if (!writeIfNeeded(line,state)) {
+      fprintf(stderr,"failed to write text, bailing\n");
+      exit(-1);
+    }
+    if (state == AtLastPageID) {
+      /* we are done. */
+      break;
+    }
+  }
+  fwrite("</mediawiki>\n",13,1,stdout);
+  exit(0);
+}
+


Property changes on: branches/ariel/xmldumps-backup/writeuptopageid.c
___________________________________________________________________
Added: svn:eol-style
   + native


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to