faster full-text mbox indexing

Kragen Sitaker Mon, 08 Mar 2004 00:38:12 -0800

The Python text indexer I posted here last weekend (and, I think,
again yesterday) is too slow.  Friday, after work, I hacked out
something in C to see if I could get it to be faster.  It's about 10
times faster than the Python version, but it lacks some features:
- it doesn't yet do the case-insensitive thing
- it doesn't yet do the indexing-by-chunks thing
- it doesn't yet do merging


The case-sensitivity was a deliberate design decision --- I thought it
would be better to do it as an index postprocessing step, touching
each indexed term only once, rather than individually downcasing every
letter in the corpus.  But I forgot about it Saturday afternoon, when
I added output sorting.  (I thought it would probably be quicker to
qsort the output in RAM rather than after it was already in text
files.)

I've tested it against the Python indexer; if I downcase the entire
input text, then capitalize 'from ' at the beginnings of lines to
recover messagehood, both programs produce identical output --- except
that the Python indexer doesn't index the 'From 's at the beginnings
of messages.

This would be an interesting program to write with the STL.

Since merging takes a substantial fraction of the indexing time, it
probably could benefit from some optimization as well.  Rewriting it
in C wouldn't be too hard, I don't think.

It felt really good to write this in C.  Maintaining C code can be the
pits, but the language really has a good, clean feeling.

/*
 * indexmail: find words, index by mail message byte offset
 */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
#include <ctype.h>
#include <string.h>

/* file stuff */

int open_or_die(char *fname, int *length) {
  int rv = open(fname, O_RDONLY);
  struct stat st;
  if (rv < 0) {
    perror(fname);
    exit(1);
  }
  fstat(rv, &st);
  *length = st.st_size;
  return rv;
}

/* word finding stuff */

/* hot spot: takes up 23% of program's run time */
void find_words(char *s, int len, void (*cb)(char *s, int len)) {
  char *t = s;
  char *last_word_start = s;
  int inword = 0;
  while (t < s + len) {
    if (!inword && isalnum(*t)) {
      last_word_start = t;
      inword = 1;
    } else if (inword && !isalnum(*t)) {
      cb(last_word_start, t - last_word_start);
      inword = 0;
    }
    t++;
  }
  if (inword) cb(last_word_start, t - last_word_start);
}

/* word indexing stuff */

typedef struct _posting {
  struct _posting *next;
  int doc_id;
} posting;

typedef struct _wordentry {
  struct _wordentry *next;
  char *word;
  int wordlen;
  int last_doc_found;
  posting *postings;
} wordentry;

/* for some reason this routine takes 1/4 the CPU time of
   add_posting_to_wordentry_list */
void add_posting_to_wordentry(wordentry *wep, int doc_id) {
  if (doc_id != wep->last_doc_found) {
    posting *new = malloc(sizeof(*new));
    new->doc_id = doc_id;
    new->next = wep->postings;
    wep->postings = new;
    wep->last_doc_found = doc_id;
  }
}

/* hot spot: takes 15% of program's run time */
void add_posting_to_wordentry_list(wordentry **list, char *word, int wordlen, 
                                   int doc_id) {
  wordentry **wepp;
  wordentry *new;

  for (wepp = list; *wepp; wepp = &((*wepp)->next))
    if ((*wepp)->wordlen == wordlen && !memcmp((*wepp)->word, word, wordlen)) 
      break;

  new = *wepp;
  if (!new) {
    new = malloc(sizeof(*new));
    if (!new) perror("malloc"), exit(3);
    new->word = word;
    new->wordlen = wordlen;
    new->last_doc_found = -1; /* invalid doc_id */
    new->postings = NULL;
  }

  add_posting_to_wordentry(new, doc_id);
  
  /* delete from old position in list */
  if (*wepp) *wepp = (*wepp)->next;
  /* move to front */
  new->next = *list;
  *list = new;
}

void dump_wordentry_list(wordentry *list) {
  posting *p;
  for (; list; list = list->next) {
    fwrite(list->word, list->wordlen, 1, stdout);
    printf(": ");
    for (p = list->postings; p; p = p->next) {
      printf("%d", p->doc_id);
      if (p->next) printf(" ");
    }
    printf("\n");
  }
}

void test_word_indexing(void) {
  wordentry *list = NULL;
  char *words[] = { "fuzzy", "bunny", "bin", "packing", "bunny", "bunny", "bin", 0 };
  char **wp;
  for (wp = words; *wp; wp++) {
    add_posting_to_wordentry_list(&list, *wp, strlen(*wp), wp-words);
    add_posting_to_wordentry_list(&list, *wp, strlen(*wp), wp-words);
  }
  dump_wordentry_list(list);
}

/* hash table */

/* Since we have move-to-front, this size is only about 10%-15% better
 * than 1021 for a sample corpus with 14 million words, 112K distinct
 * words, and 6M or so postings.  */
#define HASHSIZE 32749
wordentry *index_hash[HASHSIZE];

/* thanks P. J. Weinberger and Michael Neumann. ick, this takes 20% of
   the program's run time, at 130 ns per call. */
int hashpjw(char *s, int len) {
  char *t;
  unsigned h=0, g;
  for (t = s; t < s + len; ++t) {
    h = (h << 4) + *t;
    if (g = h&0xf0000000) {
      h ^= g>>24;
      h ^= g;
    }
  }
  return h % HASHSIZE;
}

int current_doc_id;

void index_word_hash(char *s, int len) {
  add_posting_to_wordentry_list(index_hash + hashpjw(s, len),
                                s, len, current_doc_id);
}

/* dumb approach, could probably run this search 5x faster with a
 * better algorithm, which might speed up whole indexing process by
 * about 20%, at 8 us per call.  unless my profiler lies, which it
 * might.  */
char *memstr(char *haystack, char *needle, int haystack_size) {
  char *t;
  char *lim = haystack + haystack_size - strlen(needle);
  int i;
  for (t = haystack; t < lim; t++) {
    for (i = 0; ; i++) {
      if (needle[i] == '\0') return t;
      if (needle[i] != t[i]) break;
    }
  }
  return NULL;
}

void index_by_mail_messages(char *s, int len, void (*cb)(char *s, int len)) {
  char *new_msg_start;
  /* in this function, this is the offset of the message start */
  current_doc_id = 0;
  for (;;) {
    new_msg_start = memstr(s + current_doc_id, "\nFrom ", len - current_doc_id);
    if (!new_msg_start) new_msg_start = s + len;

    find_words(s + current_doc_id, new_msg_start - s - current_doc_id, cb);

    if (new_msg_start == s + len) break;
    current_doc_id = new_msg_start - s + 1;
  }
}

void hash_index_by_mail_messages(char *s, int len) { 
  index_by_mail_messages(s, len, index_word_hash);
}

/* sorting the output by term */

void push(wordentry **stack, wordentry *item) {
  item->next = *stack;
  *stack = item;
}

wordentry *pop(wordentry **stack) {
  wordentry *rv = *stack;
  *stack = rv->next;
  /* should probably rv->next = NULL; */
  return rv;
}

wordentry *nconc(wordentry *shortlist, wordentry *longlist) {
  wordentry **join;
  if (!shortlist) return longlist;
  join = &(shortlist->next);
  while (*join) join = &((*join)->next);
  *join = longlist;
  return shortlist;
}

wordentry *all_entries() {
  int i;
  wordentry *rv = NULL;
  for (i = 0; i < HASHSIZE; i++) {
    rv = nconc(index_hash[i], rv);
    index_hash[i] = 0;
  }
  return rv;
}

#define min(a, b) ((a) < (b) ? (a) : (b))

/* sort linked lists by moving entries to different lists.  No
 * robustness against quicksort's O(N^2) worst case, because the input
 * data is sorted by hash value, then by recentness of occurrence, so
 * is very unlikely to be nearly sorted already. */
wordentry *quicksort_wordentries(wordentry *list) {
  wordentry *first, *before = NULL, *after = NULL, *wep;
  int cmp;
  if (!list) return list;
  first = pop(&list);
  while (list) {
    wep = pop(&list);
    cmp = memcmp(wep->word, first->word, min(wep->wordlen, first->wordlen));
    if (!cmp && wep->wordlen < first->wordlen) cmp = -1;
    push((cmp < 0 ? &before : &after), wep);
  }
  first->next = quicksort_wordentries(after);
  return nconc(quicksort_wordentries(before), first);
}

/* main */

int main(int argc, char **argv) {
  int length;
  int fh = open_or_die(argv[1], &length);
  char *mm = mmap(0, length, PROT_READ, MAP_SHARED, fh, 0);

  if (mm == MAP_FAILED) {
    perror("mmap");
    return 2;
  }

  hash_index_by_mail_messages(mm, length);
  dump_wordentry_list(quicksort_wordentries(all_entries()));
}

faster full-text mbox indexing

Reply via email to