On 2/27/2007, "Sjoerd Mullender" <[EMAIL PROTECTED]> wrote:

>On 2007-02-27 16:43, Jan Flokstra wrote:
>> Update of /cvsroot/monetdb/pathfinder/modules/pftijah
>> In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv27686
>> 
>> Modified Files:
>>      nexi.c pftijah_tokenize.l pftijah_util.mx serialize_pftijah.mx 
>> Log Message:
>> * repair BBP refcount bug for BAT
>
>Is this a fix which also applies to the stable branch?

I'm not shure yet. The bug only shows in the HEAD branch and does not
occur in the release branch. Problem was I did:

bat b = BBPindex(......);
if ( b != bat_nil )
    return BBPdescriptor(b)

The refcount assert crash occurs in the BBPdescriptor(). I used this
construction before and never had any problem. The bug made the
"Current" branch useless so I decided to (un)fix(:) it quickly with a
BBPfix() / BBPunfix(). I will try to figure out what to do next in the
near future. Maybe I even try to consult the CWI people :-)

>
>> * reimplement the direct bat acces methods in pftijah serialization for more
>>   speed (and clarity).
>> 
>> * Start optimizing the the pftijah tokenizer. The flex functions are called 
>> once
>>   per handle_character() call. This leads to 2 malloc's per call. I tried to
>>   do without the malloc's but this caused to a lot of strange results:-)
>>   I am now planning to craft the flexer by hand. The first small experiment
>>   shows there is a lot to gain there. (25% speedup in indexing time).
>> 
>> 
>> 
>> Index: serialize_pftijah.mx
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
>> retrieving revision 1.41
>> retrieving revision 1.42
>> diff -u -d -r1.41 -r1.42
>> --- serialize_pftijah.mx     23 Feb 2007 15:11:07 -0000      1.41
>> +++ serialize_pftijah.mx     27 Feb 2007 15:43:37 -0000      1.42
>> @@ -31,8 +31,8 @@
>>  
>>  extern int handleTijahTerm(struct tijahContextStruct *ctx, char* term);
>>  
>> -extern int useFlexScanner(char* buf, int len, struct tijahContextStruct* 
>> tjCtx); /* FLEX */
>> -extern char* flexScanOneTerm(char* buf, int len);
>> +extern int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx); /* 
>> FLEX */
>> +extern char* flexScanOneTerm(char* buf);
>>  
>>  extern char* normalizeTerm(struct tijahContextStruct *ctx, char* term );
>>  
>> @@ -70,15 +70,10 @@
>>  typedef struct dbat_struct {
>>      const char*     name;
>>      BAT*            bat;
>> -    int             oid_mark;
>> -    int             max_i;
>> -    int             max_sz;
>> -    bit             dflt;     /* fill with default value during extend */
>> -    int             dflt_int; /* the default int value */
>> -    chr             dflt_chr; /* the default chr value */
>> -    oid             dflt_oid; /* the default oid value */
>> -    /* */
>> -        union { /* cast to perform direct indexex insert in [void,any] BATs 
>> */
>> +    oid             raw_max;
>> +    oid             seqbase;
>> +    oid             seq_max;
>> +        union { /* cast to perform direct indexe insert in [void,any] BATs 
>> */
>>              void* voidCAST; /* the basecast */
>>              chr*  chrCAST;  /* cast for [void,chr] BAT */
>>              int*  intCAST;  /* cast for [void,int] BAT */
>> @@ -89,7 +84,6 @@
>>  int dbat_init(const char* name, dbat* dbat, BAT* b) {
>>      dbat->name = name;
>>      dbat->bat  = b;
>> -    dbat->dflt = FALSE;
>>      if ( dbat->bat->htype != TYPE_void ) {
>>          stream_printf(GDKerr,"ERROR: dbat_init(%s) non void 
>> BAT\n",dbat->name);
>>          return 0;
>> @@ -98,31 +92,25 @@
>>          stream_printf(GDKerr,"ERROR: dbat_init(%s) unknown 
>> ttype(%d)\n",dbat->name,dbat->bat->ttype);
>>          return 0;
>>      }
>> -        dbat->oid_mark = b->hseqbase;
>> -    dbat->max_i = dbat->max_sz = BATcount(dbat->bat);
>> +        dbat->seqbase = (oid)b->hseqbase;
>> +    dbat->raw_max = (oid)BATcount(dbat->bat);
>> +    dbat->seq_max = dbat->raw_max + dbat->seqbase;
>>      dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
>>      /* */
>>      return 1;
>>  }
>>  
>> -int dbat_finalize(dbat* dbat) {
>> -        BAT* b = dbat->bat;
>>  
>> +int dbat_finalize(dbat* dbat, int topidx) {
>>          void* top;
>> +        BAT* b = dbat->bat;
>>          
>> -        int bottomTop = dbat->max_i;
>> -    if ( 0 ) stream_printf(GDKout,"dbat_finalize(size=%d)\n",dbat->max_i);
>> +    topidx -= (int)dbat->seqbase;
>> +        int bottomTop = topidx;
>>          switch( b->ttype ) {
>>           case TYPE_int :
>>                  top = &dbat->cast.intCAST[bottomTop];
>>                  break;
>> -         case TYPE_chr: {
>> -                b->batBuns->free = dbat->max_i; 
>> -                BATsetcount(b, dbat->max_i);
>> -                b->tsorted = 0;
>> -            b->batDirty = TRUE; /* VERY important this one */
>> -                return 1;
>> -                }
>>           case TYPE_oid:
>>                  top = &dbat->cast.oidCAST[bottomTop];
>>                  break;
>> @@ -137,7 +125,7 @@
>>      /* */
>>      dbat->name  = NULL;
>>      dbat->bat   = NULL;
>> -    dbat->max_i = dbat->max_sz = 0;
>> +    dbat->raw_max = dbat->seqbase = 0;
>>      /* */
>>      return 1;
>>  }
>> @@ -145,9 +133,14 @@
>>  #define MINCHUNK 8192
>>  #define MAXCHUNK 67108864
>>  
>> -int dbat_extend(dbat* dbat, int i_mark) {
>> -    /* CHECK THIS if ( i_mark ) i = i_mark - dbat->oid_mark; */
>> -    size_t newsize = 
>> MAX(MIN(MAX(MINCHUNK,dbat->max_sz*2),(size_t)(dbat->max_sz+MAXCHUNK)),(size_t)i_mark);
>> +int dbat_extend(dbat* dbat, oid min_i /*raw-index*/, size_t forced_size) {
>> +    size_t newsize;
>> +    
>> +    if ( forced_size ) {
>> +       newsize = forced_size;
>> +    } else {
>> +       newsize = 
>> MAX(MIN(MAX(MINCHUNK,dbat->raw_max*2),(size_t)(dbat->raw_max+MAXCHUNK)),(size_t)min_i);
>> +    }
>>  
>>      /* first check if the number of BUN's < INT_MAX. If this was the case
>>       * and the previous time INT_MAX was returned this means the BAT cannot
>> @@ -156,94 +149,34 @@
>>      if ( newsize > INT_MAX ) {
>>              newsize = INT_MAX;
>>  
>> -    if ( dbat->max_sz == INT_MAX ) {
>> +    if ( dbat->raw_max == INT_MAX ) {
>>              GDKerror("dbat_extend: BATextend[\"%s\"](size>INT_MAX) 
>> fails\n","incomplete");
>>              return -1;
>>      }
>>      }
>> -    if ( 0 ) { stream_printf(GDKout,"dbat_extend[%s](%d -> 
>> %d)\n",dbat->name,dbat->max_sz,newsize); }
>> -    dbat->max_sz= newsize;
>> +#if 0
>> +    stream_printf(GDKout,"dbat_extend[%s](%d -> 
>> %d)\n",dbat->name,dbat->raw_max,newsize);
>> +#endif
>> +    dbat->raw_max= newsize;
>> +    dbat->seq_max = dbat->raw_max + dbat->seqbase;
>>      if ( !(dbat->bat = BATextend(dbat->bat,newsize)) ) {
>>          GDKerror("dbat_extend: BATextend[\"%s\"](to %d) 
>> fails\n","incomplete",newsize);
>>          return -1;
>>      }
>>      dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
>> -    /*
>> -     * now check if there's a default value handler used  
>> -     *
>> -     */
>> -    if ( dbat->dflt ) {
>> -        switch( dbat->bat->ttype ) {
>> -         case TYPE_int : {
>> -            int v   = dbat->dflt_int;
>> -            int *to = &dbat->cast.intCAST[dbat->max_sz];
>> -            for(register int *p = &dbat->cast.intCAST[dbat->max_i]; p<to;)
>> -                *p++ = v;
>> -                break;
>> -            }
>> -         case TYPE_chr: {
>> -            chr v   = dbat->dflt_chr;
>> -            chr *to = &dbat->cast.chrCAST[dbat->max_sz];
>> -            for(register chr *p = &dbat->cast.chrCAST[dbat->max_i]; p<to;)
>> -                *p++ = v;
>> -                break;
>> -                }
>> -         case TYPE_oid: {
>> -            oid v   = dbat->dflt_oid;
>> -            oid *to = &dbat->cast.oidCAST[dbat->max_sz];
>> -            for(register oid *p = &dbat->cast.oidCAST[dbat->max_i]; p<to;)
>> -                *p++ = v;
>> -                break;
>> -            }
>> -         default:
>> -                GDKerror("dbat_extend: bad ttype\n");
>> -                return -1;
>> -        }
>> -    }
>> -    /* */
>>      return 1;
>>  }
>>  
>>  int dbat_sizeHint(dbat* dbat, int sizeHint_mark) {
>> -        int sizeHint = sizeHint_mark - dbat->oid_mark;
>> -    int estimate = dbat->max_i + sizeHint;
>> -
>> -    return dbat_extend(dbat, estimate);
>> -}
>> -
>> -INLINE static int dbat_set_oid(dbat* dbat, int pos_mark, oid v) {
>> -    register int pos;
>> +        int sizeHint = sizeHint_mark - dbat->seqbase;
>> +    int estimate = dbat->raw_max + sizeHint;
>>  
>> -    if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
>> -        dbat->cast.oidCAST[pos] = v;
>> -        return 1;
>> -    } else {
>> -        if ( pos >= dbat->max_sz ) {
>> -            if ( dbat_extend(dbat,pos) < 0 )
>> -                return -1;
>> -        }
>> -        dbat->max_i = pos + 1;
>> -        dbat->cast.oidCAST[pos] = v;
>> -        return 1;
>> -    }
>> +    return dbat_extend(dbat, estimate, 0);
>>  }
>>  
>> -INLINE static int dbat_set_int(dbat* dbat, int pos_mark, int v) {
>> -    register int pos;
>> +#define dbat_set_oid(DBAT,I,V) (DBAT)->cast.oidCAST[I-(DBAT)->seqbase] = V
>>  
>> -    if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
>> -        dbat->cast.intCAST[pos] = v;
>> -        return 1;
>> -    } else {
>> -        if ( pos >= dbat->max_sz ) {
>> -            if ( dbat_extend(dbat,pos) < 0 )
>> -                return -1;
>> -        }
>> -        dbat->max_i = pos + 1;
>> -        dbat->cast.intCAST[pos] = v;
>> -        return 1;
>> -    }
>> -}
>> +#define dbat_set_int(DBAT,I,V) (DBAT)->cast.intCAST[I-(DBAT)->seqbase] = V
>>  
>>  /************************************************
>>   *
>> @@ -315,9 +248,6 @@
>>  
>>  /************************************************
>>   *
>> - *
>> - * First the temporary shredder for Tijah by JF
>> - *
>>   */
>>  
>>  INLINE static oid
>> @@ -328,15 +258,15 @@
>>      BUN bun;
>>  
>>      HASHfnd_str(bun, tjctx->hm_globalTag, (str)t);
>> -    if ( bun )
>> -    /* if ( (bun = BUNfnd(tjctx->hm_globalTag,t)) ) OLD */
>> +    if ( bun ) {
>>          return *(oid*)BUNtail(tjctx->hm_globalTag,bun);
>> -    else {
>> -            if ( !BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, 
>> FALSE) ) {
>> +    } else {
>> +            if ( BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, 
>> FALSE) ) {
>> +                return tjctx->n_globalTag++;
>> +        } else {
>>                  GDKerror("INSERT OF \"%s\" in globalTag fails.\n");
>>                  return oid_nil;
>> -        } else
>> -                return tjctx->n_globalTag++;
>> +        }
>>      }
>>  #endif
>>  }
>> @@ -349,10 +279,7 @@
>>     BUN bun;
>>  
>>     HASHfnd_str(bun, tjctx->hm_globalTerm, (str)t);
>> -   if ( bun ) {
>> -       return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
>> -   } else
>> -       return oid_nil;
>> +   return ( bun ? *(oid*)BUNtail(tjctx->hm_globalTerm,bun) : oid_nil );
>>  }
>>  
>>  INLINE static oid
>> @@ -366,22 +293,35 @@
>>      if ( bun )
>>          return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
>>      else {
>> -            if ( !BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, 
>> FALSE)){
>> +            if ( BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, 
>> FALSE)){
>> +                return tjctx->n_globalTerm++;
>> +        } else { 
>>                  GDKerror("INSERT OF \"%s\" in globalTerm fails.\n");
>>                  return oid_nil;
>> -        } else 
>> -                return tjctx->n_globalTerm++;
>> +        }
>>      }
>>  #endif
>>  }
>>  
>> -#define tj_add2plane(TJCTX,O) \
>> -    ((dbat_set_oid(&(TJCTX)->dbat_collPre, (TJCTX)->tijahPre, O) < 0) \
>> -                    ? oid_nil : ((oid)(TJCTX)->tijahPre++))
>> +INLINE oid tj_extend_plane(struct tijahContextStruct *tjctx) {
>> +    oid base = tjctx->tijahPre - tjctx->dbat_collPre.seqbase; 
>>  
>> -#define insertPreSize(TJCTX,POS,SIZE) \
>> -    dbat_set_int(&TJCTX->dbat_collSize,(int)POS,SIZE)
>> +    if ( base >= tjctx->dbat_collPre.raw_max ) {
>> +    if ( dbat_extend(&tjctx->dbat_collPre,base, 0) < 0 )
>> +        return oid_nil;
>> +    /* IMPORTANT: the size of the two bats is synchronized by the use
>> +     * of the forced size (last) parameter of dbat_extend
>> +     */
>> +    if ( 
>> dbat_extend(&tjctx->dbat_collSize,base,tjctx->dbat_collPre.raw_max) < 0 )
>> +        return oid_nil;
>> +    }
>> +    return tjctx->tijahPre++;
>> +}
>>  
>> +#define tj_newPre(TJCTX) \
>> +    (((TJCTX)->tijahPre < (TJCTX)->dbat_collPre.seq_max) \
>> +    ? \
>> +    ((oid)(TJCTX)->tijahPre++) : tj_extend_plane(TJCTX))
>>  
>>  int
>>  handleTijahTerm(struct tijahContextStruct *tjctx, char* term) {
>> @@ -397,13 +337,13 @@
>>          }
>>            }
>>            if ( (termOid = tj_termOid(tjctx, term)) == oid_nil )
>> -                return 0;
>> +                return -1;
>>      }
>>      if ( termOid ) { /* term is not a stopword */
>> -            if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
>> +            if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>>                      return 0;
>> -            if ( insertPreSize(tjctx,tjPre,0) < 0 )
>> -                    return -1;
>> +            dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
>> +            dbat_set_int(&tjctx->dbat_collSize,(int)tjPre,0);
>>  #ifdef TJ_TRACE
>>              if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:TERM: \"%s\", 
>> termoid=%d, tjPre=%d\n",tjctx->name,term,termOid,tjPre);
>>  #endif
>> @@ -416,40 +356,13 @@
>>       return 1;
>>  }
>>  
>> -/************
>> - *
>> - * The part where the Strings from Pathfinder are shredded into words
>> - * by Tijah. The USE_FLEX macro determines if the strings is shredded
>> - * by Hennings fancy flex scanner or Jan's simple strtok() scanner.
>> - */
>> -
>> -const char* obsoleteNexiChars = " \t\n\r,:;&[EMAIL PROTECTED]";
>> -
>> -int 
>> -useStrtokScanner(tjCtx* tjctx, char* s)
>> -{
>> -    char *t;
>> -    int  sz = 0;
>> -
>> -#ifdef TJ_TRACE
>> -    if (TJ_TRACE) stream_printf(GDKout,"C[%s]:CHARACTERS:\n",tjctx->name);
>> -#endif
>> -    if ( (t = strtok(s,obsoleteNexiChars)) ) do {
>> -    /* not the empty string here */
>> -        if ( handleTijahTerm(tjctx,t) < 0 )
>> -             return -1;
>> -            sz++;
>> -    } while ( (t=strtok(NULL,obsoleteNexiChars)) );
>> -    return 1;
>> -}
>> -
>>  /************************************************
>>   *
>>   * Now the real output handlers
>>   */
>>  
>>  
>> -#ifdef notused
>> +#if 0
>>  static int
>>  handle_sizeHint(XqueryCtx* ctx, int hinted_size) {
>>      tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>> @@ -502,14 +415,12 @@
>>          return (str)str_nil;
>>  }
>>  
>> -#define GUESSFORCE FALSE
>> -
>>  /* 
>>   * Replace the value of a collection parameter int the collection parameter
>>   * bat
>>   */
>>  static int replaceCollParam(tjCtx* tjctx, str param, str val) {
>> -    return ( BUNreplace(tjctx->b_collParam,param,val,GUESSFORCE) != NULL );
>> +    return ( BUNreplace(tjctx->b_collParam,param,val,FALSE) != NULL );
>>  }
>>  
>>  static BAT*
>> @@ -894,10 +805,10 @@
>>      /* if ( DOEMIT(tjctx) ) { */
>>          if ( (termOid = tj_tagOid(tjctx, name)) == oid_nil )
>>                  return 0;
>> -        if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
>> +        if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>>                  return 0;
>> +        dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
>>          if ( tj_pushTag(tjctx,tjPre) < 0 ) return 0;
>> -        if ( 0 ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", 
>> termoid=%d, Tijah pre#=%d, Pathfinder 
>> pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>>  #ifdef TJ_TRACE
>>          if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", 
>> termoid=%d, Tijah pre#=%d, Pathfinder 
>> pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>>  #endif
>> @@ -913,8 +824,7 @@
>>      --tjctx->doc_height;
>>      oid start = tj_popTag(tjctx); /* oid of the first node of the element */
>>      int size  = tjctx->tijahPre - start - 1; /* the Tijah element size */
>> -    if ( insertPreSize(tjctx,start,size) < 0 )
>> -        return 0;
>> +    dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>>  #ifdef TJ_TRACE
>>      if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:empty_endElement: 
>> \"%s\"\n", tjctx->name,"");
>>  #endif
>> @@ -934,8 +844,7 @@
>>      /* if ( DOEMIT(tjctx) ) { */
>>          oid start = tj_popTag(tjctx); /* oid of the first node of the 
>> element */
>>          int size  = tjctx->tijahPre - start - 1; /* the Tijah element size 
>> */
>> -    if ( insertPreSize(tjctx,start,size) < 0 )
>> -        return 0;
>> +    dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>>  #ifdef TJ_TRACE
>>          if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:endElement: \"%s\"\n", 
>> tjctx->name,name);
>>  #endif
>> @@ -944,8 +853,6 @@
>>      return 1;
>>  }
>>  
>> -#define USE_FLEX 1
>> -
>>  /**
>>   * Output generation handler. Handles equivalent of * SAX characters() 
>> event.
>>   */
>> @@ -954,28 +861,23 @@
>>      EMPTY_CHECK;
>>      tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>>  
>> +    register char* p = (char*)ch;
>> +    while( *p && isspace(*p) ) p++;
>> +    if ( !*p )
>> +        return 1;
>>  #ifdef TJ_TRACE
>> -    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) 
>> start\n",tjctx->name, (char*)ch);
>> +    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) 
>> start\n",tjctx->name, p);
>>  #endif
>>  
>>      if ( DOEMIT(tjctx) ) {
>> -#ifdef USE_FLEX
>> -        return useFlexScanner((char*)ch,strlen((char*)ch),tjctx);
>> -#else
>> -        return useStrtokScanner(tjctx,(char*)ch);
>> -#endif
>> +        return useFlexScanner(p,tjctx);
>>      }
>>      return 1;
>>  }
>>  
>>  char* normalizeTerm(struct tijahContextStruct *tjctx, char* term ) {
>>      char *res;
>> -#ifdef USE_FLEX
>> -        res = flexScanOneTerm((char*)term,strlen((char*)term));
>> -#else
>> -    res = strtok(term,obsoleteNexiChars);
>> -#endif
>> -    /* INCOMPLETE, should make shure tijahContext is always avail. here */
>> +        res = flexScanOneTerm((char*)term);
>>          if ( res && tjctx && tjctx->stemCtx->stem) {
>>                  if ( !(res = 
>> (char*)tjctx->stemCtx->stem(tjctx->stemCtx,(char*)res)) ) {
>>              /* must be a stopword */
>> @@ -986,14 +888,6 @@
>>  }
>>  
>>  int CMDtj_normalizeTerm(char** res, str term, str stemmer) {
>> -//Leave tokenization disabled for now
>> -//    char* tokenized;
>> -//#ifdef USE_FLEX
>> -//    tokenized = flexScanOneTerm(term,strlen(term));
>> -//#else
>> -//    tokenized = strtok(term,obsoleteNexiChars);
>> -//#endif
>> -
>>      tjStemCtx* stemCtx = getStemmingContext( stemmer );
>>  
>>      if ( stemCtx->stem ) {
>> @@ -1123,13 +1017,9 @@
>>  #ifdef TJ_TRACE
>>      if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINISH 
>> INDEXING\n",tjctx->name);
>>  #endif
>> -
>> -        /* feature not used anymore ????? */
>> -    if ( 0 /* ROEL CHANGE VIRTUAL ROOT SIZE HERE */ )
>> -            insertPreSize(tjctx,0,tjctx->tijahPre - 1);
>> -    if ( dbat_finalize(&tjctx->dbat_collPre) < 0 )
>> +    if ( dbat_finalize(&tjctx->dbat_collPre, tjctx->tijahPre) < 0 )
>>              return GDK_FAIL;
>> -    if ( dbat_finalize(&tjctx->dbat_collSize) < 0 )
>> +    if ( dbat_finalize(&tjctx->dbat_collSize, tjctx->tijahPre) < 0 )
>>              return GDK_FAIL;
>>  #ifdef TJ_TRACE
>>      if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINALIZED DIRECT 
>> BATS\n",tjctx->name);
>> 
>> Index: pftijah_tokenize.l
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
>> retrieving revision 1.12
>> retrieving revision 1.13
>> diff -u -d -r1.12 -r1.13
>> --- pftijah_tokenize.l       9 Jan 2007 15:44:39 -0000       1.12
>> +++ pftijah_tokenize.l       27 Feb 2007 15:43:37 -0000      1.13
>> @@ -115,7 +115,40 @@
>>  
>>  %%
>>  
>> -int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx) {
>> +int OPT0useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
>> +  /* UPDATE: this delivers very strange testset results and should not be
>> +   * used I think.
>> +   */
>> +  /* This is an optimized version of the flex scanner which does not copy 
>> the
>> +   * input buffer. The only strange thing about this interface is that it
>> +   * requires 2 YY_END_OF_BUFFER_CHAR (eg. 0) at the end of the buffer. The
>> +   * size of the buffer is inclusive the 2 0's.
>> +   * The last zero is toggled with its original value to prevent corruption
>> +   * of memory management tables. This was for me the only way to prevent
>> +   * copying here.
>> +   */
>> +  int len = strlen(buf);
>> +  char remember = buf[len+1];
>> +  buf[len+1] = YY_END_OF_BUFFER_CHAR;
>> +  YY_BUFFER_STATE myBuf = yy_scan_buffer(buf, len+2);
>> +
>> +  if ( !myBuf ) {
>> +      stream_printf(GDKout,"# useFlexScanner: unable to get setup non-copy 
>> buffer.");
>> +      return 0;
>> +  }
>> +  while ( pftijah_tokenizelex() ) {
>> +      /* stream_printf(GDKout,"# scan(%s).\n",pftijah_tokenizetext); */
>> +      if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
>> +          return 0;
>> +  }
>> +  yy_delete_buffer(myBuf);
>> +  buf[len+1] = remember;
>> +  return 1;
>> +}
>> +
>> +int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
>> +  // the original
>> +  int len = strlen(buf);
>>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>>    while (pftijah_tokenizelex()) {
>>        if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
>> @@ -125,6 +158,40 @@
>>    return 1;
>>  }
>>  
>> +int OPT2useFlexScanner(char* input, struct tijahContextStruct* tjctx)
>> +{
>> +    /* the fast function. This function is in the pftijah context with lots
>> +     * of small strings to tokenize many times faster as the flex and the 
>> +     * strtok() methods which seem to have a rather larger overhead
>> +     */
>> +    register char* s = input;
>> +    register char x;
>> +// #define EMIT x=*s; *s=0; stream_printf(GDKout,"#[%s]\n",base);if 
>> (!handleTijahTerm(tjctx,base)) return 0; *s=x
>> +#define EMIT x=*s; *s=0; if (!handleTijahTerm(tjctx,base)) return 0; *s=x
>> +
>> +    while ( 1 ) {
>> +      while ( isspace( *s ) ) s++;
>> +      if ( *s ) {
>> +      char* base = s;
>> +      if ( isalnum(*s) ) {
>> +          if ( isdigit(*s) ) {
>> +              while ( isdigit(*++s) ) ;
>> +              EMIT;
>> +          } else {
>> +              if (isupper(*s)) *s=tolower(*s);
>> +              while ( isalnum(*++s) ) if (isupper(*s)) *s=tolower(*s);
>> +              EMIT;
>> +          }
>> +      } else {
>> +          // INCOMPLETE, ENTITIES HERE
>> +          // stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
>> +          s++;
>> +      }
>> +      } else 
>> +          return 1;
>> +    }
>> +}
>> +
>>  char* tijah_tokenize_string(char* buf, int len, char* outbuf) {
>>    int cnt = 0;
>>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>> @@ -137,9 +204,10 @@
>>    return outbuf;
>>  }
>>  
>> -char* flexScanOneTerm(char* buf, int len) {
>> +char* flexScanOneTerm(char* buf) {
>>    char *res;
>>    char resBUFF[256];
>> +  int len = strlen(buf);
>>  
>>    YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>>    if ( pftijah_tokenizelex() ) {
>> 
>> Index: nexi.c
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
>> retrieving revision 1.49
>> retrieving revision 1.50
>> diff -u -d -r1.49 -r1.50
>> --- nexi.c   23 Feb 2007 15:11:05 -0000      1.49
>> +++ nexi.c   27 Feb 2007 15:43:37 -0000      1.50
>> @@ -455,6 +455,7 @@
>>      /*
>>       * Now find out if the collection is fragmented or not.
>>       */
>> +    /* INCOMPLETE, ERROR HERE WITH REFCOUNTS IN HEAD */
>>      BAT* fb = 
>> pftu_lookup_bat(pftu_batname1("tj_%s_fragments",(char*)parserCtx->collection,0));
>>      if ( ! fb ) {
>>             stream_printf(GDKerr,"Error: cannot find fragments bat for 
>> collection \"%s\".\n",parserCtx->collection);
>> @@ -471,6 +472,8 @@
>>                parserCtx->ffPfx        = "";
>>                parserCtx->flastPfx     = ", str(1)";
>>      }
>> +    BBPunfix(BBPcacheid(fb));
>> +    fb = NULL;
>>      // Some special cases for NLLR, since NLLR only works with COARSE2 at 
>> the moment
>>      if ( txt_retr_model->model == MODEL_NLLR ) {
>>          // Switch to COARSE2 algebra for NLLR
>> 
>> Index: pftijah_util.mx
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_util.mx,v
>> retrieving revision 1.2
>> retrieving revision 1.3
>> diff -u -d -r1.2 -r1.3
>> --- pftijah_util.mx  9 Jan 2007 17:15:23 -0000       1.2
>> +++ pftijah_util.mx  27 Feb 2007 15:43:37 -0000      1.3
>> @@ -73,6 +73,7 @@
>>      if ( b == bat_nil ) {
>>              return NULL;
>>      } else {
>> +        BBPfix(b);
>>              return BBPdescriptor(b);
>>      }
>>  }
>> 
>> 
>> -------------------------------------------------------------------------
>> Take Surveys. Earn Cash. Influence the Future of IT
>> Join SourceForge.net's Techsay panel and you'll get the chance to share your
>> opinions on IT & business topics through brief surveys-and earn cash
>> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
>> _______________________________________________
>> Monetdb-pf-checkins mailing list
>> [email protected]
>> https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins
>
>
>-- 
>Sjoerd Mullender
>

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Monetdb-developers mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-developers

Reply via email to