On 2/27/2007, "Sjoerd Mullender" <[EMAIL PROTECTED]> wrote:
>On 2007-02-27 16:43, Jan Flokstra wrote:
>> Update of /cvsroot/monetdb/pathfinder/modules/pftijah
>> In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv27686
>>
>> Modified Files:
>> nexi.c pftijah_tokenize.l pftijah_util.mx serialize_pftijah.mx
>> Log Message:
>> * repair BBP refcount bug for BAT
>
>Is this a fix which also applies to the stable branch?
I'm not shure yet. The bug only shows in the HEAD branch and does not
occur in the release branch. Problem was I did:
bat b = BBPindex(......);
if ( b != bat_nil )
return BBPdescriptor(b)
The refcount assert crash occurs in the BBPdescriptor(). I used this
construction before and never had any problem. The bug made the
"Current" branch useless so I decided to (un)fix(:) it quickly with a
BBPfix() / BBPunfix(). I will try to figure out what to do next in the
near future. Maybe I even try to consult the CWI people :-)
>
>> * reimplement the direct bat acces methods in pftijah serialization for more
>> speed (and clarity).
>>
>> * Start optimizing the the pftijah tokenizer. The flex functions are called
>> once
>> per handle_character() call. This leads to 2 malloc's per call. I tried to
>> do without the malloc's but this caused to a lot of strange results:-)
>> I am now planning to craft the flexer by hand. The first small experiment
>> shows there is a lot to gain there. (25% speedup in indexing time).
>>
>>
>>
>> Index: serialize_pftijah.mx
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
>> retrieving revision 1.41
>> retrieving revision 1.42
>> diff -u -d -r1.41 -r1.42
>> --- serialize_pftijah.mx 23 Feb 2007 15:11:07 -0000 1.41
>> +++ serialize_pftijah.mx 27 Feb 2007 15:43:37 -0000 1.42
>> @@ -31,8 +31,8 @@
>>
>> extern int handleTijahTerm(struct tijahContextStruct *ctx, char* term);
>>
>> -extern int useFlexScanner(char* buf, int len, struct tijahContextStruct*
>> tjCtx); /* FLEX */
>> -extern char* flexScanOneTerm(char* buf, int len);
>> +extern int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx); /*
>> FLEX */
>> +extern char* flexScanOneTerm(char* buf);
>>
>> extern char* normalizeTerm(struct tijahContextStruct *ctx, char* term );
>>
>> @@ -70,15 +70,10 @@
>> typedef struct dbat_struct {
>> const char* name;
>> BAT* bat;
>> - int oid_mark;
>> - int max_i;
>> - int max_sz;
>> - bit dflt; /* fill with default value during extend */
>> - int dflt_int; /* the default int value */
>> - chr dflt_chr; /* the default chr value */
>> - oid dflt_oid; /* the default oid value */
>> - /* */
>> - union { /* cast to perform direct indexex insert in [void,any] BATs
>> */
>> + oid raw_max;
>> + oid seqbase;
>> + oid seq_max;
>> + union { /* cast to perform direct indexe insert in [void,any] BATs
>> */
>> void* voidCAST; /* the basecast */
>> chr* chrCAST; /* cast for [void,chr] BAT */
>> int* intCAST; /* cast for [void,int] BAT */
>> @@ -89,7 +84,6 @@
>> int dbat_init(const char* name, dbat* dbat, BAT* b) {
>> dbat->name = name;
>> dbat->bat = b;
>> - dbat->dflt = FALSE;
>> if ( dbat->bat->htype != TYPE_void ) {
>> stream_printf(GDKerr,"ERROR: dbat_init(%s) non void
>> BAT\n",dbat->name);
>> return 0;
>> @@ -98,31 +92,25 @@
>> stream_printf(GDKerr,"ERROR: dbat_init(%s) unknown
>> ttype(%d)\n",dbat->name,dbat->bat->ttype);
>> return 0;
>> }
>> - dbat->oid_mark = b->hseqbase;
>> - dbat->max_i = dbat->max_sz = BATcount(dbat->bat);
>> + dbat->seqbase = (oid)b->hseqbase;
>> + dbat->raw_max = (oid)BATcount(dbat->bat);
>> + dbat->seq_max = dbat->raw_max + dbat->seqbase;
>> dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
>> /* */
>> return 1;
>> }
>>
>> -int dbat_finalize(dbat* dbat) {
>> - BAT* b = dbat->bat;
>>
>> +int dbat_finalize(dbat* dbat, int topidx) {
>> void* top;
>> + BAT* b = dbat->bat;
>>
>> - int bottomTop = dbat->max_i;
>> - if ( 0 ) stream_printf(GDKout,"dbat_finalize(size=%d)\n",dbat->max_i);
>> + topidx -= (int)dbat->seqbase;
>> + int bottomTop = topidx;
>> switch( b->ttype ) {
>> case TYPE_int :
>> top = &dbat->cast.intCAST[bottomTop];
>> break;
>> - case TYPE_chr: {
>> - b->batBuns->free = dbat->max_i;
>> - BATsetcount(b, dbat->max_i);
>> - b->tsorted = 0;
>> - b->batDirty = TRUE; /* VERY important this one */
>> - return 1;
>> - }
>> case TYPE_oid:
>> top = &dbat->cast.oidCAST[bottomTop];
>> break;
>> @@ -137,7 +125,7 @@
>> /* */
>> dbat->name = NULL;
>> dbat->bat = NULL;
>> - dbat->max_i = dbat->max_sz = 0;
>> + dbat->raw_max = dbat->seqbase = 0;
>> /* */
>> return 1;
>> }
>> @@ -145,9 +133,14 @@
>> #define MINCHUNK 8192
>> #define MAXCHUNK 67108864
>>
>> -int dbat_extend(dbat* dbat, int i_mark) {
>> - /* CHECK THIS if ( i_mark ) i = i_mark - dbat->oid_mark; */
>> - size_t newsize =
>> MAX(MIN(MAX(MINCHUNK,dbat->max_sz*2),(size_t)(dbat->max_sz+MAXCHUNK)),(size_t)i_mark);
>> +int dbat_extend(dbat* dbat, oid min_i /*raw-index*/, size_t forced_size) {
>> + size_t newsize;
>> +
>> + if ( forced_size ) {
>> + newsize = forced_size;
>> + } else {
>> + newsize =
>> MAX(MIN(MAX(MINCHUNK,dbat->raw_max*2),(size_t)(dbat->raw_max+MAXCHUNK)),(size_t)min_i);
>> + }
>>
>> /* first check if the number of BUN's < INT_MAX. If this was the case
>> * and the previous time INT_MAX was returned this means the BAT cannot
>> @@ -156,94 +149,34 @@
>> if ( newsize > INT_MAX ) {
>> newsize = INT_MAX;
>>
>> - if ( dbat->max_sz == INT_MAX ) {
>> + if ( dbat->raw_max == INT_MAX ) {
>> GDKerror("dbat_extend: BATextend[\"%s\"](size>INT_MAX)
>> fails\n","incomplete");
>> return -1;
>> }
>> }
>> - if ( 0 ) { stream_printf(GDKout,"dbat_extend[%s](%d ->
>> %d)\n",dbat->name,dbat->max_sz,newsize); }
>> - dbat->max_sz= newsize;
>> +#if 0
>> + stream_printf(GDKout,"dbat_extend[%s](%d ->
>> %d)\n",dbat->name,dbat->raw_max,newsize);
>> +#endif
>> + dbat->raw_max= newsize;
>> + dbat->seq_max = dbat->raw_max + dbat->seqbase;
>> if ( !(dbat->bat = BATextend(dbat->bat,newsize)) ) {
>> GDKerror("dbat_extend: BATextend[\"%s\"](to %d)
>> fails\n","incomplete",newsize);
>> return -1;
>> }
>> dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
>> - /*
>> - * now check if there's a default value handler used
>> - *
>> - */
>> - if ( dbat->dflt ) {
>> - switch( dbat->bat->ttype ) {
>> - case TYPE_int : {
>> - int v = dbat->dflt_int;
>> - int *to = &dbat->cast.intCAST[dbat->max_sz];
>> - for(register int *p = &dbat->cast.intCAST[dbat->max_i]; p<to;)
>> - *p++ = v;
>> - break;
>> - }
>> - case TYPE_chr: {
>> - chr v = dbat->dflt_chr;
>> - chr *to = &dbat->cast.chrCAST[dbat->max_sz];
>> - for(register chr *p = &dbat->cast.chrCAST[dbat->max_i]; p<to;)
>> - *p++ = v;
>> - break;
>> - }
>> - case TYPE_oid: {
>> - oid v = dbat->dflt_oid;
>> - oid *to = &dbat->cast.oidCAST[dbat->max_sz];
>> - for(register oid *p = &dbat->cast.oidCAST[dbat->max_i]; p<to;)
>> - *p++ = v;
>> - break;
>> - }
>> - default:
>> - GDKerror("dbat_extend: bad ttype\n");
>> - return -1;
>> - }
>> - }
>> - /* */
>> return 1;
>> }
>>
>> int dbat_sizeHint(dbat* dbat, int sizeHint_mark) {
>> - int sizeHint = sizeHint_mark - dbat->oid_mark;
>> - int estimate = dbat->max_i + sizeHint;
>> -
>> - return dbat_extend(dbat, estimate);
>> -}
>> -
>> -INLINE static int dbat_set_oid(dbat* dbat, int pos_mark, oid v) {
>> - register int pos;
>> + int sizeHint = sizeHint_mark - dbat->seqbase;
>> + int estimate = dbat->raw_max + sizeHint;
>>
>> - if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
>> - dbat->cast.oidCAST[pos] = v;
>> - return 1;
>> - } else {
>> - if ( pos >= dbat->max_sz ) {
>> - if ( dbat_extend(dbat,pos) < 0 )
>> - return -1;
>> - }
>> - dbat->max_i = pos + 1;
>> - dbat->cast.oidCAST[pos] = v;
>> - return 1;
>> - }
>> + return dbat_extend(dbat, estimate, 0);
>> }
>>
>> -INLINE static int dbat_set_int(dbat* dbat, int pos_mark, int v) {
>> - register int pos;
>> +#define dbat_set_oid(DBAT,I,V) (DBAT)->cast.oidCAST[I-(DBAT)->seqbase] = V
>>
>> - if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
>> - dbat->cast.intCAST[pos] = v;
>> - return 1;
>> - } else {
>> - if ( pos >= dbat->max_sz ) {
>> - if ( dbat_extend(dbat,pos) < 0 )
>> - return -1;
>> - }
>> - dbat->max_i = pos + 1;
>> - dbat->cast.intCAST[pos] = v;
>> - return 1;
>> - }
>> -}
>> +#define dbat_set_int(DBAT,I,V) (DBAT)->cast.intCAST[I-(DBAT)->seqbase] = V
>>
>> /************************************************
>> *
>> @@ -315,9 +248,6 @@
>>
>> /************************************************
>> *
>> - *
>> - * First the temporary shredder for Tijah by JF
>> - *
>> */
>>
>> INLINE static oid
>> @@ -328,15 +258,15 @@
>> BUN bun;
>>
>> HASHfnd_str(bun, tjctx->hm_globalTag, (str)t);
>> - if ( bun )
>> - /* if ( (bun = BUNfnd(tjctx->hm_globalTag,t)) ) OLD */
>> + if ( bun ) {
>> return *(oid*)BUNtail(tjctx->hm_globalTag,bun);
>> - else {
>> - if ( !BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t,
>> FALSE) ) {
>> + } else {
>> + if ( BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t,
>> FALSE) ) {
>> + return tjctx->n_globalTag++;
>> + } else {
>> GDKerror("INSERT OF \"%s\" in globalTag fails.\n");
>> return oid_nil;
>> - } else
>> - return tjctx->n_globalTag++;
>> + }
>> }
>> #endif
>> }
>> @@ -349,10 +279,7 @@
>> BUN bun;
>>
>> HASHfnd_str(bun, tjctx->hm_globalTerm, (str)t);
>> - if ( bun ) {
>> - return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
>> - } else
>> - return oid_nil;
>> + return ( bun ? *(oid*)BUNtail(tjctx->hm_globalTerm,bun) : oid_nil );
>> }
>>
>> INLINE static oid
>> @@ -366,22 +293,35 @@
>> if ( bun )
>> return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
>> else {
>> - if ( !BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t,
>> FALSE)){
>> + if ( BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t,
>> FALSE)){
>> + return tjctx->n_globalTerm++;
>> + } else {
>> GDKerror("INSERT OF \"%s\" in globalTerm fails.\n");
>> return oid_nil;
>> - } else
>> - return tjctx->n_globalTerm++;
>> + }
>> }
>> #endif
>> }
>>
>> -#define tj_add2plane(TJCTX,O) \
>> - ((dbat_set_oid(&(TJCTX)->dbat_collPre, (TJCTX)->tijahPre, O) < 0) \
>> - ? oid_nil : ((oid)(TJCTX)->tijahPre++))
>> +INLINE oid tj_extend_plane(struct tijahContextStruct *tjctx) {
>> + oid base = tjctx->tijahPre - tjctx->dbat_collPre.seqbase;
>>
>> -#define insertPreSize(TJCTX,POS,SIZE) \
>> - dbat_set_int(&TJCTX->dbat_collSize,(int)POS,SIZE)
>> + if ( base >= tjctx->dbat_collPre.raw_max ) {
>> + if ( dbat_extend(&tjctx->dbat_collPre,base, 0) < 0 )
>> + return oid_nil;
>> + /* IMPORTANT: the size of the two bats is synchronized by the use
>> + * of the forced size (last) parameter of dbat_extend
>> + */
>> + if (
>> dbat_extend(&tjctx->dbat_collSize,base,tjctx->dbat_collPre.raw_max) < 0 )
>> + return oid_nil;
>> + }
>> + return tjctx->tijahPre++;
>> +}
>>
>> +#define tj_newPre(TJCTX) \
>> + (((TJCTX)->tijahPre < (TJCTX)->dbat_collPre.seq_max) \
>> + ? \
>> + ((oid)(TJCTX)->tijahPre++) : tj_extend_plane(TJCTX))
>>
>> int
>> handleTijahTerm(struct tijahContextStruct *tjctx, char* term) {
>> @@ -397,13 +337,13 @@
>> }
>> }
>> if ( (termOid = tj_termOid(tjctx, term)) == oid_nil )
>> - return 0;
>> + return -1;
>> }
>> if ( termOid ) { /* term is not a stopword */
>> - if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
>> + if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>> return 0;
>> - if ( insertPreSize(tjctx,tjPre,0) < 0 )
>> - return -1;
>> + dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
>> + dbat_set_int(&tjctx->dbat_collSize,(int)tjPre,0);
>> #ifdef TJ_TRACE
>> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:TERM: \"%s\",
>> termoid=%d, tjPre=%d\n",tjctx->name,term,termOid,tjPre);
>> #endif
>> @@ -416,40 +356,13 @@
>> return 1;
>> }
>>
>> -/************
>> - *
>> - * The part where the Strings from Pathfinder are shredded into words
>> - * by Tijah. The USE_FLEX macro determines if the strings is shredded
>> - * by Hennings fancy flex scanner or Jan's simple strtok() scanner.
>> - */
>> -
>> -const char* obsoleteNexiChars = " \t\n\r,:;&[EMAIL PROTECTED]";
>> -
>> -int
>> -useStrtokScanner(tjCtx* tjctx, char* s)
>> -{
>> - char *t;
>> - int sz = 0;
>> -
>> -#ifdef TJ_TRACE
>> - if (TJ_TRACE) stream_printf(GDKout,"C[%s]:CHARACTERS:\n",tjctx->name);
>> -#endif
>> - if ( (t = strtok(s,obsoleteNexiChars)) ) do {
>> - /* not the empty string here */
>> - if ( handleTijahTerm(tjctx,t) < 0 )
>> - return -1;
>> - sz++;
>> - } while ( (t=strtok(NULL,obsoleteNexiChars)) );
>> - return 1;
>> -}
>> -
>> /************************************************
>> *
>> * Now the real output handlers
>> */
>>
>>
>> -#ifdef notused
>> +#if 0
>> static int
>> handle_sizeHint(XqueryCtx* ctx, int hinted_size) {
>> tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>> @@ -502,14 +415,12 @@
>> return (str)str_nil;
>> }
>>
>> -#define GUESSFORCE FALSE
>> -
>> /*
>> * Replace the value of a collection parameter int the collection parameter
>> * bat
>> */
>> static int replaceCollParam(tjCtx* tjctx, str param, str val) {
>> - return ( BUNreplace(tjctx->b_collParam,param,val,GUESSFORCE) != NULL );
>> + return ( BUNreplace(tjctx->b_collParam,param,val,FALSE) != NULL );
>> }
>>
>> static BAT*
>> @@ -894,10 +805,10 @@
>> /* if ( DOEMIT(tjctx) ) { */
>> if ( (termOid = tj_tagOid(tjctx, name)) == oid_nil )
>> return 0;
>> - if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
>> + if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
>> return 0;
>> + dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
>> if ( tj_pushTag(tjctx,tjPre) < 0 ) return 0;
>> - if ( 0 ) stream_printf(GDKout,"C[%s]:startElement: \"%s\",
>> termoid=%d, Tijah pre#=%d, Pathfinder
>> pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>> #ifdef TJ_TRACE
>> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:startElement: \"%s\",
>> termoid=%d, Tijah pre#=%d, Pathfinder
>> pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
>> #endif
>> @@ -913,8 +824,7 @@
>> --tjctx->doc_height;
>> oid start = tj_popTag(tjctx); /* oid of the first node of the element */
>> int size = tjctx->tijahPre - start - 1; /* the Tijah element size */
>> - if ( insertPreSize(tjctx,start,size) < 0 )
>> - return 0;
>> + dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>> #ifdef TJ_TRACE
>> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:empty_endElement:
>> \"%s\"\n", tjctx->name,"");
>> #endif
>> @@ -934,8 +844,7 @@
>> /* if ( DOEMIT(tjctx) ) { */
>> oid start = tj_popTag(tjctx); /* oid of the first node of the
>> element */
>> int size = tjctx->tijahPre - start - 1; /* the Tijah element size
>> */
>> - if ( insertPreSize(tjctx,start,size) < 0 )
>> - return 0;
>> + dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
>> #ifdef TJ_TRACE
>> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:endElement: \"%s\"\n",
>> tjctx->name,name);
>> #endif
>> @@ -944,8 +853,6 @@
>> return 1;
>> }
>>
>> -#define USE_FLEX 1
>> -
>> /**
>> * Output generation handler. Handles equivalent of * SAX characters()
>> event.
>> */
>> @@ -954,28 +861,23 @@
>> EMPTY_CHECK;
>> tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>>
>> + register char* p = (char*)ch;
>> + while( *p && isspace(*p) ) p++;
>> + if ( !*p )
>> + return 1;
>> #ifdef TJ_TRACE
>> - if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s)
>> start\n",tjctx->name, (char*)ch);
>> + if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s)
>> start\n",tjctx->name, p);
>> #endif
>>
>> if ( DOEMIT(tjctx) ) {
>> -#ifdef USE_FLEX
>> - return useFlexScanner((char*)ch,strlen((char*)ch),tjctx);
>> -#else
>> - return useStrtokScanner(tjctx,(char*)ch);
>> -#endif
>> + return useFlexScanner(p,tjctx);
>> }
>> return 1;
>> }
>>
>> char* normalizeTerm(struct tijahContextStruct *tjctx, char* term ) {
>> char *res;
>> -#ifdef USE_FLEX
>> - res = flexScanOneTerm((char*)term,strlen((char*)term));
>> -#else
>> - res = strtok(term,obsoleteNexiChars);
>> -#endif
>> - /* INCOMPLETE, should make shure tijahContext is always avail. here */
>> + res = flexScanOneTerm((char*)term);
>> if ( res && tjctx && tjctx->stemCtx->stem) {
>> if ( !(res =
>> (char*)tjctx->stemCtx->stem(tjctx->stemCtx,(char*)res)) ) {
>> /* must be a stopword */
>> @@ -986,14 +888,6 @@
>> }
>>
>> int CMDtj_normalizeTerm(char** res, str term, str stemmer) {
>> -//Leave tokenization disabled for now
>> -// char* tokenized;
>> -//#ifdef USE_FLEX
>> -// tokenized = flexScanOneTerm(term,strlen(term));
>> -//#else
>> -// tokenized = strtok(term,obsoleteNexiChars);
>> -//#endif
>> -
>> tjStemCtx* stemCtx = getStemmingContext( stemmer );
>>
>> if ( stemCtx->stem ) {
>> @@ -1123,13 +1017,9 @@
>> #ifdef TJ_TRACE
>> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINISH
>> INDEXING\n",tjctx->name);
>> #endif
>> -
>> - /* feature not used anymore ????? */
>> - if ( 0 /* ROEL CHANGE VIRTUAL ROOT SIZE HERE */ )
>> - insertPreSize(tjctx,0,tjctx->tijahPre - 1);
>> - if ( dbat_finalize(&tjctx->dbat_collPre) < 0 )
>> + if ( dbat_finalize(&tjctx->dbat_collPre, tjctx->tijahPre) < 0 )
>> return GDK_FAIL;
>> - if ( dbat_finalize(&tjctx->dbat_collSize) < 0 )
>> + if ( dbat_finalize(&tjctx->dbat_collSize, tjctx->tijahPre) < 0 )
>> return GDK_FAIL;
>> #ifdef TJ_TRACE
>> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINALIZED DIRECT
>> BATS\n",tjctx->name);
>>
>> Index: pftijah_tokenize.l
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
>> retrieving revision 1.12
>> retrieving revision 1.13
>> diff -u -d -r1.12 -r1.13
>> --- pftijah_tokenize.l 9 Jan 2007 15:44:39 -0000 1.12
>> +++ pftijah_tokenize.l 27 Feb 2007 15:43:37 -0000 1.13
>> @@ -115,7 +115,40 @@
>>
>> %%
>>
>> -int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx) {
>> +int OPT0useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
>> + /* UPDATE: this delivers very strange testset results and should not be
>> + * used I think.
>> + */
>> + /* This is an optimized version of the flex scanner which does not copy
>> the
>> + * input buffer. The only strange thing about this interface is that it
>> + * requires 2 YY_END_OF_BUFFER_CHAR (eg. 0) at the end of the buffer. The
>> + * size of the buffer is inclusive the 2 0's.
>> + * The last zero is toggled with its original value to prevent corruption
>> + * of memory management tables. This was for me the only way to prevent
>> + * copying here.
>> + */
>> + int len = strlen(buf);
>> + char remember = buf[len+1];
>> + buf[len+1] = YY_END_OF_BUFFER_CHAR;
>> + YY_BUFFER_STATE myBuf = yy_scan_buffer(buf, len+2);
>> +
>> + if ( !myBuf ) {
>> + stream_printf(GDKout,"# useFlexScanner: unable to get setup non-copy
>> buffer.");
>> + return 0;
>> + }
>> + while ( pftijah_tokenizelex() ) {
>> + /* stream_printf(GDKout,"# scan(%s).\n",pftijah_tokenizetext); */
>> + if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
>> + return 0;
>> + }
>> + yy_delete_buffer(myBuf);
>> + buf[len+1] = remember;
>> + return 1;
>> +}
>> +
>> +int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
>> + // the original
>> + int len = strlen(buf);
>> YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>> while (pftijah_tokenizelex()) {
>> if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
>> @@ -125,6 +158,40 @@
>> return 1;
>> }
>>
>> +int OPT2useFlexScanner(char* input, struct tijahContextStruct* tjctx)
>> +{
>> + /* the fast function. This function is in the pftijah context with lots
>> + * of small strings to tokenize many times faster as the flex and the
>> + * strtok() methods which seem to have a rather larger overhead
>> + */
>> + register char* s = input;
>> + register char x;
>> +// #define EMIT x=*s; *s=0; stream_printf(GDKout,"#[%s]\n",base);if
>> (!handleTijahTerm(tjctx,base)) return 0; *s=x
>> +#define EMIT x=*s; *s=0; if (!handleTijahTerm(tjctx,base)) return 0; *s=x
>> +
>> + while ( 1 ) {
>> + while ( isspace( *s ) ) s++;
>> + if ( *s ) {
>> + char* base = s;
>> + if ( isalnum(*s) ) {
>> + if ( isdigit(*s) ) {
>> + while ( isdigit(*++s) ) ;
>> + EMIT;
>> + } else {
>> + if (isupper(*s)) *s=tolower(*s);
>> + while ( isalnum(*++s) ) if (isupper(*s)) *s=tolower(*s);
>> + EMIT;
>> + }
>> + } else {
>> + // INCOMPLETE, ENTITIES HERE
>> + // stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
>> + s++;
>> + }
>> + } else
>> + return 1;
>> + }
>> +}
>> +
>> char* tijah_tokenize_string(char* buf, int len, char* outbuf) {
>> int cnt = 0;
>> YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>> @@ -137,9 +204,10 @@
>> return outbuf;
>> }
>>
>> -char* flexScanOneTerm(char* buf, int len) {
>> +char* flexScanOneTerm(char* buf) {
>> char *res;
>> char resBUFF[256];
>> + int len = strlen(buf);
>>
>> YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
>> if ( pftijah_tokenizelex() ) {
>>
>> Index: nexi.c
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
>> retrieving revision 1.49
>> retrieving revision 1.50
>> diff -u -d -r1.49 -r1.50
>> --- nexi.c 23 Feb 2007 15:11:05 -0000 1.49
>> +++ nexi.c 27 Feb 2007 15:43:37 -0000 1.50
>> @@ -455,6 +455,7 @@
>> /*
>> * Now find out if the collection is fragmented or not.
>> */
>> + /* INCOMPLETE, ERROR HERE WITH REFCOUNTS IN HEAD */
>> BAT* fb =
>> pftu_lookup_bat(pftu_batname1("tj_%s_fragments",(char*)parserCtx->collection,0));
>> if ( ! fb ) {
>> stream_printf(GDKerr,"Error: cannot find fragments bat for
>> collection \"%s\".\n",parserCtx->collection);
>> @@ -471,6 +472,8 @@
>> parserCtx->ffPfx = "";
>> parserCtx->flastPfx = ", str(1)";
>> }
>> + BBPunfix(BBPcacheid(fb));
>> + fb = NULL;
>> // Some special cases for NLLR, since NLLR only works with COARSE2 at
>> the moment
>> if ( txt_retr_model->model == MODEL_NLLR ) {
>> // Switch to COARSE2 algebra for NLLR
>>
>> Index: pftijah_util.mx
>> ===================================================================
>> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_util.mx,v
>> retrieving revision 1.2
>> retrieving revision 1.3
>> diff -u -d -r1.2 -r1.3
>> --- pftijah_util.mx 9 Jan 2007 17:15:23 -0000 1.2
>> +++ pftijah_util.mx 27 Feb 2007 15:43:37 -0000 1.3
>> @@ -73,6 +73,7 @@
>> if ( b == bat_nil ) {
>> return NULL;
>> } else {
>> + BBPfix(b);
>> return BBPdescriptor(b);
>> }
>> }
>>
>>
>> -------------------------------------------------------------------------
>> Take Surveys. Earn Cash. Influence the Future of IT
>> Join SourceForge.net's Techsay panel and you'll get the chance to share your
>> opinions on IT & business topics through brief surveys-and earn cash
>> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
>> _______________________________________________
>> Monetdb-pf-checkins mailing list
>> [email protected]
>> https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins
>
>
>--
>Sjoerd Mullender
>
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Monetdb-developers mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-developers