Inspired by the recent heap overrun patch, I propose to make all position_sets grow dynamically. What do you think?
Paolo Bonzini (5): dfa: use a separate data type for grps dfa: introduce alloc_posset dfa: remove dead assignment dfa: move nalloc to position_set structure dfa: automatically resize position_sets src/dfa.c | 85 +++++++++++++++++++++++++++++++++---------------------------- 1 files changed, 46 insertions(+), 39 deletions(-) -- 1.7.5.2 >From 765d61f65a0b4a193fd712382e85b786222a8fc4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <[email protected]> Date: Tue, 28 Jun 2011 09:38:55 +0200 Subject: [PATCH 1/5] dfa: use a separate data type for grps * src/dfa.c (leaf_set): New. (dfastate): Use leaf_set for grps, as the constraint field is unused. --- src/dfa.c | 29 +++++++++++++++++++---------- 1 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index a36b80a..d875517 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -244,6 +244,13 @@ typedef struct int nelem; /* Number of elements in this set. */ } position_set; +/* Sets of leaves are also stored as arrays. */ +typedef struct +{ + unsigned int *elems; /* Elements of this position set. */ + int nelem; /* Number of elements in this set. */ +} leaf_set; + /* A state of the dfa consists of a set of positions, some flags, and the token value of the lowest-numbered position of the state that contains an END token. */ @@ -2356,7 +2363,7 @@ dfaanalyze (struct dfa *d, int searchflag) void dfastate (int s, struct dfa *d, int trans[]) { - position_set *grps; /* As many as will ever be needed. */ + leaf_set *grps; /* As many as will ever be needed. */ charclass *labels; /* Labels corresponding to the groups. */ int ngrps = 0; /* Number of groups actually used. */ position pos; /* Current position being considered. */ @@ -2483,14 +2490,16 @@ dfastate (int s, struct dfa *d, int trans[]) { copyset(leftovers, labels[ngrps]); copyset(intersect, labels[j]); - MALLOC(grps[ngrps].elems, position, d->nleaves); - copy(&grps[j], &grps[ngrps]); + MALLOC(grps[ngrps].elems, unsigned int, d->nleaves); + memcpy(grps[ngrps].elems, grps[j].elems, + grps[j].nelem * sizeof(unsigned int)); + grps[ngrps].nelem = grps[j].nelem; ++ngrps; } - /* Put the position in the current group. Note that there is no - reason to call insert() here. */ - grps[j].elems[grps[j].nelem++] = pos; + /* Put the position in the current group. The constraint is + irrelevant here. */ + grps[j].elems[grps[j].nelem++] = pos.index; /* If every character matching the current position has been accounted for, we're done. */ @@ -2504,9 +2513,9 @@ dfastate (int s, struct dfa *d, int trans[]) { copyset(matches, labels[ngrps]); zeroset(matches); - MALLOC(grps[ngrps].elems, position, d->nleaves); + MALLOC(grps[ngrps].elems, unsigned int, d->nleaves); grps[ngrps].nelem = 1; - grps[ngrps].elems[0] = pos; + grps[ngrps].elems[0] = pos.index; ++ngrps; } } @@ -2553,8 +2562,8 @@ dfastate (int s, struct dfa *d, int trans[]) /* Find the union of the follows of the positions of the group. This is a hideously inefficient loop. Fix it someday. */ for (j = 0; j < grps[i].nelem; ++j) - for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) - insert(d->follows[grps[i].elems[j].index].elems[k], &follows); + for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) + insert(d->follows[grps[i].elems[j]].elems[k], &follows); #if MBS_SUPPORT if (d->mb_cur_max > 1) -- 1.7.5.2 >From 635579f5fe9aa92fc781eb21901ef08f8be13a48 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <[email protected]> Date: Tue, 28 Jun 2011 09:02:51 +0200 Subject: [PATCH 2/5] dfa: introduce alloc_posset * src/dfa.c (alloc_posset): New function, use it throughout. --- src/dfa.c | 25 ++++++++++++++----------- 1 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index d875517..4ad3a9b 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1842,6 +1842,13 @@ copy (position_set const *src, position_set *dst) dst->nelem = src->nelem; } +static void +alloc_posset (position_set *s, size_t size) +{ + MALLOC(s->elems, position, size); + s->nelem = 0; +} + /* Insert a position in a set. Position sets are maintained in sorted order according to index. If position already exists in the set with the same index then their constraints are logically or'd together. @@ -1945,7 +1952,7 @@ state_index (struct dfa *d, position_set const *s, int newline, int letter) /* We'll have to create a new state. */ REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); d->states[i].hash = hash; - MALLOC(d->states[i].elems.elems, position, s->nelem); + alloc_posset(&d->states[i].elems, s->nelem); copy(s, &d->states[i].elems); d->states[i].newline = newline; d->states[i].letter = letter; @@ -2139,7 +2146,7 @@ dfaanalyze (struct dfa *d, int searchflag) MALLOC(lastpos, position, d->nleaves); o_lastpos = lastpos, lastpos += d->nleaves; CALLOC(nalloc, int, d->tindex); - MALLOC(merged.elems, position, 2 * d->nleaves); + alloc_posset(&merged, d->nleaves); CALLOC(d->follows, position_set, d->tindex); @@ -2249,7 +2256,7 @@ dfaanalyze (struct dfa *d, int searchflag) /* Allocate the follow set for this position. */ nalloc[i] = 1; - MALLOC(d->follows[i].elems, position, nalloc[i]); + alloc_posset(&d->follows[i], nalloc[i]); break; } #ifdef DEBUG @@ -2419,10 +2426,7 @@ dfastate (int s, struct dfa *d, int trans[]) must put it to d->states[s].mbps, which contains the positions which can match with a single character not a byte. */ if (d->states[s].mbps.nelem == 0) - { - MALLOC(d->states[s].mbps.elems, position, - d->states[s].elems.nelem); - } + alloc_posset(&d->states[s].mbps, d->states[s].elems.nelem); insert(pos, &(d->states[s].mbps)); continue; } @@ -2520,8 +2524,8 @@ dfastate (int s, struct dfa *d, int trans[]) } } - MALLOC(follows.elems, position, d->nleaves); - MALLOC(tmp.elems, position, d->nleaves); + alloc_posset(&follows, d->nleaves); + alloc_posset(&tmp, d->nleaves); /* If we are a searching matcher, the default transition is to a state containing the positions of state 0, otherwise the default transition @@ -3145,8 +3149,7 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) } /* This state has some operators which can match a multibyte character. */ - follows.nelem = 0; - MALLOC(follows.elems, position, d->nleaves); + alloc_posset(&follows, d->nleaves); /* `maxlen' may be longer than the length of a character, because it may not be a character but a (multi character) collating element. -- 1.7.5.2 >From 690906eb043236efa06aa1cdc83d980717a0b554 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <[email protected]> Date: Tue, 28 Jun 2011 09:58:37 +0200 Subject: [PATCH 3/5] dfa: remove dead assignment * src/dfa.c (transit_state): transit_state_consume_1char will clear follows, do not do this ourselves. --- src/dfa.c | 1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 4ad3a9b..f174cc9 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3163,7 +3163,6 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) while (*pp - p1 < maxlen) { - follows.nelem = 0; transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); for (i = 0; i < nelem ; i++) -- 1.7.5.2 >From d65738a921827089714c78bcc483bbfc17d721b4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <[email protected]> Date: Tue, 28 Jun 2011 09:07:44 +0200 Subject: [PATCH 4/5] dfa: move nalloc to position_set structure * src/dfa.c (position_set): Add alloc. (alloc_posset): Initialize it. (dfaanalyze): Use it instead of the nalloc array or nelem. --- src/dfa.c | 16 +++++++--------- 1 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index f174cc9..2d3bb6f 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -242,6 +242,7 @@ typedef struct { position *elems; /* Elements of this position set. */ int nelem; /* Number of elements in this set. */ + int alloc; /* Number of elements allocated in ELEMS. */ } position_set; /* Sets of leaves are also stored as arrays. */ @@ -1846,6 +1847,7 @@ static void alloc_posset (position_set *s, size_t size) { MALLOC(s->elems, position, size); + s->alloc = size; s->nelem = 0; } @@ -2113,7 +2115,6 @@ dfaanalyze (struct dfa *d, int searchflag) position *firstpos; /* Array where firstpos elements are stored. */ int *nlastpos; /* Element count stack for lastpos sets. */ position *lastpos; /* Array where lastpos elements are stored. */ - int *nalloc; /* Sizes of arrays allocated to follow sets. */ position_set tmp; /* Temporary set for merging sets. */ position_set merged; /* Result of merging sets. */ int wants_newline; /* True if some position wants newline info. */ @@ -2145,7 +2146,6 @@ dfaanalyze (struct dfa *d, int searchflag) o_nlast = nlastpos; MALLOC(lastpos, position, d->nleaves); o_lastpos = lastpos, lastpos += d->nleaves; - CALLOC(nalloc, int, d->tindex); alloc_posset(&merged, d->nleaves); CALLOC(d->follows, position_set, d->tindex); @@ -2175,7 +2175,7 @@ dfaanalyze (struct dfa *d, int searchflag) { merge(&tmp, &d->follows[pos[j].index], &merged); REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, - nalloc[pos[j].index], merged.nelem - 1); + d->follows[pos[j].index].alloc, merged.nelem - 1); copy(&merged, &d->follows[pos[j].index]); } @@ -2195,7 +2195,7 @@ dfaanalyze (struct dfa *d, int searchflag) { merge(&tmp, &d->follows[pos[j].index], &merged); REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, - nalloc[pos[j].index], merged.nelem - 1); + d->follows[pos[j].index].alloc, merged.nelem - 1); copy(&merged, &d->follows[pos[j].index]); } @@ -2255,8 +2255,7 @@ dfaanalyze (struct dfa *d, int searchflag) firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; /* Allocate the follow set for this position. */ - nalloc[i] = 1; - alloc_posset(&d->follows[i], nalloc[i]); + alloc_posset(&d->follows[i], 1); break; } #ifdef DEBUG @@ -2304,8 +2303,8 @@ dfaanalyze (struct dfa *d, int searchflag) #endif copy(&d->follows[i], &merged); epsclosure(&merged, d); - if (d->follows[i].nelem < merged.nelem) - REALLOC(d->follows[i].elems, position, merged.nelem); + REALLOC_IF_NECESSARY(d->follows[i].elems, position, + d->follows[i].alloc, merged.nelem); copy(&merged, &d->follows[i]); } @@ -2333,7 +2332,6 @@ dfaanalyze (struct dfa *d, int searchflag) free(o_firstpos); free(o_nlast); free(o_lastpos); - free(nalloc); free(merged.elems); } -- 1.7.5.2 >From 0348452c8b7830bd381c44dc304e96e2a2ed47f5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <[email protected]> Date: Tue, 28 Jun 2011 09:07:44 +0200 Subject: [PATCH 5/5] dfa: automatically resize position_sets * src/dfa.c (insert, copy, merge): Resize arrays here. (dfaanalyze): Do not track number of allocated elements here. (dfastate): Allocate mbps with only one element. --- src/dfa.c | 26 ++++++++++++-------------- 1 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 2d3bb6f..04c127e 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1839,6 +1839,7 @@ dfaparse (char const *s, size_t len, struct dfa *d) static void copy (position_set const *src, position_set *dst) { + REALLOC_IF_NECESSARY(dst->elems, position, dst->alloc, src->nelem - 1); memcpy(dst->elems, src->elems, sizeof(dst->elems[0]) * src->nelem); dst->nelem = src->nelem; } @@ -1860,6 +1861,7 @@ insert (position p, position_set *s) { int count = s->nelem; int lo = 0, hi = count; + int i; while (lo < hi) { int mid = ((unsigned) lo + (unsigned) hi) >> 1; @@ -1870,15 +1872,16 @@ insert (position p, position_set *s) } if (lo < count && p.index == s->elems[lo].index) - s->elems[lo].constraint |= p.constraint; - else { - int i; - for (i = count; i > lo; i--) - s->elems[i] = s->elems[i - 1]; - s->elems[lo] = p; - ++s->nelem; + s->elems[lo].constraint |= p.constraint; + return; } + + REALLOC_IF_NECESSARY(s->elems, position, s->alloc, count); + for (i = count; i > lo; i--) + s->elems[i] = s->elems[i - 1]; + s->elems[lo] = p; + ++s->nelem; } /* Merge two sets of positions into a third. The result is exactly as if @@ -1888,6 +1891,7 @@ merge (position_set const *s1, position_set const *s2, position_set *m) { int i = 0, j = 0; + REALLOC_IF_NECESSARY(m->elems, position, m->alloc, s1->nelem + s2->nelem - 1); m->nelem = 0; while (i < s1->nelem && j < s2->nelem) if (s1->elems[i].index > s2->elems[j].index) @@ -2174,8 +2178,6 @@ dfaanalyze (struct dfa *d, int searchflag) for (j = 0; j < nlastpos[-1]; ++j) { merge(&tmp, &d->follows[pos[j].index], &merged); - REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, - d->follows[pos[j].index].alloc, merged.nelem - 1); copy(&merged, &d->follows[pos[j].index]); } @@ -2194,8 +2196,6 @@ dfaanalyze (struct dfa *d, int searchflag) for (j = 0; j < nlastpos[-2]; ++j) { merge(&tmp, &d->follows[pos[j].index], &merged); - REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, - d->follows[pos[j].index].alloc, merged.nelem - 1); copy(&merged, &d->follows[pos[j].index]); } @@ -2303,8 +2303,6 @@ dfaanalyze (struct dfa *d, int searchflag) #endif copy(&d->follows[i], &merged); epsclosure(&merged, d); - REALLOC_IF_NECESSARY(d->follows[i].elems, position, - d->follows[i].alloc, merged.nelem); copy(&merged, &d->follows[i]); } @@ -2424,7 +2422,7 @@ dfastate (int s, struct dfa *d, int trans[]) must put it to d->states[s].mbps, which contains the positions which can match with a single character not a byte. */ if (d->states[s].mbps.nelem == 0) - alloc_posset(&d->states[s].mbps, d->states[s].elems.nelem); + alloc_posset(&d->states[s].mbps, 1); insert(pos, &(d->states[s].mbps)); continue; } -- 1.7.5.2
