On Mon Dec 30 05:12:16 EST 2013, dexen.devr...@gmail.com wrote:
> hi list,
> 
> 
> both behavior and code indicate that split(1)'s  `-e' (split by regular 
> expression) doesn't play along with either `-n' (line count) or `-f' (output 
> file prefix). the former is somewhat understandable, but the later is strange 
> in lieu of `-s' (output file suffix) working just fine.
> 
> that by accident or is there some rationale?

i think the answer is a little bit of both.  it's easy to make split support
mixing any number of regular expressions with one line count.  (i believe
using -f with -e works already, unless you want a prefix for even re-matched
files.

proposed version attached

- erik

---
; whatis x xx
x=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20)
fn xx {for(i in $x)echo $i}
; xx | $home/v.split -e '^7$' -n 10
; for(i in *)echo $i && mc $i
xaa
1       2       3       4       5       6
xab
7       8       9       10
xac
11      12      13      14      15      16      17      18      19      20

; xx|$home^/v.split -f X -e '^7$' -n 10
; lc
Xaa     Xab     Xac

; xx|$home^/v.split -e '^(1)$' -e '^(7)$'  -n 10
; for(i in *)echo $i && mc $i
1
1       2       3       4       5       6
7
7       8       9       10
xaa
11      12      13      14      15      16      17      18      19      20
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>

char    digit[] = "0123456789";
char    *suffix = "";
char    *stem = "x";
char    suff[] = "aa";
char    name[200];
Biobuf  bout;
Biobuf  *output;
int     iflag;
int     xflag;

void openf(void);
int nextf(void);
int matchfile(Resub*);
int matching(Reprog**, int, char*);
char* xlower(char*);
void usage(void);

void
main(int argc, char *argv[])
{
        char *pat[25], *line, buf[256];
        int i, n, npat, lineno;
        Biobuf bin, *b;
        Reprog *re[25];

        n = 0;
        b = &bin;
        npat = 0;

        ARGBEGIN {
        case 'l':
        case 'n':
                n=atoi(EARGF(usage()));
                break;
        case 'e':
                if(npat == nelem(pat))
                        sysfatal("split: too many pats");
                pat[npat++] = EARGF(usage());
                break;
        case 'f':
                stem = EARGF(usage());
                break;
        case 's':
                suffix = EARGF(usage());
                break;
        case 'x':
                xflag++;
                break;
        case 'i':
                iflag++;
                break;
        default:
                usage();
                break;

        } ARGEND;

        if(argc > 1)
                usage();
        else if(argc == 0)
                Binit(b, 0, OREAD);
        else{
                b = Bopen(argv[0], OREAD);
                if(b == nil)
                        sysfatal("split: Bopen %s: %r", argv[0]);
        }

        /* default */
        if(n == 0 && npat == 0)
                n = 1000;

        /* prepare regular reressions */
        for(i = 0; i < npat; i++){
                re[i] = regcomp(xlower(pat[i]));
                if(re[i] == nil)
                        sysfatal("split: bad regular reression: %s", pat[i]);
        }

        lineno = 0;
        while((line = Brdline(b,'\n')) != nil) {
                line[Blinelen(b)-1] = 0;
                lineno++;
                if(matching(re, npat, line)){
                        if(xflag)
                                continue;
                }else if(n > 0 && lineno > n){
                        nextf();
                        lineno = 1;
                }else if(output == nil)
                        nextf();
                Bwrite(output, line, Blinelen(b)-1);
                Bputc(output, '\n');
        }

        while((n = Bread(b, buf, sizeof(buf))) > 0)
                Bwrite(output, buf, n);
        Bterm(b);
        exits("");
}

enum {
        Base    = 26,
        Last    = Base*(Base-1) + (Base-1),
};

int
nextf(void)
{
        static int once, seq;

        if(seq > Last){
                if(!once)
                        fprint(2, "split: file %szz not split\n", stem);
                once = 1;
                return 0;
        }
        snprint(name, sizeof name, "%s%c%c", stem, 'a'+seq/26, 'a'+seq%26);
        seq++;
        openf();
        return 1;
}

void
openf(void)
{
        static int fd = -1;

        if(fd >= 0){
                Bterm(output);
                close(fd);
        }
        fd = create(name, OWRITE,0666);
        if(fd < 0)
                sysfatal("split: can't create %s: %r", name);
        output = &bout;
        Binit(output, fd, OWRITE);
}

int
matching(Reprog **re, int nre, char *line)
{
        char *p;
        int i, len;
        Resub m[2];

        p = xlower(line);
        for(i = 0; i < nre; i++){
                memset(m, 0, sizeof m);
                if(regexec(re[i], p, m, nelem(m))){
                        if(m[1].sp == nil)
                                return nextf();
                        len = m[1].ep - m[1].sp;
                        snprint(name, sizeof name, "%*s%s", len, m[1].sp, 
suffix);
                        openf();
                        return 1;
                }
        }
        return 0;
}

char*
xlower(char *s)
{
        char *p;
        Rune r;
        static char buf[1024*UTFmax];

        if(!iflag)
                return s;
        p = buf;
        for(;;){
                if((uchar)*s < 0x80){
                        *p++ = tolower(*s);
                        if(*s++ == 0)
                                break;
                }
                else{
                        s += chartorune(&r, s);
                        r = tolowerrune(r);
                        p += runetochar(p, &r);
                }
        }
        return buf;
}

void
usage(void)
{
        fprint(2, "usage: split [-n num] [-e exp] [-f stem] [-s suff] [-x] [-i] 
[file]\n");
        exits("usage");
}

void
badexp(void)
{
        fprint(2, "split: bad regular expression\n");
        exits("bad regular expression");
}

Reply via email to