On Mon Dec 30 05:12:16 EST 2013, dexen.devr...@gmail.com wrote:
> hi list,
>
>
> both behavior and code indicate that split(1)'s `-e' (split by regular
> expression) doesn't play along with either `-n' (line count) or `-f' (output
> file prefix). the former is somewhat understandable, but the later is strange
> in lieu of `-s' (output file suffix) working just fine.
>
> that by accident or is there some rationale?
i think the answer is a little bit of both. it's easy to make split support
mixing any number of regular expressions with one line count. (i believe
using -f with -e works already, unless you want a prefix for even re-matched
files.
proposed version attached
- erik
---
; whatis x xx
x=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20)
fn xx {for(i in $x)echo $i}
; xx | $home/v.split -e '^7$' -n 10
; for(i in *)echo $i && mc $i
xaa
1 2 3 4 5 6
xab
7 8 9 10
xac
11 12 13 14 15 16 17 18 19 20
; xx|$home^/v.split -f X -e '^7$' -n 10
; lc
Xaa Xab Xac
; xx|$home^/v.split -e '^(1)$' -e '^(7)$' -n 10
; for(i in *)echo $i && mc $i
1
1 2 3 4 5 6
7
7 8 9 10
xaa
11 12 13 14 15 16 17 18 19 20
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
char digit[] = "0123456789";
char *suffix = "";
char *stem = "x";
char suff[] = "aa";
char name[200];
Biobuf bout;
Biobuf *output;
int iflag;
int xflag;
void openf(void);
int nextf(void);
int matchfile(Resub*);
int matching(Reprog**, int, char*);
char* xlower(char*);
void usage(void);
void
main(int argc, char *argv[])
{
char *pat[25], *line, buf[256];
int i, n, npat, lineno;
Biobuf bin, *b;
Reprog *re[25];
n = 0;
b = &bin;
npat = 0;
ARGBEGIN {
case 'l':
case 'n':
n=atoi(EARGF(usage()));
break;
case 'e':
if(npat == nelem(pat))
sysfatal("split: too many pats");
pat[npat++] = EARGF(usage());
break;
case 'f':
stem = EARGF(usage());
break;
case 's':
suffix = EARGF(usage());
break;
case 'x':
xflag++;
break;
case 'i':
iflag++;
break;
default:
usage();
break;
} ARGEND;
if(argc > 1)
usage();
else if(argc == 0)
Binit(b, 0, OREAD);
else{
b = Bopen(argv[0], OREAD);
if(b == nil)
sysfatal("split: Bopen %s: %r", argv[0]);
}
/* default */
if(n == 0 && npat == 0)
n = 1000;
/* prepare regular reressions */
for(i = 0; i < npat; i++){
re[i] = regcomp(xlower(pat[i]));
if(re[i] == nil)
sysfatal("split: bad regular reression: %s", pat[i]);
}
lineno = 0;
while((line = Brdline(b,'\n')) != nil) {
line[Blinelen(b)-1] = 0;
lineno++;
if(matching(re, npat, line)){
if(xflag)
continue;
}else if(n > 0 && lineno > n){
nextf();
lineno = 1;
}else if(output == nil)
nextf();
Bwrite(output, line, Blinelen(b)-1);
Bputc(output, '\n');
}
while((n = Bread(b, buf, sizeof(buf))) > 0)
Bwrite(output, buf, n);
Bterm(b);
exits("");
}
enum {
Base = 26,
Last = Base*(Base-1) + (Base-1),
};
int
nextf(void)
{
static int once, seq;
if(seq > Last){
if(!once)
fprint(2, "split: file %szz not split\n", stem);
once = 1;
return 0;
}
snprint(name, sizeof name, "%s%c%c", stem, 'a'+seq/26, 'a'+seq%26);
seq++;
openf();
return 1;
}
void
openf(void)
{
static int fd = -1;
if(fd >= 0){
Bterm(output);
close(fd);
}
fd = create(name, OWRITE,0666);
if(fd < 0)
sysfatal("split: can't create %s: %r", name);
output = &bout;
Binit(output, fd, OWRITE);
}
int
matching(Reprog **re, int nre, char *line)
{
char *p;
int i, len;
Resub m[2];
p = xlower(line);
for(i = 0; i < nre; i++){
memset(m, 0, sizeof m);
if(regexec(re[i], p, m, nelem(m))){
if(m[1].sp == nil)
return nextf();
len = m[1].ep - m[1].sp;
snprint(name, sizeof name, "%*s%s", len, m[1].sp,
suffix);
openf();
return 1;
}
}
return 0;
}
char*
xlower(char *s)
{
char *p;
Rune r;
static char buf[1024*UTFmax];
if(!iflag)
return s;
p = buf;
for(;;){
if((uchar)*s < 0x80){
*p++ = tolower(*s);
if(*s++ == 0)
break;
}
else{
s += chartorune(&r, s);
r = tolowerrune(r);
p += runetochar(p, &r);
}
}
return buf;
}
void
usage(void)
{
fprint(2, "usage: split [-n num] [-e exp] [-f stem] [-s suff] [-x] [-i]
[file]\n");
exits("usage");
}
void
badexp(void)
{
fprint(2, "split: bad regular expression\n");
exits("bad regular expression");
}