I have made a implementation of the 'csplit' command in about 160 lines of code.
The implementation is mostly POSIX compliment, but it is missing a few things It works as a Read-Eval-Print loop, where it prints to a file that changes based on context. So doing negative offsets would require it to print lines it doesn't accumulate yet. The other main one is the fact it doesn't do "[LINE] {[NUMBER]}" cleanly yet. It also includes the GNU extension "{*}" argument The other breaks from POSIX are mostly insignificant, like the fact it doesn't check locale environment variables or uses "%lu" for file size instead of "%d".
From 96c08c5c50715b4a8dce492667972751bf91af9c Mon Sep 17 00:00:00 2001 From: Oliver Webb <aquahobby...@proton.me> Date: Mon, 11 Sep 2023 23:53:55 -0500 Subject: [PATCH] Implementation of csplit command --- toys/pending/csplit.c | 164 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 toys/pending/csplit.c diff --git a/toys/pending/csplit.c b/toys/pending/csplit.c new file mode 100644 index 00000000..85db576e --- /dev/null +++ b/toys/pending/csplit.c @@ -0,0 +1,164 @@ +/* csplit.c - split files on context + * + * Copyright 2023 Oliver Webb <aquahobby...@proton.me> + * + * See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/csplit.html + * Deviations From POSIX: + * Does not use %d for file size output + * Doesn't do negitive offsets + * GNU Extension: "{*}" + * + +USE_CSPLIT(NEWTOY(csplit, "<2skf:n#", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_MAYFORK)) + +config CSPLIT + bool "csplit" + default n + help + usage: csplit [-ks] [-f PREFIX] [-n INTEGER] file arg... + + Split files into multiple files based on list of rules + + -k Does not delete Files on error + -s No file output size messages + -f [PREFIX] Use [PREFIX] as filename prefix instead of "xx" + -n [INTEGER] Make all filename numbers [INTEGER] characters long + + Valid Rules: + /regexp/[INTEGER] Break file before line that regexp matches, + %regexp%[INTEGER] Break file after line that regexp matches, + If a offset is specified for these rules, the break will happen [INTEGER] + lines after the regexp match + if a offset is specified, it will break at [INTEGER] lines after the offset + [INTEGER] Break file at line before [INTEGER] + {INTEGER} Repeat Previous Pattern INTEGER Number of times if INTEGER is * + The pattern repeats forever + +*/ + +#define FOR_csplit +#include "toys.h" + +GLOBALS( + long n; + char *f; +) + +size_t indx = 1, findx = 0, lineno = 1, btc = 0; +char *filefmt, *flname, *prefix; +int eg = 0, offset = -1; + +// This is only int so we can exit cleanly in ternary operators +int abort_csplit(char *err) { + // Cycle down through index instead of keeping track of what files we made + if (!FLAG(k)) for (; indx>=1; indx--) + remove(xmprintf(filefmt, prefix, findx)); + error_exit("%s\n", err); + return 1; +} + +int rgmatch(char *rxrl, char *line, char *fmt) { + regex_t rxp; + sscanf(rxrl,fmt, toybuf, &offset); + xregcomp(&rxp, toybuf, 0); + int rr = regexec(&rxp, line, 0, 0, 0); + if (!rr) return 1; + else if (rr == REG_NOMATCH) return 0; + return abort_csplit("bad regex"); +} + +int cntxt(char *line, char *rule) { + if (eg) return 0; + + if (offset < 0); + else if (offset == 0) { + offset = -1; + return 1; + } else { + offset--; + return 0; + } + + switch (rule[0]) { + + case '/': + return rgmatch(rule, line, "/%[^/%]/%d"); + break; + case '%': + offset++; + return rgmatch(rule, line, "%%%[^/%]%%%d"); + break; + + case '{': + // GNU extention: {*} + if (!strcmp(rule,"{*}")) + btc = -1; + else if (!sscanf(rule,"{%lu}",&btc)) + abort_csplit("bad rule"); + + if (cntxt(line, toys.optargs[indx-1])) { + // Manipulate the rule then return to it later so we create a + // new file but are still on the same rule. This is the only + // reason why we differentiate between rule and file Index + if (btc != 1) { + toys.optargs[indx] = xmprintf("{%lu}",btc-1); + indx--; + } + return 1; + } + return 0; + break; + + default: + offset = ((size_t)atoll(rule)) ? (atoll(rule)) : abort_csplit("bad rule"); + return (lineno > offset) ? abort_csplit("bad rule order") : + (lineno == offset); + break; + } + + // The code should never get to this point without returning something + perror_exit("Error"); + return 1; +} + +void csplit_main(void) +{ + FILE *fin = (*toys.optargs[0] != '-') ? xfopen(toys.optargs[0], "r") : stdin; + + struct stat st; + + // -f and -n formatting + filefmt = xmprintf("%%s%%0%dd", TT.n ? (int)TT.n : 2); + prefix = TT.f ? TT.f : "xx"; + + flname = xmprintf(filefmt, prefix, findx); + FILE *actvfile = xfopen(flname, "w+"); + for (char *line; (line = xgetline(fin)); free(line)) { + lineno++; + if (cntxt(line, toys.optargs[indx])) { + + fclose(actvfile); + if (!FLAG(s)) { + stat(flname, &st); + printf("%ld\n", st.st_size); + } + + indx++; + findx++; + flname = xmprintf(filefmt, prefix, findx); + actvfile = xfopen(flname, "w+"); + + if (indx == toys.optc) eg = 1; + } + fprintf(actvfile, "%s\n", line); + } + + fclose(actvfile); + if (!FLAG(s)) { + stat(flname, &st); + printf("%ld\n", st.st_size); + } + + // Abort Case: Not All Rules Processed + if (indx < toys.optc-1) abort_csplit("Rules not processed"); +} -- 2.34.1
_______________________________________________ Toybox mailing list Toybox@lists.landley.net http://lists.landley.net/listinfo.cgi/toybox-landley.net