I have made a implementation of the 'csplit' command in about 160 lines of code.

The implementation is mostly POSIX compliment, but it is missing a few things 

It works as a Read-Eval-Print loop, where it prints to a file that changes 
based on context. So doing negative offsets would require it to print lines it 
doesn't accumulate yet.

The other main one is the fact it doesn't do "[LINE] {[NUMBER]}" cleanly yet.

It also includes the GNU extension "{*}" argument

The other breaks from POSIX are mostly insignificant, like the fact it doesn't 
check locale environment variables or uses "%lu" for file size instead of "%d". 


From 96c08c5c50715b4a8dce492667972751bf91af9c Mon Sep 17 00:00:00 2001
From: Oliver Webb <aquahobby...@proton.me>
Date: Mon, 11 Sep 2023 23:53:55 -0500
Subject: [PATCH] Implementation of csplit command

---
 toys/pending/csplit.c | 164 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 toys/pending/csplit.c

diff --git a/toys/pending/csplit.c b/toys/pending/csplit.c
new file mode 100644
index 00000000..85db576e
--- /dev/null
+++ b/toys/pending/csplit.c
@@ -0,0 +1,164 @@
+/* csplit.c - split files on context
+ *
+ * Copyright 2023 Oliver Webb <aquahobby...@proton.me>
+ *
+ * See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/csplit.html
+ * Deviations From POSIX:
+ *	Does not use %d for file size output
+ *	Doesn't do negitive offsets
+ *	GNU Extension: "{*}"
+ *
+
+USE_CSPLIT(NEWTOY(csplit, "<2skf:n#", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_MAYFORK))
+
+config CSPLIT
+  bool "csplit"
+  default n
+  help
+    usage: csplit [-ks] [-f PREFIX] [-n INTEGER] file arg...
+	
+	Split files into multiple files based on list of rules
+
+	-k Does not delete Files on error
+	-s No file output size messages
+	-f [PREFIX] Use [PREFIX] as filename prefix instead of "xx"
+	-n [INTEGER] Make all filename numbers [INTEGER] characters long
+
+	Valid Rules:
+	/regexp/[INTEGER] Break file before line that regexp matches, 
+	%regexp%[INTEGER] Break file after line that regexp matches, 
+	If a offset is specified for these rules, the break will happen [INTEGER] 
+	lines after the regexp match
+	if a offset is specified, it will break at [INTEGER] lines after the offset
+	[INTEGER] Break file at line before [INTEGER]
+	{INTEGER} Repeat Previous Pattern INTEGER Number of times if INTEGER is *
+	The pattern repeats forever
+	
+*/
+
+#define FOR_csplit
+#include "toys.h"
+
+GLOBALS(
+  long n;
+  char *f;
+)
+
+size_t indx = 1, findx = 0, lineno = 1, btc = 0;
+char *filefmt, *flname, *prefix;
+int eg = 0, offset = -1;
+
+// This is only int so we can exit cleanly in ternary operators
+int abort_csplit(char *err) {
+  // Cycle down through index instead of keeping track of what files we made
+  if (!FLAG(k)) for (; indx>=1; indx--) 
+	remove(xmprintf(filefmt, prefix, findx));
+  error_exit("%s\n", err);
+  return 1;
+}
+
+int rgmatch(char *rxrl, char *line, char *fmt) {
+  regex_t rxp;
+  sscanf(rxrl,fmt, toybuf, &offset);
+  xregcomp(&rxp, toybuf, 0);
+  int rr = regexec(&rxp, line, 0, 0, 0);
+  if (!rr) return 1;
+  else if (rr == REG_NOMATCH) return 0;
+  return abort_csplit("bad regex");
+}
+
+int cntxt(char *line, char *rule) {
+  if (eg) return 0;
+
+  if (offset < 0);
+  else if (offset == 0) {
+	offset = -1;
+	return 1;
+  } else {
+	offset--;
+	return 0;
+  }
+
+  switch (rule[0]) {
+
+	case '/': 
+	  return rgmatch(rule, line, "/%[^/%]/%d");
+	  break;
+	case '%': 
+	  offset++;
+	  return rgmatch(rule, line, "%%%[^/%]%%%d");
+	  break;
+
+	case '{': 
+	  // GNU extention: {*}
+	  if (!strcmp(rule,"{*}")) 
+		btc = -1; 
+	  else if (!sscanf(rule,"{%lu}",&btc)) 
+		abort_csplit("bad rule");
+
+	  if (cntxt(line, toys.optargs[indx-1])) {
+		// Manipulate the rule then return to it later so we create a 
+		// new file but are still on the same rule. This is the only 
+		// reason why we differentiate between rule and file Index
+		if (btc != 1) {
+		  toys.optargs[indx] = xmprintf("{%lu}",btc-1);
+		  indx--; 
+		}
+		return 1;
+	  }
+	  return 0;
+	  break;
+
+	default:
+	 offset = ((size_t)atoll(rule)) ? (atoll(rule)) : abort_csplit("bad rule");
+	 return (lineno > offset) ? abort_csplit("bad rule order") :
+	   (lineno == offset);
+	 break;
+  }
+
+  // The code should never get to this point without returning something
+  perror_exit("Error");
+  return 1;
+}
+
+void csplit_main(void)
+{
+  FILE *fin = (*toys.optargs[0] != '-') ? xfopen(toys.optargs[0], "r") : stdin;
+
+  struct stat st;
+  
+  // -f and -n formatting
+  filefmt = xmprintf("%%s%%0%dd", TT.n ? (int)TT.n : 2);
+  prefix = TT.f ? TT.f : "xx";
+
+  flname = xmprintf(filefmt, prefix, findx);
+  FILE *actvfile = xfopen(flname, "w+");
+  for (char *line; (line = xgetline(fin)); free(line)) {
+	lineno++;
+	if (cntxt(line, toys.optargs[indx])) {
+
+	  fclose(actvfile);
+	  if (!FLAG(s)) {
+		stat(flname, &st);
+		printf("%ld\n", st.st_size);
+	  }
+
+	  indx++;
+	  findx++;
+	  flname = xmprintf(filefmt, prefix, findx);
+	  actvfile = xfopen(flname, "w+");
+
+	  if (indx == toys.optc) eg = 1;
+	}
+	fprintf(actvfile, "%s\n", line);
+  }
+
+  fclose(actvfile);
+  if (!FLAG(s)) {
+	stat(flname, &st);
+	printf("%ld\n", st.st_size);
+  }
+
+  // Abort Case: Not All Rules Processed
+  if (indx < toys.optc-1) abort_csplit("Rules not processed");
+}
-- 
2.34.1

_______________________________________________
Toybox mailing list
Toybox@lists.landley.net
http://lists.landley.net/listinfo.cgi/toybox-landley.net

Reply via email to