Very often....make that very very very very very very very very very often,
I find myself processing text in python that  when .split()'ing a line, I'd 
like to exclude the split for a 'quoted' item...quoted because it contains 
whitespace or the sep char.

For example:

s = '      Chan: 11  SNR: 22  ESSID: "Spaced Out Wifi"  Enc: On'

If I want to yank the essid in the above example, it's a pain. But with my new 
dandy split quoted method, we have a 3rd argument to .split() that we can 
spec the quote delimiter where no splitting will occur, and the quote char 
will be dropped:

        s.split(None,-1,'"')[5]
        'Spaced Out Wifi'

Attached is a proof of concept patch against 
Python-2.4.1/Objects/stringobject.c  that implements this. It is limited to 
whitespace splitting only. (sep == None)

As implemented the quote delimiter also doubles as an additional separator for 
the spliting out a substr. 

For example:
        'There is"no whitespace before these"quotes'.split(None,-1,'"')
        ['There', 'is', 'no whitespace before these', 'quotes']

This is useful, but possibly better put into practice as a separate method??

Comments please.

Dave
--- stringobject.c.orig	2006-05-17 16:12:13.000000000 -0400
+++ stringobject.c	2006-05-17 23:49:52.000000000 -0400
@@ -1336,6 +1336,85 @@
 	return NULL;
 }
 
+// dc: split quoted example
+// 'This     string has      "not only this" "and this" but"this mixed in string"as well as this "" empty one and two more at the end""""'.split(None,-1,'"')
+// CORRECT: ['This', 'string', 'has', 'not only this', 'and this', 'but', 'this mixed in string', 'as', 'well', 'as', 'this', '', 'empty', 'one', 'and', 'two', 'more', 'at', 'the', 'end', '', '']
+static PyObject *
+split_whitespace_quoted(const char *s, int len, int maxsplit, const char *qsub)
+{
+	int i, j, quoted = 0;
+	PyObject *str;
+	PyObject *list = PyList_New(0);
+
+	if (list == NULL)
+		return NULL;
+
+	for (i = j = 0; i < len; ) {
+			
+		if (!quoted) {
+			while (i < len && isspace(Py_CHARMASK(s[i])) )
+				i++;
+		}
+		
+		if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) {
+			quoted = 1;
+			i++;
+		}
+		
+		j = i;
+			
+		while (i < len) {
+			if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) {	
+				if (quoted)	
+					quoted = 2;	// End of quotes found 
+				else {
+					quoted = 1;	// Else start of new quotes in the middle of a string
+				}
+				break;
+			} else if (!quoted && isspace(Py_CHARMASK(s[i])))
+					break;
+			i++;
+		}
+		
+		if (quoted == 2 && j == i) {	// Empty string in quotes
+			SPLIT_APPEND("", 0, 0);
+			quoted = 0;
+			i++;
+			j = i;
+
+		} else if (j < i) {
+			if (maxsplit-- <= 0)
+				break;
+			SPLIT_APPEND(s, j, i);
+	
+			if (quoted == 2) {
+				quoted = 0;
+				i++;
+			} else if (quoted == 1) {
+				i++;
+				if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) { // Embedded empty string in quotes (at end of string?)
+					SPLIT_APPEND("", 0, 0);
+					quoted = 0;
+					i++;
+				}
+			} else {
+				while (i < len && isspace(Py_CHARMASK(s[i])))
+					i++;
+			}
+			
+			j = i;
+		}
+	}
+	if (j < len) {
+		SPLIT_APPEND(s, j, len);
+	}
+	return list;
+  onError:
+	Py_DECREF(list);
+	return NULL;
+}
+
+
 static PyObject *
 split_char(const char *s, int len, char ch, int maxcount)
 {
@@ -1376,15 +1455,27 @@
 static PyObject *
 string_split(PyStringObject *self, PyObject *args)
 {
-	int len = PyString_GET_SIZE(self), n, i, j, err;
+	int len = PyString_GET_SIZE(self), n, qn, i, j, err;
 	int maxsplit = -1;
-	const char *s = PyString_AS_STRING(self), *sub;
-	PyObject *list, *item, *subobj = Py_None;
+	const char *s = PyString_AS_STRING(self), *sub, *qsub;
+	PyObject *list, *item, *subobj = Py_None, *qsubobj = Py_None;
 
-	if (!PyArg_ParseTuple(args, "|Oi:split", &subobj, &maxsplit))
+	if (!PyArg_ParseTuple(args, "|OiO:split", &subobj, &maxsplit, &qsubobj))
 		return NULL;
 	if (maxsplit < 0)
 		maxsplit = INT_MAX;
+	if (qsubobj != Py_None) {
+		if (PyString_Check(qsubobj)) {
+			qsub = PyString_AS_STRING(qsubobj);
+			qn = PyString_GET_SIZE(qsubobj);
+		}
+		if (qn == 0) {
+			PyErr_SetString(PyExc_ValueError, "empty delimiter");
+			return NULL;
+		}
+		if (subobj == Py_None)
+			return split_whitespace_quoted(s, len, maxsplit, qsub);
+	}		
 	if (subobj == Py_None)
 		return split_whitespace(s, len, maxsplit);
 	if (PyString_Check(subobj)) {
_______________________________________________
Python-Dev mailing list
Python-Dev@python.org
http://mail.python.org/mailman/listinfo/python-dev
Unsubscribe: 
http://mail.python.org/mailman/options/python-dev/archive%40mail-archive.com

Reply via email to