Very often....make that very very very very very very very very very often,
I find myself processing text in python that when .split()'ing a line, I'd
like to exclude the split for a 'quoted' item...quoted because it contains
whitespace or the sep char.
For example:
s = ' Chan: 11 SNR: 22 ESSID: "Spaced Out Wifi" Enc: On'
If I want to yank the essid in the above example, it's a pain. But with my new
dandy split quoted method, we have a 3rd argument to .split() that we can
spec the quote delimiter where no splitting will occur, and the quote char
will be dropped:
s.split(None,-1,'"')[5]
'Spaced Out Wifi'
Attached is a proof of concept patch against
Python-2.4.1/Objects/stringobject.c that implements this. It is limited to
whitespace splitting only. (sep == None)
As implemented the quote delimiter also doubles as an additional separator for
the spliting out a substr.
For example:
'There is"no whitespace before these"quotes'.split(None,-1,'"')
['There', 'is', 'no whitespace before these', 'quotes']
This is useful, but possibly better put into practice as a separate method??
Comments please.
Dave
--- stringobject.c.orig 2006-05-17 16:12:13.000000000 -0400
+++ stringobject.c 2006-05-17 23:49:52.000000000 -0400
@@ -1336,6 +1336,85 @@
return NULL;
}
+// dc: split quoted example
+// 'This string has "not only this" "and this" but"this mixed in string"as well as this "" empty one and two more at the end""""'.split(None,-1,'"')
+// CORRECT: ['This', 'string', 'has', 'not only this', 'and this', 'but', 'this mixed in string', 'as', 'well', 'as', 'this', '', 'empty', 'one', 'and', 'two', 'more', 'at', 'the', 'end', '', '']
+static PyObject *
+split_whitespace_quoted(const char *s, int len, int maxsplit, const char *qsub)
+{
+ int i, j, quoted = 0;
+ PyObject *str;
+ PyObject *list = PyList_New(0);
+
+ if (list == NULL)
+ return NULL;
+
+ for (i = j = 0; i < len; ) {
+
+ if (!quoted) {
+ while (i < len && isspace(Py_CHARMASK(s[i])) )
+ i++;
+ }
+
+ if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) {
+ quoted = 1;
+ i++;
+ }
+
+ j = i;
+
+ while (i < len) {
+ if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) {
+ if (quoted)
+ quoted = 2; // End of quotes found
+ else {
+ quoted = 1; // Else start of new quotes in the middle of a string
+ }
+ break;
+ } else if (!quoted && isspace(Py_CHARMASK(s[i])))
+ break;
+ i++;
+ }
+
+ if (quoted == 2 && j == i) { // Empty string in quotes
+ SPLIT_APPEND("", 0, 0);
+ quoted = 0;
+ i++;
+ j = i;
+
+ } else if (j < i) {
+ if (maxsplit-- <= 0)
+ break;
+ SPLIT_APPEND(s, j, i);
+
+ if (quoted == 2) {
+ quoted = 0;
+ i++;
+ } else if (quoted == 1) {
+ i++;
+ if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) { // Embedded empty string in quotes (at end of string?)
+ SPLIT_APPEND("", 0, 0);
+ quoted = 0;
+ i++;
+ }
+ } else {
+ while (i < len && isspace(Py_CHARMASK(s[i])))
+ i++;
+ }
+
+ j = i;
+ }
+ }
+ if (j < len) {
+ SPLIT_APPEND(s, j, len);
+ }
+ return list;
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
+
static PyObject *
split_char(const char *s, int len, char ch, int maxcount)
{
@@ -1376,15 +1455,27 @@
static PyObject *
string_split(PyStringObject *self, PyObject *args)
{
- int len = PyString_GET_SIZE(self), n, i, j, err;
+ int len = PyString_GET_SIZE(self), n, qn, i, j, err;
int maxsplit = -1;
- const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *item, *subobj = Py_None;
+ const char *s = PyString_AS_STRING(self), *sub, *qsub;
+ PyObject *list, *item, *subobj = Py_None, *qsubobj = Py_None;
- if (!PyArg_ParseTuple(args, "|Oi:split", &subobj, &maxsplit))
+ if (!PyArg_ParseTuple(args, "|OiO:split", &subobj, &maxsplit, &qsubobj))
return NULL;
if (maxsplit < 0)
maxsplit = INT_MAX;
+ if (qsubobj != Py_None) {
+ if (PyString_Check(qsubobj)) {
+ qsub = PyString_AS_STRING(qsubobj);
+ qn = PyString_GET_SIZE(qsubobj);
+ }
+ if (qn == 0) {
+ PyErr_SetString(PyExc_ValueError, "empty delimiter");
+ return NULL;
+ }
+ if (subobj == Py_None)
+ return split_whitespace_quoted(s, len, maxsplit, qsub);
+ }
if (subobj == Py_None)
return split_whitespace(s, len, maxsplit);
if (PyString_Check(subobj)) {
_______________________________________________
Python-Dev mailing list
[email protected]
http://mail.python.org/mailman/listinfo/python-dev
Unsubscribe:
http://mail.python.org/mailman/options/python-dev/archive%40mail-archive.com