David Watson added the comment:

I've updated the ASCII/surrogateescape patches in line with
various changes to Python since I posted them.

return-ascii-surrogateescape-2015-06-25.diff incorporates the
ascii-surrogateescape and uname-surrogateescape patches, and
accept-ascii-surrogateescape-2015-06-25.diff corresponds to the
try-surrogateescape-first patch.  Neither patch touches
gethostname() on Windows.

Python's existing code now has a fast path for ASCII-only strings
which passes them through unchanged (Unicode -> ASCII), so in
order not to slow down processing of valid IDNs, the latter patch
now effectively tries encodings in the order

   ASCII/strict (existing code, fast path)
   IDNA/strict (existing code)
   ASCII/surrogateescape (added by patch)

rather than the previous

   ASCII/surrogateescape
   IDNA/strict

This doesn't change the behaviour of the patch, since IDNA always
rejects strings containing surrogate codes, and either rejects
ASCII-only strings (e.g. when a label is longer than 63
characters) or passes them through unchanged.

These patches would at least allow getfqdn() to work in Almad's
example, but in that case the host also appears to be addressable
by the IDNA equivalent ("xn--didejo-noas-1ic") of its Unicode
hostname (I haven't checked as I'm not a Windows user, but I
presume the UnicodeDecodeError came from gethost_common() in
socketmodule.c and hence the name lookup was successful), so it
would certainly be more helpful to return Unicode for non-ASCII
gethostbyaddr() results there, if they were guaranteed to map to
real IDNA hostnames in Windows environments.

(That isn't guaranteed in Unix environments of course, which is
why I'm still suggesting ASCII/surrogateescape for the general
case.)

----------
Added file: 
http://bugs.python.org/file39812/return-ascii-surrogateescape-2015-06-25.diff
Added file: 
http://bugs.python.org/file39813/accept-ascii-surrogateescape-2015-06-25.diff

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue9377>
_______________________________________
# HG changeset patch
# Parent  a497e7faa09b9a836983635b96adbf216a39e4f5
Decode hostnames as ASCII/surrogateescape (except
socket.gethostname() on Windows).

diff --git a/Doc/library/os.rst b/Doc/library/os.rst
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@@ -587,12 +587,21 @@ process and user.
    :func:`socket.gethostname`  or even
    ``socket.gethostbyaddr(socket.gethostname())``.
 
+   The strings are converted using the file system encoding and
+   the ``'surrogateescape'`` error handler, except for
+   :attr:`nodename`, which is converted as ASCII with the
+   ``'surrogateescape'`` error handler (see the :mod:`socket`
+   module documentation for details).
+
    Availability: recent flavors of Unix.
 
    .. versionchanged:: 3.3
       Return type changed from a tuple to a tuple-like object
       with named attributes.
 
+   .. versionchanged:: XXX
+      The :attr:`nodename` attribute is now converted as
+      ASCII/``surrogateescape``.
 
 .. function:: unsetenv(key)
 
diff --git a/Doc/library/socket.rst b/Doc/library/socket.rst
--- a/Doc/library/socket.rst
+++ b/Doc/library/socket.rst
@@ -124,6 +124,14 @@ differently into an actual IPv4/v6 addre
 resolution and/or the host configuration.  For deterministic behavior use a
 numeric address in *host* portion.
 
+When a hostname is returned by a system interface (except
+:func:`gethostname` on Windows), it is decoded into a string
+using the ``'ascii'`` codec and the ``'surrogateescape'`` error
+handler; this leaves ASCII bytes as ASCII, including IDNA
+ASCII-compatible encodings (see :mod:`encodings.idna`), but
+converts any non-ASCII bytes to the Unicode lone surrogate codes
+U+DC80...U+DCFF.
+
 All errors raise exceptions.  The normal exceptions for invalid argument types
 and out-of-memory conditions can be raised; starting from Python 3.3, errors
 related to socket or address semantics raise :exc:`OSError` or one of its
diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
@@ -4464,6 +4464,7 @@ os_uname_impl(PyModuleDef *module)
     struct utsname u;
     int res;
     PyObject *value;
+    PyObject *o;
 
     Py_BEGIN_ALLOW_THREADS
     res = uname(&u);
@@ -4486,7 +4487,13 @@ os_uname_impl(PyModuleDef *module)
     } \
 
     SET(0, u.sysname);
-    SET(1, u.nodename);
+    o = PyUnicode_DecodeASCII(u.nodename, strlen(u.nodename),
+                              "surrogateescape");
+    if (!o) {
+        Py_DECREF(value);
+        return NULL;
+    }
+    PyStructSequence_SET_ITEM(value, 1, o);
     SET(2, u.release);
     SET(3, u.version);
     SET(4, u.machine);
diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c
--- a/Modules/socketmodule.c
+++ b/Modules/socketmodule.c
@@ -895,6 +895,15 @@ static PyThread_type_lock netdb_lock;
 #endif
 
 
+/* Return the string representation for the given hostname. */
+
+static PyObject *
+decode_hostname(const char *name)
+{
+    return PyUnicode_DecodeASCII(name, strlen(name), "surrogateescape");
+}
+
+
 /* Convert a string specifying a host name or one of a few symbolic
    names to a numeric IP address.  This usually calls gethostbyname()
    to do the work; the names "" and "<broadcast>" are special.
@@ -4440,7 +4449,7 @@ socket_gethostname(PyObject *self, PyObj
     if (res < 0)
         return set_error();
     buf[sizeof buf - 1] = '\0';
-    return PyUnicode_DecodeFSDefault(buf);
+    return decode_hostname(buf);
 #endif
 }
 
@@ -4562,7 +4571,7 @@ gethost_common(struct hostent *h, struct
     if (h->h_aliases) {
         for (pch = h->h_aliases; *pch != NULL; pch++) {
             int status;
-            tmp = PyUnicode_FromString(*pch);
+            tmp = decode_hostname(*pch);
             if (tmp == NULL)
                 goto err;
 
@@ -4630,7 +4639,8 @@ gethost_common(struct hostent *h, struct
             goto err;
     }
 
-    rtn_tuple = Py_BuildValue("sOO", h->h_name, name_list, addr_list);
+    rtn_tuple = Py_BuildValue("NOO", decode_hostname(h->h_name),
+                              name_list, addr_list);
 
  err:
     Py_XDECREF(name_list);
@@ -5573,9 +5583,9 @@ socket_getaddrinfo(PyObject *self, PyObj
             makesockaddr(-1, res->ai_addr, res->ai_addrlen, protocol);
         if (addr == NULL)
             goto err;
-        single = Py_BuildValue("iiisO", res->ai_family,
+        single = Py_BuildValue("iiiNO", res->ai_family,
             res->ai_socktype, res->ai_protocol,
-            res->ai_canonname ? res->ai_canonname : "",
+            decode_hostname(res->ai_canonname ? res->ai_canonname : ""),
             addr);
         Py_DECREF(addr);
         if (single == NULL)
@@ -5681,7 +5691,7 @@ socket_getnameinfo(PyObject *self, PyObj
         set_gaierror(error);
         goto fail;
     }
-    ret = Py_BuildValue("ss", hbuf, pbuf);
+    ret = Py_BuildValue("Ns", decode_hostname(hbuf), pbuf);
 
 fail:
     if (res)
# HG changeset patch
# Parent  5c2d7aceb46865110857879702bed186f8aa1ad7
Accept ASCII/surrogateescape strings as hostname arguments.

diff --git a/Doc/library/socket.rst b/Doc/library/socket.rst
--- a/Doc/library/socket.rst
+++ b/Doc/library/socket.rst
@@ -132,6 +132,19 @@ ASCII-compatible encodings (see :mod:`en
 converts any non-ASCII bytes to the Unicode lone surrogate codes
 U+DC80...U+DCFF.
 
+When a string is passed as a hostname argument, it will be
+encoded as ASCII if it contains only ASCII characters; otherwise,
+the :mod:`~encodings.idna` codec will be used, and if that fails
+(for instance, because the string contains lone surrogate codes),
+then the string will be encoded using the ``'ascii'`` codec and
+the ``'surrogateescape'`` error handler.
+
+  .. versionchanged:: XXX
+     Previously, hostnames were decoded as UTF-8 (except by
+     :func:`gethostname`, which on non-Windows platforms used the
+     file system encoding and the ``'surrogateescape'`` error
+     handler), and encoded as strict ASCII or IDNA.
+
 All errors raise exceptions.  The normal exceptions for invalid argument types
 and out-of-memory conditions can be raised; starting from Python 3.3, errors
 related to socket or address semantics raise :exc:`OSError` or one of its
diff --git a/Lib/test/test_socket.py b/Lib/test/test_socket.py
--- a/Lib/test/test_socket.py
+++ b/Lib/test/test_socket.py
@@ -1434,6 +1434,66 @@ class GeneralModuleTests(unittest.TestCa
             self.assertEqual(s.family, 42424)
             self.assertEqual(s.type, 13331)
 
+
+# This should produce the same results with or without network access,
+# but can hang for some time if the upstream DNS servers are
+# unreachable.
+@unittest.skipUnless(support.is_resource_enabled('network'),
+                     'network is not enabled')
+class TestHostnameRepresentations(unittest.TestCase):
+
+    def tryHostnameArgs(self, function, notfounderror):
+        # Call the given one-argument function with various valid and
+        # invalid representations of nonexistent hostnames - it should
+        # raise notfounderror for valid representations.
+
+        # An RFC 1123-compliant host name (".invalid" TLD is reserved
+        # under RFC 2606).
+        self.assertRaises(notfounderror, function, "host.domain.invalid.")
+        # Previous name as bytes and bytearray.
+        self.assertRaises(notfounderror, function, b"host.domain.invalid.")
+        self.assertRaises(notfounderror, function,
+                          bytearray(b"host.domain.invalid."))
+        # A domain name with a non-ASCII octet, as bytes and bytearray.
+        self.assertRaises(notfounderror, function, b"\xff.domain.invalid.")
+        self.assertRaises(notfounderror, function,
+                          bytearray(b"\xff.domain.invalid."))
+        # Previous domain name as ASCII/surrogateescape string representation.
+        self.assertRaises(notfounderror, function, "\udcff.domain.invalid.")
+        # A legal IDN.
+        self.assertRaises(notfounderror, function, "€.domain.invalid.")
+        # A combination of the previous two, which may make sense in
+        # theory, but is not accepted (the Euro sign means it must be
+        # interpreted as an IDN, but it is is not a legal IDN, because
+        # it contains a surrogate character).
+        self.assertRaises(TypeError, function, "\udcff.€.domain.invalid.")
+
+    def testGethostbynameHostnames(self):
+        self.tryHostnameArgs(socket.gethostbyname,
+                             (socket.herror, socket.gaierror))
+
+    def testGethostbyname_exHostnames(self):
+        self.tryHostnameArgs(socket.gethostbyname_ex,
+                             (socket.herror, socket.gaierror))
+
+    def testGethostbyaddrHostnames(self):
+        self.tryHostnameArgs(socket.gethostbyaddr,
+                             (socket.herror, socket.gaierror))
+
+    def testGetaddrinfoHostnames(self):
+        self.tryHostnameArgs(lambda host: socket.getaddrinfo(host, None),
+                             socket.gaierror)
+
+    def testSocketObjectHostnames(self):
+        def f(host):
+            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            try:
+                s.connect((host, 80))
+            finally:
+                s.close()
+        self.tryHostnameArgs(f, socket.error)
+
+
 @unittest.skipUnless(HAVE_SOCKET_CAN, 'SocketCan required for this test.')
 class BasicCANTest(unittest.TestCase):
 
@@ -5305,6 +5365,7 @@ def test_main():
     tests.append(TestUnixDomain)
     tests.append(TestLinuxAbstractNamespace)
     tests.extend([TIPCTest, TIPCThreadableTest])
+    tests.append(TestHostnameRepresentations)
     tests.extend([BasicCANTest, CANTest])
     tests.extend([BasicRDSTest, RDSTest])
     tests.extend([
diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c
--- a/Modules/socketmodule.c
+++ b/Modules/socketmodule.c
@@ -1413,6 +1413,11 @@ idna_converter(PyObject *obj, struct may
             return 0;
         }
         obj3 = PyUnicode_AsEncodedString(obj2, "idna", NULL);
+        /* IDNA codec doesn't use UnicodeEncodeError. */
+        if (!obj3 && PyErr_ExceptionMatches(PyExc_UnicodeError)) {
+            PyErr_Clear();
+            obj3 = PyUnicode_AsEncodedString(obj2, "ascii", "surrogateescape");
+        }
         Py_DECREF(obj2);
         if (!obj3) {
             PyErr_SetString(PyExc_TypeError, "encoding of hostname failed");
@@ -4501,17 +4506,17 @@ extern int sethostname(const char *, siz
 static PyObject *
 socket_gethostbyname(PyObject *self, PyObject *args)
 {
-    char *name;
+    struct maybe_idna host = {NULL, NULL};
     sock_addr_t addrbuf;
     PyObject *ret = NULL;
 
-    if (!PyArg_ParseTuple(args, "et:gethostbyname", "idna", &name))
+    if (!PyArg_ParseTuple(args, "O&:gethostbyname", idna_converter, &host))
         return NULL;
-    if (setipaddr(name, SAS2SA(&addrbuf),  sizeof(addrbuf), AF_INET) < 0)
+    if (setipaddr(host.buf, SAS2SA(&addrbuf),  sizeof(addrbuf), AF_INET) < 0)
         goto finally;
     ret = makeipaddr(SAS2SA(&addrbuf), sizeof(struct sockaddr_in));
 finally:
-    PyMem_Free(name);
+    idna_cleanup(&host);
     return ret;
 }
 
@@ -4655,7 +4660,7 @@ gethost_common(struct hostent *h, struct
 static PyObject *
 socket_gethostbyname_ex(PyObject *self, PyObject *args)
 {
-    char *name;
+    struct maybe_idna host = {NULL, NULL};
     struct hostent *h;
     sock_addr_t addr;
     struct sockaddr *sa;
@@ -4674,27 +4679,27 @@ socket_gethostbyname_ex(PyObject *self, 
 #endif
 #endif /* HAVE_GETHOSTBYNAME_R */
 
-    if (!PyArg_ParseTuple(args, "et:gethostbyname_ex", "idna", &name))
+    if (!PyArg_ParseTuple(args, "O&:gethostbyname_ex", idna_converter, &host))
         return NULL;
-    if (setipaddr(name, SAS2SA(&addr), sizeof(addr), AF_INET) < 0)
+    if (setipaddr(host.buf, SAS2SA(&addr), sizeof(addr), AF_INET) < 0)
         goto finally;
     Py_BEGIN_ALLOW_THREADS
 #ifdef HAVE_GETHOSTBYNAME_R
 #if   defined(HAVE_GETHOSTBYNAME_R_6_ARG)
-    gethostbyname_r(name, &hp_allocated, buf, buf_len,
+    gethostbyname_r(host.buf, &hp_allocated, buf, buf_len,
                              &h, &errnop);
 #elif defined(HAVE_GETHOSTBYNAME_R_5_ARG)
-    h = gethostbyname_r(name, &hp_allocated, buf, buf_len, &errnop);
+    h = gethostbyname_r(host.buf, &hp_allocated, buf, buf_len, &errnop);
 #else /* HAVE_GETHOSTBYNAME_R_3_ARG */
     memset((void *) &data, '\0', sizeof(data));
-    result = gethostbyname_r(name, &hp_allocated, &data);
+    result = gethostbyname_r(host.buf, &hp_allocated, &data);
     h = (result != 0) ? NULL : &hp_allocated;
 #endif
 #else /* not HAVE_GETHOSTBYNAME_R */
 #ifdef USE_GETHOSTBYNAME_LOCK
     PyThread_acquire_lock(netdb_lock, 1);
 #endif
-    h = gethostbyname(name);
+    h = gethostbyname(host.buf);
 #endif /* HAVE_GETHOSTBYNAME_R */
     Py_END_ALLOW_THREADS
     /* Some C libraries would require addr.__ss_family instead of
@@ -4708,7 +4713,7 @@ socket_gethostbyname_ex(PyObject *self, 
     PyThread_release_lock(netdb_lock);
 #endif
 finally:
-    PyMem_Free(name);
+    idna_cleanup(&host);
     return ret;
 }
 
@@ -4727,7 +4732,7 @@ socket_gethostbyaddr(PyObject *self, PyO
 {
     sock_addr_t addr;
     struct sockaddr *sa = SAS2SA(&addr);
-    char *ip_num;
+    struct maybe_idna ip_num = {NULL, NULL};
     struct hostent *h;
     PyObject *ret = NULL;
 #ifdef HAVE_GETHOSTBYNAME_R
@@ -4751,10 +4756,10 @@ socket_gethostbyaddr(PyObject *self, PyO
     int al;
     int af;
 
-    if (!PyArg_ParseTuple(args, "et:gethostbyaddr", "idna", &ip_num))
+    if (!PyArg_ParseTuple(args, "O&:gethostbyaddr", idna_converter, &ip_num))
         return NULL;
     af = AF_UNSPEC;
-    if (setipaddr(ip_num, sa, sizeof(addr), af) < 0)
+    if (setipaddr(ip_num.buf, sa, sizeof(addr), af) < 0)
         goto finally;
     af = sa->sa_family;
     ap = NULL;
@@ -4800,7 +4805,7 @@ socket_gethostbyaddr(PyObject *self, PyO
     PyThread_release_lock(netdb_lock);
 #endif
 finally:
-    PyMem_Free(ip_num);
+    idna_cleanup(&ip_num);
     return ret;
 }
 
@@ -5502,11 +5507,11 @@ socket_getaddrinfo(PyObject *self, PyObj
     PyObject *hobj = NULL;
     PyObject *pobj = (PyObject *)NULL;
     char pbuf[30];
-    char *hptr, *pptr;
+    char *pptr;
     int family, socktype, protocol, flags;
     int error;
     PyObject *all = (PyObject *)NULL;
-    PyObject *idna = NULL;
+    struct maybe_idna host = {NULL, NULL};
 
     socktype = protocol = flags = 0;
     family = AF_UNSPEC;
@@ -5516,20 +5521,8 @@ socket_getaddrinfo(PyObject *self, PyObj
         return NULL;
     }
     if (hobj == Py_None) {
-        hptr = NULL;
-    } else if (PyUnicode_Check(hobj)) {
-        _Py_IDENTIFIER(encode);
-
-        idna = _PyObject_CallMethodId(hobj, &PyId_encode, "s", "idna");
-        if (!idna)
-            return NULL;
-        assert(PyBytes_Check(idna));
-        hptr = PyBytes_AS_STRING(idna);
-    } else if (PyBytes_Check(hobj)) {
-        hptr = PyBytes_AsString(hobj);
-    } else {
-        PyErr_SetString(PyExc_TypeError,
-                        "getaddrinfo() argument 1 must be string or None");
+        host.buf = NULL;
+    } else if (!idna_converter(hobj, &host)) {
         return NULL;
     }
     if (PyLong_CheckExact(pobj)) {
@@ -5566,7 +5559,7 @@ socket_getaddrinfo(PyObject *self, PyObj
     hints.ai_flags = flags;
     Py_BEGIN_ALLOW_THREADS
     ACQUIRE_GETADDRINFO_LOCK
-    error = getaddrinfo(hptr, pptr, &hints, &res0);
+    error = getaddrinfo(host.buf, pptr, &hints, &res0);
     Py_END_ALLOW_THREADS
     RELEASE_GETADDRINFO_LOCK  /* see comment in setipaddr() */
     if (error) {
@@ -5595,13 +5588,13 @@ socket_getaddrinfo(PyObject *self, PyObj
             goto err;
         Py_XDECREF(single);
     }
-    Py_XDECREF(idna);
+    idna_cleanup(&host);
     if (res0)
         freeaddrinfo(res0);
     return all;
  err:
     Py_XDECREF(all);
-    Py_XDECREF(idna);
+    idna_cleanup(&host);
     if (res0)
         freeaddrinfo(res0);
     return (PyObject *)NULL;
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
https://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to