On 26/01/2026 04:30, Gianluca Cannata wrote:
Hi Amos, Samuel, The Hurd Team,

this a follow up of my previous email that strive to handle HTTP redirect.

There are two files: one adds simply the User-Agent because some
websites does not like HTTP/1.0 requests without an User-Agent header;
the second one implements a simple redirect mechanism if the first
HEAD request returns a response with a Location header.

Tried with:

settrans -facg /tmp/site ./httpfs -D -L 1 gnu.org/
In the HTML parser for parsing tmp
Connecting to gnu.org via gnu.org:80
HTTP Protocol Verified. Status: 301
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200
entering the main loop

ls -1 /tmp/site/
filling out dir tmp
index.html

cat /tmp/site/index.html
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200

I have a question: in the next patch shall I focus on removing the
HEAD and using only a GET ? Because this patch does not handle the
case if eventually the GET request replies with a Location header.

PS: the result after ls command is much longer but i cut it off for brevity.

Sincerely,

Gianluca


Looking at just the HEAD handling loop ...


Index: httpfs/http.c
===================================================================
--- httpfs.orig/http.c
+++ httpfs/http.c
@@ -187,72 +187,142 @@ error_t open_connection(struct netnode *
        size_t towrite;
        char buffer[4096];
        ssize_t bytes_read;
+       int redirects_followed = 0;
- /* 1. Target selection. - * If ip_addr (proxy global variable) is set, we use it. - * Otherwise we use the node URL.
-        */
-       const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? ip_addr : 
node->url;
-
-       /* 2. Agnostic resolution (IPv4/IPv6) */
-       if ((err = lookup_host (target_host, &server_addr, &addr_len, &sock_type, 
&protocol)) != 0) {
-               fprintf (stderr, "Cannot resolve host: %s\n", target_host);
-               return err;
-       }
+       while (1) {
+               if (redirects_followed > max_redirects)
+                       return ELOOP;
+
+               /* 1. Target selection.
+ * If ip_addr (proxy global variable) is set, we use it. + * Otherwise we use the node URL.
+                */
+               const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? 
ip_addr : node->url;
+
+               /* 2. Agnostic resolution (IPv4/IPv6) */
+               if ((err = lookup_host (target_host, &server_addr, &addr_len, 
&sock_type, &protocol)) != 0) {
+                       fprintf (stderr, "Cannot resolve host: %s\n", 
target_host);
+                       return err;
+               }

Might be better to have this as a do {...} while (!err) loop, since at least one attempt should always happen.


- /* 3. Set of the port. */
-       if (server_addr.ss_family == AF_INET) {
-               ((struct sockaddr_in *)&server_addr)->sin_port = htons (port);
-       } else if (server_addr.ss_family == AF_INET6) {
-               ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons (port);
-       }
+               /* 3. Set of the port. */
+               if (server_addr.ss_family == AF_INET) {
+                       ((struct sockaddr_in *)&server_addr)->sin_port = htons 
(port);
+               } else if (server_addr.ss_family == AF_INET6) {
+                       ((struct sockaddr_in6 *)&server_addr)->sin6_port = 
htons (port);
+               }
- if (debug_flag)
-               fprintf (stderr, "Connecting to %s via %s:%d\n", node->url, 
target_host, port);
+               if (debug_flag)
+                       fprintf (stderr, "Connecting to %s via %s:%d\n", 
node->url, target_host, port);
- /* 4. First connection: HEAD request */
-       *fd = socket (server_addr.ss_family, sock_type, protocol);
-       if (*fd == -1)
-       {
-               perror ("Socket creation error for HEAD request");
-               return errno;
-       }
+               /* 4. First connection: HEAD request */
+               *fd = socket (server_addr.ss_family, sock_type, protocol);
+               if (*fd == -1)
+               {
+                       perror ("Socket creation error for HEAD request");
+                       return errno;
+               }
- if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1) {
-               perror ("Connection to remote host failed");
-               close (*fd);
-               return errno;
-       }
+               if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == 
-1) {
+                       perror ("Connection to remote host failed");
+                       close (*fd);
+                       return errno;
+               }
- /* Send a HEAD request find header length */
-       sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: 
%s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
-       towrite = strlen (buffer);
-       written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
-       if ( written == -1 || written < towrite )
-       {
-               fprintf(stderr,"Could not send an HTTP request to host\n");
-               return errno;
-       }
+               /* Send a HEAD request find header length */
+               sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: 
%s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
+               towrite = strlen (buffer);
+               written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
+               if ( written == -1 || written < towrite )
+               {
+                       fprintf(stderr,"Could not send an HTTP request to 
host\n");
+                       return errno;
+               }
+
+               /* Check HTTP status code and handle other than 200 OK only */
+               err = translate_http_status (*fd, &bytes_read);
+
+               /* Follow a redirect up to max_redirects */
+               if (err == EAGAIN) {
+                       /* Read the HEAD response headers line by line and find 
Location: string */
+                       char line[1024];


HTTP agents are required to support URLs of at least 8000 bytes.

So allocating a much larger buffer dynamically instead of on the stack would be best.


+                       char *new_url = NULL;
+                       ssize_t nheader;
+
+                       while (1) {
+                               size_t i = 0;
+                               char c;
+                               while (i < sizeof (line) - 1) {
+                                       if (read (*fd, &c, 1) <= 0) break;
+                                       line[i++] = c;
+                                       if (c == '\n') break;
+                               }
+
+                               line[i] = '\0';
+
+                               if (line[0] == '\r' || line[0] == '\n' || i == 
0) break;


FYI, Bare-CR is a known security issue with HTTP/1.x. There are two cases to check for:

1) '\r' existing by itself in the middle of a logical-line.
  This is a protocol error.

2) multiple '\r' before the '\n' line terminator.
  - These can be ignored, or treated as a protocol error.



+
+                               if (strncasecmp (line, "Location:", 9) == 0) {
+                                       char *url_start = line + 9;
+                                       while (*url_start == ' ' || *url_start 
== '\t') url_start++;
+
+                                       char *url_end = strpbrk (url_start, 
"\r\n");
+                                       if (url_end) *url_end = '\0';
+
+                                       new_url = strdup (url_start);
+                               }
+                       }
+
+                       close (*fd);
+
+                       if (new_url) {
+                               if (strncasecmp (new_url, "https://";, 8) == 0) {
+                                       free (new_url);
+                                       return EPROTO;
+                               }
+
+                               char *host = new_url;
+                               if (strncasecmp (new_url, "http://";, 7) == 0) 
host = host + 7;
+
+                               char *slash = strchr (host, '/');
+

FYI, There is also the weird case of URI path-abempty which means the delimiter here can also be '?', or '#', or the end of string marker.


+                               if (node->url) free (node->url);
+                               if (node->conn_req) free (node->conn_req);
+
+                               if (slash) {
+                                       node->url = strndup (host, slash - 
host);
+                                       node->conn_req = strdup (slash);
+                               } else {
+                                       node->url = strdup (host);
+                                       node->conn_req = strdup ("/");
+                               }
+
+                               if (node->comm_buf) free (node->comm_buf);
+                               asprintf (&node->comm_buf, "GET %s HTTP/1.0\r\nHost: 
%s\r\nUser-Agent: %s/%s\r\n\r\n",
+                                               node->conn_req, node->url, 
HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
> +> +                               free (new_url);
+                               redirects_followed++;
+                               continue;
+                       }
+
+                       return EPROTO;
+               }
+
+               if (err != 0) {
+                       close (*fd);
+                       return err;
+               }
+
+               int n = read (*fd, buffer, sizeof (buffer));
+               if (n >= 0) {
+                       buffer[n] = '\0';
+                       *head_len = bytes_read + n;
+               }
- /* Check HTTP status code and handle other than 200 OK only */
-       if ((err = translate_http_status (*fd, &bytes_read)) != 0)
-       {
-               close (*fd);
-               return err;
-       }
- - int n = read(*fd,buffer,sizeof(buffer)); - if ( n < 0 ) - {
-               perror ("Failed to read HEAD response");
                close (*fd);
-               return errno;
+               break;
        }
-       buffer[n] = '\0';
-
-       *head_len = bytes_read + n;
- - close(*fd); /* 5. Second connection: GET request */
        /* Send the GET request for the url */



Reply via email to