Il giorno lun 26 gen 2026 alle ore 08:38 Amos Jeffries
<[email protected]> ha scritto:
>
> On 26/01/2026 04:30, Gianluca Cannata wrote:
> > Hi Amos, Samuel, The Hurd Team,
> >
> > this a follow up of my previous email that strive to handle HTTP redirect.
> >
> > There are two files: one adds simply the User-Agent because some
> > websites does not like HTTP/1.0 requests without an User-Agent header;
> > the second one implements a simple redirect mechanism if the first
> > HEAD request returns a response with a Location header.
> >
> > Tried with:
> >
> > settrans -facg /tmp/site ./httpfs -D -L 1 gnu.org/
> > In the HTML parser for parsing tmp
> > Connecting to gnu.org via gnu.org:80
> > HTTP Protocol Verified. Status: 301
> > Connecting to www.gnu.org via www.gnu.org:80
> > HTTP Protocol Verified. Status: 200
> > entering the main loop
> >
> > ls -1 /tmp/site/
> > filling out dir tmp
> > index.html
> >
> > cat /tmp/site/index.html
> > Connecting to www.gnu.org via www.gnu.org:80
> > HTTP Protocol Verified. Status: 200
> >
> > I have a question: in the next patch shall I focus on removing the
> > HEAD and using only a GET ? Because this patch does not handle the
> > case if eventually the GET request replies with a Location header.
> >
> > PS: the result after ls command is much longer but i cut it off for brevity.
> >
> > Sincerely,
> >
> > Gianluca
>
>
> Looking at just the HEAD handling loop ...
>
>
> > Index: httpfs/http.c
> > ===================================================================
> > --- httpfs.orig/http.c
> > +++ httpfs/http.c
> > @@ -187,72 +187,142 @@ error_t open_connection(struct netnode *
> >         size_t towrite;
> >         char buffer[4096];
> >         ssize_t bytes_read;
> > +       int redirects_followed = 0;
> >
> > -       /* 1. Target selection.
> > -        * If ip_addr (proxy global variable) is set, we use it.
> > -        * Otherwise we use the node URL.
> > -        */
> > -       const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? 
> > ip_addr : node->url;
> > -
> > -       /* 2. Agnostic resolution (IPv4/IPv6) */
> > -       if ((err = lookup_host (target_host, &server_addr, &addr_len, 
> > &sock_type, &protocol)) != 0) {
> > -               fprintf (stderr, "Cannot resolve host: %s\n", target_host);
> > -               return err;
> > -       }
> > +       while (1) {
> > +               if (redirects_followed > max_redirects)
> > +                       return ELOOP;
> > +
> > +               /* 1. Target selection.
> > +                * If ip_addr (proxy global variable) is set, we use it.
> > +                * Otherwise we use the node URL.
> > +                */
> > +               const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 
> > 0) ? ip_addr : node->url;
> > +
> > +               /* 2. Agnostic resolution (IPv4/IPv6) */
> > +               if ((err = lookup_host (target_host, &server_addr, 
> > &addr_len, &sock_type, &protocol)) != 0) {
> > +                       fprintf (stderr, "Cannot resolve host: %s\n", 
> > target_host);
> > +                       return err;
> > +               }
>
> Might be better to have this as a do {...} while (!err) loop, since at
> least one attempt should always happen.

I may be wrong, but isn't while (1) always true and it will iterate at
least once like a do-while ?

>
> >
> > -       /* 3. Set of the port. */
> > -       if (server_addr.ss_family == AF_INET) {
> > -               ((struct sockaddr_in *)&server_addr)->sin_port = htons 
> > (port);
> > -       } else if (server_addr.ss_family == AF_INET6) {
> > -               ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons 
> > (port);
> > -       }
> > +               /* 3. Set of the port. */
> > +               if (server_addr.ss_family == AF_INET) {
> > +                       ((struct sockaddr_in *)&server_addr)->sin_port = 
> > htons (port);
> > +               } else if (server_addr.ss_family == AF_INET6) {
> > +                       ((struct sockaddr_in6 *)&server_addr)->sin6_port = 
> > htons (port);
> > +               }
> >
> > -       if (debug_flag)
> > -               fprintf (stderr, "Connecting to %s via %s:%d\n", node->url, 
> > target_host, port);
> > +               if (debug_flag)
> > +                       fprintf (stderr, "Connecting to %s via %s:%d\n", 
> > node->url, target_host, port);
> >
> > -       /* 4. First connection: HEAD request */
> > -       *fd = socket (server_addr.ss_family, sock_type, protocol);
> > -       if (*fd == -1)
> > -       {
> > -               perror ("Socket creation error for HEAD request");
> > -               return errno;
> > -       }
> > +               /* 4. First connection: HEAD request */
> > +               *fd = socket (server_addr.ss_family, sock_type, protocol);
> > +               if (*fd == -1)
> > +               {
> > +                       perror ("Socket creation error for HEAD request");
> > +                       return errno;
> > +               }
> >
> > -       if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1) 
> > {
> > -               perror ("Connection to remote host failed");
> > -               close (*fd);
> > -               return errno;
> > -       }
> > +               if (connect (*fd, (struct sockaddr *)&server_addr, 
> > addr_len) == -1) {
> > +                       perror ("Connection to remote host failed");
> > +                       close (*fd);
> > +                       return errno;
> > +               }
> >
> > -       /* Send a HEAD request find header length */
> > -       sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: 
> > %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, 
> > HTTPFS_SERVER_VERSION);
> > -       towrite = strlen (buffer);
> > -       written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
> > -       if ( written == -1 || written < towrite )
> > -       {
> > -               fprintf(stderr,"Could not send an HTTP request to host\n");
> > -               return errno;
> > -       }
> > +               /* Send a HEAD request find header length */
> > +               sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: 
> > %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, 
> > HTTPFS_SERVER_VERSION);
> > +               towrite = strlen (buffer);
> > +               written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
> > +               if ( written == -1 || written < towrite )
> > +               {
> > +                       fprintf(stderr,"Could not send an HTTP request to 
> > host\n");
> > +                       return errno;
> > +               }
> > +
> > +               /* Check HTTP status code and handle other than 200 OK only 
> > */
> > +               err = translate_http_status (*fd, &bytes_read);
> > +
> > +               /* Follow a redirect up to max_redirects */
> > +               if (err == EAGAIN) {
> > +                       /* Read the HEAD response headers line by line and 
> > find Location: string */
> > +                       char line[1024];
>
>
> HTTP agents are required to support URLs of at least 8000 bytes.
>
> So allocating a much larger buffer dynamically instead of on the stack
> would be best.
>
>
> > +                       char *new_url = NULL;
> > +                       ssize_t nheader;
> > +
> > +                       while (1) {
> > +                               size_t i = 0;
> > +                               char c;
> > +                               while (i < sizeof (line) - 1) {
> > +                                       if (read (*fd, &c, 1) <= 0) break;
> > +                                       line[i++] = c;
> > +                                       if (c == '\n') break;
> > +                               }
> > +
> > +                               line[i] = '\0';
> > +
> > +                               if (line[0] == '\r' || line[0] == '\n' || i 
> > == 0) break;
>
>
> FYI,  Bare-CR is a known security issue with HTTP/1.x. There are two
> cases to check for:
>
> 1) '\r' existing by itself in the middle of a logical-line.
>    This is a protocol error.
>
> 2) multiple '\r' before the '\n' line terminator.
>    - These can be ignored, or treated as a protocol error.
>
>
>
> > +
> > +                               if (strncasecmp (line, "Location:", 9) == 
> > 0) {
> > +                                       char *url_start = line + 9;
> > +                                       while (*url_start == ' ' || 
> > *url_start == '\t') url_start++;
> > +
> > +                                       char *url_end = strpbrk (url_start, 
> > "\r\n");
> > +                                       if (url_end) *url_end = '\0';
> > +
> > +                                       new_url = strdup (url_start);
> > +                               }
> > +                       }
> > +
> > +                       close (*fd);
> > +
> > +                       if (new_url) {
> > +                               if (strncasecmp (new_url, "https://";, 8) == 
> > 0) {
> > +                                       free (new_url);
> > +                                       return EPROTO;
> > +                               }
> > +
> > +                               char *host = new_url;
> > +                               if (strncasecmp (new_url, "http://";, 7) == 
> > 0) host = host + 7;
> > +
> > +                               char *slash = strchr (host, '/');
> > +
>
> FYI, There is also the weird case of URI path-abempty which means the
> delimiter here can also be '?', or '#', or the end of string marker.
>
>
> > +                               if (node->url) free (node->url);
> > +                               if (node->conn_req) free (node->conn_req);
> > +
> > +                               if (slash) {
> > +                                       node->url = strndup (host, slash - 
> > host);
> > +                                       node->conn_req = strdup (slash);
> > +                               } else {
> > +                                       node->url = strdup (host);
> > +                                       node->conn_req = strdup ("/");
> > +                               }
> > +
> > +                               if (node->comm_buf) free (node->comm_buf);
> > +                               asprintf (&node->comm_buf, "GET %s 
> > HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",
> > +                                               node->conn_req, node->url, 
> > HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
>  > +> +                               free (new_url);
> > +                               redirects_followed++;
> > +                               continue;
> > +                       }
> > +
> > +                       return EPROTO;
> > +               }
> > +
> > +               if (err != 0) {
> > +                       close (*fd);
> > +                       return err;
> > +               }
> > +
> > +               int n = read (*fd, buffer, sizeof (buffer));
> > +               if (n >= 0) {
> > +                       buffer[n] = '\0';
> > +                       *head_len = bytes_read + n;
> > +               }
> >
> > -       /* Check HTTP status code and handle other than 200 OK only */
> > -       if ((err = translate_http_status (*fd, &bytes_read)) != 0)
> > -       {
> > -               close (*fd);
> > -               return err;
> > -       }
> > -
> > -       int n = read(*fd,buffer,sizeof(buffer));
> > -       if ( n < 0 )
> > -       {
> > -               perror ("Failed to read HEAD response");
> >                 close (*fd);
> > -               return errno;
> > +               break;
> >         }
> > -       buffer[n] = '\0';
> > -
> > -       *head_len = bytes_read + n;
> > -
> > -       close(*fd);
> >
> >         /* 5. Second connection: GET request */
> >         /* Send the GET request for the url */
>
>
>

Reply via email to