Il giorno lun 26 gen 2026 alle ore 08:38 Amos Jeffries
<[email protected]> ha scritto:
>
> On 26/01/2026 04:30, Gianluca Cannata wrote:
> > Hi Amos, Samuel, The Hurd Team,
> >
> > this a follow up of my previous email that strive to handle HTTP redirect.
> >
> > There are two files: one adds simply the User-Agent because some
> > websites does not like HTTP/1.0 requests without an User-Agent header;
> > the second one implements a simple redirect mechanism if the first
> > HEAD request returns a response with a Location header.
> >
> > Tried with:
> >
> > settrans -facg /tmp/site ./httpfs -D -L 1 gnu.org/
> > In the HTML parser for parsing tmp
> > Connecting to gnu.org via gnu.org:80
> > HTTP Protocol Verified. Status: 301
> > Connecting to www.gnu.org via www.gnu.org:80
> > HTTP Protocol Verified. Status: 200
> > entering the main loop
> >
> > ls -1 /tmp/site/
> > filling out dir tmp
> > index.html
> >
> > cat /tmp/site/index.html
> > Connecting to www.gnu.org via www.gnu.org:80
> > HTTP Protocol Verified. Status: 200
> >
> > I have a question: in the next patch shall I focus on removing the
> > HEAD and using only a GET ? Because this patch does not handle the
> > case if eventually the GET request replies with a Location header.
> >
> > PS: the result after ls command is much longer but i cut it off for brevity.
> >
> > Sincerely,
> >
> > Gianluca
>
>
> Looking at just the HEAD handling loop ...
>
>
> > Index: httpfs/http.c
> > ===================================================================
> > --- httpfs.orig/http.c
> > +++ httpfs/http.c
> > @@ -187,72 +187,142 @@ error_t open_connection(struct netnode *
> > size_t towrite;
> > char buffer[4096];
> > ssize_t bytes_read;
> > + int redirects_followed = 0;
> >
> > - /* 1. Target selection.
> > - * If ip_addr (proxy global variable) is set, we use it.
> > - * Otherwise we use the node URL.
> > - */
> > - const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ?
> > ip_addr : node->url;
> > -
> > - /* 2. Agnostic resolution (IPv4/IPv6) */
> > - if ((err = lookup_host (target_host, &server_addr, &addr_len,
> > &sock_type, &protocol)) != 0) {
> > - fprintf (stderr, "Cannot resolve host: %s\n", target_host);
> > - return err;
> > - }
> > + while (1) {
> > + if (redirects_followed > max_redirects)
> > + return ELOOP;
> > +
> > + /* 1. Target selection.
> > + * If ip_addr (proxy global variable) is set, we use it.
> > + * Otherwise we use the node URL.
> > + */
> > + const char *target_host = (strcmp (ip_addr, "0.0.0.0") !=
> > 0) ? ip_addr : node->url;
> > +
> > + /* 2. Agnostic resolution (IPv4/IPv6) */
> > + if ((err = lookup_host (target_host, &server_addr,
> > &addr_len, &sock_type, &protocol)) != 0) {
> > + fprintf (stderr, "Cannot resolve host: %s\n",
> > target_host);
> > + return err;
> > + }
>
> Might be better to have this as a do {...} while (!err) loop, since at
> least one attempt should always happen.
I may be wrong, but isn't while (1) always true and it will iterate at
least once like a do-while ?
>
> >
> > - /* 3. Set of the port. */
> > - if (server_addr.ss_family == AF_INET) {
> > - ((struct sockaddr_in *)&server_addr)->sin_port = htons
> > (port);
> > - } else if (server_addr.ss_family == AF_INET6) {
> > - ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons
> > (port);
> > - }
> > + /* 3. Set of the port. */
> > + if (server_addr.ss_family == AF_INET) {
> > + ((struct sockaddr_in *)&server_addr)->sin_port =
> > htons (port);
> > + } else if (server_addr.ss_family == AF_INET6) {
> > + ((struct sockaddr_in6 *)&server_addr)->sin6_port =
> > htons (port);
> > + }
> >
> > - if (debug_flag)
> > - fprintf (stderr, "Connecting to %s via %s:%d\n", node->url,
> > target_host, port);
> > + if (debug_flag)
> > + fprintf (stderr, "Connecting to %s via %s:%d\n",
> > node->url, target_host, port);
> >
> > - /* 4. First connection: HEAD request */
> > - *fd = socket (server_addr.ss_family, sock_type, protocol);
> > - if (*fd == -1)
> > - {
> > - perror ("Socket creation error for HEAD request");
> > - return errno;
> > - }
> > + /* 4. First connection: HEAD request */
> > + *fd = socket (server_addr.ss_family, sock_type, protocol);
> > + if (*fd == -1)
> > + {
> > + perror ("Socket creation error for HEAD request");
> > + return errno;
> > + }
> >
> > - if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1)
> > {
> > - perror ("Connection to remote host failed");
> > - close (*fd);
> > - return errno;
> > - }
> > + if (connect (*fd, (struct sockaddr *)&server_addr,
> > addr_len) == -1) {
> > + perror ("Connection to remote host failed");
> > + close (*fd);
> > + return errno;
> > + }
> >
> > - /* Send a HEAD request find header length */
> > - sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent:
> > %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME,
> > HTTPFS_SERVER_VERSION);
> > - towrite = strlen (buffer);
> > - written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
> > - if ( written == -1 || written < towrite )
> > - {
> > - fprintf(stderr,"Could not send an HTTP request to host\n");
> > - return errno;
> > - }
> > + /* Send a HEAD request find header length */
> > + sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent:
> > %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME,
> > HTTPFS_SERVER_VERSION);
> > + towrite = strlen (buffer);
> > + written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
> > + if ( written == -1 || written < towrite )
> > + {
> > + fprintf(stderr,"Could not send an HTTP request to
> > host\n");
> > + return errno;
> > + }
> > +
> > + /* Check HTTP status code and handle other than 200 OK only
> > */
> > + err = translate_http_status (*fd, &bytes_read);
> > +
> > + /* Follow a redirect up to max_redirects */
> > + if (err == EAGAIN) {
> > + /* Read the HEAD response headers line by line and
> > find Location: string */
> > + char line[1024];
>
>
> HTTP agents are required to support URLs of at least 8000 bytes.
>
> So allocating a much larger buffer dynamically instead of on the stack
> would be best.
>
>
> > + char *new_url = NULL;
> > + ssize_t nheader;
> > +
> > + while (1) {
> > + size_t i = 0;
> > + char c;
> > + while (i < sizeof (line) - 1) {
> > + if (read (*fd, &c, 1) <= 0) break;
> > + line[i++] = c;
> > + if (c == '\n') break;
> > + }
> > +
> > + line[i] = '\0';
> > +
> > + if (line[0] == '\r' || line[0] == '\n' || i
> > == 0) break;
>
>
> FYI, Bare-CR is a known security issue with HTTP/1.x. There are two
> cases to check for:
>
> 1) '\r' existing by itself in the middle of a logical-line.
> This is a protocol error.
>
> 2) multiple '\r' before the '\n' line terminator.
> - These can be ignored, or treated as a protocol error.
>
>
>
> > +
> > + if (strncasecmp (line, "Location:", 9) ==
> > 0) {
> > + char *url_start = line + 9;
> > + while (*url_start == ' ' ||
> > *url_start == '\t') url_start++;
> > +
> > + char *url_end = strpbrk (url_start,
> > "\r\n");
> > + if (url_end) *url_end = '\0';
> > +
> > + new_url = strdup (url_start);
> > + }
> > + }
> > +
> > + close (*fd);
> > +
> > + if (new_url) {
> > + if (strncasecmp (new_url, "https://", 8) ==
> > 0) {
> > + free (new_url);
> > + return EPROTO;
> > + }
> > +
> > + char *host = new_url;
> > + if (strncasecmp (new_url, "http://", 7) ==
> > 0) host = host + 7;
> > +
> > + char *slash = strchr (host, '/');
> > +
>
> FYI, There is also the weird case of URI path-abempty which means the
> delimiter here can also be '?', or '#', or the end of string marker.
>
>
> > + if (node->url) free (node->url);
> > + if (node->conn_req) free (node->conn_req);
> > +
> > + if (slash) {
> > + node->url = strndup (host, slash -
> > host);
> > + node->conn_req = strdup (slash);
> > + } else {
> > + node->url = strdup (host);
> > + node->conn_req = strdup ("/");
> > + }
> > +
> > + if (node->comm_buf) free (node->comm_buf);
> > + asprintf (&node->comm_buf, "GET %s
> > HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",
> > + node->conn_req, node->url,
> > HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
> > +> + free (new_url);
> > + redirects_followed++;
> > + continue;
> > + }
> > +
> > + return EPROTO;
> > + }
> > +
> > + if (err != 0) {
> > + close (*fd);
> > + return err;
> > + }
> > +
> > + int n = read (*fd, buffer, sizeof (buffer));
> > + if (n >= 0) {
> > + buffer[n] = '\0';
> > + *head_len = bytes_read + n;
> > + }
> >
> > - /* Check HTTP status code and handle other than 200 OK only */
> > - if ((err = translate_http_status (*fd, &bytes_read)) != 0)
> > - {
> > - close (*fd);
> > - return err;
> > - }
> > -
> > - int n = read(*fd,buffer,sizeof(buffer));
> > - if ( n < 0 )
> > - {
> > - perror ("Failed to read HEAD response");
> > close (*fd);
> > - return errno;
> > + break;
> > }
> > - buffer[n] = '\0';
> > -
> > - *head_len = bytes_read + n;
> > -
> > - close(*fd);
> >
> > /* 5. Second connection: GET request */
> > /* Send the GET request for the url */
>
>
>