On 26/01/2026 04:30, Gianluca Cannata wrote:
Hi Amos, Samuel, The Hurd Team,
this a follow up of my previous email that strive to handle HTTP redirect.
There are two files: one adds simply the User-Agent because some
websites does not like HTTP/1.0 requests without an User-Agent header;
the second one implements a simple redirect mechanism if the first
HEAD request returns a response with a Location header.
Tried with:
settrans -facg /tmp/site ./httpfs -D -L 1 gnu.org/
In the HTML parser for parsing tmp
Connecting to gnu.org via gnu.org:80
HTTP Protocol Verified. Status: 301
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200
entering the main loop
ls -1 /tmp/site/
filling out dir tmp
index.html
cat /tmp/site/index.html
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200
I have a question: in the next patch shall I focus on removing the
HEAD and using only a GET ? Because this patch does not handle the
case if eventually the GET request replies with a Location header.
PS: the result after ls command is much longer but i cut it off for brevity.
Sincerely,
Gianluca
Looking at just the HEAD handling loop ...
Index: httpfs/http.c
===================================================================
--- httpfs.orig/http.c
+++ httpfs/http.c
@@ -187,72 +187,142 @@ error_t open_connection(struct netnode *
size_t towrite;
char buffer[4096];
ssize_t bytes_read;
+ int redirects_followed = 0;
- /* 1. Target selection.
- * If ip_addr (proxy global variable) is set, we use it.
- * Otherwise we use the node URL.
- */
- const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? ip_addr :
node->url;
-
- /* 2. Agnostic resolution (IPv4/IPv6) */
- if ((err = lookup_host (target_host, &server_addr, &addr_len, &sock_type,
&protocol)) != 0) {
- fprintf (stderr, "Cannot resolve host: %s\n", target_host);
- return err;
- }
+ while (1) {
+ if (redirects_followed > max_redirects)
+ return ELOOP;
+
+ /* 1. Target selection.
+ * If ip_addr (proxy global variable) is set, we use it.
+ * Otherwise we use the node URL.
+ */
+ const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ?
ip_addr : node->url;
+
+ /* 2. Agnostic resolution (IPv4/IPv6) */
+ if ((err = lookup_host (target_host, &server_addr, &addr_len,
&sock_type, &protocol)) != 0) {
+ fprintf (stderr, "Cannot resolve host: %s\n",
target_host);
+ return err;
+ }
Might be better to have this as a do {...} while (!err) loop, since at
least one attempt should always happen.
- /* 3. Set of the port. */
- if (server_addr.ss_family == AF_INET) {
- ((struct sockaddr_in *)&server_addr)->sin_port = htons (port);
- } else if (server_addr.ss_family == AF_INET6) {
- ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons (port);
- }
+ /* 3. Set of the port. */
+ if (server_addr.ss_family == AF_INET) {
+ ((struct sockaddr_in *)&server_addr)->sin_port = htons
(port);
+ } else if (server_addr.ss_family == AF_INET6) {
+ ((struct sockaddr_in6 *)&server_addr)->sin6_port =
htons (port);
+ }
- if (debug_flag)
- fprintf (stderr, "Connecting to %s via %s:%d\n", node->url,
target_host, port);
+ if (debug_flag)
+ fprintf (stderr, "Connecting to %s via %s:%d\n",
node->url, target_host, port);
- /* 4. First connection: HEAD request */
- *fd = socket (server_addr.ss_family, sock_type, protocol);
- if (*fd == -1)
- {
- perror ("Socket creation error for HEAD request");
- return errno;
- }
+ /* 4. First connection: HEAD request */
+ *fd = socket (server_addr.ss_family, sock_type, protocol);
+ if (*fd == -1)
+ {
+ perror ("Socket creation error for HEAD request");
+ return errno;
+ }
- if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1) {
- perror ("Connection to remote host failed");
- close (*fd);
- return errno;
- }
+ if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) ==
-1) {
+ perror ("Connection to remote host failed");
+ close (*fd);
+ return errno;
+ }
- /* Send a HEAD request find header length */
- sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent:
%s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
- towrite = strlen (buffer);
- written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
- if ( written == -1 || written < towrite )
- {
- fprintf(stderr,"Could not send an HTTP request to host\n");
- return errno;
- }
+ /* Send a HEAD request find header length */
+ sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent:
%s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
+ towrite = strlen (buffer);
+ written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
+ if ( written == -1 || written < towrite )
+ {
+ fprintf(stderr,"Could not send an HTTP request to
host\n");
+ return errno;
+ }
+
+ /* Check HTTP status code and handle other than 200 OK only */
+ err = translate_http_status (*fd, &bytes_read);
+
+ /* Follow a redirect up to max_redirects */
+ if (err == EAGAIN) {
+ /* Read the HEAD response headers line by line and find
Location: string */
+ char line[1024];
HTTP agents are required to support URLs of at least 8000 bytes.
So allocating a much larger buffer dynamically instead of on the stack
would be best.
+ char *new_url = NULL;
+ ssize_t nheader;
+
+ while (1) {
+ size_t i = 0;
+ char c;
+ while (i < sizeof (line) - 1) {
+ if (read (*fd, &c, 1) <= 0) break;
+ line[i++] = c;
+ if (c == '\n') break;
+ }
+
+ line[i] = '\0';
+
+ if (line[0] == '\r' || line[0] == '\n' || i ==
0) break;
FYI, Bare-CR is a known security issue with HTTP/1.x. There are two
cases to check for:
1) '\r' existing by itself in the middle of a logical-line.
This is a protocol error.
2) multiple '\r' before the '\n' line terminator.
- These can be ignored, or treated as a protocol error.
+
+ if (strncasecmp (line, "Location:", 9) == 0) {
+ char *url_start = line + 9;
+ while (*url_start == ' ' || *url_start
== '\t') url_start++;
+
+ char *url_end = strpbrk (url_start,
"\r\n");
+ if (url_end) *url_end = '\0';
+
+ new_url = strdup (url_start);
+ }
+ }
+
+ close (*fd);
+
+ if (new_url) {
+ if (strncasecmp (new_url, "https://", 8) == 0) {
+ free (new_url);
+ return EPROTO;
+ }
+
+ char *host = new_url;
+ if (strncasecmp (new_url, "http://", 7) == 0)
host = host + 7;
+
+ char *slash = strchr (host, '/');
+
FYI, There is also the weird case of URI path-abempty which means the
delimiter here can also be '?', or '#', or the end of string marker.
+ if (node->url) free (node->url);
+ if (node->conn_req) free (node->conn_req);
+
+ if (slash) {
+ node->url = strndup (host, slash -
host);
+ node->conn_req = strdup (slash);
+ } else {
+ node->url = strdup (host);
+ node->conn_req = strdup ("/");
+ }
+
+ if (node->comm_buf) free (node->comm_buf);
+ asprintf (&node->comm_buf, "GET %s HTTP/1.0\r\nHost:
%s\r\nUser-Agent: %s/%s\r\n\r\n",
+ node->conn_req, node->url,
HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
> +> + free (new_url);
+ redirects_followed++;
+ continue;
+ }
+
+ return EPROTO;
+ }
+
+ if (err != 0) {
+ close (*fd);
+ return err;
+ }
+
+ int n = read (*fd, buffer, sizeof (buffer));
+ if (n >= 0) {
+ buffer[n] = '\0';
+ *head_len = bytes_read + n;
+ }
- /* Check HTTP status code and handle other than 200 OK only */
- if ((err = translate_http_status (*fd, &bytes_read)) != 0)
- {
- close (*fd);
- return err;
- }
-
- int n = read(*fd,buffer,sizeof(buffer));
- if ( n < 0 )
- {
- perror ("Failed to read HEAD response");
close (*fd);
- return errno;
+ break;
}
- buffer[n] = '\0';
-
- *head_len = bytes_read + n;
-
- close(*fd);
/* 5. Second connection: GET request */
/* Send the GET request for the url */