Hi Amos, Samuel, The Hurd Team,
this a follow up of my previous email that strive to handle HTTP redirect.
There are two files: one adds simply the User-Agent because some
websites does not like HTTP/1.0 requests without an User-Agent header;
the second one implements a simple redirect mechanism if the first
HEAD request returns a response with a Location header.
Tried with:
settrans -facg /tmp/site ./httpfs -D -L 1 gnu.org/
In the HTML parser for parsing tmp
Connecting to gnu.org via gnu.org:80
HTTP Protocol Verified. Status: 301
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200
entering the main loop
ls -1 /tmp/site/
filling out dir tmp
index.html
cat /tmp/site/index.html
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200
I have a question: in the next patch shall I focus on removing the
HEAD and using only a GET ? Because this patch does not handle the
case if eventually the GET request replies with a Location header.
PS: the result after ls command is much longer but i cut it off for brevity.
Sincerely,
Gianluca
Index: httpfs/http.c
===================================================================
--- httpfs.orig/http.c
+++ httpfs/http.c
@@ -225,7 +225,7 @@ error_t open_connection(struct netnode *
}
/* Send a HEAD request find header length */
- sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\n\r\n",node->conn_req,node->url);
+ sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
towrite = strlen (buffer);
written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
if ( written == -1 || written < towrite )
@@ -389,7 +389,7 @@ error_t fill_dirnode (struct netnode *di
strcat(conn_req,"/");
}
comm_buf=(char *)malloc((strlen(conn_req)+20)*sizeof(char));
- sprintf(comm_buf,"GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",conn_req,url);
+ sprintf(comm_buf,"GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",conn_req,url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
nd = httpfs_make_node (go->f_type,url,conn_req,comm_buf,f_name);
if (!nd)
Index: httpfs/args.c
===================================================================
--- httpfs.orig/args.c
+++ httpfs/args.c
@@ -40,6 +40,7 @@ static const struct argp_option options[
{"proxy",'X',"STRING",0,"Specify IP address of proxy server"},
{"port",'P',"NUMBER",0,"Specify a non-standard port"},
{"mode",'M',"STRING",0,"Set to directory or file (--mode=dir or --mode=file)"},
+ {"location",'L',"MAX",0,"The maximum number of redirects to follow"},
{0}
};
@@ -78,6 +79,11 @@ static error_t parse_opt (int opt, char
else
return ARGP_ERR_UNKNOWN;
break;
+ case 'L':
+ max_redirects = atoi (arg);
+ if (max_redirects < 0)
+ argp_error (state, "Invalid redirect limit: %s", arg);
+ break;
case ARGP_KEY_ARG:
url = arg;
break;
Index: httpfs/httpfs.c
===================================================================
--- httpfs.orig/httpfs.c
+++ httpfs/httpfs.c
@@ -38,6 +38,7 @@ char *url, *conn_req;
char *ip_addr;
char *dir_tok[25];
struct files *list_of_entries = NULL, *this_entry;
+int max_redirects = 0;
struct httpfs *httpfs; /* filesystem global pointer */
volatile struct mapped_time_value *httpfs_maptime;
Index: httpfs/httpfs.h
===================================================================
--- httpfs.orig/httpfs.h
+++ httpfs/httpfs.h
@@ -52,6 +52,9 @@ extern int no_of_slashes;
* like www.gnu.org/gpl.html and www.gnu.org/ no file given so index.html */
extern char *dir_tok[25];
+/* The maximum number of redirects to follow */
+extern int max_redirects;
+
/* handle all initial parameter parsing */
error_t httpfs_parse_args (int argc, char **argv);
Index: httpfs/http.c
===================================================================
--- httpfs.orig/http.c
+++ httpfs/http.c
@@ -187,72 +187,142 @@ error_t open_connection(struct netnode *
size_t towrite;
char buffer[4096];
ssize_t bytes_read;
+ int redirects_followed = 0;
- /* 1. Target selection.
- * If ip_addr (proxy global variable) is set, we use it.
- * Otherwise we use the node URL.
- */
- const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? ip_addr : node->url;
-
- /* 2. Agnostic resolution (IPv4/IPv6) */
- if ((err = lookup_host (target_host, &server_addr, &addr_len, &sock_type, &protocol)) != 0) {
- fprintf (stderr, "Cannot resolve host: %s\n", target_host);
- return err;
- }
+ while (1) {
+ if (redirects_followed > max_redirects)
+ return ELOOP;
+
+ /* 1. Target selection.
+ * If ip_addr (proxy global variable) is set, we use it.
+ * Otherwise we use the node URL.
+ */
+ const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? ip_addr : node->url;
+
+ /* 2. Agnostic resolution (IPv4/IPv6) */
+ if ((err = lookup_host (target_host, &server_addr, &addr_len, &sock_type, &protocol)) != 0) {
+ fprintf (stderr, "Cannot resolve host: %s\n", target_host);
+ return err;
+ }
- /* 3. Set of the port. */
- if (server_addr.ss_family == AF_INET) {
- ((struct sockaddr_in *)&server_addr)->sin_port = htons (port);
- } else if (server_addr.ss_family == AF_INET6) {
- ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons (port);
- }
+ /* 3. Set of the port. */
+ if (server_addr.ss_family == AF_INET) {
+ ((struct sockaddr_in *)&server_addr)->sin_port = htons (port);
+ } else if (server_addr.ss_family == AF_INET6) {
+ ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons (port);
+ }
- if (debug_flag)
- fprintf (stderr, "Connecting to %s via %s:%d\n", node->url, target_host, port);
+ if (debug_flag)
+ fprintf (stderr, "Connecting to %s via %s:%d\n", node->url, target_host, port);
- /* 4. First connection: HEAD request */
- *fd = socket (server_addr.ss_family, sock_type, protocol);
- if (*fd == -1)
- {
- perror ("Socket creation error for HEAD request");
- return errno;
- }
+ /* 4. First connection: HEAD request */
+ *fd = socket (server_addr.ss_family, sock_type, protocol);
+ if (*fd == -1)
+ {
+ perror ("Socket creation error for HEAD request");
+ return errno;
+ }
- if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1) {
- perror ("Connection to remote host failed");
- close (*fd);
- return errno;
- }
+ if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1) {
+ perror ("Connection to remote host failed");
+ close (*fd);
+ return errno;
+ }
- /* Send a HEAD request find header length */
- sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
- towrite = strlen (buffer);
- written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
- if ( written == -1 || written < towrite )
- {
- fprintf(stderr,"Could not send an HTTP request to host\n");
- return errno;
- }
+ /* Send a HEAD request find header length */
+ sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
+ towrite = strlen (buffer);
+ written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
+ if ( written == -1 || written < towrite )
+ {
+ fprintf(stderr,"Could not send an HTTP request to host\n");
+ return errno;
+ }
+
+ /* Check HTTP status code and handle other than 200 OK only */
+ err = translate_http_status (*fd, &bytes_read);
+
+ /* Follow a redirect up to max_redirects */
+ if (err == EAGAIN) {
+ /* Read the HEAD response headers line by line and find Location: string */
+ char line[1024];
+ char *new_url = NULL;
+ ssize_t nheader;
+
+ while (1) {
+ size_t i = 0;
+ char c;
+ while (i < sizeof (line) - 1) {
+ if (read (*fd, &c, 1) <= 0) break;
+ line[i++] = c;
+ if (c == '\n') break;
+ }
+
+ line[i] = '\0';
+
+ if (line[0] == '\r' || line[0] == '\n' || i == 0) break;
+
+ if (strncasecmp (line, "Location:", 9) == 0) {
+ char *url_start = line + 9;
+ while (*url_start == ' ' || *url_start == '\t') url_start++;
+
+ char *url_end = strpbrk (url_start, "\r\n");
+ if (url_end) *url_end = '\0';
+
+ new_url = strdup (url_start);
+ }
+ }
+
+ close (*fd);
+
+ if (new_url) {
+ if (strncasecmp (new_url, "https://", 8) == 0) {
+ free (new_url);
+ return EPROTO;
+ }
+
+ char *host = new_url;
+ if (strncasecmp (new_url, "http://", 7) == 0) host = host + 7;
+
+ char *slash = strchr (host, '/');
+
+ if (node->url) free (node->url);
+ if (node->conn_req) free (node->conn_req);
+
+ if (slash) {
+ node->url = strndup (host, slash - host);
+ node->conn_req = strdup (slash);
+ } else {
+ node->url = strdup (host);
+ node->conn_req = strdup ("/");
+ }
+
+ if (node->comm_buf) free (node->comm_buf);
+ asprintf (&node->comm_buf, "GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",
+ node->conn_req, node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
+
+ free (new_url);
+ redirects_followed++;
+ continue;
+ }
+
+ return EPROTO;
+ }
+
+ if (err != 0) {
+ close (*fd);
+ return err;
+ }
+
+ int n = read (*fd, buffer, sizeof (buffer));
+ if (n >= 0) {
+ buffer[n] = '\0';
+ *head_len = bytes_read + n;
+ }
- /* Check HTTP status code and handle other than 200 OK only */
- if ((err = translate_http_status (*fd, &bytes_read)) != 0)
- {
- close (*fd);
- return err;
- }
-
- int n = read(*fd,buffer,sizeof(buffer));
- if ( n < 0 )
- {
- perror ("Failed to read HEAD response");
close (*fd);
- return errno;
+ break;
}
- buffer[n] = '\0';
-
- *head_len = bytes_read + n;
-
- close(*fd);
/* 5. Second connection: GET request */
/* Send the GET request for the url */
@@ -288,126 +358,76 @@ error_t fill_dirnode (struct netnode *di
error_t err = 0;
struct node *nd, **prevp;
struct files *go;
- char *comm_buf,*url,*conn_req,*f_name,*temp,*temp1;
+ char *comm_buf = NULL, *url = NULL, *conn_req = NULL, *f_name = NULL;
if (debug_flag)
fprintf (stderr, "filling out dir %s\n", dir->file_name);
-
- if ( dir->type == HTTP_DIR_NOT_FILLED ) {
- /* it is an unfilled directory so send a GET request for that
- * directory and parse the incoming HTML stream to get the file
- * and directories within that
- * and Fill the intermediate data-structure *file */
- err = parse(dir);
- if ( err )
- return err;
+
+ if (dir->type == HTTP_DIR_NOT_FILLED) {
+ err = parse (dir);
+ if (err) return err;
dir->type = HTTP_DIR;
}
-
dir->noents = TRUE;
dir->num_ents = 0;
prevp = &dir->ents;
-
- for(go=list_of_entries;go!=NULL;go=go->next)
- {
- /* *file linked list contains all the file info obtained from
- * parsing the <a href="..">
- * select the ones belonging to this particular directory
- * and fill its node */
-
- if(strcmp(dir->file_name,go->parent)==0)
- {
- /* got a file in this directory
- * directory under consideration is dir->file_name
- * so have to fetch all files whose parent is
- * dir->file_name, i.e. dir->file_name==go->parent */
-
- if ( go->f_type == HTTP_URL )
- {
- /* its an url
- * url is shown as regular file
- * its name is altered by changing / to .
- * www.gnu.org/gpl.html will be changed to
- * www.gnu.org.gpl.html */
- char *slash;
- conn_req=(char *)malloc((strlen(go->f_name)+8)*sizeof(char));
- slash = strchr(go->f_name, '/');
- if (slash)
- url = strndup(go->f_name, slash - go->f_name);
- else
- url = strdup(go->f_name);
- f_name = strdup(go->f_name);
- int i;
- for (i = 0; f_name[i] != '\0'; i++)
- if (f_name[i] == '/')
- f_name[i] = '.';
-
- sprintf(conn_req,"%s%s","http://",go->f_name);
- }
- else
- {
- /* its not an url */
- f_name = strdup(go->f_name);
- url=strdup(dir->url);
- if ( go != list_of_entries )
- {
- size_t conn_req_size = strlen(dir->conn_req) + strlen(go->f_name) + 1;
- if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED )
- conn_req_size++; /* We'll need to add a trailing slash later. */
- conn_req=(char *)malloc(conn_req_size*sizeof(char));
- sprintf(conn_req,"%s%s",dir->conn_req,go->f_name);
- }
- else
- {
- if ( dir_tok[no_of_slashes] == NULL )
- {
- /* the file corresponding to base url
- * user has given a file explicitly in
- * the url */
- size_t conn_req_size = strlen(dir->conn_req) + strlen(go->f_name) + 1;
- if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED )
- conn_req_size++; /* We'll need to add a trailing slash later. */
- conn_req=(char *)malloc(conn_req_size*sizeof(char));
- sprintf(conn_req,"%s%s",dir->conn_req,go->f_name);
- }
- else
- {
- /* the file corresponding to base url
- * user has not given a file explicitly
- * the url so its the index.html */
- size_t conn_req_size = strlen(dir->conn_req) + 1;
- if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED )
- conn_req_size++; /* We'll need to add a trailing slash later. */
- conn_req=(char *)malloc(conn_req_size*sizeof(char));
- sprintf(conn_req,"%s",dir->conn_req);
- }
- }
- if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED )
- /* the filled file is directory so it has to end
- * with a / */
- strcat(conn_req,"/");
- }
- comm_buf=(char *)malloc((strlen(conn_req)+20)*sizeof(char));
- sprintf(comm_buf,"GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",conn_req,url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
- nd = httpfs_make_node (go->f_type,url,conn_req,comm_buf,f_name);
- if (!nd)
- {
- err = ENOMEM;
- return err;
- }
- free(comm_buf);
- free(conn_req);
- free(f_name);
- *prevp = nd;
- nd->prevp = prevp;
- prevp = &nd->next;
- dir->num_ents++;
- if (dir->noents)
- dir->noents = FALSE;
- }
- }
- return err;
+ for (go = list_of_entries; go != NULL; go = go->next) {
+ if (strcmp(dir->file_name, go->parent) == 0) {
+
+ /* Handle URL */
+ if (go->f_type == HTTP_URL) {
+ char *slash = strchr(go->f_name, '/');
+ url = slash ? strndup(go->f_name, slash - go->f_name) : strdup(go->f_name);
+ f_name = strdup(go->f_name);
+
+ for (int i = 0; f_name[i] != '\0'; i++)
+ if (f_name[i] == '/') f_name[i] = '.';
+
+ if (asprintf(&conn_req, "http://%s", go->f_name) < 0) return ENOMEM;
+ }
+ else {
+ /* Handle Local File/Directories */
+ f_name = strdup(go->f_name);
+ url = strdup(dir->url);
+
+ /* Build conn_req if it is a root element or a sub-element */
+ if (go != list_of_entries) {
+ if (asprintf(&conn_req, "%s%s%s", dir->conn_req, go->f_name,
+ (go->f_type == HTTP_DIR || go->f_type == HTTP_DIR_NOT_FILLED) ? "/" : "") < 0)
+ return ENOMEM;
+ } else {
+ /* Base URL */
+ if (asprintf(&conn_req, "%s%s", dir->conn_req,
+ (go->f_type == HTTP_DIR || go->f_type == HTTP_DIR_NOT_FILLED) ? "/" : "") < 0)
+ return ENOMEM;
+ }
+ }
+
+ /* Build comm_buf safely */
+ if (asprintf(&comm_buf, "GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",
+ conn_req, url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION) < 0) {
+ return ENOMEM;
+ }
+
+ nd = httpfs_make_node (go->f_type, url, conn_req, comm_buf, f_name);
+
+ /* Final cleanup */
+ free(comm_buf); comm_buf = NULL;
+ free(conn_req); conn_req = NULL;
+ free(url); url = NULL;
+ free(f_name); f_name = NULL;
+
+ if (!nd) return ENOMEM;
+
+ *prevp = nd;
+ nd->prevp = prevp;
+ prevp = &nd->next;
+ dir->num_ents++;
+ dir->noents = FALSE;
+ }
+ }
+ return 0;
}