On Thu, 13 Jun 2019 01:22:48 +0300 Adrian Grigore <adrian.emil.grig...@gmail.com> wrote:
Dear Adrian, > I started writing a RFC3986 compliant URI parser. It's not done yet. > Can I get some feedback? > > https://adi.tilde.institute/tmp/uri.h > https://adi.tilde.institute/tmp/parseuri.c please find my feedback and suggestions below: 1) First off, you might want to think about hosting your own git or (if that is not possible) at least upload it to GitHub or Sourcehut. Given such links tend to die and the files themselves aren't too large, I've attached them to this response so they are preserved for posterity. 2) Don't forget to add a LICENSE. 3) Don't typedef non-opaque structs!!! Remove the typedefs and rename the structs to lowercase equivalents, e.g. "struct Uri -> struct uri" etc. (uppercase names are reserved for opaque types, and your structs are not opaque) 4) For parsers, it always makes sense to write a test-program that parses a number of test-cases. Take a look at [0] for a set of test-cases (especially uris.xslt). Judging from the filename, you are also probably planning to write a resolver. [0] also provides test-cases for that. 5) If you are not planning to write a resolver. better rename parseuri.c to uri.c, so it matches the header. Even if you are in fact planning to write a resolver, I'd just put it all inside the single uri.c so it is easier to handle. 6) #define EURINOSCHEME 96 #define EURIBADSCHEME 97 #define EURIBADUSERINFO 98 #define EURIBADHOST 99 #define EURIBADIPLIT 100 This can be written as an enum, starting with 96. Also maybe the names could be a bit better to read, as follows. You can then use this enum as a type as well, in case you pass these errors around as arguments. enum uri_error { EURI_NOSCHEME = 96, EURI_BADSCHEME, EURI_BADUSERINFO, EURI_BADHOST, EURI_BADIPLIT, } 7) I don't see any reason why you would allocate memory anywhere in the code. There is no need for that. Also, if you insist on allocating memory, check your malloc()'s and strdup()'s. 8) Regarding the memory management, the way you do it is questionable. Your function syntax for the main function is Uri *parseuri(const char *uri); and you proceed to allocate a struct Uri within parseuri() that you later return as a pointer. This is very very bad practice, and you should rather make it such that the user passes a pointer to an allocated struct to your parseuri()-function. Then you can change the return value to an int and reflect error-conditions as follows: 0 -> all good 1,2,3 -> error, could be matched with the uri_error enum above 9) Most importantly, don't you think this could be done simpler? Your code has a lot of commented out stuff. I don't know why you would send your code to the mailing list for feedback when it contains so much commented out stuff. Try to clean it up and refactor for more simplicity. I hope this feedback is helpful. With best regards Laslo Hunhold -- Laslo Hunhold <d...@frign.de>
#define SCHEME_MAX 32 #define AUTHORITY_MAX USERINFO_MAX + 1 + AUTHORITY_HOST_MAX + 1 + \ AUTHORITY_PORT_MAX #define AUTHORITY_HOST_MAX HOST_NAME_MAX #define AUTHORITY_PORT_MAX 5 #define USERINFO_MAX USERINFO_USERNAME_MAX + 1 + USERINFO_PASSWORD_MAX #define USERINFO_USERNAME_MAX 255 #define USERINFO_PASSWORD_MAX 255 #define QUERY_MAX 1024 #define FRAGMENT_MAX 1024 typedef struct Uri Uri; typedef struct Authority Authority; typedef struct Userinfo Userinfo; struct Uri { char scheme[SCHEME_MAX]; char username[USERINFO_USERNAME_MAX]; char password[USERINFO_PASSWORD_MAX]; char host[AUTHORITY_HOST_MAX]; char port[AUTHORITY_PORT_MAX]; char path[PATH_MAX]; char query[QUERY_MAX]; char fragment[FRAGMENT_MAX]; }; struct Authority { char username[USERINFO_USERNAME_MAX]; char password[USERINFO_PASSWORD_MAX]; char host[AUTHORITY_HOST_MAX]; char port[AUTHORITY_PORT_MAX]; }; struct Userinfo { char username[USERINFO_USERNAME_MAX]; char password[USERINFO_PASSWORD_MAX]; }; Uri *parseuri(const char *uri); char *parsescheme(const char *scheme); Authority *parseauthority(const char *authority); Userinfo *parseuserinfo(const char *userinfo); char *parsequery(const char *query);
#include <ctype.h> #include <errno.h> #include <limits.h> #include <search.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "uri.h" size_t estrlcpy(char *dst, const char *src, size_t dstsize) { size_t n; n = strlcpy(dst, src, dstsize); if (n >= dstsize) { fputs("liburi: Buffer overflow\n", stderr); exit(1); } return n; } #define EURINOSCHEME 96 #define EURIBADSCHEME 97 #define EURIBADUSERINFO 98 #define EURIBADHOST 99 #define EURIBADIPLIT 100 int isschemechr(int c); int iuserinfochr(int c); int ishostchr(int c); int isportchr(int c); int isiplitchr(int c); int isipv6addresschr(int c); int isipvfuture(int c); int ispctencoded(int c); int isunreserved(int c); int isreserved(int c); int isgendelim(int c); int issubdelim(int c); int isschemechr(int c) { return isalpha(c) || isdigit(c) || c == '+' || c == '.' || c == '-'; } int isuserinfochr(int c) { return isunreserved(c) || issubdelim(c) || c == ':'; } int isiplitchr(int c) { // return c == '[' || } int isipv6chr(int c) { } int ispchar(int c) { return isunreserved(c) || ispctencoded(c) || issubdelim(c) || c == ':' || c == '@'; } int isquerychr(int c) { return ispchar(c) || c == '/' || c == '?'; } int isfragmentchr(int c) { return ispchar(c) || c == '/' || c == '?'; } int ispctencoded(int c) { return isxdigit(c) || c == '%'; } int isunreserved(int c) { return isalpha(c) || isdigit(c) || c == '-' || c == '.' || c == '_' || c == '~'; } int isreserved(int c) { return isgendelim(c) || issubdelim(c); } int isgendelim(int c) { return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || c == ']' || c == '@'; } int issubdelim(int c) { return c == '!' || c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' || c == ',' || c == ';' || c == '='; } char * parsescheme(const char *scheme) { char *Scheme; int i = 0, c; if (!isalpha(scheme[0])) { goto err; } Scheme = strdup(scheme); while((c = Scheme[i])) { if (!isschemechr(c)) goto err; Scheme[i] = tolower(c); i++; } if (i != 0) goto done; err: errno = EURIBADSCHEME; return NULL; done: return Scheme; } Userinfo * parseuserinfo(const char *userinfo) { Userinfo *Userinfo; char username[USERINFO_USERNAME_MAX]; char password[USERINFO_PASSWORD_MAX]; int i = 0, usernamelen = 0, c; Userinfo = malloc(sizeof(struct Userinfo)); if (Userinfo == NULL) { free(Userinfo); return NULL; } while((c = userinfo[i])) { if (!isuserinfochr(c)) { free(Userinfo); errno = EURIBADUSERINFO; return NULL; } if (c == ':') { username[i] = 0; i++; usernamelen = i; goto password; } username[i] = c; i++; } goto done; password: while((c = userinfo[i])) { if (!isuserinfochr(c)) { free(Userinfo); errno = EURIBADUSERINFO; return NULL; } password[i - usernamelen] = c; i++; } done: strcpy(Userinfo->username, username); strcpy(Userinfo->password, password); return Userinfo; } /* char * parseipliteral(const char *iplit) { char *IPLit; int i = 0, c; if (iplit[0] != '[') { errno = EURIBADIPLIT; return NULL; } IPLit = strdup(iplit); while((c = IPLit[i])) { if (!isiplitchr(c)) goto err; Scheme[i] = tolower(c); i++; } if (i != 0) goto done; err: errno = EURIBADSCHEME; return NULL; } char * parsehost(const char *host) { char *Host; if (host[0] == '[') { return parseipliteral(host); } } */ Authority * parseauthority(const char *authority) { char userinfo[USERINFO_MAX]; char host[AUTHORITY_HOST_MAX]; char port[AUTHORITY_PORT_MAX]; Userinfo *Userinfo; Authority *Authority; const char *p = authority, *t; Authority = malloc(sizeof(struct Authority)); if (Authority == NULL) { free(Authority); return NULL; } t = strchr(authority, '@'); if (t) { strncpy(userinfo, p, t - p); userinfo[t - p + 1] = 0; Userinfo = parseuserinfo(userinfo); strcpy(Authority->username, Userinfo->username); strcpy(Authority->password, Userinfo->password); p = t + 1; } t = strchr(p, ':'); if (t) { strncpy(host, p, t - p); host[t - p + 1] = 0; estrlcpy(port, t + 1, sizeof(port)); } else { estrlcpy(host, p, sizeof(host)); } strcpy(Authority->host, host); strcpy(Authority->port, port); return Authority; } /* Query * parsequery(const char *query) { Query *Query; Query = malloc(sizeof(struct Query)); if (Query == NULL) { free(Query); return NULL; } Query->params = malloc(1 * sizeof(ENTRY)); Query->params[0] = malloc(sizeof(ENTRY)); Query->params[0]->key = "foo"; Query->params[0]->data = "bar"; Query->len = 1; return Query; } */ Uri * parseuri(const char *uri) { char scheme[SCHEME_MAX]; char authority[AUTHORITY_MAX]; char path[PATH_MAX]; char query[QUERY_MAX]; Authority *Authority; Uri *Uri; const char *p = uri, *t; char *s; int c; Uri = malloc(sizeof(struct Uri)); if (Uri == NULL) { free(Uri); return NULL; } t = strchr(uri, ':'); if (t == NULL) { free(Uri); errno = EURINOSCHEME; return NULL; } strncpy(scheme, p, t - p); scheme[t - p] = 0; s = parsescheme(scheme); if (s == NULL) { free(s); free(Uri); return NULL; } estrlcpy(Uri->scheme, s, sizeof(Uri->scheme)); p = t + 1; if (*p == '/') { if (*(p + 1) != '/') goto path; p = p + 2; t = p + strcspn(p, "/#?"); strncpy(authority, p, t - p); authority[t - p] = 0; Authority = parseauthority(authority); if(Authority == NULL) { free(Authority); free(Uri); return NULL; } strcpy(Uri->username, Authority->username); strcpy(Uri->password, Authority->password); strcpy(Uri->host, Authority->host); strcpy(Uri->port, Authority->port); p = t; switch(*t) { case '/': goto path; case '?': goto query; case '#': goto fragment; } } path: t = p + strcspn(p, "?#"); if (t == p) goto query; strncpy(Uri->path, p, t - p); Uri->path[t - p] = 0; p = t; query: if (*p != '?') goto fragment; t = p + strcspn(p, "#"); if (t == p) goto fragment; strncpy(query, p + 1, t - p - 1); query[t - p - 1] = 0; //Uri->query = parsequery(query); strcpy(Uri->query, query); p = t; fragment: if (*p != '#') goto done; estrlcpy(Uri->fragment, p + 1, sizeof(Uri->fragment)); done: return Uri; }
pgpq2NyKSiXOw.pgp
Description: PGP signature