On Thu, 13 Jun 2019 01:22:48 +0300
Adrian Grigore <adrian.emil.grig...@gmail.com> wrote:

Dear Adrian,

> I started writing a RFC3986 compliant URI parser. It's not done yet.
> Can I get some feedback?
> 
> https://adi.tilde.institute/tmp/uri.h
> https://adi.tilde.institute/tmp/parseuri.c

please find my feedback and suggestions below:

1) First off, you might want to think about hosting your own git or (if
that is not possible) at least upload it to GitHub or Sourcehut. Given
such links tend to die and the files themselves aren't too large, I've
attached them to this response so they are preserved for posterity.

2) Don't forget to add a LICENSE.

3) Don't typedef non-opaque structs!!! Remove the typedefs and
rename the structs to lowercase equivalents, e.g. "struct Uri -> struct
uri" etc. (uppercase names are reserved for opaque types, and your
structs are not opaque)

4) For parsers, it always makes sense to write a test-program that
parses a number of test-cases. Take a look at [0] for a set of
test-cases (especially uris.xslt). Judging from the filename, you are
also probably planning to write a resolver. [0] also provides
test-cases for that.

5) If you are not planning to write a resolver. better rename
parseuri.c to uri.c, so it matches the header. Even if you are in fact
planning to write a resolver, I'd just put it all inside the single
uri.c so it is easier to handle.

6)
#define EURINOSCHEME 96
#define EURIBADSCHEME 97
#define EURIBADUSERINFO 98
#define EURIBADHOST 99
#define EURIBADIPLIT 100

This can be written as an enum, starting with 96. Also maybe the names
could be a bit better to read, as follows. You can then use this enum
as a type as well, in case you pass these errors around as arguments.

enum uri_error {
        EURI_NOSCHEME = 96,
        EURI_BADSCHEME,
        EURI_BADUSERINFO,
        EURI_BADHOST,
        EURI_BADIPLIT,
}

7) I don't see any reason why you would allocate memory anywhere in the
code. There is no need for that. Also, if you insist on allocating
memory, check your malloc()'s and strdup()'s.

8) Regarding the memory management, the way you do it is questionable.
Your function syntax for the main function is

        Uri *parseuri(const char *uri);

and you proceed to allocate a struct Uri within parseuri() that you
later return as a pointer. This is very very bad practice, and you
should rather make it such that the user passes a pointer to an
allocated struct to your parseuri()-function. Then you can change the
return value to an int and reflect error-conditions as follows:

        0     -> all good
        1,2,3 -> error, could be matched with the uri_error enum above

9) Most importantly, don't you think this could be done simpler? Your
code has a lot of commented out stuff. I don't know why you would send
your code to the mailing list for feedback when it contains so much
commented out stuff. Try to clean it up and refactor for more
simplicity.

I hope this feedback is helpful.

With best regards

Laslo Hunhold

-- 
Laslo Hunhold <d...@frign.de>
#define SCHEME_MAX   		32
#define AUTHORITY_MAX USERINFO_MAX + 1 + AUTHORITY_HOST_MAX + 1 + \
		 	AUTHORITY_PORT_MAX
#define AUTHORITY_HOST_MAX	HOST_NAME_MAX
#define AUTHORITY_PORT_MAX     	5
#define USERINFO_MAX USERINFO_USERNAME_MAX + 1 + USERINFO_PASSWORD_MAX
#define USERINFO_USERNAME_MAX 	255
#define USERINFO_PASSWORD_MAX 	255
#define QUERY_MAX		1024
#define FRAGMENT_MAX		1024

typedef struct Uri Uri;
typedef struct Authority Authority;
typedef struct Userinfo Userinfo;

struct Uri {
	char scheme[SCHEME_MAX];
	char username[USERINFO_USERNAME_MAX];
	char password[USERINFO_PASSWORD_MAX];
	char host[AUTHORITY_HOST_MAX];
	char port[AUTHORITY_PORT_MAX];
	char path[PATH_MAX];
	char query[QUERY_MAX];
	char fragment[FRAGMENT_MAX];
};

struct Authority {
	char username[USERINFO_USERNAME_MAX];
	char password[USERINFO_PASSWORD_MAX];
	char host[AUTHORITY_HOST_MAX];
	char port[AUTHORITY_PORT_MAX];
};

struct Userinfo {
	char username[USERINFO_USERNAME_MAX];
	char password[USERINFO_PASSWORD_MAX];
};

Uri *parseuri(const char *uri);
char *parsescheme(const char *scheme);
Authority *parseauthority(const char *authority);
Userinfo *parseuserinfo(const char *userinfo);
char *parsequery(const char *query);
#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <search.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "uri.h"

size_t
estrlcpy(char *dst, const char *src, size_t dstsize)
{
	size_t n;

	n = strlcpy(dst, src, dstsize);

	if (n >= dstsize) {
		fputs("liburi: Buffer overflow\n", stderr);
		exit(1);
	}

	return n;
}

#define EURINOSCHEME 96
#define EURIBADSCHEME 97
#define EURIBADUSERINFO 98
#define EURIBADHOST 99
#define EURIBADIPLIT 100

int isschemechr(int c);
int iuserinfochr(int c);
int ishostchr(int c);
int isportchr(int c);
int isiplitchr(int c);
int isipv6addresschr(int c);
int isipvfuture(int c);
int ispctencoded(int c);
int isunreserved(int c);
int isreserved(int c);
int isgendelim(int c);
int issubdelim(int c);

int
isschemechr(int c)
{
	return isalpha(c) ||
		isdigit(c) ||
		c == '+' ||
		c == '.' ||
		c == '-';
}

int
isuserinfochr(int c)
{
	return isunreserved(c) ||
		issubdelim(c) ||
		c == ':';
}

int
isiplitchr(int c)
{
//	return c == '[' ||
}

int
isipv6chr(int c)
{
}

int
ispchar(int c)
{
	return isunreserved(c) ||
		ispctencoded(c) ||
		issubdelim(c) ||
		c == ':' ||
		c == '@';
}

int
isquerychr(int c)
{
	return ispchar(c) ||
		c == '/' ||
		c == '?';
}

int
isfragmentchr(int c)
{
	return ispchar(c) ||
		c == '/' ||
		c == '?';
}

int
ispctencoded(int c) {
	return isxdigit(c) || c == '%';
}

int
isunreserved(int c)
{
	return isalpha(c) ||
		isdigit(c) ||
		c == '-' ||
		c == '.' ||
		c == '_' ||
		c == '~';
}

int
isreserved(int c)
{
	return isgendelim(c) || issubdelim(c);
}

int
isgendelim(int c)
{
	return c == ':' ||
		c == '/' ||
		c == '?' ||
		c == '#' ||
		c == '[' ||
		c == ']' ||
		c == '@';
}

int
issubdelim(int c)
{
	return c == '!' ||
		c == '$' ||
		c == '&' ||
		c == '\'' ||
		c == '(' ||
		c == ')' ||
		c == '*' ||
		c == '+' ||
		c == ',' ||
		c == ';' ||
		c == '=';
}

char *
parsescheme(const char *scheme)
{
	char *Scheme;
	int i = 0, c;

	if (!isalpha(scheme[0])) {
		goto err;
	}

	Scheme = strdup(scheme);

	while((c = Scheme[i])) {
		if (!isschemechr(c))
			goto err;

		Scheme[i] = tolower(c);
		i++;
	}

	if (i != 0)
		goto done;

	err:
	errno = EURIBADSCHEME;
	return NULL;

	done:
	return Scheme;
}

Userinfo *
parseuserinfo(const char *userinfo)
{
	Userinfo *Userinfo;
	char username[USERINFO_USERNAME_MAX];
	char password[USERINFO_PASSWORD_MAX];
	int i = 0, usernamelen = 0, c;

	Userinfo = malloc(sizeof(struct Userinfo));
	if (Userinfo == NULL) {
		free(Userinfo);
		return NULL;
	}

	while((c = userinfo[i])) {
		if (!isuserinfochr(c)) {
			free(Userinfo);
			errno = EURIBADUSERINFO;
			return NULL;
		}
		if (c == ':') {
			username[i] = 0;
			i++;
			usernamelen = i;
			goto password;
		}
		
		username[i] = c;
		i++;
	}

	goto done;

	password:
	while((c = userinfo[i])) {
		if (!isuserinfochr(c)) {
			free(Userinfo);
			errno = EURIBADUSERINFO;
			return NULL;
		}
		password[i - usernamelen] = c;
		i++;
	}

	done:
	strcpy(Userinfo->username, username);
	strcpy(Userinfo->password, password);
	return Userinfo;
}

/*
char *
parseipliteral(const char *iplit)
{
	char *IPLit;
	int i = 0, c;

	if (iplit[0] != '[') {
		errno = EURIBADIPLIT;
		return NULL;
	}

	IPLit = strdup(iplit);

	while((c = IPLit[i])) {
		if (!isiplitchr(c))
			goto err;

		Scheme[i] = tolower(c);
		i++;
	}

	if (i != 0)
		goto done;

	err:
	errno = EURIBADSCHEME;
	return NULL;

}

char *
parsehost(const char *host)
{
	char *Host;

	if (host[0] == '[') {
		return parseipliteral(host);
	}
}
*/
Authority *
parseauthority(const char *authority)
{
	char userinfo[USERINFO_MAX];
	char host[AUTHORITY_HOST_MAX];
	char port[AUTHORITY_PORT_MAX];
	Userinfo *Userinfo;
	Authority *Authority;
	const char *p = authority, *t;

	Authority = malloc(sizeof(struct Authority));
	if (Authority == NULL) {
		free(Authority);
		return NULL;
	}

	t = strchr(authority, '@');
	if (t) {
		strncpy(userinfo, p, t - p);
		userinfo[t - p + 1] = 0;

		Userinfo = parseuserinfo(userinfo);
		strcpy(Authority->username, Userinfo->username);
		strcpy(Authority->password, Userinfo->password);
		p = t + 1;
	}
	
	t = strchr(p, ':');
	if (t) {
		strncpy(host, p, t - p);
		host[t - p + 1] = 0;
		estrlcpy(port, t + 1, sizeof(port));
	} else {
		estrlcpy(host, p, sizeof(host));
	}

	strcpy(Authority->host, host);
	strcpy(Authority->port, port);
	return Authority;
}

/*
Query *
parsequery(const char *query)
{
	Query *Query;

	Query = malloc(sizeof(struct Query));
	if (Query == NULL) {
		free(Query);
		return NULL;
	}

	Query->params = malloc(1 * sizeof(ENTRY));
	Query->params[0] = malloc(sizeof(ENTRY));
	Query->params[0]->key = "foo";
	Query->params[0]->data = "bar";
	Query->len = 1;
	return Query;
}
*/

Uri *
parseuri(const char *uri)
{
	char scheme[SCHEME_MAX];
	char authority[AUTHORITY_MAX];
	char path[PATH_MAX];
	char query[QUERY_MAX];
	Authority *Authority;
	Uri *Uri;
	const char *p = uri, *t;
	char *s;
	int c;

	Uri = malloc(sizeof(struct Uri));
	if (Uri == NULL) {
		free(Uri);
		return NULL;
	}

	t = strchr(uri, ':');
	if (t == NULL) {
		free(Uri);
		errno = EURINOSCHEME;
		return NULL;
	}

	strncpy(scheme, p, t - p);
	scheme[t - p] = 0;

	s = parsescheme(scheme);
	if (s == NULL) {
		free(s);
		free(Uri);
		return NULL;
	}
	estrlcpy(Uri->scheme, s, sizeof(Uri->scheme));

	p = t + 1;

	if (*p == '/') {
		if (*(p + 1) != '/')
			goto path;
		p = p + 2;
		t = p + strcspn(p, "/#?");
		strncpy(authority, p, t - p);
		authority[t - p] = 0;
		Authority = parseauthority(authority);
		if(Authority == NULL) {
			free(Authority);
			free(Uri);
			return NULL;
		}

		strcpy(Uri->username, Authority->username);
		strcpy(Uri->password, Authority->password);
		strcpy(Uri->host, Authority->host);
		strcpy(Uri->port, Authority->port);

		p = t;
		switch(*t) {
			case '/':
				goto path;
			case '?':
				goto query;
			case '#':
				goto fragment;
		}
	}

	path:
		t = p + strcspn(p, "?#");
		if (t == p)
			goto query;
		
		strncpy(Uri->path, p, t - p);
		Uri->path[t - p] = 0;
		p = t;

	query:
		if (*p != '?')
			goto fragment;

		t = p + strcspn(p, "#");

		if (t == p)
			goto fragment;

		strncpy(query, p + 1, t - p - 1);
		query[t - p - 1] = 0;
		//Uri->query = parsequery(query);
		strcpy(Uri->query, query);
		p = t;

	fragment:
		if (*p != '#')
			goto done;
		estrlcpy(Uri->fragment, p + 1, sizeof(Uri->fragment));

	done:
	return Uri;
}

Attachment: pgpq2NyKSiXOw.pgp
Description: PGP signature

Reply via email to