Browse Source

Write parser for urls.

tags/0.9.0
Vsevolod Stakhov 9 years ago
parent
commit
37b4f84e4e
1 changed files with 275 additions and 6 deletions
  1. 275
    6
      src/libserver/url.c

+ 275
- 6
src/libserver/url.c View File

@@ -928,6 +928,10 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str)
}

if (st == parse_domain) {
if (p - c == 0) {
return 1;
}

u->field_set |= 1 << UF_HOST;
u->field_data[UF_HOST].len = p - c;
u->field_data[UF_HOST].off = c - str;
@@ -935,9 +939,11 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str)
return 0;
}
else if (st == parse_query) {
u->field_set |= 1 << UF_QUERY;
u->field_data[UF_QUERY].len = p - c;
u->field_data[UF_QUERY].off = c - str;
if (p - c > 0) {
u->field_set |= 1 << UF_QUERY;
u->field_data[UF_QUERY].len = p - c;
u->field_data[UF_QUERY].off = c - str;
}

return 0;
}
@@ -945,6 +951,270 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str)
return 1;
}

static gint
rspamd_web_parse (struct http_parser_url *u, const gchar *str)
{
const gchar *p = str, *c = str;
gchar t;
gunichar uc;
glong pt;
gint ret = 1;
enum {
parse_protocol,
parse_slash,
parse_slash_slash,
parse_semicolon,
parse_user,
parse_at,
parse_password_start,
parse_password,
parse_domain,
parse_port,
parse_suffix_slash,
parse_path,
parse_query,
parse_part
} st = parse_protocol;

while (*p) {
t = *p;

switch (st) {
case parse_protocol:
if (t == ':') {
st = parse_semicolon;
u->field_set |= 1 << UF_SCHEMA;
u->field_data[UF_SCHEMA].len = p - c;
u->field_data[UF_SCHEMA].off = 0;
}
p ++;
break;
case parse_semicolon:
if (t == '/') {
st = parse_slash;
p ++;
}
else {
st = parse_slash_slash;
}
break;
case parse_slash:
if (t == '/') {
st = parse_slash_slash;
}
else {
return 1;
}
p ++;
break;
case parse_slash_slash:
c = p;

/* XXX: inefficient lookahead */
if (strchr (p, '@') != NULL) {
st = parse_user;
}
else {
st = parse_domain;
}

break;
case parse_user:
if (t == ':') {
if (p - c == 0) {
return 1;
}
u->field_set |= 1 << UF_USERINFO;
u->field_data[UF_USERINFO].len = p - c;
u->field_data[UF_USERINFO].off = c - str;
st = parse_password_start;
}
else if (t == '@') {
/* No password */
if (p - c == 0) {
return 1;
}
u->field_set |= 1 << UF_USERINFO;
u->field_data[UF_USERINFO].len = p - c;
u->field_data[UF_USERINFO].off = c - str;
st = parse_at;
}
else if (!is_atom (t)) {
return 1;
}
p ++;
break;
case parse_password_start:
if (t == '@') {
/* Empty password */
st = parse_at;
}
else {
c = p;
st = parse_password;
}
p ++;
break;
case parse_password:
if (t == '@') {
/* XXX: password is not stored */
st = parse_at;
}
else if (!is_atom (t)) {
return 1;
}
p ++;
break;
case parse_at:
c = p;
st = parse_domain;
break;
case parse_domain:
if (t == '/' || t == ':') {
if (p - c == 0) {
return 1;
}
u->field_set |= 1 << UF_HOST;
u->field_data[UF_HOST].len = p - c;
u->field_data[UF_HOST].off = c - str;

if (t == '/') {
st = parse_suffix_slash;
}
else {
st = parse_port;
}
p ++;
}
else {
if (*p != '.' && *p != '-' && *p != '_') {
uc = g_utf8_get_char_validated (p, -1);

if (uc == (gunichar)-1) {
/* Bad utf8 */
return 1;
}

if (!g_unichar_isalnum (uc)) {
/* Bad symbol */
return 1;
}

p = g_utf8_next_char (p);
}
else {
p ++;
}
}
break;
case parse_port:
if (t == '/') {
pt = strtoul (c, NULL, 10);
if (pt == 0 || pt > 65535) {
return 1;
}
u->port = pt;
st = parse_suffix_slash;
}
else if (!g_ascii_isdigit (t)) {
return 1;
}
p ++;
break;
case parse_suffix_slash:
if (t != '/') {
c = p;
st = parse_path;
}
else {
/* Skip extra slashes */
p ++;
}
break;
case parse_path:
if (t == '?') {
if (p - c != 0) {
u->field_set |= 1 << UF_PATH;
u->field_data[UF_PATH].len = p - c;
u->field_data[UF_PATH].off = c - str;
}
c = p + 1;
st = parse_query;
}
p ++;
break;
case parse_query:
if (t == '#') {
if (p - c != 0) {
u->field_set |= 1 << UF_QUERY;
u->field_data[UF_QUERY].len = p - c;
u->field_data[UF_QUERY].off = c - str;
}
c = p + 1;
st = parse_part;
}
p ++;
break;
case parse_part:
/* Allow anything here */
p ++;
break;
}
}

/* Parse remaining */
switch (st) {
case parse_domain:
if (p - c == 0) {
return 1;
}
u->field_set |= 1 << UF_HOST;
u->field_data[UF_HOST].len = p - c;
u->field_data[UF_HOST].off = c - str;
ret = 0;

break;
case parse_port:
pt = strtoul (c, NULL, 10);
if (pt == 0 || pt > 65535) {
return 1;
}
u->port = pt;
ret = 0;
break;
case parse_path:
if (p - c > 0) {
u->field_set |= 1 << UF_PATH;
u->field_data[UF_PATH].len = p - c;
u->field_data[UF_PATH].off = c - str;
}
ret = 0;
break;
case parse_query:
if (p - c > 0) {
u->field_set |= 1 << UF_QUERY;
u->field_data[UF_QUERY].len = p - c;
u->field_data[UF_QUERY].off = c - str;
}
ret = 0;
break;
case parse_part:
if (p - c > 0) {
u->field_set |= 1 << UF_FRAGMENT;
u->field_data[UF_FRAGMENT].len = p - c;
u->field_data[UF_FRAGMENT].off = c - str;
}
ret = 0;
break;
default:
/* Error state */
ret = 1;
break;
}

return ret;
}

enum uri_errno
rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
rspamd_mempool_t *pool)
@@ -1003,7 +1273,6 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
}

uri->string = p;
len = strlen (p);

rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_free, p);

@@ -1013,11 +1282,11 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
ret = rspamd_mailto_parse (&u, p);
}
else {
ret = http_parser_parse_url (p, len, 0, &u);
ret = rspamd_web_parse (&u, p);
}
}
else {
ret = http_parser_parse_url (p, len, 0, &u);
ret = rspamd_web_parse (&u, p);
}

if (ret != 0) {

Loading…
Cancel
Save