aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-20 12:01:44 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-20 12:01:44 +0000
commit03b9158983703b45c4eda25ac416668a1a5b24b2 (patch)
treee15c0ea71c47bec512d0aa7c9430555cc8cabbb8
parentc18fc9cc0f5d3de75e813a138d12361637f71570 (diff)
downloadrspamd-03b9158983703b45c4eda25ac416668a1a5b24b2.tar.gz
rspamd-03b9158983703b45c4eda25ac416668a1a5b24b2.zip
Rework url detection and decoding.
-rw-r--r--src/libserver/url.c296
1 files changed, 81 insertions, 215 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 6fdc53ef0..0f804a6b2 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -739,9 +739,9 @@ enum {
IS_URLSAFE)) != 0)
void
-rspamd_unescape_uri (u_char **dst, u_char **src, size_t size)
+rspamd_unescape_uri (gchar **dst, gchar **src, gsize size)
{
- u_char *d, *s, ch, c, decoded;
+ gchar *d, *s, ch, c, decoded;
enum {
sw_usual = 0,
sw_quoted,
@@ -776,14 +776,14 @@ rspamd_unescape_uri (u_char **dst, u_char **src, size_t size)
case sw_quoted:
if (ch >= '0' && ch <= '9') {
- decoded = (u_char) (ch - '0');
+ decoded = (ch - '0');
state = sw_quoted_second;
break;
}
- c = (u_char) (ch | 0x20);
+ c = (ch | 0x20);
if (c >= 'a' && c <= 'f') {
- decoded = (u_char) (c - 'a' + 10);
+ decoded = (c - 'a' + 10);
state = sw_quoted_second;
break;
}
@@ -801,7 +801,7 @@ rspamd_unescape_uri (u_char **dst, u_char **src, size_t size)
state = sw_usual;
if (ch >= '0' && ch <= '9') {
- ch = (u_char) ((decoded << 4) + ch - '0');
+ ch = ((decoded << 4) + ch - '0');
*d++ = ch;
break;
@@ -809,7 +809,7 @@ rspamd_unescape_uri (u_char **dst, u_char **src, size_t size)
c = (u_char) (ch | 0x20);
if (c >= 'a' && c <= 'f') {
- ch = (u_char) ((decoded << 4) + c - 'a' + 10);
+ ch = ((decoded << 4) + c - 'a' + 10);
if (ch == '?') {
*d++ = ch;
@@ -896,11 +896,19 @@ url_init (void)
return 0;
}
+#define SET_U(u, field) do { \
+ if ((u) != NULL) { \
+ (u)->field_set |= 1 << (field); \
+ (u)->field_data[(field)].len = p - c; \
+ (u)->field_data[(field)].off = c - str; \
+ } \
+} while (0)
+
static gint
-rspamd_mailto_parse (struct http_parser_url *u, const gchar *str,
+rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
gchar const **end)
{
- const gchar *p = str, *c = str;
+ const gchar *p = str, *c = str, *last = str + len;
gchar t;
gint ret = 1;
enum {
@@ -918,16 +926,14 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str,
parse_query
} st = parse_mailto;
- while (*p) {
+ while (p < last) {
t = *p;
switch (st) {
case parse_mailto:
if (t == ':') {
st = parse_semicolon;
- u->field_set |= 1 << UF_SCHEMA;
- u->field_data[UF_SCHEMA].len = p - c;
- u->field_data[UF_SCHEMA].off = 0;
+ SET_U (u, UF_SCHEMA);
}
p ++;
break;
@@ -983,9 +989,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str,
if (p - c == 0) {
goto out;
}
- u->field_set |= 1 << UF_USERINFO;
- u->field_data[UF_USERINFO].len = p - c;
- u->field_data[UF_USERINFO].off = c - str;
+ SET_U (u, UF_USERINFO);
st = parse_at;
}
else if (!is_atom (t)) {
@@ -999,10 +1003,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str,
break;
case parse_domain:
if (t == '?') {
- u->field_set |= 1 << UF_HOST;
- u->field_data[UF_HOST].len = p - c;
- u->field_data[UF_HOST].off = c - str;
-
+ SET_U (u, UF_HOST);
st = parse_suffix_question;
}
else if (!is_domain (t) && t != '.' && t != '_') {
@@ -1025,18 +1026,13 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str,
if (st == parse_domain) {
if (p - c != 0) {
- u->field_set |= 1 << UF_HOST;
- u->field_data[UF_HOST].len = p - c;
- u->field_data[UF_HOST].off = c - str;
-
+ SET_U (u, UF_HOST);
ret = 0;
}
}
else if (st == parse_query) {
if (p - c > 0) {
- u->field_set |= 1 << UF_QUERY;
- u->field_data[UF_QUERY].len = p - c;
- u->field_data[UF_QUERY].off = c - str;
+ SET_U (u, UF_QUERY);
}
ret = 0;
@@ -1051,9 +1047,10 @@ out:
}
static gint
-rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end)
+rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
+ gchar const **end, gboolean strict)
{
- const gchar *p = str, *c = str;
+ const gchar *p = str, *c = str, *last = str + len;
gchar t;
gunichar uc;
glong pt;
@@ -1075,16 +1072,14 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end
parse_part
} st = parse_protocol;
- while (*p) {
+ while (p < last) {
t = *p;
switch (st) {
case parse_protocol:
if (t == ':') {
st = parse_semicolon;
- u->field_set |= 1 << UF_SCHEMA;
- u->field_data[UF_SCHEMA].len = p - c;
- u->field_data[UF_SCHEMA].off = 0;
+ SET_U (u, UF_SCHEMA);
}
p ++;
break;
@@ -1123,9 +1118,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end
if (p - c == 0) {
goto out;
}
- u->field_set |= 1 << UF_USERINFO;
- u->field_data[UF_USERINFO].len = p - c;
- u->field_data[UF_USERINFO].off = c - str;
+ SET_U (u, UF_USERINFO);
st = parse_password_start;
}
else if (t == '@') {
@@ -1133,9 +1126,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end
if (p - c == 0) {
goto out;
}
- u->field_set |= 1 << UF_USERINFO;
- u->field_data[UF_USERINFO].len = p - c;
- u->field_data[UF_USERINFO].off = c - str;
+ SET_U (u, UF_USERINFO);
st = parse_at;
}
else if (!is_atom (t)) {
@@ -1173,9 +1164,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end
if (p - c == 0) {
goto out;
}
- u->field_set |= 1 << UF_HOST;
- u->field_data[UF_HOST].len = p - c;
- u->field_data[UF_HOST].off = c - str;
+ SET_U (u, UF_HOST);
if (t == '/') {
st = parse_suffix_slash;
@@ -1188,7 +1177,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end
}
else {
if (*p != '.' && *p != '-' && *p != '_') {
- uc = g_utf8_get_char_validated (p, -1);
+ uc = g_utf8_get_char_validated (p, last - p);
if (uc == (gunichar)-1) {
/* Bad utf8 */
@@ -1234,43 +1223,61 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end
case parse_path:
if (t == '?') {
if (p - c != 0) {
- u->field_set |= 1 << UF_PATH;
- u->field_data[UF_PATH].len = p - c;
- u->field_data[UF_PATH].off = c - str;
+ SET_U (u, UF_PATH);
}
c = p + 1;
st = parse_query;
}
+ else if (!is_urlsafe (t)) {
+ if (strict) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
+ }
p ++;
break;
case parse_query:
if (t == '#') {
if (p - c != 0) {
- u->field_set |= 1 << UF_QUERY;
- u->field_data[UF_QUERY].len = p - c;
- u->field_data[UF_QUERY].off = c - str;
+ SET_U (u, UF_QUERY);
}
c = p + 1;
st = parse_part;
}
+ else if (!is_urlsafe (t)) {
+ if (strict) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
+ }
p ++;
break;
case parse_part:
- /* Allow anything here */
+ if (!is_urlsafe (t)) {
+ if (strict) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
+ }
p ++;
break;
}
}
+set:
/* Parse remaining */
switch (st) {
case parse_domain:
if (p - c == 0) {
goto out;
}
- u->field_set |= 1 << UF_HOST;
- u->field_data[UF_HOST].len = p - c;
- u->field_data[UF_HOST].off = c - str;
+ SET_U (u, UF_HOST);
ret = 0;
break;
@@ -1284,25 +1291,19 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gchar const **end
break;
case parse_path:
if (p - c > 0) {
- u->field_set |= 1 << UF_PATH;
- u->field_data[UF_PATH].len = p - c;
- u->field_data[UF_PATH].off = c - str;
+ SET_U (u, UF_PATH);
}
ret = 0;
break;
case parse_query:
if (p - c > 0) {
- u->field_set |= 1 << UF_QUERY;
- u->field_data[UF_QUERY].len = p - c;
- u->field_data[UF_QUERY].off = c - str;
+ SET_U (u, UF_QUERY);
}
ret = 0;
break;
case parse_part:
if (p - c > 0) {
- u->field_set |= 1 << UF_FRAGMENT;
- u->field_data[UF_FRAGMENT].len = p - c;
- u->field_data[UF_FRAGMENT].off = c - str;
+ SET_U (u, UF_FRAGMENT);
}
ret = 0;
break;
@@ -1319,6 +1320,8 @@ out:
return ret;
}
+#undef SET_U
+
enum uri_errno
rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
rspamd_mempool_t *pool)
@@ -1371,32 +1374,24 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
return URI_ERRNO_EMPTY;
}
- p = g_uri_unescape_string (uristring, NULL);
- if (p == NULL) {
- return URI_ERRNO_BAD_ENCODING;
- }
-
- uri->string = p;
-
- rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_free, p);
-
if (len > sizeof ("mailto:") - 1) {
/* For mailto: urls we also need to add slashes to make it a valid URL */
if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) {
- ret = rspamd_mailto_parse (&u, p, NULL);
+ ret = rspamd_mailto_parse (&u, uristring, len, NULL);
}
else {
- ret = rspamd_web_parse (&u, p, NULL);
+ ret = rspamd_web_parse (&u, uristring, len, NULL, TRUE);
}
}
else {
- ret = rspamd_web_parse (&u, p, NULL);
+ ret = rspamd_web_parse (&u, uristring, len, NULL, TRUE);
}
if (ret != 0) {
return URI_ERRNO_BAD_FORMAT;
}
+ p = uristring;
for (i = 0; i < UF_MAX; i ++) {
if (u.field_set & (1 << i)) {
comp = p + u.field_data[i].off;
@@ -1435,6 +1430,15 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
return URI_ERRNO_BAD_FORMAT;
}
+ /* Now decode url symbols */
+ uri->string = rspamd_mempool_strdup (pool, p);
+
+ if (uri->datalen > 0) {
+ rspamd_unescape_uri (&uri->data, &uri->data, uri->datalen);
+ }
+ if (uri->querylen > 0) {
+ rspamd_unescape_uri (&uri->query, &uri->query, uri->querylen);
+ }
rspamd_str_lc (uri->string, uri->protocollen);
rspamd_str_lc (uri->host, uri->hostlen);
@@ -1630,151 +1634,13 @@ url_web_end (const gchar *begin,
const gchar *pos,
url_match_t *match)
{
- const gchar *p, *c;
- gchar open_brace = '\0', close_brace = '\0';
- gint brace_stack = 0;
- gboolean passwd = FALSE;
- guint port, i;
+ const gchar *last = NULL;
- p = pos + strlen (match->pattern);
- for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) {
- if (*p == url_braces[i]) {
- close_brace = url_braces[i + 1];
- open_brace = *p;
- break;
- }
- }
-
- /* find the end of the domain */
- if (is_atom (*p)) {
- c = p;
- while (p < end) {
- if (!is_atom (*p) && !(*p & 0x80)) {
- break;
- }
-
- p++;
-
- while (p < end && (is_atom (*p) || (*p & 0x80))) {
- p++;
- }
-
- if ((p + 1) < end && *p == '.' &&
- (is_atom (*(p + 1)) || *(p + 1) == '/' || (*(p + 1) & 0x80))) {
- p++;
- }
- }
-
- if (*p != '@') {
- p = c;
- }
- else {
- p++;
- }
-
- goto domain;
- }
- else if (is_domain (*p) || (*p & 0x80)) {
-domain:
- while (p < end) {
- if (!is_domain (*p) && !(*p & 0x80)) {
- break;
- }
-
- p++;
-
- while (p < end && (is_domain (*p) || (*p & 0x80))) {
- p++;
- }
-
- if ((p + 1) < end && *p == '.' &&
- (is_domain (*(p + 1)) || *(p + 1) == '/' ||
- (*(p + 1) & 0x80))) {
- p++;
- }
- }
- }
- else {
+ if (rspamd_web_parse (NULL, pos, end - pos, &last, FALSE) != 0) {
return FALSE;
}
- if (p < end) {
- switch (*p) {
- case ':': /* we either have a port or a password */
- p++;
-
- if (is_digit (*p) || passwd) {
- port = (*p++ - '0');
-
- while (p < end && is_digit (*p) && port < 65536) {
- port = (port * 10) + (*p++ - '0');
- }
-
- if (!passwd && (port >= 65536 || *p == '@')) {
- if (p < end && *p == '@') {
- /* this must be a password? */
- goto passwd;
- }
- else if (p < end) {
- return FALSE;
- }
-
- p--;
- }
- }
- else {
-passwd:
- passwd = TRUE;
- c = p;
-
- while (p < end && is_atom (*p)) {
- p++;
- }
-
- if ((p + 2) < end) {
- if (*p == '@') {
- p++;
- if (is_domain (*p)) {
- goto domain;
- }
- }
-
- return FALSE;
- }
- }
-
- if (p >= end || *p != '/') {
- break;
- }
-
- /* we have a '/' so there could be a path - fall through */
- case '/': /* we've detected a path component to our url */
- p++;
- case '?':
- while (p < end && is_urlsafe (*p)) {
- if (*p == open_brace) {
- brace_stack++;
- }
- else if (*p == close_brace) {
- brace_stack--;
- if (brace_stack == -1) {
- break;
- }
- }
- p++;
- }
-
- break;
- default:
- break;
- }
- }
-
- while (p > pos && strchr (",.:;?!-|}])\"", p[-1])) {
- p--;
- }
-
- match->m_len = (p - pos);
+ match->m_len = (last - pos);
return TRUE;
}