aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/url.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/url.c')
-rw-r--r--src/libserver/url.c108
1 files changed, 68 insertions, 40 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 918f7eee3..824c0dff8 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -46,6 +46,8 @@
#include "message.h"
#include "multipattern.h"
#include "contrib/uthash/utlist.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
typedef struct url_match_s {
const gchar *m_begin;
@@ -153,21 +155,17 @@ struct url_matcher static_matchers[] = {
/* Common prefixes */
{"file://", "", url_file_start, url_file_end,
0, 0},
- {"file:\\\\", "", url_file_start, url_file_end,
+ {"file:\\\\", "", url_file_start, url_file_end,
0, 0},
{"ftp://", "", url_web_start, url_web_end,
0, 0},
- {"ftp:\\\\", "", url_web_start, url_web_end,
+ {"ftp:\\\\", "", url_web_start, url_web_end,
0, 0},
{"sftp://", "", url_web_start, url_web_end,
0, 0},
- {"http://", "", url_web_start, url_web_end,
+ {"http:", "", url_web_start, url_web_end,
0, 0},
- {"http:\\\\", "", url_web_start, url_web_end,
- 0, 0},
- {"https://", "", url_web_start, url_web_end,
- 0, 0},
- {"https:\\\\", "", url_web_start, url_web_end,
+ {"https:", "", url_web_start, url_web_end,
0, 0},
{"news://", "", url_web_start, url_web_end,
0, 0},
@@ -550,7 +548,7 @@ is_url_end (gchar c)
static gint
rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
- gchar const **end, gboolean strict)
+ gchar const **end, gboolean strict, guint *flags)
{
const gchar *p = str, *c = str, *last = str + len;
gchar t;
@@ -591,6 +589,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
p++;
}
else {
+ *flags |= RSPAMD_URL_FLAG_MISSINGSLASHES;
st = parse_slash_slash;
}
break;
@@ -704,11 +703,11 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
static gint
rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
- gchar const **end, gboolean strict, gboolean *obscured)
+ gchar const **end, gboolean strict, guint *flags)
{
const gchar *p = str, *c = str, *last = str + len, *slash = NULL;
gchar t;
- gunichar uc;
+ UChar32 uc;
glong pt;
gint ret = 1;
gboolean user_seen = FALSE;
@@ -766,6 +765,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
}
else {
st = parse_slash_slash;
+ *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES;
}
break;
case parse_slash:
@@ -833,15 +833,13 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
/* We have multiple at in fact */
st = parse_multiple_at;
user_seen = TRUE;
-
- if (obscured) {
- *obscured = TRUE;
- }
+ *flags |= RSPAMD_URL_FLAG_OBSCURED;
continue;
}
SET_U (u, UF_USERINFO);
+ *flags |= RSPAMD_URL_FLAG_HAS_USER;
st = parse_at;
}
else if (!g_ascii_isgraph (t)) {
@@ -867,6 +865,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
if (t == '@') {
/* Empty password */
SET_U (u, UF_USERINFO);
+ *flags |= RSPAMD_URL_FLAG_HAS_USER;
st = parse_at;
}
else {
@@ -878,6 +877,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
case parse_password:
if (t == '@') {
/* XXX: password is not stored */
+ *flags |= RSPAMD_URL_FLAG_HAS_USER;
SET_U (u, UF_USERINFO);
st = parse_at;
}
@@ -942,24 +942,32 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
st = parse_user;
}
else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
- uc = g_utf8_get_char_validated (p, last - p);
+ if (*p & 0x80) {
+ *flags |= RSPAMD_URL_FLAG_IDN;
+ guint i = 0;
- if (uc == (gunichar) -1) {
- /* Bad utf8 */
- goto out;
- }
+ U8_NEXT (p, i, last - p, uc);
- if (!g_unichar_isalnum (uc)) {
- /* Bad symbol */
- if (strict) {
+ if (uc < 0) {
+ /* Bad utf8 */
goto out;
}
- else {
- goto set;
+
+ if (!u_isalnum (uc)) {
+ /* Bad symbol */
+ if (strict) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
}
- }
- p = g_utf8_next_char (p);
+ p = p + i;
+ }
+ else {
+ p ++;
+ }
}
else {
p++;
@@ -992,6 +1000,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
}
if (u != NULL) {
u->port = pt;
+ *flags |= RSPAMD_URL_FLAG_HAS_PORT;
}
st = parse_suffix_slash;
}
@@ -1002,6 +1011,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
}
if (u != NULL) {
u->port = pt;
+ *flags |= RSPAMD_URL_FLAG_HAS_PORT;
}
c = p + 1;
@@ -1014,6 +1024,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
}
if (u != NULL) {
u->port = pt;
+ *flags |= RSPAMD_URL_FLAG_HAS_PORT;
}
c = p + 1;
@@ -1514,9 +1525,8 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
struct http_parser_url u;
gchar *p, *comp;
const gchar *end;
- guint i, complen, ret;
+ guint i, complen, ret, flags = 0;
gsize unquoted_len = 0;
- gboolean obscured = FALSE;
memset (uri, 0, sizeof (*uri));
memset (&u, 0, sizeof (u));
@@ -1531,14 +1541,14 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
if (len > sizeof ("mailto:") - 1) {
/* For mailto: urls we also need to add slashes to make it a valid URL */
if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) {
- ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE);
+ ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE, &flags);
}
else {
- ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &obscured);
+ ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
}
}
else {
- ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &obscured);
+ ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
}
if (ret != 0) {
@@ -1551,8 +1561,27 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
uri->raw = p;
uri->rawlen = len;
- uri->string = rspamd_mempool_alloc (pool, len + 1);
- rspamd_strlcpy (uri->string, p, len + 1);
+
+ if (flags & RSPAMD_URL_FLAG_MISSINGSLASHES) {
+ len += 2;
+ uri->string = rspamd_mempool_alloc (pool, len + 1);
+ memcpy (uri->string, p, u.field_data[UF_SCHEMA].len);
+ memcpy (uri->string + u.field_data[UF_SCHEMA].len, "://", 3);
+ rspamd_strlcpy (uri->string + u.field_data[UF_SCHEMA].len + 3,
+ p + u.field_data[UF_SCHEMA].len + 1,
+ len - 1 - u.field_data[UF_SCHEMA].len);
+ /* Compensate slashes added */
+ for (i = UF_SCHEMA + 1; i < UF_MAX; i++) {
+ if (u.field_set & (1 << i)) {
+ u.field_data[i].off += 2;
+ }
+ }
+ }
+ else {
+ uri->string = rspamd_mempool_alloc (pool, len + 1);
+ rspamd_strlcpy (uri->string, p, len + 1);
+ }
+
uri->urllen = len;
for (i = 0; i < UF_MAX; i++) {
@@ -1591,15 +1620,12 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
}
uri->port = u.port;
+ uri->flags = flags;
if (!uri->hostlen) {
return URI_ERRNO_HOST_MISSING;
}
- if (obscured) {
- uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
/* Now decode url symbols */
unquoted_len = rspamd_url_decode (uri->string,
uri->string,
@@ -1961,13 +1987,14 @@ url_web_end (struct url_callback_data *cb,
{
const gchar *last = NULL;
gint len = cb->end - pos;
+ guint flags = 0;
if (match->newline_pos && match->st != '<') {
/* We should also limit our match end to the newline */
len = MIN (len, match->newline_pos - pos);
}
- if (rspamd_web_parse (NULL, pos, len, &last, FALSE, NULL) != 0) {
+ if (rspamd_web_parse (NULL, pos, len, &last, FALSE, &flags) != 0) {
return FALSE;
}
@@ -2026,6 +2053,7 @@ url_email_end (struct url_callback_data *cb,
const gchar *last = NULL;
struct http_parser_url u;
gint len = cb->end - pos;
+ guint flags = 0;
if (match->newline_pos && match->st != '<') {
/* We should also limit our match end to the newline */
@@ -2034,7 +2062,7 @@ url_email_end (struct url_callback_data *cb,
if (!match->prefix || match->prefix[0] == '\0') {
/* We have mailto:// at the beginning */
- if (rspamd_mailto_parse (&u, pos, len, &last, FALSE) != 0) {
+ if (rspamd_mailto_parse (&u, pos, len, &last, FALSE, &flags) != 0) {
return FALSE;
}