diff options
-rw-r--r-- | src/libserver/url.c | 29 | ||||
-rw-r--r-- | test/lua/unit/url.lua | 3 |
2 files changed, 30 insertions, 2 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c index eb663519d..8a33b4915 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1113,10 +1113,35 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, if (t != '/' && t != '\\') { c = p; - st = parse_domain_start; slash = p; + st = parse_domain_start; + + /* + * Unfortunately, due to brain damage of the RFC 3986 authors, + * we have to distinguish two possibilities here: + * authority = [ userinfo "@" ] host [ ":" port ] + * So if we have @ somewhere before hostname then we must process + * with the username state. Otherwise, we have to process via + * the hostname state. Unfortunately, there is no way to distinguish + * them aside of running NFA or two DFA or performing lookahead. + * Lookahead approach looks easier to implement. + */ + + const char *tp = p; + while (tp < last) { + if (*tp == '@') { + user_seen = TRUE; + st = parse_user; + break; + } + else if (*tp == '/' || *tp == '#' || *tp == '?') { + st = parse_domain_start; + } + + tp ++; + } - if (*p == '[') { + if (st == parse_domain_start && *p == '[') { st = parse_ipv6; p++; c = p; diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua index 97eda91c6..40d684bfc 100644 --- a/test/lua/unit/url.lua +++ b/test/lua/unit/url.lua @@ -133,6 +133,9 @@ context("URL check functions", function() {"http://hehe。example。com#test", true, { host = 'hehe.example.com', fragment = 'test' }}, + {"http:////$%^&****((@example.org//#f@f", true, { + user = '$%^&****((', host = 'example.org', fragment = 'f@f' + }}, } -- Some cases from https://code.google.com/p/google-url/source/browse/trunk/src/url_canon_unittest.cc |