diff options
Diffstat (limited to 'src/libserver/url.c')
-rw-r--r-- | src/libserver/url.c | 771 |
1 files changed, 390 insertions, 381 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c index 0ec256902..ea46c0353 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -915,6 +915,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, parse_multiple_at, parse_password_start, parse_password, + parse_domain_start, parse_domain, parse_ipv6, parse_port_password, @@ -933,465 +934,473 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, t = *p; switch (st) { - case parse_protocol: - if (t == ':') { - st = parse_semicolon; - SET_U (u, UF_SCHEMA); + case parse_protocol: + if (t == ':') { + st = parse_semicolon; + SET_U (u, UF_SCHEMA); + } + else if (!g_ascii_isalnum (t) && t != '+' && t != '-') { + if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) { + /* We might have some domain, but no protocol */ + st = parse_domain_start; + p = c; + slash = c; + break; } - else if (!g_ascii_isalnum (t) && t != '+' && t != '-') { - if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) { - /* We might have some domain, but no protocol */ - st = parse_domain; - p = c; - slash = c; - break; - } - else { - goto out; - } + else { + goto out; } + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; p++; - break; - case parse_semicolon: - if (t == '/' || t == '\\') { - st = parse_slash; + } + else { + st = parse_slash_slash; + *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + + if (t != '/' && t != '\\') { + c = p; + st = parse_domain_start; + slash = p; + + if (*p == '[') { + st = parse_ipv6; p++; + c = p; } - else { - st = parse_slash_slash; - *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES; - } - break; - case parse_slash: - if (t == '/' || t == '\\') { - st = parse_slash_slash; - } - else { + } + else { + /* Skip multiple slashes */ + p++; + } + break; + case parse_ipv6: + if (t == ']') { + if (p - c == 0) { goto out; } + SET_U (u, UF_HOST); p++; - break; - case parse_slash_slash: - - if (t != '/' && t != '\\') { - c = p; - st = parse_domain; - slash = p; - if (*p == '[') { - st = parse_ipv6; - p++; - c = p; - } + if (*p == ':') { + st = parse_port; + c = p + 1; } - else { - /* Skip multiple slashes */ - p++; + else if (*p == '/') { + st = parse_path; + c = p + 1; } - break; - case parse_ipv6: - if (t == ']') { - if (p - c == 0) { - goto out; - } - SET_U (u, UF_HOST); - p++; - - if (*p == ':') { - st = parse_port; - c = p + 1; - } - else if (*p == '/') { - st = parse_path; - c = p + 1; - } - else if (p != last) { - goto out; - } + else if (p != last) { + goto out; } - else if (!g_ascii_isxdigit (t) && t != ':' && t != '.') { + } + else if (!g_ascii_isxdigit (t) && t != ':' && t != '.') { + goto out; + } + p++; + break; + case parse_user: + if (t == ':') { + if (p - c == 0) { goto out; } - p++; - break; - case parse_user: - if (t == ':') { - if (p - c == 0) { - goto out; - } - user_start = c; - st = parse_password_start; + user_start = c; + st = parse_password_start; + } + else if (t == '@') { + /* No password */ + if (p - c == 0) { + /* We have multiple at in fact */ + st = parse_multiple_at; + user_seen = TRUE; + *flags |= RSPAMD_URL_FLAG_OBSCURED; + + continue; + } + + SET_U (u, UF_USERINFO); + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else if (!g_ascii_isgraph (t)) { + goto out; + } + p++; + break; + case parse_multiple_at: + if (t != '@') { + if (p - c == 0) { + goto out; } - else if (t == '@') { - /* No password */ - if (p - c == 0) { - /* We have multiple at in fact */ - st = parse_multiple_at; - user_seen = TRUE; - *flags |= RSPAMD_URL_FLAG_OBSCURED; - continue; + /* For now, we ignore all that stuff as it is bogus */ + SET_U (u, UF_USERINFO); + st = parse_at; + } + else { + p ++; + } + break; + case parse_password_start: + if (t == '@') { + /* Empty password */ + SET_U (u, UF_USERINFO); + if (u != NULL && u->field_data[UF_USERINFO].len > 0) { + /* Eat semicolon */ + u->field_data[UF_USERINFO].len--; + } + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else { + c = p; + password_start = p; + st = parse_password; + } + p++; + break; + case parse_password: + if (t == '@') { + /* XXX: password is not stored */ + if (u != NULL) { + if (u->field_data[UF_USERINFO].len == 0 + && password_start + && user_start && password_start > user_start + 1) { + *flags |= RSPAMD_URL_FLAG_HAS_USER; + u->field_set |= 1u << (UF_USERINFO); + u->field_data[UF_USERINFO].len = + password_start - user_start - 1; + u->field_data[UF_USERINFO].off = + user_start - str; } - SET_U (u, UF_USERINFO); - *flags |= RSPAMD_URL_FLAG_HAS_USER; - st = parse_at; } - else if (!g_ascii_isgraph (t)) { + st = parse_at; + } + else if (!g_ascii_isgraph (t)) { + goto out; + } + p++; + break; + case parse_at: + c = p; + + if (t == '@') { + *flags |= RSPAMD_URL_FLAG_OBSCURED; + p ++; + } + else if (t == '[') { + st = parse_ipv6; + p++; + c = p; + } + else { + st = parse_domain_start; + } + break; + case parse_domain_start: + if (g_ascii_isalnum (t) || t & 0x80) { + st = parse_domain; + } + else { + goto out; + } + break; + case parse_domain: + if (t == '/' || t == ':' || t == '?' || t == '#') { + if (p - c == 0) { goto out; } - p++; - break; - case parse_multiple_at: - if (t != '@') { - if (p - c == 0) { - goto out; - } - - /* For now, we ignore all that stuff as it is bogus */ - SET_U (u, UF_USERINFO); - st = parse_at; + if (t == '/') { + SET_U (u, UF_HOST); + st = parse_suffix_slash; } - else { - p ++; + else if (t == '?') { + SET_U (u, UF_HOST); + st = parse_query; + c = p + 1; } - break; - case parse_password_start: - if (t == '@') { - /* Empty password */ - SET_U (u, UF_USERINFO); - if (u != NULL && u->field_data[UF_USERINFO].len > 0) { - /* Eat semicolon */ - u->field_data[UF_USERINFO].len--; - } - *flags |= RSPAMD_URL_FLAG_HAS_USER; - st = parse_at; + else if (t == '#') { + SET_U (u, UF_HOST); + st = parse_part; + c = p + 1; + } + else if (!user_seen) { + /* + * Here we can have both port and password, hence we need + * to apply some heuristic here + */ + st = parse_port_password; } else { - c = p; - password_start = p; - st = parse_password; + /* + * We can go only for parsing port here + */ + SET_U (u, UF_HOST); + st = parse_port; + c = p + 1; } p++; - break; - case parse_password: - if (t == '@') { - /* XXX: password is not stored */ - if (u != NULL) { - if (u->field_data[UF_USERINFO].len == 0 - && password_start - && user_start && password_start > user_start + 1) { - *flags |= RSPAMD_URL_FLAG_HAS_USER; - u->field_set |= 1u << (UF_USERINFO); - u->field_data[UF_USERINFO].len = - password_start - user_start - 1; - u->field_data[UF_USERINFO].off = - user_start - str; - } - - } - st = parse_at; + } + else { + if (is_url_end (t)) { + goto set; } - else if (!g_ascii_isgraph (t)) { - goto out; + else if (*p == '@' && !user_seen) { + /* We need to fallback and test user */ + p = slash; + user_seen = TRUE; + st = parse_user; } - p++; - break; - case parse_at: - c = p; + else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') { + if (*p & 0x80) { + (*flags) |= RSPAMD_URL_FLAG_IDN; + guint i = 0; - if (t == '@') { - *flags |= RSPAMD_URL_FLAG_OBSCURED; - p ++; - } - else if (t == '[') { - st = parse_ipv6; - p++; - c = p; - } - else { - st = parse_domain; - } - break; - case parse_domain: - if (t == '/' || t == ':' || t == '?' || t == '#') { - if (p - c == 0) { - goto out; - } - if (t == '/') { - SET_U (u, UF_HOST); - st = parse_suffix_slash; - } - else if (t == '?') { - SET_U (u, UF_HOST); - st = parse_query; - c = p + 1; - } - else if (t == '#') { - SET_U (u, UF_HOST); - st = parse_part; - c = p + 1; - } - else if (!user_seen) { - /* - * Here we can have both port and password, hence we need - * to apply some heuristic here - */ - st = parse_port_password; - } - else { - /* - * We can go only for parsing port here - */ - SET_U (u, UF_HOST); - st = parse_port; - c = p + 1; - } - p++; - } - else { - if (is_url_end (t)) { - goto set; - } - else if (*p == '@' && !user_seen) { - /* We need to fallback and test user */ - p = slash; - user_seen = TRUE; - st = parse_user; - } - else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') { - if (*p & 0x80) { - (*flags) |= RSPAMD_URL_FLAG_IDN; - guint i = 0; + U8_NEXT (p, i, last - p, uc); - U8_NEXT (p, i, last - p, uc); + if (uc < 0) { + /* Bad utf8 */ + goto out; + } - if (uc < 0) { - /* Bad utf8 */ - goto out; + if (!u_isalnum (uc)) { + /* Bad symbol */ + if (IS_ZERO_WIDTH_SPACE (uc)) { + (*flags) |= RSPAMD_URL_FLAG_OBSCURED; } - - if (!u_isalnum (uc)) { - /* Bad symbol */ - if (IS_ZERO_WIDTH_SPACE (uc)) { - (*flags) |= RSPAMD_URL_FLAG_OBSCURED; - } - else { - if (!u_isgraph (uc)) { - if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { - goto out; - } - else { - goto set; - } + else { + if (!u_isgraph (uc)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; + } + else { + goto set; } } } - - p = p + i; } - else if (is_urlsafe (*p)) { + + p = p + i; + } + else if (is_urlsafe (*p)) { + p ++; + } + else { + if (parse_flags & RSPAMD_URL_PARSE_HREF) { + /* We have to use all shit we are given here */ p ++; + (*flags) |= RSPAMD_URL_FLAG_OBSCURED; } else { - if (parse_flags & RSPAMD_URL_PARSE_HREF) { - /* We have to use all shit we are given here */ - p ++; - (*flags) |= RSPAMD_URL_FLAG_OBSCURED; + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; } else { - if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { - goto out; - } - else { - goto set; - } + goto set; } } } - else { - p++; - } } - break; - case parse_port_password: - if (g_ascii_isdigit (t)) { - const gchar *tmp = p; - - while (tmp < last) { - if (!g_ascii_isdigit (*tmp)) { - if (*tmp == '/' || *tmp == '#' || *tmp == '?' || - is_url_end (*tmp) || g_ascii_isspace (*tmp)) { - /* Port + something */ - st = parse_port; - c = slash; - p--; - SET_U (u, UF_HOST); - p++; - c = p; - break; - } - else { - /* Not a port, bad character at the end */ - break; - } + else { + p++; + } + } + break; + case parse_port_password: + if (g_ascii_isdigit (t)) { + const gchar *tmp = p; + + while (tmp < last) { + if (!g_ascii_isdigit (*tmp)) { + if (*tmp == '/' || *tmp == '#' || *tmp == '?' || + is_url_end (*tmp) || g_ascii_isspace (*tmp)) { + /* Port + something */ + st = parse_port; + c = slash; + p--; + SET_U (u, UF_HOST); + p++; + c = p; + break; + } + else { + /* Not a port, bad character at the end */ + break; } - tmp ++; - } - - if (tmp == last) { - /* Host + port only */ - st = parse_port; - c = slash; - p--; - SET_U (u, UF_HOST); - p++; - c = p; } + tmp ++; + } - if (st != parse_port) { - /* Fallback to user:password */ - p = slash; - c = slash; - user_seen = TRUE; - st = parse_user; - } + if (tmp == last) { + /* Host + port only */ + st = parse_port; + c = slash; + p--; + SET_U (u, UF_HOST); + p++; + c = p; } - else { - /* Rewind back */ + + if (st != parse_port) { + /* Fallback to user:password */ p = slash; c = slash; user_seen = TRUE; st = parse_user; } - break; - case parse_port: - if (t == '/') { - pt = strtoul (c, NULL, 10); - if (pt == 0 || pt > 65535) { - goto out; - } - if (u != NULL) { - u->port = pt; - *flags |= RSPAMD_URL_FLAG_HAS_PORT; - } - st = parse_suffix_slash; + } + else { + /* Rewind back */ + p = slash; + c = slash; + user_seen = TRUE; + st = parse_user; + } + break; + case parse_port: + if (t == '/') { + pt = strtoul (c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; + } + st = parse_suffix_slash; + } + else if (t == '?') { + pt = strtoul (c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; } - else if (t == '?') { - pt = strtoul (c, NULL, 10); - if (pt == 0 || pt > 65535) { - goto out; - } - if (u != NULL) { - u->port = pt; - *flags |= RSPAMD_URL_FLAG_HAS_PORT; - } - c = p + 1; - st = parse_query; + c = p + 1; + st = parse_query; + } + else if (t == '#') { + pt = strtoul (c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; } - else if (t == '#') { - pt = strtoul (c, NULL, 10); - if (pt == 0 || pt > 65535) { - goto out; - } - if (u != NULL) { - u->port = pt; - *flags |= RSPAMD_URL_FLAG_HAS_PORT; - } - c = p + 1; - st = parse_part; + c = p + 1; + st = parse_part; + } + else if (is_url_end (t)) { + goto set; + } + else if (!g_ascii_isdigit (t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) || + !g_ascii_isspace (t)) { + goto out; } - else if (is_url_end (t)) { + else { goto set; } - else if (!g_ascii_isdigit (t)) { - if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) || - !g_ascii_isspace (t)) { - goto out; - } - else { - goto set; - } - } + } + p++; + break; + case parse_suffix_slash: + if (t != '/') { + c = p; + st = parse_path; + } + else { + /* Skip extra slashes */ p++; - break; - case parse_suffix_slash: - if (t != '/') { - c = p; - st = parse_path; - } - else { - /* Skip extra slashes */ - p++; + } + break; + case parse_path: + if (t == '?') { + if (p - c != 0) { + SET_U (u, UF_PATH); } - break; - case parse_path: - if (t == '?') { - if (p - c != 0) { - SET_U (u, UF_PATH); + c = p + 1; + st = parse_query; + } + else if (is_url_end (t)) { + goto set; + } + else if (is_lwsp (t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace (t)) { + goto set; } - c = p + 1; - st = parse_query; + goto out; } - else if (is_url_end (t)) { + else { goto set; } - else if (is_lwsp (t)) { - if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { - if (g_ascii_isspace (t)) { - goto set; - } - goto out; - } - else { - goto set; - } + } + p++; + break; + case parse_query: + if (t == '#') { + if (p - c != 0) { + SET_U (u, UF_QUERY); } - p++; - break; - case parse_query: - if (t == '#') { - if (p - c != 0) { - SET_U (u, UF_QUERY); + c = p + 1; + st = parse_part; + } + else if (is_url_end (t)) { + goto set; + } + else if (is_lwsp (t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace (t)) { + goto set; } - c = p + 1; - st = parse_part; + goto out; } - else if (is_url_end (t)) { + else { goto set; } - else if (is_lwsp (t)) { - if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { - if (g_ascii_isspace (t)) { - goto set; - } - goto out; - } - else { + } + p++; + break; + case parse_part: + if (is_url_end (t)) { + goto set; + } + else if (is_lwsp (t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace (t)) { goto set; } + goto out; } - p++; - break; - case parse_part: - if (is_url_end (t)) { + else { goto set; } - else if (is_lwsp (t)) { - if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { - if (g_ascii_isspace (t)) { - goto set; - } - goto out; - } - else { - goto set; - } - } - p++; - break; + } + p++; + break; } } |