IS_URLSAFE)) != 0)
void
-rspamd_unescape_uri (gchar **dst, gchar **src, gsize size)
+rspamd_unescape_uri (gchar *dst, const gchar *src, gsize size)
{
- gchar *d, *s, ch, c, decoded;
+ gchar *d, ch, c, decoded;
+ const gchar *s;
enum {
sw_usual = 0,
sw_quoted,
sw_quoted_second
} state;
- d = *dst;
- s = *src;
+ d = dst;
+ s = src;
state = 0;
decoded = 0;
switch (state) {
case sw_usual:
- if (ch == '?') {
- *d++ = ch;
- goto done;
- }
if (ch == '%') {
state = sw_quoted;
if (c >= 'a' && c <= 'f') {
ch = ((decoded << 4) + c - 'a' + 10);
- if (ch == '?') {
- *d++ = ch;
- goto done;
- }
-
*d++ = ch;
break;
}
}
}
- done:
-
- *dst = d;
- *src = s;
+ *d = '\0';
}
const gchar *
rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
gchar const **end, gboolean strict)
{
- const gchar *p = str, *c = str, *last = str + len;
+ const gchar *p = str, *c = str, *last = str + len, *slash = NULL;
gchar t;
gunichar uc;
glong pt;
gint ret = 1;
+ gboolean user_seen = FALSE;
enum {
parse_protocol,
parse_slash,
parse_password_start,
parse_password,
parse_domain,
+ parse_port_password,
parse_port,
parse_suffix_slash,
parse_path,
break;
case parse_slash_slash:
c = p;
-
- /* XXX: inefficient lookahead */
- if (strchr (p, '@') != NULL) {
- st = parse_user;
- }
- else {
- st = parse_domain;
- }
-
+ st = parse_domain;
+ slash = p;
break;
case parse_user:
if (t == ':') {
if (p - c == 0) {
goto out;
}
- SET_U (u, UF_HOST);
-
if (t == '/') {
+ SET_U (u, UF_HOST);
st = parse_suffix_slash;
}
- else {
- st = parse_port;
+ else if (!user_seen) {
+ /*
+ * Here we can have both port and password, hence we need
+ * to apply some heuristic here
+ */
+ st = parse_port_password;
c = p + 1;
}
p ++;
}
}
break;
+ case parse_port_password:
+ if (g_ascii_isdigit (t)) {
+ /* XXX: that breaks urls with passwords starting with number */
+ st = parse_port;
+ c = slash;
+ SET_U (u, UF_HOST);
+ c = p;
+ }
+ else {
+ /* Rewind back */
+ p = slash;
+ c = slash;
+ user_seen = TRUE;
+ st = parse_user;
+ }
+ break;
case parse_port:
if (t == '/') {
pt = strtoul (c, NULL, 10);
if (pt == 0 || pt > 65535) {
goto out;
}
- u->port = pt;
+ if (u != NULL) {
+ u->port = pt;
+ }
st = parse_suffix_slash;
}
else if (!g_ascii_isdigit (t)) {
}
/* Now decode url symbols */
- uri->string = rspamd_mempool_strdup (pool, p);
-
- if (uri->datalen > 0) {
- rspamd_unescape_uri (&uri->data, &uri->data, uri->datalen);
- }
- if (uri->querylen > 0) {
- rspamd_unescape_uri (&uri->query, &uri->query, uri->querylen);
- }
+ uri->string = p;
+ rspamd_unescape_uri (uri->string, uri->string, len);
rspamd_str_lc (uri->string, uri->protocollen);
rspamd_str_lc (uri->host, uri->hostlen);