From 938ac78ad1099c5c216c2cec7ce6c55310744f1a Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 1 Mar 2019 18:30:33 +0000 Subject: [Rework] Rework telephone urls parsing logic --- src/libserver/html.c | 3 +- src/libserver/url.c | 335 +++++++++++++++++++++++++++++++++++---------------- src/libserver/url.h | 2 +- src/lua/lua_config.c | 1 - 4 files changed, 234 insertions(+), 107 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index 31438ddad..47d8ec836 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1349,7 +1349,8 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, if (rspamd_substring_search (start, len, "://", 3) == -1) { if (len >= sizeof ("mailto:") && (memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 || - memcmp (start, "tel:", sizeof ("tel:") - 1) == 0)) { + memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 || + memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) { /* Exclusion, has valid but 'strange' prefix */ } else { diff --git a/src/libserver/url.c b/src/libserver/url.c index b14d1cfa7..6b4a0d2d0 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -103,6 +103,11 @@ static const struct { .name = "tel", .len = 3 }, + { + .proto = PROTOCOL_TELEPHONE, + .name = "callto", + .len = 3 + }, { .proto = PROTOCOL_UNKNOWN, .name = NULL, @@ -193,7 +198,7 @@ struct url_matcher static_matchers[] = { 0, 0}, {"mailto:", "", url_email_start, url_email_end, 0, 0}, - {"callto://", "", url_web_start, url_web_end, + {"callto:", "", url_tel_start, url_tel_end, 0, 0}, {"h323:", "", url_web_start, url_web_end, 0, 0}, @@ -738,6 +743,144 @@ rspamd_mailto_parse (struct http_parser_url *u, return ret; } +static gint +rspamd_telephone_parse (struct http_parser_url *u, + const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, + guint *flags) +{ + enum { + parse_protocol, + parse_semicolon, + parse_slash, + parse_slash_slash, + parse_spaces, + parse_plus, + parse_phone_start, + parse_phone, + } st = parse_protocol; + + const gchar *p = str, *c = str, *last = str + len; + gchar t; + gint ret = 1, i; + UChar32 uc; + + if (u != NULL) { + memset (u, 0, sizeof (*u)); + } + + while (p < last) { + t = *p; + + switch (st) { + case parse_protocol: + if (t == ':') { + st = parse_semicolon; + SET_U (u, UF_SCHEMA); + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; + p++; + } + else { + st = parse_slash_slash; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + if (g_ascii_isspace (t)) { + st = parse_spaces; + p++; + } + else if (t == '+') { + c = p; + st = parse_plus; + } + else if (t == '/') { + /* Skip multiple slashes */ + p++; + } + else { + st = parse_phone_start; + c = p; + } + break; + case parse_spaces: + if (t == '+') { + c = p; + st = parse_plus; + } + else if (!g_ascii_isspace (t)) { + st = parse_phone_start; + c = p; + } + else { + p ++; + } + break; + case parse_plus: + c = p; + p ++; + st = parse_phone_start; + break; + case parse_phone_start: + if (*p == '%' || *p == '(' || g_ascii_isdigit (*p)) { + st = parse_phone; + p ++; + } + else { + goto out; + } + break; + case parse_phone: + i = p - str; + U8_NEXT (str, i, len, uc); + p = str + i; + + if (u_isdigit (uc) || uc == '(' || uc == ')' || uc == '[' || uc == ']' + || u_isspace (uc) || uc == '%') { + p ++; + } + else if (uc <= 0 || is_url_end (uc)) { + ret = 0; + goto set; + } + break; + } + } + + set: + if (st == parse_phone) { + if (p - c != 0) { + SET_U (u, UF_HOST); + ret = 0; + } + } + + out: + if (end != NULL) { + *end = p; + } + + if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { + return 0; + } + + return ret; +} + static gint rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, gchar const **end, @@ -1638,6 +1781,40 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen, } } +static void +rspamd_telephone_normalise_inplace (struct rspamd_url *uri) +{ + gchar *t, *h, *end; + gint i = 0, w, orig_len; + UChar32 uc; + + t = uri->host; + h = t; + end = uri->host + uri->hostlen; + orig_len = uri->hostlen; + + if (*h == '+') { + h ++; + t ++; + } + + while (h < end) { + i = 0; + U8_NEXT (h, i, end - h, uc); + + if (u_isdigit (uc)) { + w = 0; + U8_APPEND_UNSAFE (t, w, uc); + t += w; + } + + h += i; + } + + uri->hostlen = t - uri->host; + uri->urllen -= (orig_len - uri->hostlen); +} + enum uri_errno rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, @@ -1659,6 +1836,7 @@ rspamd_url_parse (struct rspamd_url *uri, } p = uristring; + uri->protocol = PROTOCOL_UNKNOWN; if (len > sizeof ("mailto:") - 1) { /* For mailto: urls we also need to add slashes to make it a valid URL */ @@ -1666,75 +1844,11 @@ rspamd_url_parse (struct rspamd_url *uri, ret = rspamd_mailto_parse (&u, uristring, len, &end, parse_flags, &flags); } - else if (g_ascii_strncasecmp (p, "tel:", sizeof ("tel:") - 1) == 0) { - /* Telephone url */ - gint nlen = 0; - gboolean has_plus = FALSE; - end = p + len; - gchar *t, *tend; - UChar32 uc; - - uri->raw = p; - uri->rawlen = len; - uri->string = rspamd_mempool_alloc (pool, len + 1); - t = uri->string; - tend = t + len; - i = 4; - - memcpy (t, "tel:", 4); - t += 4; - p += 4; - nlen = 4; - - if (*p == '+') { - has_plus = TRUE; - *t++ = *p++; - nlen ++; - i ++; - } - - while (t < tend && i < len) { - U8_NEXT (uristring, i, len, uc); - - if (u_isdigit (uc)) { - if (g_ascii_isdigit (uc)) { - *t++ = uc; - nlen ++; - } - else { - /* Obfuscated number */ - uri->flags |= RSPAMD_URL_FLAG_OBSCURED; - } - } - else if (IS_OBSCURED_CHAR (uc)) { - uri->flags |= RSPAMD_URL_FLAG_OBSCURED; - } - } - - *t = '\0'; - - if (rspamd_normalise_unicode_inplace (pool, uri->string, &nlen)) { - uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; - } - - uri->urllen = nlen; - + else if (g_ascii_strncasecmp (p, "tel:", sizeof ("tel:") - 1) == 0 || + g_ascii_strncasecmp (p, "callto:", sizeof ("callto:") - 1) == 0) { + ret = rspamd_telephone_parse (&u, uristring, len, &end, parse_flags, + &flags); uri->protocol = PROTOCOL_TELEPHONE; - uri->protocollen = 4; - - uri->host = uri->string + 4; - uri->hostlen = nlen - 4; - - if (has_plus) { - uri->tld = uri->string + 5; - uri->tldlen = nlen - 5; - } - else { - uri->tld = uri->string + 4; - uri->tldlen = nlen - 4; - } - - return URI_ERRNO_OK; } else { ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags, @@ -1867,32 +1981,46 @@ rspamd_url_parse (struct rspamd_url *uri, rspamd_str_lc (uri->string, uri->protocollen); rspamd_str_lc_utf8 (uri->host, uri->hostlen); - uri->protocol = PROTOCOL_UNKNOWN; - - for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) { - if (uri->protocollen == rspamd_url_protocols[i].len) { - if (memcmp (uri->string, - rspamd_url_protocols[i].name, uri->protocollen) == 0) { - uri->protocol = rspamd_url_protocols[i].proto; - break; + if (uri->protocol == PROTOCOL_UNKNOWN) { + for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) { + if (uri->protocollen == rspamd_url_protocols[i].len) { + if (memcmp (uri->string, + rspamd_url_protocols[i].name, uri->protocollen) == 0) { + uri->protocol = rspamd_url_protocols[i].proto; + break; + } } } } - /* Find TLD part */ - rspamd_multipattern_lookup (url_scanner->search_trie, + if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP)) { + /* Find TLD part */ + rspamd_multipattern_lookup (url_scanner->search_trie, uri->host, uri->hostlen, rspamd_tld_trie_callback, uri, NULL); - if (uri->tldlen == 0) { - if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) { - /* Ignore URL's without TLD if it is not a numeric URL */ - if (!rspamd_url_is_ip (uri, pool)) { - return URI_ERRNO_TLD_MISSING; + if (uri->tldlen == 0) { + if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) { + /* Ignore URL's without TLD if it is not a numeric URL */ + if (!rspamd_url_is_ip (uri, pool)) { + return URI_ERRNO_TLD_MISSING; + } + } else { + /* Assume tld equal to host */ + uri->tld = uri->host; + uri->tldlen = uri->hostlen; } } + } + else if (uri->protocol & PROTOCOL_TELEPHONE) { + /* We need to normalise phone number: remove all spaces and braces */ + rspamd_telephone_normalise_inplace (uri); + + if (uri->host[0] == '+') { + uri->tld = uri->host + 1; + uri->tldlen = uri->hostlen - 1; + } else { - /* Assume tld equal to host */ uri->tld = uri->host; uri->tldlen = uri->hostlen; } @@ -2371,13 +2499,15 @@ url_tel_start (struct url_callback_data *cb, const gchar *pos, url_match_t *match) { - if (!(*pos == '+' || g_ascii_isdigit (*pos))) { - /* Urls cannot start with . */ - return FALSE; - } - match->m_begin = pos; + if (pos >= cb->begin + 1) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + return TRUE; } @@ -2386,29 +2516,26 @@ url_tel_end (struct url_callback_data *cb, const gchar *pos, url_match_t *match) { - UChar32 uc; - gint len = cb->end - pos, i = 0; + const gchar *last = NULL; + struct http_parser_url u; + gint len = cb->end - pos; + guint flags = 0; if (match->newline_pos && match->st != '<') { /* We should also limit our match end to the newline */ len = MIN (len, match->newline_pos - pos); } - while (i < len) { - U8_NEXT (pos, i, len, uc); - - if (uc < 0) { - break; - } + if (rspamd_telephone_parse (&u, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + return FALSE; + } - if (!(u_isdigit (uc) || u_isspace (uc) || - IS_OBSCURED_CHAR (uc) || uc == '+' || - uc == '-' || uc == '.')) { - break; - } + if (!(u.field_set & (1 << UF_HOST))) { + return FALSE; } - match->m_len = i; + match->m_len = (last - pos); return TRUE; } diff --git a/src/libserver/url.h b/src/libserver/url.h index 4d1948921..3deeb8cf5 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -85,7 +85,7 @@ enum rspamd_url_protocol { PROTOCOL_HTTPS = 1u << 3, PROTOCOL_MAILTO = 1u << 4, PROTOCOL_TELEPHONE = 1u << 5, - PROTOCOL_UNKNOWN = -1, + PROTOCOL_UNKNOWN = 1u << 31, }; /** diff --git a/src/lua/lua_config.c b/src/lua/lua_config.c index 884467e10..2ea72f075 100644 --- a/src/lua/lua_config.c +++ b/src/lua/lua_config.c @@ -2619,7 +2619,6 @@ lua_config_newindex (lua_State *L) gint type = SYMBOL_TYPE_NORMAL, priority = 0, idx; gdouble weight = 1.0, score = NAN; const char *type_str, *group = NULL, *description = NULL; - guint flags = 0; no_squeeze = cfg->disable_lua_squeeze; /* -- cgit v1.2.3