} url_match_t;
#define URL_FLAG_NOHTML (1 << 0)
-#define URL_FLAG_STRICT_MATCH (1 << 1)
+#define URL_FLAG_TLD_MATCH (1 << 1)
#define URL_FLAG_STAR_MATCH (1 << 2)
struct url_matcher {
continue;
}
- flags = URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH;
+ flags = URL_FLAG_NOHTML | URL_FLAG_TLD_MATCH;
if (linebuf[0] == '*') {
flags |= URL_FLAG_STAR_MATCH;
rspamd_mempool_t *pool)
{
struct http_parser_url u;
- gchar *p, *comp;
+ gchar *p, *comp, t;
const gchar *end;
guint i, complen, ret;
gint state = 0;
}
/* Find TLD part */
- acism_lookup (url_scanner->search_trie, uri->host, uri->hostlen,
- rspamd_tld_trie_callback, uri, &state, true);
+ if (acism_lookup (url_scanner->search_trie, uri->host, uri->hostlen,
+ rspamd_tld_trie_callback, uri, &state, true) == 0) {
+ /* Ignore URL's without TLD if it is not a numeric URL */
+ for (i = 0; i < uri->hostlen; i ++) {
+ t = uri->host[i];
+
+ if (g_ascii_isalpha (t)) {
+ return URI_ERRNO_BAD_FORMAT;
+ }
+ }
+ }
if (uri->protocol == PROTOCOL_UNKNOWN) {
return URI_ERRNO_INVALID_PROTOCOL;
/* Do not try to match non-html like urls in html texts */
return 0;
}
+
+ if (matcher->flags & URL_FLAG_TLD_MATCH) {
+ /* Immediately check pos + 1 for valid chars */
+ pos = &cb->begin[textpos + 1];
+ if (pos < cb->end) {
+ if (!g_ascii_isspace (*pos) && *pos != '/' && *pos != '?' && *pos != ':') {
+ return 0;
+ }
+ }
+ }
+
pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
m.pattern = matcher->pattern;