From a4977e18de67905b2863514feeff7a77025d4087 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 19 Mar 2020 18:33:16 +0000 Subject: [PATCH] [Rework] Fix various url extraction issues --- src/libserver/url.c | 14 ++++++++------ src/libutil/multipattern.c | 17 +++++++++-------- test/lua/unit/url.lua | 4 +++- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/libserver/url.c b/src/libserver/url.c index ff8c30819..2e0991406 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -496,12 +496,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc) rspamd_multipattern_add_pattern (url_scanner->search_trie_strict, static_matchers[i].pattern, RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8| - RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD); + RSPAMD_MULTIPATTERN_RE); } else { rspamd_multipattern_add_pattern (url_scanner->search_trie_strict, static_matchers[i].pattern, - RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD); + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); } } @@ -513,12 +513,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc) rspamd_multipattern_add_pattern (url_scanner->search_trie_full, static_matchers[i].pattern, RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8| - RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD); + RSPAMD_MULTIPATTERN_RE); } else { rspamd_multipattern_add_pattern (url_scanner->search_trie_full, static_matchers[i].pattern, - RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD); + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); } } g_array_append_vals (sc->matchers_full, static_matchers, n); @@ -558,14 +558,14 @@ rspamd_url_init (const gchar *tld_file) sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers)); url_scanner->search_trie_strict = rspamd_multipattern_create_sized ( G_N_ELEMENTS (static_matchers), - RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); if (tld_file) { /* Reserve larger multipattern */ url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE, sizeof (struct url_matcher), 13000); url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000, - RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); } else { url_scanner->matchers_full = NULL; @@ -3173,6 +3173,8 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp, } else { cb->url_str = NULL; + /* Continue search if no pattern has been found */ + return 0; } /* Continue search if required (return 0 means continue) */ diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c index f1295a9e4..547762d26 100644 --- a/src/libutil/multipattern.c +++ b/src/libutil/multipattern.c @@ -103,7 +103,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen, /* * We understand the following cases * 1) blah -> .blah\b - * 2) *.blah -> ..*\\.blah\b + * 2) *.blah -> ..*\\.blah\b|$ * 3) ??? */ @@ -127,7 +127,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen, len = slen + strlen (prefix); } - suffix = "\\b"; + suffix = "(:?\\b|$)"; len += strlen (suffix); res = g_malloc (len + 1); @@ -329,26 +329,27 @@ rspamd_multipattern_add_pattern_len (struct rspamd_multipattern *mp, if (rspamd_hs_check ()) { gchar *np; gint fl = HS_FLAG_SOM_LEFTMOST; + gint adjusted_flags = mp->flags | flags; - if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) { + if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) { fl |= HS_FLAG_CASELESS; } - if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) { - if (mp->flags & RSPAMD_MULTIPATTERN_TLD) { + if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) { + if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) { fl |= HS_FLAG_UTF8; } else { fl |= HS_FLAG_UTF8 | HS_FLAG_UCP; } } - if (mp->flags & RSPAMD_MULTIPATTERN_DOTALL) { + if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) { fl |= HS_FLAG_DOTALL; } - if (mp->flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) { + if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) { fl |= HS_FLAG_SINGLEMATCH; fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */ } - if (mp->flags & RSPAMD_MULTIPATTERN_NO_START) { + if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) { fl &= ~HS_FLAG_SOM_LEFTMOST; } diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua index 24c354960..9647db79b 100644 --- a/test/lua/unit/url.lua +++ b/test/lua/unit/url.lua @@ -83,7 +83,9 @@ context("URL check functions", function() {"http:/\\www.google.com/foo?bar=baz#", true, { host = 'www.google.com', path = 'foo', query = 'bar=baz', tld = 'google.com' }}, - {"http://[www.google.com]/", false}, + {"http://[www.google.com]/", true, { + host = 'www.google.com', + }}, {"