]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Fix various url extraction issues
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 19 Mar 2020 18:33:16 +0000 (18:33 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 19 Mar 2020 18:33:16 +0000 (18:33 +0000)
src/libserver/url.c
src/libutil/multipattern.c
test/lua/unit/url.lua

index ff8c30819c73fa06c4c6f67f16e49b9e4848c7dc..2e099140638b3b3dd7f974e93649f51ca92b7994 100644 (file)
@@ -496,12 +496,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
                        rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
                                        static_matchers[i].pattern,
                                        RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
-                                                       RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
+                                                       RSPAMD_MULTIPATTERN_RE);
                }
                else {
                        rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
                                        static_matchers[i].pattern,
-                                       RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
+                                       RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
                }
        }
 
@@ -513,12 +513,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
                                rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
                                                static_matchers[i].pattern,
                                                RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
-                                               RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
+                                               RSPAMD_MULTIPATTERN_RE);
                        }
                        else {
                                rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
                                                static_matchers[i].pattern,
-                                               RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
+                                               RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
                        }
                }
                g_array_append_vals (sc->matchers_full, static_matchers, n);
@@ -558,14 +558,14 @@ rspamd_url_init (const gchar *tld_file)
                        sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers));
        url_scanner->search_trie_strict = rspamd_multipattern_create_sized (
                        G_N_ELEMENTS (static_matchers),
-                       RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+                       RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
 
        if (tld_file) {
                /* Reserve larger multipattern */
                url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE,
                                sizeof (struct url_matcher), 13000);
                url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000,
-                               RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+                               RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
        }
        else {
                url_scanner->matchers_full = NULL;
@@ -3173,6 +3173,8 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
        }
        else {
                cb->url_str = NULL;
+               /* Continue search if no pattern has been found */
+               return 0;
        }
 
        /* Continue search if required (return 0 means continue) */
index f1295a9e43a5f9bd2cd347687ad1b258935ad317..547762d26160b222dbd43fc5f38fe1294b60b5b3 100644 (file)
@@ -103,7 +103,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
        /*
         * We understand the following cases
         * 1) blah -> .blah\b
-        * 2) *.blah -> ..*\\.blah\b
+        * 2) *.blah -> ..*\\.blah\b|$
         * 3) ???
         */
 
@@ -127,7 +127,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
                len = slen + strlen (prefix);
        }
 
-       suffix = "\\b";
+       suffix = "(:?\\b|$)";
        len += strlen (suffix);
 
        res = g_malloc (len + 1);
@@ -329,26 +329,27 @@ rspamd_multipattern_add_pattern_len (struct rspamd_multipattern *mp,
        if (rspamd_hs_check ()) {
                gchar *np;
                gint fl = HS_FLAG_SOM_LEFTMOST;
+               gint adjusted_flags = mp->flags | flags;
 
-               if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) {
+               if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) {
                        fl |= HS_FLAG_CASELESS;
                }
-               if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
-                       if (mp->flags & RSPAMD_MULTIPATTERN_TLD) {
+               if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) {
+                       if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) {
                                fl |= HS_FLAG_UTF8;
                        }
                        else {
                                fl |= HS_FLAG_UTF8 | HS_FLAG_UCP;
                        }
                }
-               if (mp->flags & RSPAMD_MULTIPATTERN_DOTALL) {
+               if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) {
                        fl |= HS_FLAG_DOTALL;
                }
-               if (mp->flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
+               if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
                        fl |= HS_FLAG_SINGLEMATCH;
                        fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */
                }
-               if (mp->flags & RSPAMD_MULTIPATTERN_NO_START) {
+               if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) {
                        fl &= ~HS_FLAG_SOM_LEFTMOST;
                }
 
index 24c354960c822d44da8774cd8b7163594785a972..9647db79b7af6c63f13b3414f714658f68de99f3 100644 (file)
@@ -83,7 +83,9 @@ context("URL check functions", function()
     {"http:/\\www.google.com/foo?bar=baz#", true, {
       host = 'www.google.com', path = 'foo', query = 'bar=baz', tld = 'google.com'
     }},
-    {"http://[www.google.com]/", false},
+    {"http://[www.google.com]/", true, {
+      host = 'www.google.com',
+    }},
     {"<test.com", true, {
       host = 'test.com', tld = 'test.com',
     }},