Browse Source

[Rework] Fix various url extraction issues

tags/2.5
Vsevolod Stakhov 4 years ago
parent
commit
a4977e18de
3 changed files with 20 additions and 15 deletions
  1. 8
    6
      src/libserver/url.c
  2. 9
    8
      src/libutil/multipattern.c
  3. 3
    1
      test/lua/unit/url.lua

+ 8
- 6
src/libserver/url.c View File

@@ -496,12 +496,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_RE);
}
else {
rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
}
}

@@ -513,12 +513,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_RE);
}
else {
rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
}
}
g_array_append_vals (sc->matchers_full, static_matchers, n);
@@ -558,14 +558,14 @@ rspamd_url_init (const gchar *tld_file)
sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers));
url_scanner->search_trie_strict = rspamd_multipattern_create_sized (
G_N_ELEMENTS (static_matchers),
RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);

if (tld_file) {
/* Reserve larger multipattern */
url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE,
sizeof (struct url_matcher), 13000);
url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000,
RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
}
else {
url_scanner->matchers_full = NULL;
@@ -3173,6 +3173,8 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
}
else {
cb->url_str = NULL;
/* Continue search if no pattern has been found */
return 0;
}

/* Continue search if required (return 0 means continue) */

+ 9
- 8
src/libutil/multipattern.c View File

@@ -103,7 +103,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
/*
* We understand the following cases
* 1) blah -> .blah\b
* 2) *.blah -> ..*\\.blah\b
* 2) *.blah -> ..*\\.blah\b|$
* 3) ???
*/

@@ -127,7 +127,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
len = slen + strlen (prefix);
}

suffix = "\\b";
suffix = "(:?\\b|$)";
len += strlen (suffix);

res = g_malloc (len + 1);
@@ -329,26 +329,27 @@ rspamd_multipattern_add_pattern_len (struct rspamd_multipattern *mp,
if (rspamd_hs_check ()) {
gchar *np;
gint fl = HS_FLAG_SOM_LEFTMOST;
gint adjusted_flags = mp->flags | flags;

if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) {
fl |= HS_FLAG_CASELESS;
}
if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
if (mp->flags & RSPAMD_MULTIPATTERN_TLD) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) {
fl |= HS_FLAG_UTF8;
}
else {
fl |= HS_FLAG_UTF8 | HS_FLAG_UCP;
}
}
if (mp->flags & RSPAMD_MULTIPATTERN_DOTALL) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) {
fl |= HS_FLAG_DOTALL;
}
if (mp->flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
fl |= HS_FLAG_SINGLEMATCH;
fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */
}
if (mp->flags & RSPAMD_MULTIPATTERN_NO_START) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) {
fl &= ~HS_FLAG_SOM_LEFTMOST;
}


+ 3
- 1
test/lua/unit/url.lua View File

@@ -83,7 +83,9 @@ context("URL check functions", function()
{"http:/\\www.google.com/foo?bar=baz#", true, {
host = 'www.google.com', path = 'foo', query = 'bar=baz', tld = 'google.com'
}},
{"http://[www.google.com]/", false},
{"http://[www.google.com]/", true, {
host = 'www.google.com',
}},
{"<test.com", true, {
host = 'test.com', tld = 'test.com',
}},

Loading…
Cancel
Save