Browse Source

[Rework] Fix various url extraction issues

tags/2.5
Vsevolod Stakhov 4 years ago
parent
commit
a4977e18de
3 changed files with 20 additions and 15 deletions
  1. 8
    6
      src/libserver/url.c
  2. 9
    8
      src/libutil/multipattern.c
  3. 3
    1
      test/lua/unit/url.lua

+ 8
- 6
src/libserver/url.c View File

rspamd_multipattern_add_pattern (url_scanner->search_trie_strict, rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
static_matchers[i].pattern, static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8| RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_RE);
} }
else { else {
rspamd_multipattern_add_pattern (url_scanner->search_trie_strict, rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
static_matchers[i].pattern, static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
} }
} }


rspamd_multipattern_add_pattern (url_scanner->search_trie_full, rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
static_matchers[i].pattern, static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8| RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_RE);
} }
else { else {
rspamd_multipattern_add_pattern (url_scanner->search_trie_full, rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
static_matchers[i].pattern, static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
} }
} }
g_array_append_vals (sc->matchers_full, static_matchers, n); g_array_append_vals (sc->matchers_full, static_matchers, n);
sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers)); sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers));
url_scanner->search_trie_strict = rspamd_multipattern_create_sized ( url_scanner->search_trie_strict = rspamd_multipattern_create_sized (
G_N_ELEMENTS (static_matchers), G_N_ELEMENTS (static_matchers),
RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);


if (tld_file) { if (tld_file) {
/* Reserve larger multipattern */ /* Reserve larger multipattern */
url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE, url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE,
sizeof (struct url_matcher), 13000); sizeof (struct url_matcher), 13000);
url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000, url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000,
RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
} }
else { else {
url_scanner->matchers_full = NULL; url_scanner->matchers_full = NULL;
} }
else { else {
cb->url_str = NULL; cb->url_str = NULL;
/* Continue search if no pattern has been found */
return 0;
} }


/* Continue search if required (return 0 means continue) */ /* Continue search if required (return 0 means continue) */

+ 9
- 8
src/libutil/multipattern.c View File

/* /*
* We understand the following cases * We understand the following cases
* 1) blah -> .blah\b * 1) blah -> .blah\b
* 2) *.blah -> ..*\\.blah\b
* 2) *.blah -> ..*\\.blah\b|$
* 3) ??? * 3) ???
*/ */


len = slen + strlen (prefix); len = slen + strlen (prefix);
} }


suffix = "\\b";
suffix = "(:?\\b|$)";
len += strlen (suffix); len += strlen (suffix);


res = g_malloc (len + 1); res = g_malloc (len + 1);
if (rspamd_hs_check ()) { if (rspamd_hs_check ()) {
gchar *np; gchar *np;
gint fl = HS_FLAG_SOM_LEFTMOST; gint fl = HS_FLAG_SOM_LEFTMOST;
gint adjusted_flags = mp->flags | flags;


if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) {
fl |= HS_FLAG_CASELESS; fl |= HS_FLAG_CASELESS;
} }
if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
if (mp->flags & RSPAMD_MULTIPATTERN_TLD) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) {
fl |= HS_FLAG_UTF8; fl |= HS_FLAG_UTF8;
} }
else { else {
fl |= HS_FLAG_UTF8 | HS_FLAG_UCP; fl |= HS_FLAG_UTF8 | HS_FLAG_UCP;
} }
} }
if (mp->flags & RSPAMD_MULTIPATTERN_DOTALL) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) {
fl |= HS_FLAG_DOTALL; fl |= HS_FLAG_DOTALL;
} }
if (mp->flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
fl |= HS_FLAG_SINGLEMATCH; fl |= HS_FLAG_SINGLEMATCH;
fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */ fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */
} }
if (mp->flags & RSPAMD_MULTIPATTERN_NO_START) {
if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) {
fl &= ~HS_FLAG_SOM_LEFTMOST; fl &= ~HS_FLAG_SOM_LEFTMOST;
} }



+ 3
- 1
test/lua/unit/url.lua View File

{"http:/\\www.google.com/foo?bar=baz#", true, { {"http:/\\www.google.com/foo?bar=baz#", true, {
host = 'www.google.com', path = 'foo', query = 'bar=baz', tld = 'google.com' host = 'www.google.com', path = 'foo', query = 'bar=baz', tld = 'google.com'
}}, }},
{"http://[www.google.com]/", false},
{"http://[www.google.com]/", true, {
host = 'www.google.com',
}},
{"<test.com", true, { {"<test.com", true, {
host = 'test.com', tld = 'test.com', host = 'test.com', tld = 'test.com',
}}, }},

Loading…
Cancel
Save