From 9ec0dbfd35d3fd9bd61ae74e30214089f95305c4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 14 Feb 2019 17:27:45 +0000 Subject: [PATCH] [Minor] Fix loading of unicode multipatterns --- src/libmime/lang_detection.c | 14 ++++++++++++++ src/libserver/cfg_rcl.c | 4 +++- src/libserver/url.c | 6 +++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 72964a93a..9c29212dd 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -459,9 +459,16 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, const char *word = ucl_object_tolstring (w, &wlen); const char *saved; +#ifdef WITH_HYPERSCAN + rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp, + word, wlen, + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8 + |RSPAMD_MULTIPATTERN_RE); +#else rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp, word, wlen, RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); +#endif nelt->stop_words ++; nstop ++; @@ -817,8 +824,15 @@ rspamd_language_detector_init (struct rspamd_config *cfg) /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { ret->trigramms[i] = kh_init (rspamd_trigram_hash); +#ifdef WITH_HYPERSCAN + ret->stop_words[i].mp = rspamd_multipattern_create ( + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8| + RSPAMD_MULTIPATTERN_RE); +#else ret->stop_words[i].mp = rspamd_multipattern_create ( RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); +#endif + ret->stop_words[i].ranges = g_array_new (FALSE, FALSE, sizeof (struct rspamd_stop_word_range)); } diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c index fa1c07f6e..21a89c06e 100644 --- a/src/libserver/cfg_rcl.c +++ b/src/libserver/cfg_rcl.c @@ -1314,7 +1314,9 @@ rspamd_rcl_composite_handler (rspamd_mempool_t *pool, } rspamd_config_add_symbol (cfg, composite_name, score, - description, group, FALSE, FALSE, + description, group, + 0, + ucl_object_get_priority (obj) + 1, 1); elt = ucl_object_lookup (obj, "groups"); diff --git a/src/libserver/url.c b/src/libserver/url.c index 421c8a181..0effe4d6b 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -454,7 +454,7 @@ rspamd_url_parse_tld_file (const gchar *fname, m.flags = flags; rspamd_multipattern_add_pattern (url_scanner->search_trie, p, - RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE); + RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); m.pattern = rspamd_multipattern_get_pattern (url_scanner->search_trie, rspamd_multipattern_get_npatterns (url_scanner->search_trie) - 1); m.patlen = strlen (m.pattern); @@ -517,13 +517,13 @@ rspamd_url_init (const gchar *tld_file) url_scanner->matchers = g_array_sized_new (FALSE, TRUE, sizeof (struct url_matcher), 13000); url_scanner->search_trie = rspamd_multipattern_create_sized (13000, - RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE); + RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); } else { url_scanner->matchers = g_array_sized_new (FALSE, TRUE, sizeof (struct url_matcher), 128); url_scanner->search_trie = rspamd_multipattern_create_sized (128, - RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE); + RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); } rspamd_url_add_static_matchers (url_scanner); -- 2.39.5