diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/lang_detection.c | 1116 | ||||
-rw-r--r-- | src/libmime/lang_detection.h | 25 | ||||
-rw-r--r-- | src/libmime/message.c | 354 | ||||
-rw-r--r-- | src/libmime/message.h | 23 | ||||
-rw-r--r-- | src/libmime/mime_encoding.c | 50 | ||||
-rw-r--r-- | src/libmime/mime_encoding.h | 7 | ||||
-rw-r--r-- | src/libserver/re_cache.c | 10 | ||||
-rw-r--r-- | src/libserver/task.c | 11 | ||||
-rw-r--r-- | src/libserver/url.c | 6 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 20 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 429 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 5 | ||||
-rw-r--r-- | src/libutil/logger.c | 6 | ||||
-rw-r--r-- | src/lua/lua_mimepart.c | 42 | ||||
-rw-r--r-- | src/lua/lua_trie.c | 6 | ||||
-rw-r--r-- | src/lua/lua_util.c | 11 | ||||
-rw-r--r-- | src/plugins/chartable.c | 22 | ||||
-rw-r--r-- | src/plugins/fuzzy_check.c | 18 | ||||
-rw-r--r-- | src/plugins/lua/antivirus.lua | 8 | ||||
-rw-r--r-- | src/plugins/lua/arc.lua | 3 | ||||
-rw-r--r-- | src/plugins/lua/dkim_signing.lua | 3 | ||||
-rw-r--r-- | src/rspamadm/confighelp.c | 2 |
22 files changed, 1285 insertions, 892 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 8763365af..d4237690d 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -17,6 +17,7 @@ #include "lang_detection.h" #include "libutil/logger.h" #include "libcryptobox/cryptobox.h" +#include "libutil/multipattern.h" #include "ucl.h" #include "khash.h" #include <glob.h> @@ -26,7 +27,7 @@ #include <unicode/ustring.h> #include <math.h> -static const gsize default_short_text_limit = 200; +static const gsize default_short_text_limit = 20; static const gsize default_words = 80; static const gdouble update_prob = 0.6; static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages"; @@ -42,28 +43,17 @@ struct rspamd_language_unicode_match { * List of languages detected by unicode scripts */ static const struct rspamd_language_unicode_match unicode_langs[] = { - {"el", UBLOCK_GREEK}, - {"ml", UBLOCK_MALAYALAM}, - {"te", UBLOCK_TELUGU}, - {"ta", UBLOCK_TAMIL}, - {"gu", UBLOCK_GUJARATI}, - {"th", UBLOCK_THAI}, - {"kn", UBLOCK_KANNADA}, - {"ka", UBLOCK_GEORGIAN}, - {"si", UBLOCK_SINHALA}, - {"hy", UBLOCK_ARMENIAN}, - {"lo", UBLOCK_LAO}, - {"km", UBLOCK_KHMER} -}; - -/* - * List of languages to apply unigramms only - */ -static const gchar *unigramms_langs[] = { - "ja", - "ko", - "zh-CN", - "zh-TW" + {"el", RSPAMD_UNICODE_GREEK}, + {"ml", RSPAMD_UNICODE_MALAYALAM}, + {"te", RSPAMD_UNICODE_TELUGU}, + {"ta", RSPAMD_UNICODE_TAMIL}, + {"gu", RSPAMD_UNICODE_GUJARATI}, + {"th", RSPAMD_UNICODE_THAI}, + {"ka", RSPAMD_UNICODE_GEORGIAN}, + {"si", RSPAMD_UNICODE_SINHALA}, + {"hy", RSPAMD_UNICODE_ARMENIAN}, + {"ja", RSPAMD_UNICODE_JP}, + {"ko", RSPAMD_UNICODE_HANGUL}, }; /* @@ -73,24 +63,29 @@ static const gchar *tier0_langs[] = { "en", }; static const gchar *tier1_langs[] = { - "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja", - "ko", "pt", "ru", "pl", "tk", "th", "ar" + "fr", "it", "de", "es", "nl", + "pt", "ru", "pl", "tk", "th", "ar" }; enum rspamd_language_elt_flags { RS_LANGUAGE_DEFAULT = 0, RS_LANGUAGE_LATIN = (1 << 0), - RS_LANGUAGE_UNISCRIPT = (1 << 1), - RS_LANGUAGE_UNIGRAMM = (1 << 2), RS_LANGUAGE_TIER1 = (1 << 3), RS_LANGUAGE_TIER0 = (1 << 4), }; +enum rspamd_language_category { + RSPAMD_LANGUAGE_LATIN = 0, + RSPAMD_LANGUAGE_CYRILLIC, + RSPAMD_LANGUAGE_DEVANAGARI, + RSPAMD_LANGUAGE_ARAB, + RSPAMD_LANGUAGE_MAX, +}; + struct rspamd_language_elt { const gchar *name; /* e.g. "en" or "ru" */ enum rspamd_language_elt_flags flags; - guint ngramms_total; - guint unigramms_words; + enum rspamd_language_category category; guint trigramms_words; gdouble mean; gdouble std; @@ -109,6 +104,17 @@ struct rspamd_ngramm_chain { gchar *utf; }; +struct rspamd_stop_word_range { + guint start; + guint stop; + struct rspamd_language_elt *elt; +}; + +struct rspamd_stop_word_elt { + struct rspamd_multipattern *mp; + GArray *ranges; /* of rspamd_stop_word_range */ +}; + #define msg_debug_lang_det(...) rspamd_conditional_debug_fast (NULL, NULL, \ rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \ G_STRFUNC, \ @@ -149,18 +155,6 @@ rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts) } static guint -rspamd_unigram_hash_func (gconstpointer key) -{ - return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ()); -} - -static gboolean -rspamd_unigram_equal_func (gconstpointer v, gconstpointer v2) -{ - return memcmp (v, v2, sizeof (UChar)) == 0; -} - -static guint rspamd_trigram_hash_func (gconstpointer key) { return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar), rspamd_hash_seed ()); @@ -172,8 +166,6 @@ rspamd_trigram_equal_func (gconstpointer v, gconstpointer v2) return memcmp (v, v2, 3 * sizeof (UChar)) == 0; } -KHASH_INIT (rspamd_unigram_hash, const UChar *, struct rspamd_ngramm_chain, true, - rspamd_unigram_hash_func, rspamd_unigram_equal_func); KHASH_INIT (rspamd_trigram_hash, const UChar *, struct rspamd_ngramm_chain, true, rspamd_trigram_hash_func, rspamd_trigram_equal_func); KHASH_INIT (rspamd_candidates_hash, const gchar *, @@ -182,9 +174,8 @@ KHASH_INIT (rspamd_candidates_hash, const gchar *, struct rspamd_lang_detector { GPtrArray *languages; - khash_t(rspamd_unigram_hash) *unigramms; /* unigramms frequencies */ - khash_t(rspamd_trigram_hash) *trigramms; /* trigramms frequencies */ - GHashTable *unicode_scripts; /* indexed by unicode script */ + khash_t(rspamd_trigram_hash) *trigramms[RSPAMD_LANGUAGE_MAX]; /* trigramms frequencies */ + struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX]; UConverter *uchar_converter; gsize short_text_limit; gsize total_occurencies; /* number of all languages found */ @@ -226,9 +217,13 @@ struct rspamd_language_ucs_elt { static void rspamd_language_detector_init_ngramm (struct rspamd_config *cfg, - struct rspamd_lang_detector *d, - struct rspamd_language_elt *lelt, - struct rspamd_language_ucs_elt *ucs, guint len, guint freq, guint total) + struct rspamd_lang_detector *d, + struct rspamd_language_elt *lelt, + struct rspamd_language_ucs_elt *ucs, + guint len, + guint freq, + guint total, + khash_t (rspamd_trigram_hash) *htb) { struct rspamd_ngramm_chain *chain = NULL, st_chain; struct rspamd_ngramm_elt *elt; @@ -238,18 +233,13 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg, switch (len) { case 1: - k = kh_get (rspamd_unigram_hash, d->unigramms, ucs->s); - if (k != kh_end (d->unigramms)) { - chain = &kh_value (d->unigramms, k); - } - break; case 2: g_assert_not_reached (); break; case 3: - k = kh_get (rspamd_trigram_hash, d->trigramms, ucs->s); - if (k != kh_end (d->trigramms)) { - chain = &kh_value (d->trigramms, k); + k = kh_get (rspamd_trigram_hash, htb, ucs->s); + if (k != kh_end (htb)) { + chain = &kh_value (htb, k); } break; default: @@ -270,14 +260,8 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg, elt->prob = ((gdouble)freq) / ((gdouble)total); g_ptr_array_add (chain->languages, elt); - if (len == 1) { - k = kh_put (rspamd_unigram_hash, d->unigramms, ucs->s, &i); - kh_value (d->unigramms, k) = *chain; - } - else { - k = kh_put (rspamd_trigram_hash, d->trigramms, ucs->s, &i); - kh_value (d->trigramms, k) = *chain; - } + k = kh_put (rspamd_trigram_hash, htb, ucs->s, &i); + kh_value (htb, k) = *chain; } else { /* Check sanity */ @@ -300,6 +284,23 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg, } } +static inline enum rspamd_language_category +rspamd_language_detector_get_category (guint uflags) +{ + enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN; + + if (uflags & RSPAMD_UNICODE_CYRILLIC) { + cat = RSPAMD_LANGUAGE_CYRILLIC; + } + else if (uflags & RSPAMD_UNICODE_DEVANAGARI) { + cat = RSPAMD_LANGUAGE_DEVANAGARI; + } + else if (uflags & RSPAMD_UNICODE_ARABIC) { + cat = RSPAMD_LANGUAGE_ARAB; + } + + return cat; +} static const gchar * rspamd_language_detector_print_flags (struct rspamd_language_elt *elt) @@ -307,9 +308,6 @@ rspamd_language_detector_print_flags (struct rspamd_language_elt *elt) static gchar flags_buf[256]; goffset r = 0; - if (elt->flags & RS_LANGUAGE_UNIGRAMM) { - r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "unigrams,"); - } if (elt->flags & RS_LANGUAGE_TIER1) { r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier1,"); } @@ -342,19 +340,22 @@ rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b) static void rspamd_language_detector_read_file (struct rspamd_config *cfg, struct rspamd_lang_detector *d, - const gchar *path) + const gchar *path, + const ucl_object_t *stop_words) { struct ucl_parser *parser; ucl_object_t *top; - const ucl_object_t *freqs, *n_words, *cur; + const ucl_object_t *freqs, *n_words, *cur, *type; ucl_object_iter_t it = NULL; UErrorCode uc_err = U_ZERO_ERROR; struct rspamd_language_elt *nelt; - const struct rspamd_language_unicode_match *uc_match; struct rspamd_language_ucs_elt *ucs_elt; + khash_t (rspamd_trigram_hash) *htb = NULL; gchar *pos; - guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, loaded; + guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, + loaded, nstop = 0; gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0; + enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX; parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS); if (!ucl_parser_add_file (parser, path)) { @@ -396,141 +397,181 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, return; } else { - nelt->unigramms_words = ucl_object_toint (ucl_array_find_index (n_words, - 0)); nelt->trigramms_words = ucl_object_toint (ucl_array_find_index (n_words, 2)); } - if ((uc_match = rspamd_language_search_unicode_match (nelt->name, unicode_langs, - G_N_ELEMENTS (unicode_langs))) != NULL) { - g_hash_table_insert (d->unicode_scripts, (gpointer)&uc_match->unicode_code, - nelt); - nelt->flags |= RS_LANGUAGE_UNISCRIPT; - msg_info_config ("loaded unicode script only %s language: %d", - nelt->name, - uc_match->unicode_code); + type = ucl_object_lookup (top, "type"); + + if (type == NULL || ucl_object_type (type) != UCL_STRING) { + msg_warn_config ("cannot find type in language %s", nelt->name); + ucl_object_unref (top); + + return; } else { - GPtrArray *ngramms; - guint nsym; + const gchar *stype = ucl_object_tostring (type); - if (rspamd_language_search_str (nelt->name, unigramms_langs, - G_N_ELEMENTS (unigramms_langs))) { - nelt->flags |= RS_LANGUAGE_UNIGRAMM; + if (strcmp (stype, "latin") == 0) { + cat = RSPAMD_LANGUAGE_LATIN; + } + else if (strcmp (stype, "cyrillic") == 0) { + cat = RSPAMD_LANGUAGE_CYRILLIC; } + else if (strcmp (stype, "arab") == 0) { + cat = RSPAMD_LANGUAGE_ARAB; + } + else if (strcmp (stype, "devanagari") == 0) { + cat = RSPAMD_LANGUAGE_DEVANAGARI; + } + else { + msg_warn_config ("unknown type %s of language %s", stype, nelt->name); + ucl_object_unref (top); - if (rspamd_language_search_str (nelt->name, tier1_langs, - G_N_ELEMENTS (tier1_langs))) { - nelt->flags |= RS_LANGUAGE_TIER1; + return; } + } + + if (stop_words) { + const ucl_object_t *specific_stop_words; + + specific_stop_words = ucl_object_lookup (stop_words, nelt->name); + + if (specific_stop_words) { + it = NULL; + const ucl_object_t *w; + guint start, stop; + + start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); + + while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) { + rspamd_multipattern_add_pattern (d->stop_words[cat].mp, + ucl_object_tostring (w), 0); + nstop ++; + } - if (rspamd_language_search_str (nelt->name, tier0_langs, - G_N_ELEMENTS (tier0_langs))) { - nelt->flags |= RS_LANGUAGE_TIER0; + stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); + + struct rspamd_stop_word_range r; + + r.start = start; + r.stop = stop; + r.elt = nelt; + + g_array_append_val (d->stop_words[cat].ranges, r); + it = NULL; } + } - it = NULL; - ngramms = g_ptr_array_sized_new (freqs->len); - i = 0; - skipped = 0; - loaded = 0; + nelt->category = cat; + htb = d->trigramms[cat]; - while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) { - const gchar *key; - gsize keylen; - guint freq; + GPtrArray *ngramms; + guint nsym; - key = ucl_object_keyl (cur, &keylen); - freq = ucl_object_toint (cur); + if (rspamd_language_search_str (nelt->name, tier1_langs, + G_N_ELEMENTS (tier1_langs))) { + nelt->flags |= RS_LANGUAGE_TIER1; + } - i ++; - delta = freq - mean; - mean += delta / i; - delta2 = freq - mean; - m2 += delta * delta2; + if (rspamd_language_search_str (nelt->name, tier0_langs, + G_N_ELEMENTS (tier0_langs))) { + nelt->flags |= RS_LANGUAGE_TIER0; + } - if (key != NULL) { - ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool, - sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar)); + it = NULL; + ngramms = g_ptr_array_sized_new (freqs->len); + i = 0; + skipped = 0; + loaded = 0; - nsym = ucnv_toUChars (d->uchar_converter, - ucs_elt->s, keylen + 1, - key, - keylen, &uc_err); - ucs_elt->utf = key; + while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) { + const gchar *key; + gsize keylen; + guint freq; - if (uc_err != U_ZERO_ERROR) { - msg_warn_config ("cannot convert key to unicode: %s", - u_errorName (uc_err)); + key = ucl_object_keyl (cur, &keylen); + freq = ucl_object_toint (cur); - continue; - } + i ++; + delta = freq - mean; + mean += delta / i; + delta2 = freq - mean; + m2 += delta * delta2; - rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym); + if (key != NULL) { + ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool, + sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar)); - if (nsym == 3 && !(nelt->flags & RS_LANGUAGE_UNIGRAMM)) { - g_ptr_array_add (ngramms, ucs_elt); - } - else if (nsym == 1 && nelt->flags & RS_LANGUAGE_UNIGRAMM) { - g_ptr_array_add (ngramms, ucs_elt); - } - else { - continue; - } + nsym = ucnv_toUChars (d->uchar_converter, + ucs_elt->s, keylen + 1, + key, + keylen, &uc_err); + ucs_elt->utf = key; - if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) { - total_latin++; - } + if (uc_err != U_ZERO_ERROR) { + msg_warn_config ("cannot convert key to unicode: %s", + u_errorName (uc_err)); + + continue; + } - ucs_elt->freq = freq; + rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym); - total_ngramms++; + if (nsym == 3) { + g_ptr_array_add (ngramms, ucs_elt); + } + else { + continue; } - } - std = sqrt (m2 / (i - 1)); + if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) { + total_latin++; + } - if (total_latin >= total_ngramms / 3) { - nelt->flags |= RS_LANGUAGE_LATIN; - } + ucs_elt->freq = freq; - if (nelt->flags & RS_LANGUAGE_UNIGRAMM) { - nsym = 1; - } - else { - nsym = 3; + total_ngramms++; } + } - total = 0; - PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) { + std = sqrt (m2 / (i - 1)); - if (!(nelt->flags & RS_LANGUAGE_LATIN) && - rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) { - ucs_elt->freq = 0; - /* Skip latin ngramm for non-latin language to avoid garbadge */ - skipped ++; - continue; - } + if (total_latin >= total_ngramms / 3) { + nelt->flags |= RS_LANGUAGE_LATIN; + } + + nsym = 3; - /* Now, discriminate low frequency ngramms */ + total = 0; + PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) { - total += ucs_elt->freq; - loaded ++; + if (!(nelt->flags & RS_LANGUAGE_LATIN) && + rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) { + ucs_elt->freq = 0; + /* Skip latin ngramm for non-latin language to avoid garbadge */ + skipped ++; + continue; } - g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm); + /* Now, discriminate low frequency ngramms */ - PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) { - if (ucs_elt->freq > 0) { - rspamd_language_detector_init_ngramm (cfg, d, - nelt, ucs_elt, nsym, - ucs_elt->freq, total); - } + total += ucs_elt->freq; + loaded ++; + } + + g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm); + + PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) { + if (ucs_elt->freq > 0) { + rspamd_language_detector_init_ngramm (cfg, d, + nelt, ucs_elt, nsym, + ucs_elt->freq, total, htb); } + } #ifdef EXTRA_LANGDET_DEBUG - /* Useful for debug */ + /* Useful for debug */ for (i = 0; i < 10; i ++) { ucs_elt = g_ptr_array_index (ngramms, i); @@ -539,22 +580,20 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, } #endif - g_ptr_array_free (ngramms, TRUE); - nelt->mean = mean; - nelt->std = std; - nelt->ngramms_total = total; - msg_info_config ("loaded %s language, %d unigramms, %d trigramms, " - "%d ngramms loaded; " - "std=%.2f, mean=%.2f, skipped=%d, loaded=%d; " - "(%s)", - nelt->name, - (gint)nelt->unigramms_words, - (gint)nelt->trigramms_words, - total, - std, mean, - skipped, loaded, - rspamd_language_detector_print_flags (nelt)); - } + g_ptr_array_free (ngramms, TRUE); + nelt->mean = mean; + nelt->std = std; + + msg_info_config ("loaded %s language, %d trigramms, " + "%d ngramms loaded; " + "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; " + "(%s)", + nelt->name, + (gint)nelt->trigramms_words, + total, + std, mean, + skipped, loaded, nstop, + rspamd_language_detector_print_flags (nelt)); g_ptr_array_add (d->languages, nelt); ucl_object_unref (top); @@ -631,16 +670,10 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d) ucnv_close (d->uchar_converter); } - if (d->unicode_scripts) { - g_hash_table_unref (d->unicode_scripts); - } - - if (d->unigramms) { - kh_destroy (rspamd_unigram_hash, d->unigramms); - } - - if (d->trigramms) { - kh_destroy (rspamd_trigram_hash, d->trigramms); + for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { + kh_destroy (rspamd_trigram_hash, d->trigramms[i]); + rspamd_multipattern_destroy (d->stop_words[i].mp); + g_array_free (d->stop_words[i].ranges, TRUE); } if (d->languages) { @@ -656,12 +689,14 @@ rspamd_language_detector_init (struct rspamd_config *cfg) *languages_disable = NULL; const gchar *languages_path = default_languages_path; glob_t gl; - size_t i, short_text_limit = default_short_text_limit; + size_t i, short_text_limit = default_short_text_limit, total = 0; UErrorCode uc_err = U_ZERO_ERROR; GString *languages_pattern; struct rspamd_ngramm_chain *chain, schain; gchar *fname; struct rspamd_lang_detector *ret = NULL; + struct ucl_parser *parser; + ucl_object_t *stop_words; section = ucl_object_lookup (cfg->rcl_obj, "lang_detection"); @@ -683,6 +718,22 @@ rspamd_language_detector_init (struct rspamd_config *cfg) } languages_pattern = g_string_sized_new (PATH_MAX); + rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path); + parser = ucl_parser_new (UCL_PARSER_DEFAULT); + + if (ucl_parser_add_file (parser, languages_pattern->str)) { + stop_words = ucl_parser_get_object (parser); + } + else { + msg_err_config ("cannot read stop words from %s: %s", + languages_pattern->str, + ucl_parser_get_error (parser)); + stop_words = NULL; + } + + ucl_parser_free (parser); + languages_pattern->len = 0; + rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path); memset (&gl, 0, sizeof (gl)); @@ -696,9 +747,13 @@ rspamd_language_detector_init (struct rspamd_config *cfg) ret->uchar_converter = ucnv_open ("UTF-8", &uc_err); ret->short_text_limit = short_text_limit; /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */ - ret->unigramms = kh_init (rspamd_unigram_hash); - ret->trigramms = kh_init (rspamd_trigram_hash); - ret->unicode_scripts = g_hash_table_new (g_int_hash, g_int_equal); + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { + ret->trigramms[i] = kh_init (rspamd_trigram_hash); + ret->stop_words[i].mp = rspamd_multipattern_create ( + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); + ret->stop_words[i].ranges = g_array_new (FALSE, FALSE, + sizeof (struct rspamd_stop_word_range)); + } g_assert (uc_err == U_ZERO_ERROR); @@ -708,7 +763,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg) if (!rspamd_ucl_array_find_str (fname, languages_disable) || (languages_enable == NULL || rspamd_ucl_array_find_str (fname, languages_enable))) { - rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i]); + rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i], + stop_words); } else { msg_info_config ("skip language file %s: disabled", fname); @@ -717,18 +773,27 @@ rspamd_language_detector_init (struct rspamd_config *cfg) g_free (fname); } - kh_foreach_value (ret->trigramms, schain, { - chain = &schain; - rspamd_language_detector_process_chain (cfg, chain); - }); + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { + GError *err = NULL; + + kh_foreach_value (ret->trigramms[i], schain, { + chain = &schain; + rspamd_language_detector_process_chain (cfg, chain); + }); + + if (!rspamd_multipattern_compile (ret->stop_words[i].mp, &err)) { + msg_err_config ("cannot compile stop words for %d language group: %e", + i, err); + g_error_free (err); + } + + total += kh_size (ret->trigramms[i]); + } - msg_info_config ("loaded %d languages, %d unicode only languages, " - "%d unigramms, " + msg_info_config ("loaded %d languages, " "%d trigramms", (gint)ret->languages->len, - (gint)g_hash_table_size (ret->unicode_scripts), - (gint)kh_size (ret->unigramms), - (gint)kh_size (ret->trigramms)); + (gint)total); REF_INIT_RETAIN (ret, rspamd_language_detector_dtor); rspamd_mempool_add_destructor (cfg->cfg_pool, @@ -859,11 +924,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, #endif } -enum rspamd_language_gramm_type { - rs_unigramm = 0, - rs_trigramm -}; - static goffset rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, guint wlen, goffset cur_off) @@ -914,9 +974,10 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, */ static void rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, - struct rspamd_lang_detector *d, - UChar *window, enum rspamd_language_gramm_type type, - khash_t(rspamd_candidates_hash) *candidates) + struct rspamd_lang_detector *d, + UChar *window, + khash_t(rspamd_candidates_hash) *candidates, + khash_t(rspamd_trigram_hash) *trigramms) { guint i; gint ret; @@ -926,19 +987,9 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, khiter_t k; gdouble prob; - switch (type) { - case rs_unigramm: - k = kh_get (rspamd_unigram_hash, d->unigramms, window); - if (k != kh_end (d->unigramms)) { - chain = &kh_value (d->unigramms, k); - } - break; - case rs_trigramm: - k = kh_get (rspamd_trigram_hash, d->trigramms, window); - if (k != kh_end (d->trigramms)) { - chain = &kh_value (d->trigramms, k); - } - break; + k = kh_get (rspamd_trigram_hash, trigramms, window); + if (k != kh_end (trigramms)) { + chain = &kh_value (trigramms, k); } if (chain) { @@ -980,29 +1031,20 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, static void rspamd_language_detector_detect_word (struct rspamd_task *task, - struct rspamd_lang_detector *d, - rspamd_stat_token_t *tok, - khash_t(rspamd_candidates_hash) *candidates, - enum rspamd_language_gramm_type type) + struct rspamd_lang_detector *d, + rspamd_stat_token_t *tok, + khash_t(rspamd_candidates_hash) *candidates, + khash_t(rspamd_trigram_hash) *trigramms) { - guint wlen; + const guint wlen = 3; UChar window[3]; goffset cur = 0; - switch (type) { - case rs_unigramm: - wlen = 1; - break; - case rs_trigramm: - wlen = 3; - break; - } - /* Split words */ while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) != -1) { rspamd_language_detector_process_ngramm_full (task, - d, window, type, candidates); + d, window, candidates, trigramms); } } @@ -1074,113 +1116,35 @@ rspamd_language_detector_filter_negligible (struct rspamd_task *task, msg_debug_lang_det ("removed %d languages", filtered); } -static gboolean -rspamd_language_detector_is_unicode (struct rspamd_task *task, - struct rspamd_lang_detector *d, - GArray *ucs_tokens, - goffset *selected_words, - gsize nparts, - khash_t(rspamd_candidates_hash) *candidates) -{ - guint i, j, total_found = 0, total_checked = 0; - rspamd_stat_token_t *tok; - UChar t; - gint uc_script, ret; - khint_t k; - struct rspamd_language_elt *elt; - struct rspamd_lang_detector_res *cand; - - for (i = 0; i < nparts; i++) { - tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, - selected_words[i]); - - for (j = 0; j < tok->len; j ++) { - t = *(((UChar *)tok->begin) + j); - - uc_script = ublock_getCode (t); - elt = g_hash_table_lookup (d->unicode_scripts, &uc_script); - - if (elt) { - k = kh_get (rspamd_candidates_hash, candidates, elt->name); - if (k != kh_end (candidates)) { - cand = kh_value (candidates, k); - } - else { - cand = NULL; - } - - if (cand == NULL) { - cand = rspamd_mempool_alloc (task->task_pool, - sizeof (*cand)); - cand->elt = elt; - cand->lang = elt->name; - cand->prob = 1; - - k = kh_put (rspamd_candidates_hash, candidates, elt->name, &ret); - kh_value (candidates, k) = cand; - } else { - /* Update guess */ - cand->prob ++; - } - - total_found ++; - } - - total_checked ++; - } - - if (i >= nparts / 2 && total_found == 0) { - /* No special scripts found, stop processing */ - return FALSE; - } - } - - if (total_found < total_checked / 2) { - /* Not enough confidence */ - return FALSE; - } - else { - /* Filter candidates */ - kh_foreach_value (candidates, cand, { - cand->prob = cand->prob / total_checked; - }); - } - - return TRUE; -} - static void rspamd_language_detector_detect_type (struct rspamd_task *task, - guint nwords, - struct rspamd_lang_detector *d, - GArray *ucs_tokens, - khash_t(rspamd_candidates_hash) *candidates, - enum rspamd_language_gramm_type type) { - guint nparts = MIN (ucs_tokens->len, nwords); + guint nwords, + struct rspamd_lang_detector *d, + GArray *words, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) *candidates) +{ + guint nparts = MIN (words->len, nwords); goffset *selected_words; - rspamd_stat_token_t *tok; + rspamd_stat_token_t *tok, ucs_w; guint i; selected_words = g_new0 (goffset, nparts); - rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words); + rspamd_language_detector_random_select (words, nparts, selected_words); msg_debug_lang_det ("randomly selected %d words", nparts); - /* Check unicode scripts */ - if (kh_size (candidates) != 0 || - !rspamd_language_detector_is_unicode (task, d, ucs_tokens, - selected_words, nparts, candidates)) { - - for (i = 0; i < nparts; i++) { - tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, - selected_words[i]); - rspamd_language_detector_detect_word (task, d, tok, candidates, - type); - } - - /* Filter negligible candidates */ - rspamd_language_detector_filter_negligible (task, candidates); + for (i = 0; i < nparts; i++) { + tok = &g_array_index (words, rspamd_stat_token_t, + selected_words[i]); + rspamd_language_detector_to_ucs (task->lang_det, + task->task_pool, + tok, &ucs_w); + rspamd_language_detector_detect_word (task, d, &ucs_w, candidates, + d->trigramms[cat]); } + /* Filter negligible candidates */ + rspamd_language_detector_filter_negligible (task, candidates); g_free (selected_words); } @@ -1209,11 +1173,11 @@ enum rspamd_language_detected_type { static enum rspamd_language_detected_type rspamd_language_detector_try_ngramm (struct rspamd_task *task, - guint nwords, - struct rspamd_lang_detector *d, - GArray *ucs_tokens, - enum rspamd_language_gramm_type type, - khash_t(rspamd_candidates_hash) *candidates) + guint nwords, + struct rspamd_lang_detector *d, + GArray *ucs_tokens, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) *candidates) { guint cand_len = 0; struct rspamd_lang_detector_res *cand; @@ -1222,8 +1186,8 @@ rspamd_language_detector_try_ngramm (struct rspamd_task *task, nwords, d, ucs_tokens, - candidates, - type); + cat, + candidates); kh_foreach_value (candidates, cand, { if (!isnan (cand->prob)) { @@ -1320,117 +1284,429 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, return 0; } -GPtrArray * +static void +rspamd_language_detector_unicode_scripts (struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + const gchar *p = part->utf_stripped_content->data, *end; + guint i = 0; + end = p + part->utf_stripped_content->len; + gint32 uc, sc; + guint nlatin = 0, nchinese = 0, nspecial = 0; + + while (p + i < end) { + U8_NEXT (p, i, part->utf_stripped_content->len, uc); + + if (((gint32) uc) < 0) { + break; + } + + if (u_isalpha (uc)) { + sc = ublock_getCode (uc); + + switch (sc) { + case UBLOCK_BASIC_LATIN: + case UBLOCK_LATIN_1_SUPPLEMENT: + part->unicode_scripts |= RSPAMD_UNICODE_LATIN; + nlatin ++; + break; + case UBLOCK_HEBREW: + part->unicode_scripts |= RSPAMD_UNICODE_HEBREW; + nspecial ++; + break; + case UBLOCK_GREEK: + part->unicode_scripts |= RSPAMD_UNICODE_GREEK; + nspecial ++; + break; + case UBLOCK_CYRILLIC: + part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC; + nspecial ++; + break; + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: + case UBLOCK_CJK_COMPATIBILITY: + case UBLOCK_CJK_RADICALS_SUPPLEMENT: + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: + part->unicode_scripts |= RSPAMD_UNICODE_CJK; + nchinese ++; + break; + case UBLOCK_HIRAGANA: + case UBLOCK_KATAKANA: + part->unicode_scripts |= RSPAMD_UNICODE_JP; + nspecial ++; + break; + case UBLOCK_HANGUL_JAMO: + case UBLOCK_HANGUL_COMPATIBILITY_JAMO: + part->unicode_scripts |= RSPAMD_UNICODE_HANGUL; + nspecial ++; + break; + case UBLOCK_ARABIC: + part->unicode_scripts |= RSPAMD_UNICODE_ARABIC; + nspecial ++; + break; + case UBLOCK_DEVANAGARI: + part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI; + nspecial ++; + break; + case UBLOCK_ARMENIAN: + part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN; + nspecial ++; + break; + case UBLOCK_GEORGIAN: + part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN; + nspecial ++; + break; + case UBLOCK_GUJARATI: + part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI; + nspecial ++; + break; + case UBLOCK_TELUGU: + part->unicode_scripts |= RSPAMD_UNICODE_TELUGU; + nspecial ++; + break; + case UBLOCK_TAMIL: + part->unicode_scripts |= RSPAMD_UNICODE_TAMIL; + nspecial ++; + break; + case UBLOCK_THAI: + part->unicode_scripts |= RSPAMD_UNICODE_THAI; + nspecial ++; + break; + case RSPAMD_UNICODE_MALAYALAM: + part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM; + nspecial ++; + break; + case RSPAMD_UNICODE_SINHALA: + part->unicode_scripts |= RSPAMD_UNICODE_SINHALA; + nspecial ++; + break; + } + } + + if (nspecial > 6 && nspecial > nlatin) { + break; + } + else if (nchinese > 6 && nchinese > nlatin) { + if (nspecial > 0) { + /* Likely japanese */ + break; + } + } + } + + msg_debug_lang_det ("stop after checking %d characters, " + "%d latin, %d special, %d chinese", + i, nlatin, nspecial, nchinese); +} + +static inline void +rspamd_language_detector_set_language (struct rspamd_task *task, + struct rspamd_mime_text_part *part, + const gchar *code) +{ + struct rspamd_lang_detector_res *r; + + r = rspamd_mempool_alloc0 (task->task_pool, sizeof (*r)); + r->prob = 1.0; + r->lang = code; + + part->languages = g_ptr_array_sized_new (1); + g_ptr_array_add (part->languages, r); + part->language = code; +} + +static gboolean +rspamd_language_detector_try_uniscript (struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + guint i; + + for (i = 0; i < G_N_ELEMENTS (unicode_langs); i ++) { + if (unicode_langs[i].unicode_code & part->unicode_scripts) { + msg_debug_lang_det ("set language based on unicode script %s", + unicode_langs[i].lang); + rspamd_language_detector_set_language (task, part, + unicode_langs[i].lang); + + return TRUE; + } + } + + if (part->unicode_scripts & RSPAMD_UNICODE_CJK) { + rspamd_language_detector_set_language (task, part, + "zh-CN"); + + return TRUE; + } + + return FALSE; +} + + +KHASH_MAP_INIT_STR (rspamd_sw_hash, int); + +struct rspamd_sw_cbdata { + khash_t (rspamd_sw_hash) *res; + GArray *ranges; +}; + +static gint +rspamd_ranges_cmp (const void *k, const void *memb) +{ + gint pos = GPOINTER_TO_INT (k); + const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *)memb; + + if (pos >= r->start && pos < r->stop) { + return 0; + } + else if (pos < r->start) { + return -1; + } + + return 1; +} + +static gint +rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + /* Check if boundary */ + const gchar *prev, *next; + struct rspamd_stop_word_range *r; + struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context; + khiter_t k; + + if (match_start > 0) { + prev = text + match_start - 1; + + if (!(g_ascii_isspace (*prev) || g_ascii_ispunct (*prev))) { + return 0; + } + } + else if (match_pos < len) { + next = text + match_pos + 1; + + if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) { + return 0; + } + } + + /* We have a word on the boundary, check range */ + r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data, + cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp); + + g_assert (r != NULL); + + k = kh_get (rspamd_sw_hash, cbdata->res, r->elt->name); + + if (k != kh_end (cbdata->res)) { + kh_value (cbdata->res, k) ++; + } + else { + gint tt; + + k = kh_put (rspamd_sw_hash, cbdata->res, r->elt->name, &tt); + kh_value (cbdata->res, k) = 1; + } + + return 0; +} + +static gboolean +rspamd_language_detector_try_stop_words (struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part, + enum rspamd_language_category cat) +{ + struct rspamd_stop_word_elt *elt; + struct rspamd_sw_cbdata cbdata; + gboolean ret = FALSE; + + elt = &d->stop_words[cat]; + cbdata.res = kh_init (rspamd_sw_hash); + cbdata.ranges = elt->ranges; + + rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data, + part->utf_stripped_content->len, rspamd_language_detector_sw_cb, + &cbdata, NULL); + + if (kh_size (cbdata.res) > 0) { + gint max = G_MININT, cur_matches; + const gchar *sel = NULL, *cur_lang; + + kh_foreach (cbdata.res, cur_lang, cur_matches, { + if (cur_matches > max) { + max = cur_matches; + sel = cur_lang; + } + }); + + if (max > 0 && sel) { + msg_debug_lang_det ("set language based on stop words script %s, %d found", + sel, max); + rspamd_language_detector_set_language (task, part, + sel); + + ret = TRUE; + } + } + + kh_destroy (rspamd_sw_hash, cbdata.res); + + return ret; +} + +gboolean rspamd_language_detector_detect (struct rspamd_task *task, - struct rspamd_lang_detector *d, - GArray *ucs_tokens, gsize words_len) + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part) { khash_t(rspamd_candidates_hash) *candidates; GPtrArray *result; gdouble mean, std, start_ticks, end_ticks; guint cand_len; + enum rspamd_language_category cat; struct rspamd_lang_detector_res *cand; enum rspamd_language_detected_type r; struct rspamd_frequency_sort_cbdata cbd; /* Check if we have sorted candidates based on frequency */ - gboolean frequency_heuristic_applied = FALSE; + gboolean frequency_heuristic_applied = FALSE, ret = FALSE; - if (ucs_tokens->len == 0) { - return g_ptr_array_new (); + if (!part->utf_stripped_content) { + return FALSE; } start_ticks = rspamd_get_ticks (TRUE); - candidates = kh_init (rspamd_candidates_hash); - kh_resize (rspamd_candidates_hash, candidates, 32); - r = rspamd_language_detector_try_ngramm (task, default_words, d, - ucs_tokens, rs_trigramm, - candidates); + rspamd_language_detector_unicode_scripts (task, part); + /* Apply unicode scripts heuristic */ - if (r == rs_detect_none) { - msg_debug_lang_det ("no trigramms found, switch to unigramms"); - r = rspamd_language_detector_try_ngramm (task, default_words, - d, ucs_tokens, rs_unigramm, - candidates); + if (rspamd_language_detector_try_uniscript (task, part)) { + ret = TRUE; } - else if (r == rs_detect_multiple) { - /* Check our guess */ - mean = 0.0; - std = 0.0; - cand_len = 0; + cat = rspamd_language_detector_get_category (part->unicode_scripts); + + if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) { + ret = TRUE; + } - /* Check distirbution */ - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - mean += cand->prob; - cand_len ++; + if (!ret) { + if (part->utf_words->len < default_short_text_limit) { + r = rs_detect_none; + msg_debug_lang_det ("text is too short for trigramms detection: " + "%d words; at least %d words required", + (int)part->utf_words->len, + (int)default_short_text_limit); + rspamd_language_detector_set_language (task, part, "en"); + candidates = kh_init (rspamd_candidates_hash); + } + else { + candidates = kh_init (rspamd_candidates_hash); + kh_resize (rspamd_candidates_hash, candidates, 32); + + r = rspamd_language_detector_try_ngramm (task, + default_words, + d, + part->utf_words, + cat, + candidates); + + if (r == rs_detect_none) { + msg_debug_lang_det ("no trigramms found, fallback to english"); + rspamd_language_detector_set_language (task, part, "en"); + } else if (r == rs_detect_multiple) { + /* Check our guess */ + + mean = 0.0; + std = 0.0; + cand_len = 0; + + /* Check distirbution */ + kh_foreach_value (candidates, cand, { + if (!isnan (cand->prob)) { + mean += cand->prob; + cand_len++; + } + }); + + if (cand_len > 0) { + mean /= cand_len; + + kh_foreach_value (candidates, cand, { + gdouble err; + if (!isnan (cand->prob)) { + err = cand->prob - mean; + std += fabs (err); + } + }); + + std /= cand_len; + } + + msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev", + cand_len, mean, std); + + if (cand_len > 0 && std / fabs (mean) < 0.25) { + msg_debug_lang_det ("apply frequency heuristic sorting"); + frequency_heuristic_applied = TRUE; + cbd.d = d; + cbd.mean = mean; + cbd.std = std; + cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; + + if (part->utf_words->len < default_words / 2) { + cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + } + } } - }); + } - if (cand_len > 0) { - mean /= cand_len; + /* Now, convert hash to array and sort it */ + if (r != rs_detect_none && kh_size (candidates) > 0) { + result = g_ptr_array_sized_new (kh_size (candidates)); kh_foreach_value (candidates, cand, { - gdouble err; if (!isnan (cand->prob)) { - err = cand->prob - mean; - std += fabs (err); + msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, + cand->prob); + g_ptr_array_add (result, cand); } }); - std /= cand_len; - } - - msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev", - cand_len, mean, std); - - if (cand_len > 0 && std / fabs (mean) < 0.25) { - msg_debug_lang_det ("apply frequency heuristic sorting"); - frequency_heuristic_applied = TRUE; - cbd.d = d; - cbd.mean = mean; - cbd.std = std; - cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; - - if (ucs_tokens->len < default_words / 2) { - cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + if (frequency_heuristic_applied) { + g_ptr_array_sort_with_data (result, + rspamd_language_detector_cmp_heuristic, (gpointer) &cbd); + } else { + g_ptr_array_sort (result, rspamd_language_detector_cmp); } - } - } - /* Now, convert hash to array and sort it */ - result = g_ptr_array_sized_new (kh_size (candidates)); + if (result->len > 0 && !frequency_heuristic_applied) { + cand = g_ptr_array_index (result, 0); + cand->elt->occurencies++; + d->total_occurencies++; + } - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, - cand->prob); - g_ptr_array_add (result, cand); + part->languages = result; + ret = TRUE; + } + else if (part->languages == NULL) { + rspamd_language_detector_set_language (task, part, "en"); } - }); - - if (frequency_heuristic_applied) { - g_ptr_array_sort_with_data (result, - rspamd_language_detector_cmp_heuristic, (gpointer)&cbd); - } - else { - g_ptr_array_sort (result, rspamd_language_detector_cmp); - } - - kh_destroy (rspamd_candidates_hash, candidates); - if (result->len > 0 && !frequency_heuristic_applied) { - cand = g_ptr_array_index (result, 0); - cand->elt->occurencies ++; - d->total_occurencies ++; + kh_destroy (rspamd_candidates_hash, candidates); } end_ticks = rspamd_get_ticks (TRUE); msg_debug_lang_det ("detected languages in %.0f ticks", (end_ticks - start_ticks)); - return result; + return ret; } diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index 2d28ec65a..50fe19b6e 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -20,11 +20,32 @@ #include "config.h" #include "libserver/cfg_file.h" #include "libstat/stat_api.h" +#include "libmime/message.h" struct rspamd_lang_detector; struct rspamd_language_elt; struct rspamd_task; +enum rspamd_unicode_scripts { + RSPAMD_UNICODE_LATIN = (1 << 0), + RSPAMD_UNICODE_GREEK = (1 << 1), + RSPAMD_UNICODE_CYRILLIC = (1 << 2), + RSPAMD_UNICODE_HEBREW = (1 << 3), + RSPAMD_UNICODE_CJK = (1 << 4), + RSPAMD_UNICODE_JP = (1 << 5), + RSPAMD_UNICODE_ARABIC = (1 << 6), + RSPAMD_UNICODE_DEVANAGARI = (1 << 7), + RSPAMD_UNICODE_THAI = (1 << 8), + RSPAMD_UNICODE_ARMENIAN = (1 << 9), + RSPAMD_UNICODE_GEORGIAN = (1 << 10), + RSPAMD_UNICODE_GUJARATI = (1 << 11), + RSPAMD_UNICODE_TAMIL = (1 << 12), + RSPAMD_UNICODE_TELUGU = (1 << 13), + RSPAMD_UNICODE_MALAYALAM = (1 << 14), + RSPAMD_UNICODE_SINHALA = (1 << 15), + RSPAMD_UNICODE_HANGUL = (1 << 16), +}; + struct rspamd_lang_detector_res { gdouble prob; const gchar *lang; @@ -59,8 +80,8 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, * @param words_len * @return array of struct rspamd_lang_detector_res sorted by freq descending */ -GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task, +gboolean rspamd_language_detector_detect (struct rspamd_task *task, struct rspamd_lang_detector *d, - GArray *ucs_tokens, gsize words_len); + struct rspamd_mime_text_part *part); #endif diff --git a/src/libmime/message.c b/src/libmime/message.c index e6cb63504..0d4581ad7 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -67,7 +67,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, guint i, nlen, total_len = 0, short_len = 0; gdouble avg_len = 0; - if (part->normalized_words) { + if (part->utf_words) { #ifdef WITH_SNOWBALL static GHashTable *stemmers = NULL; @@ -97,10 +97,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, #endif - for (i = 0; i < part->normalized_words->len; i++) { + for (i = 0; i < part->utf_words->len; i++) { guint64 h; - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); r = NULL; #ifdef WITH_SNOWBALL if (stem) { @@ -156,7 +156,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, } } - if (part->normalized_words && part->normalized_words->len) { + if (part->utf_words && part->utf_words->len) { gdouble *avg_len_p, *short_len_p; avg_len_p = rspamd_mempool_get_variable (task->task_pool, @@ -188,12 +188,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, } } -static guint +static void rspamd_mime_part_create_words (struct rspamd_task *task, struct rspamd_mime_text_part *part) { - rspamd_stat_token_t *w, ucs_w; - guint i, ucs_len = 0; enum rspamd_tokenize_type tok_type; if (IS_PART_UTF (part)) { @@ -203,69 +201,39 @@ rspamd_mime_part_create_words (struct rspamd_task *task, tok_type = RSPAMD_TOKENIZE_RAW; } - /* Ugly workaround */ - if (IS_PART_HTML (part)) { - part->normalized_words = rspamd_tokenize_text ( - part->stripped_content->data, - part->stripped_content->len, tok_type, task->cfg, - part->exceptions, - NULL); - } - else { - part->normalized_words = rspamd_tokenize_text ( - part->stripped_content->data, - part->stripped_content->len, tok_type, task->cfg, - part->exceptions, - NULL); - } - - if (part->normalized_words) { - part->normalized_hashes = g_array_sized_new (FALSE, FALSE, - sizeof (guint64), part->normalized_words->len); - - if (IS_PART_UTF (part) && task->lang_det) { - part->ucs32_words = g_array_sized_new (FALSE, FALSE, - sizeof (rspamd_stat_token_t), part->normalized_words->len); - } - - if (part->ucs32_words) { - + part->utf_words = rspamd_tokenize_text ( + part->utf_stripped_content->data, + part->utf_stripped_content->len, + &part->utf_stripped_text, + tok_type, task->cfg, + part->exceptions, + NULL); - for (i = 0; i < part->normalized_words->len; i++) { - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, - i); - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { - rspamd_language_detector_to_ucs (task->lang_det, - task->task_pool, - w, &ucs_w); - g_array_append_val (part->ucs32_words, ucs_w); - ucs_len += ucs_w.len; - } - } - } + if (part->utf_words) { + part->normalized_hashes = g_array_sized_new (FALSE, FALSE, + sizeof (guint64), part->utf_words->len); } - return ucs_len; } static void rspamd_mime_part_detect_language (struct rspamd_task *task, - struct rspamd_mime_text_part *part, guint ucs_len) + struct rspamd_mime_text_part *part) { struct rspamd_lang_detector_res *lang; - if (part->ucs32_words) { - part->languages = rspamd_language_detector_detect (task, - task->lang_det, - part->ucs32_words, ucs_len); - - if (part->languages->len > 0) { + if (!IS_PART_EMPTY (part) && part->utf_words && part->utf_words->len > 0 && + task->lang_det) { + if (rspamd_language_detector_detect (task, task->lang_det, part)) { lang = g_ptr_array_index (part->languages, 0); part->language = lang->lang; msg_info_task ("detected part language: %s", part->language); } + else { + part->language = "en"; /* Safe fallback */ + } } } @@ -289,7 +257,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, state = seen_cr; if (p > c) { last_c = *(p - 1); - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)c, p - c); } @@ -299,11 +267,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, case seen_cr: /* Double \r\r */ if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); crlf_added = TRUE; g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } part->nlines ++; @@ -326,17 +294,17 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, if (p > c) { last_c = *(p - 1); - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)c, p - c); } c = p + 1; if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); crlf_added = TRUE; } else { @@ -348,13 +316,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, /* \r\n */ if (!crlf_added) { if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *) " ", 1); crlf_added = TRUE; } g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } c = p + 1; @@ -364,11 +332,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, case seen_lf: /* Double \n\n */ if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); crlf_added = TRUE; g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } part->nlines++; @@ -414,13 +382,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, if (!crlf_added) { g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } /* Skip initial spaces */ if (G_UNLIKELY (*p == ' ')) { if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); } @@ -451,7 +419,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, switch (state) { case normal_char: - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)c, p - c); while (c < p) { @@ -479,10 +447,10 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, default: if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } part->nlines++; @@ -495,34 +463,52 @@ static void rspamd_normalize_text_part (struct rspamd_task *task, struct rspamd_mime_text_part *part) { - const gchar *p, *end; guint i; goffset off; struct rspamd_process_exception *ex; + UErrorCode uc_err = U_ZERO_ERROR; - /* Strip newlines */ - part->stripped_content = g_byte_array_sized_new (part->content->len); part->newlines = g_ptr_array_sized_new (128); - p = (const gchar *)part->content->data; - end = p + part->content->len; - - rspamd_strip_newlines_parse (p, end, part); - - for (i = 0; i < part->newlines->len; i ++) { - ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); - off = (goffset)g_ptr_array_index (part->newlines, i); - g_ptr_array_index (part->newlines, i) = (gpointer)(goffset) - (part->stripped_content->data + off); - ex->pos = off; - ex->len = 0; - ex->type = RSPAMD_EXCEPTION_NEWLINE; - part->exceptions = g_list_prepend (part->exceptions, ex); + + if (IS_PART_EMPTY (part)) { + part->utf_stripped_content = g_byte_array_new (); + } + else { + part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len); + + p = (const gchar *)part->utf_content->data; + end = p + part->utf_content->len; + + rspamd_strip_newlines_parse (p, end, part); + + for (i = 0; i < part->newlines->len; i ++) { + ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); + off = (goffset)g_ptr_array_index (part->newlines, i); + g_ptr_array_index (part->newlines, i) = (gpointer)(goffset) + (part->utf_stripped_content->data + off); + ex->pos = off; + ex->len = 0; + ex->type = RSPAMD_EXCEPTION_NEWLINE; + part->exceptions = g_list_prepend (part->exceptions, ex); + } + } + + if (IS_PART_UTF (part)) { + utext_openUTF8 (&part->utf_stripped_text, + part->utf_stripped_content->data, + part->utf_stripped_content->len, + &uc_err); + + if (!U_SUCCESS (uc_err)) { + msg_warn_task ("cannot open text from utf content"); + /* Probably, should be an assertion */ + } } rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) free_byte_array_callback, - part->stripped_content); + part->utf_stripped_content); rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, part->newlines); @@ -615,10 +601,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part g_assert (rspamd_multipattern_compile (gtube_matcher, NULL)); } - if (part->content && part->content->len >= sizeof (gtube_pattern_reject) && - part->content->len <= max_check_size) { - if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->content->data, - part->content->len, + if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) && + part->utf_content->len <= max_check_size) { + if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data, + part->utf_content->len, rspamd_multipattern_gtube_cb, NULL, NULL)) > 0) { switch (ret) { @@ -639,7 +625,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part msg_info_task ( "<%s>: gtube %s pattern has been found in part of length %ud", task->message_id, rspamd_action_to_str (act), - part->content->len); + part->utf_content->len); } } } @@ -655,9 +641,86 @@ exceptions_compare_func (gconstpointer a, gconstpointer b) return ea->pos - eb->pos; } +static gboolean +rspamd_message_process_plain_text_part (struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) +{ + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert (task, text_part); + + if (text_part->utf_raw_content != NULL) { + /* Different from HTML, where we also parse HTML and strip tags */ + text_part->utf_content = text_part->utf_raw_content; + text_part->unicode_content = text_part->unicode_raw_content; + } + else { + /* + * We ignore unconverted parts from now as it is dangerous + * to treat them as text parts + */ + + return FALSE; + } + + return TRUE; +} + +static gboolean +rspamd_message_process_html_text_part (struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) +{ + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; + + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert (task, text_part); + + if (text_part->utf_raw_content == NULL) { + return FALSE; + } + + text_part->html = rspamd_mempool_alloc0 (task->task_pool, + sizeof (*text_part->html)); + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; + text_part->utf_content = rspamd_html_process_part_full ( + task->task_pool, + text_part->html, + text_part->utf_raw_content, + &text_part->exceptions, + task->urls, + task->emails); + + if (text_part->utf_content->len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + } + + /* Also add unicode content */ + text_part->unicode_content = g_array_sized_new (FALSE, FALSE, + sizeof (UChar), text_part->utf_content->len + 1); + rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content); + + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) free_byte_array_callback, + text_part->utf_content); + rspamd_mempool_add_destructor (task->task_pool, + rspamd_array_free_hard, + text_part->unicode_content); + + return TRUE; +} + static void -rspamd_message_process_text_part (struct rspamd_task *task, - struct rspamd_mime_part *mime_part) +rspamd_message_process_text_part_maybe (struct rspamd_task *task, + struct rspamd_mime_part *mime_part) { struct rspamd_mime_text_part *text_part; rspamd_ftok_t html_tok, xhtml_tok; @@ -738,87 +801,32 @@ rspamd_message_process_text_part (struct rspamd_task *task, debug_task ("skip attachments for checking as text parts"); return; } - - if (found_html) { - text_part = rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct rspamd_mime_text_part)); - text_part->raw.begin = mime_part->raw_data.begin; - text_part->raw.len = mime_part->raw_data.len; - text_part->parsed.begin = mime_part->parsed_data.begin; - text_part->parsed.len = mime_part->parsed_data.len; - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; - text_part->mime_part = mime_part; - - if (mime_part->parsed_data.len == 0) { - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - g_ptr_array_add (task->text_parts, text_part); - return; - } - - rspamd_mime_text_part_maybe_convert (task, text_part); - - if (text_part->utf_raw_content == NULL) { - return; - } - - text_part->html = rspamd_mempool_alloc0 (task->task_pool, - sizeof (*text_part->html)); - text_part->mime_part = mime_part; - - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; - text_part->content = rspamd_html_process_part_full ( - task->task_pool, - text_part->html, - text_part->utf_raw_content, - &text_part->exceptions, - task->urls, - task->emails); - - if (text_part->content->len == 0) { - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - } - - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) free_byte_array_callback, - text_part->content); - g_ptr_array_add (task->text_parts, text_part); + else if (!(found_txt || found_html)) { + /* Not a text part */ + return; } - else if (found_txt) { - text_part = - rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct rspamd_mime_text_part)); - text_part->mime_part = mime_part; - text_part->raw.begin = mime_part->raw_data.begin; - text_part->raw.len = mime_part->raw_data.len; - text_part->parsed.begin = mime_part->parsed_data.begin; - text_part->parsed.len = mime_part->parsed_data.len; - text_part->mime_part = mime_part; - - if (mime_part->parsed_data.len == 0) { - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - g_ptr_array_add (task->text_parts, text_part); - return; - } - rspamd_mime_text_part_maybe_convert (task, text_part); + text_part = rspamd_mempool_alloc0 (task->task_pool, + sizeof (struct rspamd_mime_text_part)); + text_part->mime_part = mime_part; + text_part->raw.begin = mime_part->raw_data.begin; + text_part->raw.len = mime_part->raw_data.len; + text_part->parsed.begin = mime_part->parsed_data.begin; + text_part->parsed.len = mime_part->parsed_data.len; + text_part->utf_stripped_text = (UText)UTEXT_INITIALIZER; - if (text_part->utf_raw_content != NULL) { - /* - * We ignore unconverted parts from now as it is dangerous - * to treat them as text parts - */ - text_part->content = text_part->utf_raw_content; - g_ptr_array_add (task->text_parts, text_part); - } - else { + if (found_html) { + if (!rspamd_message_process_html_text_part (task, text_part)) { return; } } else { - return; + if (!rspamd_message_process_plain_text_part (task, text_part)) { + return; + } } - + g_ptr_array_add (task->text_parts, text_part); mime_part->flags |= RSPAMD_MIME_PART_TEXT; mime_part->specific.txt = text_part; @@ -867,7 +875,7 @@ rspamd_message_process_text_part (struct rspamd_task *task, text_part->exceptions); } - text_part->ucs_len = rspamd_mime_part_create_words (task, text_part); + rspamd_mime_part_create_words (task, text_part); } /* Creates message from various data using libmagic to detect type */ @@ -1172,7 +1180,7 @@ rspamd_message_process (struct rspamd_task *task) struct rspamd_mime_part *part; part = g_ptr_array_index (task->parts, i); - rspamd_message_process_text_part (task, part); + rspamd_message_process_text_part_maybe (task, part); } rspamd_images_process (task); @@ -1207,7 +1215,7 @@ rspamd_message_process (struct rspamd_task *task) sel = p2; } else { - if (p1->ucs_len > p2->ucs_len) { + if (p1->unicode_content->len > p2->unicode_content->len) { sel = p1; } else { @@ -1215,7 +1223,7 @@ rspamd_message_process (struct rspamd_task *task) } } - rspamd_mime_part_detect_language (task, sel, sel->ucs_len); + rspamd_mime_part_detect_language (task, sel); if (sel->language && sel->language[0]) { /* Propagate language */ @@ -1274,13 +1282,13 @@ rspamd_message_process (struct rspamd_task *task) PTR_ARRAY_FOREACH (task->text_parts, i, text_part) { if (!text_part->language) { - rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len); + rspamd_mime_part_detect_language (task, text_part); } rspamd_mime_part_extract_words (task, text_part); - if (text_part->normalized_words) { - total_words += text_part->normalized_words->len; + if (text_part->utf_words) { + total_words += text_part->utf_words->len; } } diff --git a/src/libmime/message.h b/src/libmime/message.h index baabb762a..205bf5bb2 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -14,6 +14,7 @@ #include "content_type.h" #include <unicode/uchar.h> +#include <unicode/utext.h> struct rspamd_task; struct controller_session; @@ -86,20 +87,28 @@ struct rspamd_mime_text_part { const gchar *language; GPtrArray *languages; const gchar *real_charset; + + /* Raw data in native encoding */ rspamd_ftok_t raw; rspamd_ftok_t parsed; /* decoded from mime encodings */ - GByteArray *content; /* utf8 encoded processed content */ - GArray *ucs_raw_content; /* unicode raw content (of UChar) */ + /* UTF8 content */ + GByteArray *utf_content; /* utf8 encoded processed content */ GByteArray *utf_raw_content; /* utf raw content */ - GByteArray *stripped_content; /* utf content with no newlines */ + GByteArray *utf_stripped_content; /* utf content with no newlines */ + GArray *normalized_hashes; + GArray *utf_words; + UText utf_stripped_text; /* Used by libicu to represent the utf8 content */ + + /* Unicode content, used by libicu */ + GArray *unicode_raw_content; /* unicode raw content (of UChar) */ + GArray *unicode_content; /* unicode processed content (of UChar) */ + GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ struct html_content *html; GList *exceptions; /**< list of offsets of urls */ struct rspamd_mime_part *mime_part; - GArray *normalized_words; - GArray *ucs32_words; - GArray *normalized_hashes; + guint flags; guint nlines; guint spaces; @@ -110,7 +119,7 @@ struct rspamd_mime_text_part { guint empty_lines; guint capital_letters; guint numeric_characters; - guint ucs_len; + guint unicode_scripts; }; enum rspamd_received_type { diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index d3f255740..a0abb1bb0 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -283,18 +283,18 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task, rspamd_mime_utf8_conv_init (); utf = text_part->utf_raw_content; - text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE, + text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), utf->len + 1); - text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter, - (UChar *)text_part->ucs_raw_content->data, + text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter, + (UChar *)text_part->unicode_raw_content->data, utf->len + 1, utf->data, utf->len, &uc_err); if (!U_SUCCESS (uc_err)) { - g_array_free (text_part->ucs_raw_content, TRUE); - text_part->ucs_raw_content = NULL; + g_array_free (text_part->unicode_raw_content, TRUE); + text_part->unicode_raw_content = NULL; } } @@ -311,12 +311,12 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); } - if (!text_part->ucs_raw_content) { + if (!text_part->unicode_raw_content) { return; } - src = (UChar *)text_part->ucs_raw_content->data; - nsym = text_part->ucs_raw_content->len; + src = (UChar *)text_part->unicode_raw_content->data; + nsym = text_part->unicode_raw_content->len; /* We can now check if we need to decompose */ end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err); @@ -346,8 +346,8 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, } else { /* Copy normalised back */ - memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar)); - text_part->ucs_raw_content->len = nsym; + memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar)); + text_part->unicode_raw_content->len = nsym; text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED; } @@ -369,16 +369,16 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task, rspamd_mime_utf8_conv_init (); if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) && - text_part->ucs_raw_content) { + text_part->unicode_raw_content) { clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len, + dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len, clen); g_byte_array_set_size (text_part->utf_raw_content, dlen); r = ucnv_fromUChars (utf8_converter, text_part->utf_raw_content->data, dlen, - (UChar *)text_part->ucs_raw_content->data, - text_part->ucs_raw_content->len, + (UChar *)text_part->unicode_raw_content->data, + text_part->unicode_raw_content->len, &uc_err); text_part->utf_raw_content->len = r; } @@ -410,10 +410,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, } - text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE, + text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), input->len + 1); r = ucnv_toUChars (conv, - (UChar *)text_part->ucs_raw_content->data, + (UChar *)text_part->unicode_raw_content->data, input->len + 1, input->data, input->len, @@ -426,7 +426,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, return FALSE; } - text_part->ucs_raw_content->len = r; + text_part->unicode_raw_content->len = r; rspamd_mime_text_part_normalise (task, text_part); /* Now, convert to utf8 */ @@ -434,7 +434,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); d = rspamd_mempool_alloc (task->task_pool, dlen); r = ucnv_fromUChars (utf8_converter, d, dlen, - (UChar *)text_part->ucs_raw_content->data, r, &uc_err); + (UChar *)text_part->unicode_raw_content->data, r, &uc_err); if (!U_SUCCESS (uc_err)) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, @@ -750,3 +750,17 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, SET_PART_UTF (text_part); } + +void +rspamd_utf_to_unicode (GByteArray *in, GArray *dest) +{ + UErrorCode uc_err = U_ZERO_ERROR; + + g_array_set_size (dest, in->len + 1); + dest->len = ucnv_toUChars (utf8_converter, + (UChar *)dest->data, + in->len + 1, + in->data, + in->len, + &uc_err); +} diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h index 5e30efdae..0754bb348 100644 --- a/src/libmime/mime_encoding.h +++ b/src/libmime/mime_encoding.h @@ -86,4 +86,11 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, */ void rspamd_mime_charset_utf_enforce (gchar *in, gsize len); +/** + * Converts utf8 to libicu unichars + * @param in + * @param dest + */ +void rspamd_utf_to_unicode (GByteArray *in, GArray *dest); + #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */ diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index c47db5761..268376e4d 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -905,8 +905,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, raw = TRUE; } - in = part->content->data; - len = part->content->len; + in = part->utf_content->data; + len = part->utf_content->len; } } @@ -1006,9 +1006,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); - if (part->stripped_content) { - scvec[i + 1] = (guchar *)part->stripped_content->data; - lenvec[i + 1] = part->stripped_content->len; + if (part->utf_stripped_content) { + scvec[i + 1] = (guchar *)part->utf_stripped_content->data; + lenvec[i + 1] = part->utf_stripped_content->len; } else { scvec[i + 1] = (guchar *)""; diff --git a/src/libserver/task.c b/src/libserver/task.c index bfeec990b..d77fc0145 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -242,20 +242,17 @@ rspamd_task_free (struct rspamd_task *task) for (i = 0; i < task->text_parts->len; i ++) { tp = g_ptr_array_index (task->text_parts, i); - if (tp->normalized_words) { - g_array_free (tp->normalized_words, TRUE); + if (tp->utf_words) { + g_array_free (tp->utf_words, TRUE); } if (tp->normalized_hashes) { g_array_free (tp->normalized_hashes, TRUE); } - if (tp->ucs32_words) { - g_array_free (tp->ucs32_words, TRUE); - } if (tp->languages) { g_ptr_array_unref (tp->languages); } - if (tp->ucs_raw_content) { - g_array_free (tp->ucs_raw_content, TRUE); + if (tp->unicode_raw_content) { + g_array_free (tp->unicode_raw_content, TRUE); } } diff --git a/src/libserver/url.c b/src/libserver/url.c index 653cc3570..9e6ab72db 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2624,7 +2624,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, { struct rspamd_url_mimepart_cbdata mcbd; - if (part->stripped_content == NULL || part->stripped_content->len == 0) { + if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) { msg_warn_task ("got empty text part"); return; } @@ -2632,8 +2632,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, mcbd.task = task; mcbd.part = part; - rspamd_url_find_multiple (task->task_pool, part->stripped_content->data, - part->stripped_content->len, is_html, part->newlines, + rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data, + part->utf_stripped_content->len, is_html, part->newlines, rspamd_url_text_part_callback, &mcbd); } diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 540a9e23f..6d34ba51c 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -331,8 +331,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); - if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { - reserved_len += part->normalized_words->len; + if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { + reserved_len += part->utf_words->len; } /* XXX: normal window size */ reserved_len += 5; @@ -346,9 +346,9 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); - if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { + if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, - part->normalized_words, IS_PART_UTF (part), + part->utf_words, IS_PART_UTF (part), NULL, task->tokens); } @@ -365,8 +365,18 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF, + UText utxt = UTEXT_INITIALIZER; + UErrorCode uc_err = U_ZERO_ERROR; + gsize slen = strlen (sub); + + utext_openUTF8 (&utxt, + sub, + slen, + &uc_err); + + words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF, NULL, NULL, NULL); + if (words != NULL) { for (i = 0; i < words->len; i ++) { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index fce98c53f..ac7f8be85 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -21,8 +21,10 @@ #include "tokenizers.h" #include "stat_internal.h" #include "../../../contrib/mumhash/mum.h" -#include "unicode/utf8.h" -#include "unicode/uchar.h" +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include <unicode/uiter.h> +#include <unicode/ubrk.h> typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, rspamd_stat_token_t * token, @@ -59,7 +61,7 @@ const gchar t_delimiters[255] = { /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, +rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gsize *rl, gboolean unused) { @@ -148,187 +150,97 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, return TRUE; } -static gboolean -rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, - gchar const **cur, rspamd_stat_token_t * token, - GList **exceptions, gsize *rl, - gboolean check_signature) +static inline gboolean +rspamd_tokenize_check_limit (gboolean decay, + guint word_decay, + guint nwords, + guint64 *hv, + guint64 *prob, + const rspamd_stat_token_t *token, + gssize remain, + gssize total) { - gint32 i, siglen = 0, remain; - goffset pos; - const gchar *p, *s, *sig = NULL; - UChar32 uc; - guint processed = 0; - struct rspamd_process_exception *ex = NULL; - enum { - skip_delimiters = 0, - feed_token, - process_signature - } state = skip_delimiters; - - if (buf == NULL) { - return FALSE; - } - - if (exceptions != NULL && *exceptions != NULL) { - ex = (*exceptions)->data; - } - - g_assert (cur != NULL); + static const gdouble avg_word_len = 6.0; - if (*cur == NULL) { - *cur = buf->begin; - } - - token->len = 0; + if (!decay) { + if (token->len >= sizeof (guint64)) { +#ifdef _MUM_UNALIGNED_ACCESS + *hv = mum_hash_step (*hv, *(guint64 *)token->begin); +#else + guint64 tmp; + memcpy (&tmp, token->begin, sizeof (tmp)); + *hv = mum_hash_step (*hv, tmp); +#endif + } - pos = *cur - buf->begin; - if (pos >= buf->len) { - return FALSE; - } + /* Check for decay */ + if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) { + /* Start decay */ + gdouble decay_prob; - remain = buf->len - pos; - s = *cur; - p = s; - token->begin = s; + *hv = mum_hash_finish (*hv); - for (i = 0; i < remain; ) { - p = &s[i]; - U8_NEXT (s, i, remain, uc); /* This also advances i */ + /* We assume that word is 6 symbols length in average */ + decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len); - if (uc < 0) { - if (i < remain) { - uc = 0xFFFD; + if (decay_prob >= 1.0) { + *prob = G_MAXUINT64; } else { - return FALSE; + *prob = decay_prob * G_MAXUINT64; } - } - switch (state) { - case skip_delimiters: - if (ex != NULL && p - buf->begin == ex->pos) { - goto process_exception; - } - else if (u_isgraph (uc)) { - if (u_isalnum (uc)) { - state = feed_token; - token->begin = p; - continue; - } - else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) { - sig = p; - siglen = remain - i; - state = process_signature; - continue; - } - } - break; - case feed_token: - if (ex != NULL && p - buf->begin == (gint)ex->pos) { - token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; - goto process_exception; - } - else if (!u_isalnum (uc)) { - token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; - goto set_token; - } - processed ++; - break; - case process_signature: - if (*p == '\r' || *p == '\n') { - msg_debug ("signature found: %*s", (gint)siglen, sig); - return FALSE; - } - else if (*p != ' ' && *p != '-' && *p != '_') { - state = skip_delimiters; - continue; - } - break; + return TRUE; } } + else { + /* Decaying probability */ + /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */ + *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL; - /* Last character */ - if (state == feed_token) { - p = &s[i]; - goto set_token; + if (*hv > *prob) { + return TRUE; + } } return FALSE; +} -set_token: - if (rl) { - *rl = processed; - } +static inline gboolean +rspamd_utf_word_valid (const gchar *text, const gchar *end, + gint32 start, gint32 finish) +{ + const gchar *st = text + start, *fin = text + finish; + UChar32 c; - if (token->len == 0 && processed > 0) { - token->len = p - token->begin; - g_assert (token->len > 0); + if (st >= end || fin > end || st >= fin) { + return FALSE; } - *cur = &s[i]; - - return TRUE; - -process_exception: - if (token->len == 0 && processed > 0) { - /* - * We have processed something before the next exception, so - * continue processing on next iteration of this function call - */ - token->len = p - token->begin; - g_assert (token->len > 0); - - *cur = p; + U8_NEXT (text, start, finish, c); + if (u_isalnum (c)) { return TRUE; } - if (ex->type == RSPAMD_EXCEPTION_URL) { - token->begin = "!!EX!!"; - token->len = sizeof ("!!EX!!") - 1; - token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - processed = token->len; - } - - p += ex->len; - - /* We need to skip all exceptions that are within this exception */ - *exceptions = g_list_next (*exceptions); - - while (*exceptions) { - ex = (*exceptions)->data; - - if (ex->pos < p - buf->begin) { - /* Nested exception */ - if (ex->pos + ex->len > p - buf->begin) { - /* - * We have somehow overlapping nesting exception, - * extend current offset - */ - p = buf->begin + ex->pos + ex->len; - } - - *exceptions = g_list_next (*exceptions); - } - else { - break; - } - } - - *cur = p; - - if (rl) { - *rl = processed; - } - - return TRUE; + return FALSE; } +#define SHIFT_EX do { \ + cur = g_list_next (cur); \ + if (cur) { \ + ex = (struct rspamd_process_exception *) cur->data; \ + } \ + else { \ + ex = NULL; \ + } \ +} while(0) GArray * rspamd_tokenize_text (const gchar *text, gsize len, + const UText *utxt, enum rspamd_tokenize_type how, - struct rspamd_config *cfg, GList *exceptions, + struct rspamd_config *cfg, + GList *exceptions, guint64 *hash) { rspamd_stat_token_t token, buf; @@ -336,11 +248,11 @@ rspamd_tokenize_text (const gchar *text, gsize len, gsize l = 0; GArray *res; GList *cur = exceptions; - token_get_function func; guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; guint64 hv = 0; gboolean decay = FALSE; guint64 prob; + static UBreakIterator* bi = NULL; if (text == NULL) { return NULL; @@ -353,18 +265,6 @@ rspamd_tokenize_text (const gchar *text, gsize len, token.len = 0; token.flags = 0; - switch (how) { - case RSPAMD_TOKENIZE_RAW: - func = rspamd_tokenizer_get_word_compat; - break; - case RSPAMD_TOKENIZE_UTF: - func = rspamd_tokenizer_get_word; - break; - default: - g_assert_not_reached (); - break; - } - if (cfg != NULL) { min_len = cfg->min_word_len; max_len = cfg->max_word_len; @@ -375,56 +275,175 @@ rspamd_tokenize_text (const gchar *text, gsize len, res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), initial_size); - while (func (&buf, &pos, &token, &cur, &l, FALSE)) { - if (l == 0 || (min_len > 0 && l < min_len) || - (max_len > 0 && l > max_len)) { + if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { + while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) { + if (l == 0 || (min_len > 0 && l < min_len) || + (max_len > 0 && l > max_len)) { + token.begin = pos; + continue; + } + + if (rspamd_tokenize_check_limit (decay, word_decay, res->len, + &hv, &prob, &token, pos - text, len)) { + if (!decay) { + decay = TRUE; + } + else { + token.begin = pos; + continue; + } + } + + g_array_append_val (res, token); token.begin = pos; - continue; } + } + else { + /* UTF8 boundaries */ + UErrorCode uc_err = U_ZERO_ERROR; + int32_t last, p; + struct rspamd_process_exception *ex = NULL; - if (!decay) { - if (token.len >= sizeof (guint64)) { -#ifdef _MUM_UNALIGNED_ACCESS - hv = mum_hash_step (hv, *(guint64 *)token.begin); -#else - guint64 tmp; - memcpy (&tmp, token.begin, sizeof (tmp)); - hv = mum_hash_step (hv, tmp); -#endif - } + if (bi == NULL) { + bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err); - /* Check for decay */ - if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) { - /* Start decay */ - gdouble decay_prob; + g_assert (U_SUCCESS (uc_err)); + } - decay = TRUE; - hv = mum_hash_finish (hv); + ubrk_setUText (bi, (UText*)utxt, &uc_err); + last = ubrk_first (bi); + p = last; - /* We assume that word is 6 symbols length in average */ - decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0); + if (cur) { + ex = (struct rspamd_process_exception *)cur->data; + } - if (decay_prob >= 1.0) { - prob = G_MAXUINT64; + while (p != UBRK_DONE) { +start_over: + token.len = 0; + + if (p > last) { + if (ex && cur) { + /* Check exception */ + if (ex->pos >= last && ex->pos <= p) { + /* We have an exception within boundary */ + /* First, start to drain exceptions from the start */ + while (cur && ex->pos <= last) { + /* We have an exception at the beginning, skip those */ + last += ex->len; + + if (ex->type == RSPAMD_EXCEPTION_URL) { + token.begin = "!!EX!!"; + token.len = sizeof ("!!EX!!") - 1; + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + + g_array_append_val (res, token); + token.flags = 0; + } + + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + p = ubrk_next (bi); + } + + /* We need to reset our scan with new p and last */ + SHIFT_EX; + goto start_over; + } + + SHIFT_EX; + } + + /* Now, we can have an exception within boundary again */ + if (cur && ex->pos >= last && ex->pos <= p) { + /* Append the first part */ + if (rspamd_utf_word_valid (text, text + len, last, + ex->pos)) { + token.begin = text + last; + token.len = ex->pos - last; + token.flags = 0; + g_array_append_val (res, token); + } + + /* Process the current exception */ + last += ex->len + (ex->pos - last); + + if (ex->type == RSPAMD_EXCEPTION_URL) { + token.begin = "!!EX!!"; + token.len = sizeof ("!!EX!!") - 1; + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + + g_array_append_val (res, token); + } + + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + p = ubrk_next (bi); + } + /* We need to reset our scan with new p and last */ + SHIFT_EX; + goto start_over; + } + + SHIFT_EX; + } + else if (p > last) { + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + } + } + } + else if (ex->pos < last) { + /* Forward exceptions list */ + while (cur && ex->pos <= last) { + /* We have an exception at the beginning, skip those */ + SHIFT_EX; + } + + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + } + } + else { + /* No exceptions within boundary */ + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + } + } } else { - prob = decay_prob * G_MAXUINT64; + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + } + } + + if (rspamd_tokenize_check_limit (decay, word_decay, res->len, + &hv, &prob, &token, pos - text, len)) { + if (!decay) { + decay = TRUE; + } else { + token.len = 0; + } } } - } - else { - /* Decaying probability */ - /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */ - hv = 2862933555777941757ULL * hv + 3037000493ULL; - if (hv > prob) { - token.begin = pos; - continue; + if (token.len > 0) { + g_array_append_val (res, token); } - } - g_array_append_val (res, token); - token.begin = pos; + last = p; + p = ubrk_next (bi); + } } if (!decay) { @@ -438,6 +457,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, return res; } +#undef SHIFT_EX + /* * vi:ts=4 */ diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 8be5f98a8..6c538eafc 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -7,6 +7,8 @@ #include "rspamd.h" #include "stat_api.h" +#include <unicode/utext.h> + #define RSPAMD_DEFAULT_TOKENIZER "osb" struct rspamd_tokenizer_runtime; @@ -28,7 +30,7 @@ struct rspamd_stat_tokenizer { enum rspamd_tokenize_type { RSPAMD_TOKENIZE_UTF = 0, RSPAMD_TOKENIZE_RAW, - RSPAMD_TOKENIZE_UCS + RSPAMD_TOKENIZE_UNICODE }; /* Compare two token nodes */ @@ -37,6 +39,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_stat_token_t type) */ GArray * rspamd_tokenize_text (const gchar *text, gsize len, + const UText *utxt, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, diff --git a/src/libutil/logger.c b/src/libutil/logger.c index 027c21da1..cd624f831 100644 --- a/src/libutil/logger.c +++ b/src/libutil/logger.c @@ -273,13 +273,13 @@ rspamd_log_open_priv (rspamd_logger_t *rspamd_log, uid_t uid, gid_t gid) S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH); if (rspamd_log->fd == -1) { fprintf (stderr, - "open_log: cannot open desired log file: %s, %s my pid: %d", - rspamd_log->log_file, strerror (errno), getpid ()); + "open_log: cannot open desired log file: %s, %s\n", + rspamd_log->log_file, strerror (errno)); return -1; } if (fchown (rspamd_log->fd, uid, gid) == -1) { fprintf (stderr, - "open_log: cannot chown desired log file: %s, %s", + "open_log: cannot chown desired log file: %s, %s\n", rspamd_log->log_file, strerror (errno)); close (rspamd_log->fd); return -1; diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index bb3406e80..78c3e05b9 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -549,16 +549,16 @@ lua_textpart_get_content (lua_State * L) rspamd_lua_setclass (L, "rspamd{text}", -1); if (!type) { - start = part->content->data; - len = part->content->len; + start = part->utf_content->data; + len = part->utf_content->len; } else if (strcmp (type, "content") == 0) { - start = part->content->data; - len = part->content->len; + start = part->utf_content->data; + len = part->utf_content->len; } else if (strcmp (type, "content_oneline") == 0) { - start = part->stripped_content->data; - len = part->stripped_content->len; + start = part->utf_stripped_content->data; + len = part->utf_stripped_content->len; } else if (strcmp (type, "raw_parsed") == 0) { start = part->parsed.begin; @@ -618,8 +618,8 @@ lua_textpart_get_content_oneline (lua_State * L) t = lua_newuserdata (L, sizeof (*t)); rspamd_lua_setclass (L, "rspamd{text}", -1); - t->start = part->stripped_content->data; - t->len = part->stripped_content->len; + t->start = part->utf_stripped_content->data; + t->len = part->utf_stripped_content->len; t->flags = 0; return 1; @@ -636,11 +636,11 @@ lua_textpart_get_length (lua_State * L) return 1; } - if (IS_PART_EMPTY (part) || part->content == NULL) { + if (IS_PART_EMPTY (part) || part->utf_content == NULL) { lua_pushinteger (L, 0); } else { - lua_pushinteger (L, part->content->len); + lua_pushinteger (L, part->utf_content->len); } return 1; @@ -721,11 +721,11 @@ lua_textpart_get_words_count (lua_State *L) return 1; } - if (IS_PART_EMPTY (part) || part->normalized_words == NULL) { + if (IS_PART_EMPTY (part) || part->utf_words == NULL) { lua_pushinteger (L, 0); } else { - lua_pushinteger (L, part->normalized_words->len); + lua_pushinteger (L, part->utf_words->len); } return 1; @@ -743,14 +743,14 @@ lua_textpart_get_words (lua_State *L) return luaL_error (L, "invalid arguments"); } - if (IS_PART_EMPTY (part) || part->normalized_words == NULL) { + if (IS_PART_EMPTY (part) || part->utf_words == NULL) { lua_createtable (L, 0, 0); } else { - lua_createtable (L, part->normalized_words->len, 0); + lua_createtable (L, part->utf_words->len, 0); - for (i = 0; i < part->normalized_words->len; i ++) { - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + for (i = 0; i < part->utf_words->len; i ++) { + w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); lua_pushlstring (L, w->begin, w->len); lua_rawseti (L, -2, i + 1); @@ -876,8 +876,8 @@ struct lua_shingle_data { }; #define STORE_TOKEN(i, t) do { \ - if ((i) < part->normalized_words->len) { \ - word = &g_array_index (part->normalized_words, rspamd_stat_token_t, (i)); \ + if ((i) < part->utf_words->len) { \ + word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \ sd->t.begin = word->begin; \ sd->t.len = word->len; \ } \ @@ -936,8 +936,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L) /* Calculate direct hash */ rspamd_cryptobox_hash_init (&st, key, rspamd_cryptobox_HASHKEYBYTES); - for (i = 0; i < part->normalized_words->len; i ++) { - word = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + for (i = 0; i < part->utf_words->len; i ++) { + word = &g_array_index (part->utf_words, rspamd_stat_token_t, i); rspamd_cryptobox_hash_update (&st, word->begin, word->len); } @@ -947,7 +947,7 @@ lua_textpart_get_fuzzy_hashes (lua_State * L) sizeof (hexdigest)); lua_pushlstring (L, hexdigest, sizeof (hexdigest) - 1); - sgl = rspamd_shingles_from_text (part->normalized_words, key, + sgl = rspamd_shingles_from_text (part->utf_words, key, pool, lua_shingles_filter, part, RSPAMD_SHINGLES_MUMHASH); if (sgl == NULL) { diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c index 16a8ace0c..e6a6052d4 100644 --- a/src/lua/lua_trie.c +++ b/src/lua/lua_trie.c @@ -262,9 +262,9 @@ lua_trie_search_mime (lua_State *L) for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); - if (!IS_PART_EMPTY (part) && part->content != NULL) { - text = part->content->data; - len = part->content->len; + if (!IS_PART_EMPTY (part) && part->utf_content != NULL) { + text = part->utf_content->data; + len = part->utf_content->len; if (lua_trie_search_str (L, trie, text, len) != 0) { found = TRUE; diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 3de68e60a..d6095ab52 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -1078,6 +1078,7 @@ lua_util_tokenize_text (lua_State *L) GList *exceptions = NULL, *cur; struct rspamd_lua_text *t; struct rspamd_process_exception *ex; + UText utxt = UTEXT_INITIALIZER; GArray *res; rspamd_stat_token_t *w; @@ -1129,7 +1130,15 @@ lua_util_tokenize_text (lua_State *L) exceptions = g_list_reverse (exceptions); } - res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL, + UErrorCode uc_err = U_ZERO_ERROR; + utext_openUTF8 (&utxt, + in, + len, + &uc_err); + + res = rspamd_tokenize_text ((gchar *)in, len, + &utxt, + RSPAMD_TOKENIZE_UTF, NULL, exceptions, NULL); diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 987879258..f917c26c8 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -560,13 +560,13 @@ rspamd_chartable_process_part (struct rspamd_task *task, guint i, ncap = 0; gdouble cur_score = 0.0; - if (part == NULL || part->normalized_words == NULL || - part->normalized_words->len == 0) { + if (part == NULL || part->utf_words == NULL || + part->utf_words->len == 0) { return; } - for (i = 0; i < part->normalized_words->len; i++) { - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + for (i = 0; i < part->utf_words->len; i++) { + w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { @@ -588,7 +588,7 @@ rspamd_chartable_process_part (struct rspamd_task *task, */ part->capital_letters += ncap; - cur_score /= (gdouble)part->normalized_words->len; + cur_score /= (gdouble)part->utf_words->len; if (cur_score > 2.0) { cur_score = 2.0; @@ -619,7 +619,17 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused) guint i; gdouble cur_score = 0.0; - words = rspamd_tokenize_text (task->subject, strlen (task->subject), + UText utxt = UTEXT_INITIALIZER; + UErrorCode uc_err = U_ZERO_ERROR; + gsize slen = strlen (task->subject); + + utext_openUTF8 (&utxt, + task->subject, + slen, + &uc_err); + + words = rspamd_tokenize_text (task->subject, slen, + &utxt, RSPAMD_TOKENIZE_UTF, NULL, NULL, diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index c0fd8aa4c..bf08c0e46 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -1196,7 +1196,7 @@ fuzzy_io_fin (void *ud) static GArray * fuzzy_preprocess_words (struct rspamd_mime_text_part *part, rspamd_mempool_t *pool) { - return part->normalized_words; + return part->utf_words; } static void @@ -1418,8 +1418,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task, rspamd_cryptobox_hash_init (&st, rule->hash_key->str, rule->hash_key->len); - rspamd_cryptobox_hash_update (&st, part->stripped_content->data, - part->stripped_content->len); + rspamd_cryptobox_hash_update (&st, part->utf_stripped_content->data, + part->utf_stripped_content->len); if (task->subject) { /* We also include subject */ @@ -2615,7 +2615,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, } /* Check length of part */ - fac = rule->ctx->text_multiplier * part->content->len; + fac = rule->ctx->text_multiplier * part->utf_content->len; if ((double)min_bytes > fac) { if (!rule->short_text_direct_hash) { msg_info_task ( @@ -2624,7 +2624,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, "skip fuzzy check", task->message_id, min_bytes, fac, - part->content->len, + part->utf_content->len, rule->ctx->text_multiplier); continue; } @@ -2635,21 +2635,21 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, "use direct hash", task->message_id, min_bytes, fac, - part->content->len, + part->utf_content->len, rule->ctx->text_multiplier); short_text = TRUE; } } - if (part->normalized_words == NULL || - part->normalized_words->len == 0) { + if (part->utf_words == NULL || + part->utf_words->len == 0) { msg_info_task ("<%s>, part hash empty, skip fuzzy check", task->message_id); continue; } if (rule->ctx->min_hash_len != 0 && - part->normalized_words->len < + part->utf_words->len < rule->ctx->min_hash_len) { if (!rule->short_text_direct_hash) { msg_info_task ( diff --git a/src/plugins/lua/antivirus.lua b/src/plugins/lua/antivirus.lua index 37c58bcf8..46ea8c40d 100644 --- a/src/plugins/lua/antivirus.lua +++ b/src/plugins/lua/antivirus.lua @@ -889,10 +889,16 @@ if opts and type(opts) == 'table' then for _, p in ipairs(m['patterns']) do if type(p) == 'table' then for sym in pairs(p) do + rspamd_logger.debugm(N, rspamd_config, 'registering: %1', { + type = 'virtual', + name = sym, + parent = m['symbol'], + parent_id = id, + }) rspamd_config:register_symbol({ type = 'virtual', name = sym, - parent = m['symbol'] + parent = id }) end end diff --git a/src/plugins/lua/arc.lua b/src/plugins/lua/arc.lua index 30ae0cd19..53fb7466a 100644 --- a/src/plugins/lua/arc.lua +++ b/src/plugins/lua/arc.lua @@ -608,7 +608,8 @@ end rspamd_config:register_symbol({ name = settings['sign_symbol'], - callback = arc_signing_cb + callback = arc_signing_cb, + groups = {"policies", "arc"} }) -- Do not sign unless valid diff --git a/src/plugins/lua/dkim_signing.lua b/src/plugins/lua/dkim_signing.lua index 99e1fca68..f9c6ecdb6 100644 --- a/src/plugins/lua/dkim_signing.lua +++ b/src/plugins/lua/dkim_signing.lua @@ -213,5 +213,6 @@ end rspamd_config:register_symbol({ name = settings['symbol'], - callback = dkim_signing_cb + callback = dkim_signing_cb, + groups = {"policies", "dkim"} }) diff --git a/src/rspamadm/confighelp.c b/src/rspamadm/confighelp.c index ff80341ea..d3461489e 100644 --- a/src/rspamadm/confighelp.c +++ b/src/rspamadm/confighelp.c @@ -71,7 +71,7 @@ rspamadm_confighelp_help (gboolean full_help, const struct rspamadm_command *cmd "-P: use specific Lua plugins path\n" "--no-color: disable coloured output\n" "--short: show only option names\n" - "--no-examples: do not show examples (impied by --short)\n" + "--no-examples: do not show examples (implied by --short)\n" "--help: shows available options and commands"; } else { |