aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/libmime/lang_detection.c1116
-rw-r--r--src/libmime/lang_detection.h25
-rw-r--r--src/libmime/message.c354
-rw-r--r--src/libmime/message.h23
-rw-r--r--src/libmime/mime_encoding.c50
-rw-r--r--src/libmime/mime_encoding.h7
-rw-r--r--src/libserver/re_cache.c10
-rw-r--r--src/libserver/task.c11
-rw-r--r--src/libserver/url.c6
-rw-r--r--src/libstat/stat_process.c20
-rw-r--r--src/libstat/tokenizers/tokenizers.c429
-rw-r--r--src/libstat/tokenizers/tokenizers.h5
-rw-r--r--src/libutil/logger.c6
-rw-r--r--src/lua/lua_mimepart.c42
-rw-r--r--src/lua/lua_trie.c6
-rw-r--r--src/lua/lua_util.c11
-rw-r--r--src/plugins/chartable.c22
-rw-r--r--src/plugins/fuzzy_check.c18
-rw-r--r--src/plugins/lua/antivirus.lua8
-rw-r--r--src/plugins/lua/arc.lua3
-rw-r--r--src/plugins/lua/dkim_signing.lua3
-rw-r--r--src/rspamadm/confighelp.c2
22 files changed, 1285 insertions, 892 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 8763365af..d4237690d 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -17,6 +17,7 @@
#include "lang_detection.h"
#include "libutil/logger.h"
#include "libcryptobox/cryptobox.h"
+#include "libutil/multipattern.h"
#include "ucl.h"
#include "khash.h"
#include <glob.h>
@@ -26,7 +27,7 @@
#include <unicode/ustring.h>
#include <math.h>
-static const gsize default_short_text_limit = 200;
+static const gsize default_short_text_limit = 20;
static const gsize default_words = 80;
static const gdouble update_prob = 0.6;
static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
@@ -42,28 +43,17 @@ struct rspamd_language_unicode_match {
* List of languages detected by unicode scripts
*/
static const struct rspamd_language_unicode_match unicode_langs[] = {
- {"el", UBLOCK_GREEK},
- {"ml", UBLOCK_MALAYALAM},
- {"te", UBLOCK_TELUGU},
- {"ta", UBLOCK_TAMIL},
- {"gu", UBLOCK_GUJARATI},
- {"th", UBLOCK_THAI},
- {"kn", UBLOCK_KANNADA},
- {"ka", UBLOCK_GEORGIAN},
- {"si", UBLOCK_SINHALA},
- {"hy", UBLOCK_ARMENIAN},
- {"lo", UBLOCK_LAO},
- {"km", UBLOCK_KHMER}
-};
-
-/*
- * List of languages to apply unigramms only
- */
-static const gchar *unigramms_langs[] = {
- "ja",
- "ko",
- "zh-CN",
- "zh-TW"
+ {"el", RSPAMD_UNICODE_GREEK},
+ {"ml", RSPAMD_UNICODE_MALAYALAM},
+ {"te", RSPAMD_UNICODE_TELUGU},
+ {"ta", RSPAMD_UNICODE_TAMIL},
+ {"gu", RSPAMD_UNICODE_GUJARATI},
+ {"th", RSPAMD_UNICODE_THAI},
+ {"ka", RSPAMD_UNICODE_GEORGIAN},
+ {"si", RSPAMD_UNICODE_SINHALA},
+ {"hy", RSPAMD_UNICODE_ARMENIAN},
+ {"ja", RSPAMD_UNICODE_JP},
+ {"ko", RSPAMD_UNICODE_HANGUL},
};
/*
@@ -73,24 +63,29 @@ static const gchar *tier0_langs[] = {
"en",
};
static const gchar *tier1_langs[] = {
- "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
- "ko", "pt", "ru", "pl", "tk", "th", "ar"
+ "fr", "it", "de", "es", "nl",
+ "pt", "ru", "pl", "tk", "th", "ar"
};
enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
RS_LANGUAGE_LATIN = (1 << 0),
- RS_LANGUAGE_UNISCRIPT = (1 << 1),
- RS_LANGUAGE_UNIGRAMM = (1 << 2),
RS_LANGUAGE_TIER1 = (1 << 3),
RS_LANGUAGE_TIER0 = (1 << 4),
};
+enum rspamd_language_category {
+ RSPAMD_LANGUAGE_LATIN = 0,
+ RSPAMD_LANGUAGE_CYRILLIC,
+ RSPAMD_LANGUAGE_DEVANAGARI,
+ RSPAMD_LANGUAGE_ARAB,
+ RSPAMD_LANGUAGE_MAX,
+};
+
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
enum rspamd_language_elt_flags flags;
- guint ngramms_total;
- guint unigramms_words;
+ enum rspamd_language_category category;
guint trigramms_words;
gdouble mean;
gdouble std;
@@ -109,6 +104,17 @@ struct rspamd_ngramm_chain {
gchar *utf;
};
+struct rspamd_stop_word_range {
+ guint start;
+ guint stop;
+ struct rspamd_language_elt *elt;
+};
+
+struct rspamd_stop_word_elt {
+ struct rspamd_multipattern *mp;
+ GArray *ranges; /* of rspamd_stop_word_range */
+};
+
#define msg_debug_lang_det(...) rspamd_conditional_debug_fast (NULL, NULL, \
rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
G_STRFUNC, \
@@ -149,18 +155,6 @@ rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts)
}
static guint
-rspamd_unigram_hash_func (gconstpointer key)
-{
- return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ());
-}
-
-static gboolean
-rspamd_unigram_equal_func (gconstpointer v, gconstpointer v2)
-{
- return memcmp (v, v2, sizeof (UChar)) == 0;
-}
-
-static guint
rspamd_trigram_hash_func (gconstpointer key)
{
return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar), rspamd_hash_seed ());
@@ -172,8 +166,6 @@ rspamd_trigram_equal_func (gconstpointer v, gconstpointer v2)
return memcmp (v, v2, 3 * sizeof (UChar)) == 0;
}
-KHASH_INIT (rspamd_unigram_hash, const UChar *, struct rspamd_ngramm_chain, true,
- rspamd_unigram_hash_func, rspamd_unigram_equal_func);
KHASH_INIT (rspamd_trigram_hash, const UChar *, struct rspamd_ngramm_chain, true,
rspamd_trigram_hash_func, rspamd_trigram_equal_func);
KHASH_INIT (rspamd_candidates_hash, const gchar *,
@@ -182,9 +174,8 @@ KHASH_INIT (rspamd_candidates_hash, const gchar *,
struct rspamd_lang_detector {
GPtrArray *languages;
- khash_t(rspamd_unigram_hash) *unigramms; /* unigramms frequencies */
- khash_t(rspamd_trigram_hash) *trigramms; /* trigramms frequencies */
- GHashTable *unicode_scripts; /* indexed by unicode script */
+ khash_t(rspamd_trigram_hash) *trigramms[RSPAMD_LANGUAGE_MAX]; /* trigramms frequencies */
+ struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
UConverter *uchar_converter;
gsize short_text_limit;
gsize total_occurencies; /* number of all languages found */
@@ -226,9 +217,13 @@ struct rspamd_language_ucs_elt {
static void
rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
- struct rspamd_lang_detector *d,
- struct rspamd_language_elt *lelt,
- struct rspamd_language_ucs_elt *ucs, guint len, guint freq, guint total)
+ struct rspamd_lang_detector *d,
+ struct rspamd_language_elt *lelt,
+ struct rspamd_language_ucs_elt *ucs,
+ guint len,
+ guint freq,
+ guint total,
+ khash_t (rspamd_trigram_hash) *htb)
{
struct rspamd_ngramm_chain *chain = NULL, st_chain;
struct rspamd_ngramm_elt *elt;
@@ -238,18 +233,13 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
switch (len) {
case 1:
- k = kh_get (rspamd_unigram_hash, d->unigramms, ucs->s);
- if (k != kh_end (d->unigramms)) {
- chain = &kh_value (d->unigramms, k);
- }
- break;
case 2:
g_assert_not_reached ();
break;
case 3:
- k = kh_get (rspamd_trigram_hash, d->trigramms, ucs->s);
- if (k != kh_end (d->trigramms)) {
- chain = &kh_value (d->trigramms, k);
+ k = kh_get (rspamd_trigram_hash, htb, ucs->s);
+ if (k != kh_end (htb)) {
+ chain = &kh_value (htb, k);
}
break;
default:
@@ -270,14 +260,8 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
elt->prob = ((gdouble)freq) / ((gdouble)total);
g_ptr_array_add (chain->languages, elt);
- if (len == 1) {
- k = kh_put (rspamd_unigram_hash, d->unigramms, ucs->s, &i);
- kh_value (d->unigramms, k) = *chain;
- }
- else {
- k = kh_put (rspamd_trigram_hash, d->trigramms, ucs->s, &i);
- kh_value (d->trigramms, k) = *chain;
- }
+ k = kh_put (rspamd_trigram_hash, htb, ucs->s, &i);
+ kh_value (htb, k) = *chain;
}
else {
/* Check sanity */
@@ -300,6 +284,23 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
}
}
+static inline enum rspamd_language_category
+rspamd_language_detector_get_category (guint uflags)
+{
+ enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN;
+
+ if (uflags & RSPAMD_UNICODE_CYRILLIC) {
+ cat = RSPAMD_LANGUAGE_CYRILLIC;
+ }
+ else if (uflags & RSPAMD_UNICODE_DEVANAGARI) {
+ cat = RSPAMD_LANGUAGE_DEVANAGARI;
+ }
+ else if (uflags & RSPAMD_UNICODE_ARABIC) {
+ cat = RSPAMD_LANGUAGE_ARAB;
+ }
+
+ return cat;
+}
static const gchar *
rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
@@ -307,9 +308,6 @@ rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
static gchar flags_buf[256];
goffset r = 0;
- if (elt->flags & RS_LANGUAGE_UNIGRAMM) {
- r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "unigrams,");
- }
if (elt->flags & RS_LANGUAGE_TIER1) {
r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier1,");
}
@@ -342,19 +340,22 @@ rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b)
static void
rspamd_language_detector_read_file (struct rspamd_config *cfg,
struct rspamd_lang_detector *d,
- const gchar *path)
+ const gchar *path,
+ const ucl_object_t *stop_words)
{
struct ucl_parser *parser;
ucl_object_t *top;
- const ucl_object_t *freqs, *n_words, *cur;
+ const ucl_object_t *freqs, *n_words, *cur, *type;
ucl_object_iter_t it = NULL;
UErrorCode uc_err = U_ZERO_ERROR;
struct rspamd_language_elt *nelt;
- const struct rspamd_language_unicode_match *uc_match;
struct rspamd_language_ucs_elt *ucs_elt;
+ khash_t (rspamd_trigram_hash) *htb = NULL;
gchar *pos;
- guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, loaded;
+ guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
+ loaded, nstop = 0;
gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
+ enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS);
if (!ucl_parser_add_file (parser, path)) {
@@ -396,141 +397,181 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
return;
}
else {
- nelt->unigramms_words = ucl_object_toint (ucl_array_find_index (n_words,
- 0));
nelt->trigramms_words = ucl_object_toint (ucl_array_find_index (n_words,
2));
}
- if ((uc_match = rspamd_language_search_unicode_match (nelt->name, unicode_langs,
- G_N_ELEMENTS (unicode_langs))) != NULL) {
- g_hash_table_insert (d->unicode_scripts, (gpointer)&uc_match->unicode_code,
- nelt);
- nelt->flags |= RS_LANGUAGE_UNISCRIPT;
- msg_info_config ("loaded unicode script only %s language: %d",
- nelt->name,
- uc_match->unicode_code);
+ type = ucl_object_lookup (top, "type");
+
+ if (type == NULL || ucl_object_type (type) != UCL_STRING) {
+ msg_warn_config ("cannot find type in language %s", nelt->name);
+ ucl_object_unref (top);
+
+ return;
}
else {
- GPtrArray *ngramms;
- guint nsym;
+ const gchar *stype = ucl_object_tostring (type);
- if (rspamd_language_search_str (nelt->name, unigramms_langs,
- G_N_ELEMENTS (unigramms_langs))) {
- nelt->flags |= RS_LANGUAGE_UNIGRAMM;
+ if (strcmp (stype, "latin") == 0) {
+ cat = RSPAMD_LANGUAGE_LATIN;
+ }
+ else if (strcmp (stype, "cyrillic") == 0) {
+ cat = RSPAMD_LANGUAGE_CYRILLIC;
}
+ else if (strcmp (stype, "arab") == 0) {
+ cat = RSPAMD_LANGUAGE_ARAB;
+ }
+ else if (strcmp (stype, "devanagari") == 0) {
+ cat = RSPAMD_LANGUAGE_DEVANAGARI;
+ }
+ else {
+ msg_warn_config ("unknown type %s of language %s", stype, nelt->name);
+ ucl_object_unref (top);
- if (rspamd_language_search_str (nelt->name, tier1_langs,
- G_N_ELEMENTS (tier1_langs))) {
- nelt->flags |= RS_LANGUAGE_TIER1;
+ return;
}
+ }
+
+ if (stop_words) {
+ const ucl_object_t *specific_stop_words;
+
+ specific_stop_words = ucl_object_lookup (stop_words, nelt->name);
+
+ if (specific_stop_words) {
+ it = NULL;
+ const ucl_object_t *w;
+ guint start, stop;
+
+ start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
+
+ while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) {
+ rspamd_multipattern_add_pattern (d->stop_words[cat].mp,
+ ucl_object_tostring (w), 0);
+ nstop ++;
+ }
- if (rspamd_language_search_str (nelt->name, tier0_langs,
- G_N_ELEMENTS (tier0_langs))) {
- nelt->flags |= RS_LANGUAGE_TIER0;
+ stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
+
+ struct rspamd_stop_word_range r;
+
+ r.start = start;
+ r.stop = stop;
+ r.elt = nelt;
+
+ g_array_append_val (d->stop_words[cat].ranges, r);
+ it = NULL;
}
+ }
- it = NULL;
- ngramms = g_ptr_array_sized_new (freqs->len);
- i = 0;
- skipped = 0;
- loaded = 0;
+ nelt->category = cat;
+ htb = d->trigramms[cat];
- while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
- const gchar *key;
- gsize keylen;
- guint freq;
+ GPtrArray *ngramms;
+ guint nsym;
- key = ucl_object_keyl (cur, &keylen);
- freq = ucl_object_toint (cur);
+ if (rspamd_language_search_str (nelt->name, tier1_langs,
+ G_N_ELEMENTS (tier1_langs))) {
+ nelt->flags |= RS_LANGUAGE_TIER1;
+ }
- i ++;
- delta = freq - mean;
- mean += delta / i;
- delta2 = freq - mean;
- m2 += delta * delta2;
+ if (rspamd_language_search_str (nelt->name, tier0_langs,
+ G_N_ELEMENTS (tier0_langs))) {
+ nelt->flags |= RS_LANGUAGE_TIER0;
+ }
- if (key != NULL) {
- ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
- sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar));
+ it = NULL;
+ ngramms = g_ptr_array_sized_new (freqs->len);
+ i = 0;
+ skipped = 0;
+ loaded = 0;
- nsym = ucnv_toUChars (d->uchar_converter,
- ucs_elt->s, keylen + 1,
- key,
- keylen, &uc_err);
- ucs_elt->utf = key;
+ while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
+ const gchar *key;
+ gsize keylen;
+ guint freq;
- if (uc_err != U_ZERO_ERROR) {
- msg_warn_config ("cannot convert key to unicode: %s",
- u_errorName (uc_err));
+ key = ucl_object_keyl (cur, &keylen);
+ freq = ucl_object_toint (cur);
- continue;
- }
+ i ++;
+ delta = freq - mean;
+ mean += delta / i;
+ delta2 = freq - mean;
+ m2 += delta * delta2;
- rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
+ if (key != NULL) {
+ ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
+ sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar));
- if (nsym == 3 && !(nelt->flags & RS_LANGUAGE_UNIGRAMM)) {
- g_ptr_array_add (ngramms, ucs_elt);
- }
- else if (nsym == 1 && nelt->flags & RS_LANGUAGE_UNIGRAMM) {
- g_ptr_array_add (ngramms, ucs_elt);
- }
- else {
- continue;
- }
+ nsym = ucnv_toUChars (d->uchar_converter,
+ ucs_elt->s, keylen + 1,
+ key,
+ keylen, &uc_err);
+ ucs_elt->utf = key;
- if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
- total_latin++;
- }
+ if (uc_err != U_ZERO_ERROR) {
+ msg_warn_config ("cannot convert key to unicode: %s",
+ u_errorName (uc_err));
+
+ continue;
+ }
- ucs_elt->freq = freq;
+ rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
- total_ngramms++;
+ if (nsym == 3) {
+ g_ptr_array_add (ngramms, ucs_elt);
+ }
+ else {
+ continue;
}
- }
- std = sqrt (m2 / (i - 1));
+ if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+ total_latin++;
+ }
- if (total_latin >= total_ngramms / 3) {
- nelt->flags |= RS_LANGUAGE_LATIN;
- }
+ ucs_elt->freq = freq;
- if (nelt->flags & RS_LANGUAGE_UNIGRAMM) {
- nsym = 1;
- }
- else {
- nsym = 3;
+ total_ngramms++;
}
+ }
- total = 0;
- PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
+ std = sqrt (m2 / (i - 1));
- if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
- rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
- ucs_elt->freq = 0;
- /* Skip latin ngramm for non-latin language to avoid garbadge */
- skipped ++;
- continue;
- }
+ if (total_latin >= total_ngramms / 3) {
+ nelt->flags |= RS_LANGUAGE_LATIN;
+ }
+
+ nsym = 3;
- /* Now, discriminate low frequency ngramms */
+ total = 0;
+ PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
- total += ucs_elt->freq;
- loaded ++;
+ if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
+ rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+ ucs_elt->freq = 0;
+ /* Skip latin ngramm for non-latin language to avoid garbadge */
+ skipped ++;
+ continue;
}
- g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
+ /* Now, discriminate low frequency ngramms */
- PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
- if (ucs_elt->freq > 0) {
- rspamd_language_detector_init_ngramm (cfg, d,
- nelt, ucs_elt, nsym,
- ucs_elt->freq, total);
- }
+ total += ucs_elt->freq;
+ loaded ++;
+ }
+
+ g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
+
+ PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
+ if (ucs_elt->freq > 0) {
+ rspamd_language_detector_init_ngramm (cfg, d,
+ nelt, ucs_elt, nsym,
+ ucs_elt->freq, total, htb);
}
+ }
#ifdef EXTRA_LANGDET_DEBUG
- /* Useful for debug */
+ /* Useful for debug */
for (i = 0; i < 10; i ++) {
ucs_elt = g_ptr_array_index (ngramms, i);
@@ -539,22 +580,20 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
#endif
- g_ptr_array_free (ngramms, TRUE);
- nelt->mean = mean;
- nelt->std = std;
- nelt->ngramms_total = total;
- msg_info_config ("loaded %s language, %d unigramms, %d trigramms, "
- "%d ngramms loaded; "
- "std=%.2f, mean=%.2f, skipped=%d, loaded=%d; "
- "(%s)",
- nelt->name,
- (gint)nelt->unigramms_words,
- (gint)nelt->trigramms_words,
- total,
- std, mean,
- skipped, loaded,
- rspamd_language_detector_print_flags (nelt));
- }
+ g_ptr_array_free (ngramms, TRUE);
+ nelt->mean = mean;
+ nelt->std = std;
+
+ msg_info_config ("loaded %s language, %d trigramms, "
+ "%d ngramms loaded; "
+ "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; "
+ "(%s)",
+ nelt->name,
+ (gint)nelt->trigramms_words,
+ total,
+ std, mean,
+ skipped, loaded, nstop,
+ rspamd_language_detector_print_flags (nelt));
g_ptr_array_add (d->languages, nelt);
ucl_object_unref (top);
@@ -631,16 +670,10 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d)
ucnv_close (d->uchar_converter);
}
- if (d->unicode_scripts) {
- g_hash_table_unref (d->unicode_scripts);
- }
-
- if (d->unigramms) {
- kh_destroy (rspamd_unigram_hash, d->unigramms);
- }
-
- if (d->trigramms) {
- kh_destroy (rspamd_trigram_hash, d->trigramms);
+ for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
+ kh_destroy (rspamd_trigram_hash, d->trigramms[i]);
+ rspamd_multipattern_destroy (d->stop_words[i].mp);
+ g_array_free (d->stop_words[i].ranges, TRUE);
}
if (d->languages) {
@@ -656,12 +689,14 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
*languages_disable = NULL;
const gchar *languages_path = default_languages_path;
glob_t gl;
- size_t i, short_text_limit = default_short_text_limit;
+ size_t i, short_text_limit = default_short_text_limit, total = 0;
UErrorCode uc_err = U_ZERO_ERROR;
GString *languages_pattern;
struct rspamd_ngramm_chain *chain, schain;
gchar *fname;
struct rspamd_lang_detector *ret = NULL;
+ struct ucl_parser *parser;
+ ucl_object_t *stop_words;
section = ucl_object_lookup (cfg->rcl_obj, "lang_detection");
@@ -683,6 +718,22 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
}
languages_pattern = g_string_sized_new (PATH_MAX);
+ rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path);
+ parser = ucl_parser_new (UCL_PARSER_DEFAULT);
+
+ if (ucl_parser_add_file (parser, languages_pattern->str)) {
+ stop_words = ucl_parser_get_object (parser);
+ }
+ else {
+ msg_err_config ("cannot read stop words from %s: %s",
+ languages_pattern->str,
+ ucl_parser_get_error (parser));
+ stop_words = NULL;
+ }
+
+ ucl_parser_free (parser);
+ languages_pattern->len = 0;
+
rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path);
memset (&gl, 0, sizeof (gl));
@@ -696,9 +747,13 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
ret->short_text_limit = short_text_limit;
/* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
- ret->unigramms = kh_init (rspamd_unigram_hash);
- ret->trigramms = kh_init (rspamd_trigram_hash);
- ret->unicode_scripts = g_hash_table_new (g_int_hash, g_int_equal);
+ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
+ ret->trigramms[i] = kh_init (rspamd_trigram_hash);
+ ret->stop_words[i].mp = rspamd_multipattern_create (
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+ ret->stop_words[i].ranges = g_array_new (FALSE, FALSE,
+ sizeof (struct rspamd_stop_word_range));
+ }
g_assert (uc_err == U_ZERO_ERROR);
@@ -708,7 +763,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
if (!rspamd_ucl_array_find_str (fname, languages_disable) ||
(languages_enable == NULL ||
rspamd_ucl_array_find_str (fname, languages_enable))) {
- rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i]);
+ rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i],
+ stop_words);
}
else {
msg_info_config ("skip language file %s: disabled", fname);
@@ -717,18 +773,27 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
g_free (fname);
}
- kh_foreach_value (ret->trigramms, schain, {
- chain = &schain;
- rspamd_language_detector_process_chain (cfg, chain);
- });
+ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
+ GError *err = NULL;
+
+ kh_foreach_value (ret->trigramms[i], schain, {
+ chain = &schain;
+ rspamd_language_detector_process_chain (cfg, chain);
+ });
+
+ if (!rspamd_multipattern_compile (ret->stop_words[i].mp, &err)) {
+ msg_err_config ("cannot compile stop words for %d language group: %e",
+ i, err);
+ g_error_free (err);
+ }
+
+ total += kh_size (ret->trigramms[i]);
+ }
- msg_info_config ("loaded %d languages, %d unicode only languages, "
- "%d unigramms, "
+ msg_info_config ("loaded %d languages, "
"%d trigramms",
(gint)ret->languages->len,
- (gint)g_hash_table_size (ret->unicode_scripts),
- (gint)kh_size (ret->unigramms),
- (gint)kh_size (ret->trigramms));
+ (gint)total);
REF_INIT_RETAIN (ret, rspamd_language_detector_dtor);
rspamd_mempool_add_destructor (cfg->cfg_pool,
@@ -859,11 +924,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
#endif
}
-enum rspamd_language_gramm_type {
- rs_unigramm = 0,
- rs_trigramm
-};
-
static goffset
rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
guint wlen, goffset cur_off)
@@ -914,9 +974,10 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
*/
static void
rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- UChar *window, enum rspamd_language_gramm_type type,
- khash_t(rspamd_candidates_hash) *candidates)
+ struct rspamd_lang_detector *d,
+ UChar *window,
+ khash_t(rspamd_candidates_hash) *candidates,
+ khash_t(rspamd_trigram_hash) *trigramms)
{
guint i;
gint ret;
@@ -926,19 +987,9 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
khiter_t k;
gdouble prob;
- switch (type) {
- case rs_unigramm:
- k = kh_get (rspamd_unigram_hash, d->unigramms, window);
- if (k != kh_end (d->unigramms)) {
- chain = &kh_value (d->unigramms, k);
- }
- break;
- case rs_trigramm:
- k = kh_get (rspamd_trigram_hash, d->trigramms, window);
- if (k != kh_end (d->trigramms)) {
- chain = &kh_value (d->trigramms, k);
- }
- break;
+ k = kh_get (rspamd_trigram_hash, trigramms, window);
+ if (k != kh_end (trigramms)) {
+ chain = &kh_value (trigramms, k);
}
if (chain) {
@@ -980,29 +1031,20 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
static void
rspamd_language_detector_detect_word (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- rspamd_stat_token_t *tok,
- khash_t(rspamd_candidates_hash) *candidates,
- enum rspamd_language_gramm_type type)
+ struct rspamd_lang_detector *d,
+ rspamd_stat_token_t *tok,
+ khash_t(rspamd_candidates_hash) *candidates,
+ khash_t(rspamd_trigram_hash) *trigramms)
{
- guint wlen;
+ const guint wlen = 3;
UChar window[3];
goffset cur = 0;
- switch (type) {
- case rs_unigramm:
- wlen = 1;
- break;
- case rs_trigramm:
- wlen = 3;
- break;
- }
-
/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {
rspamd_language_detector_process_ngramm_full (task,
- d, window, type, candidates);
+ d, window, candidates, trigramms);
}
}
@@ -1074,113 +1116,35 @@ rspamd_language_detector_filter_negligible (struct rspamd_task *task,
msg_debug_lang_det ("removed %d languages", filtered);
}
-static gboolean
-rspamd_language_detector_is_unicode (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
- goffset *selected_words,
- gsize nparts,
- khash_t(rspamd_candidates_hash) *candidates)
-{
- guint i, j, total_found = 0, total_checked = 0;
- rspamd_stat_token_t *tok;
- UChar t;
- gint uc_script, ret;
- khint_t k;
- struct rspamd_language_elt *elt;
- struct rspamd_lang_detector_res *cand;
-
- for (i = 0; i < nparts; i++) {
- tok = &g_array_index (ucs_tokens, rspamd_stat_token_t,
- selected_words[i]);
-
- for (j = 0; j < tok->len; j ++) {
- t = *(((UChar *)tok->begin) + j);
-
- uc_script = ublock_getCode (t);
- elt = g_hash_table_lookup (d->unicode_scripts, &uc_script);
-
- if (elt) {
- k = kh_get (rspamd_candidates_hash, candidates, elt->name);
- if (k != kh_end (candidates)) {
- cand = kh_value (candidates, k);
- }
- else {
- cand = NULL;
- }
-
- if (cand == NULL) {
- cand = rspamd_mempool_alloc (task->task_pool,
- sizeof (*cand));
- cand->elt = elt;
- cand->lang = elt->name;
- cand->prob = 1;
-
- k = kh_put (rspamd_candidates_hash, candidates, elt->name, &ret);
- kh_value (candidates, k) = cand;
- } else {
- /* Update guess */
- cand->prob ++;
- }
-
- total_found ++;
- }
-
- total_checked ++;
- }
-
- if (i >= nparts / 2 && total_found == 0) {
- /* No special scripts found, stop processing */
- return FALSE;
- }
- }
-
- if (total_found < total_checked / 2) {
- /* Not enough confidence */
- return FALSE;
- }
- else {
- /* Filter candidates */
- kh_foreach_value (candidates, cand, {
- cand->prob = cand->prob / total_checked;
- });
- }
-
- return TRUE;
-}
-
static void
rspamd_language_detector_detect_type (struct rspamd_task *task,
- guint nwords,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
- khash_t(rspamd_candidates_hash) *candidates,
- enum rspamd_language_gramm_type type) {
- guint nparts = MIN (ucs_tokens->len, nwords);
+ guint nwords,
+ struct rspamd_lang_detector *d,
+ GArray *words,
+ enum rspamd_language_category cat,
+ khash_t(rspamd_candidates_hash) *candidates)
+{
+ guint nparts = MIN (words->len, nwords);
goffset *selected_words;
- rspamd_stat_token_t *tok;
+ rspamd_stat_token_t *tok, ucs_w;
guint i;
selected_words = g_new0 (goffset, nparts);
- rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words);
+ rspamd_language_detector_random_select (words, nparts, selected_words);
msg_debug_lang_det ("randomly selected %d words", nparts);
- /* Check unicode scripts */
- if (kh_size (candidates) != 0 ||
- !rspamd_language_detector_is_unicode (task, d, ucs_tokens,
- selected_words, nparts, candidates)) {
-
- for (i = 0; i < nparts; i++) {
- tok = &g_array_index (ucs_tokens, rspamd_stat_token_t,
- selected_words[i]);
- rspamd_language_detector_detect_word (task, d, tok, candidates,
- type);
- }
-
- /* Filter negligible candidates */
- rspamd_language_detector_filter_negligible (task, candidates);
+ for (i = 0; i < nparts; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t,
+ selected_words[i]);
+ rspamd_language_detector_to_ucs (task->lang_det,
+ task->task_pool,
+ tok, &ucs_w);
+ rspamd_language_detector_detect_word (task, d, &ucs_w, candidates,
+ d->trigramms[cat]);
}
+ /* Filter negligible candidates */
+ rspamd_language_detector_filter_negligible (task, candidates);
g_free (selected_words);
}
@@ -1209,11 +1173,11 @@ enum rspamd_language_detected_type {
static enum rspamd_language_detected_type
rspamd_language_detector_try_ngramm (struct rspamd_task *task,
- guint nwords,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
- enum rspamd_language_gramm_type type,
- khash_t(rspamd_candidates_hash) *candidates)
+ guint nwords,
+ struct rspamd_lang_detector *d,
+ GArray *ucs_tokens,
+ enum rspamd_language_category cat,
+ khash_t(rspamd_candidates_hash) *candidates)
{
guint cand_len = 0;
struct rspamd_lang_detector_res *cand;
@@ -1222,8 +1186,8 @@ rspamd_language_detector_try_ngramm (struct rspamd_task *task,
nwords,
d,
ucs_tokens,
- candidates,
- type);
+ cat,
+ candidates);
kh_foreach_value (candidates, cand, {
if (!isnan (cand->prob)) {
@@ -1320,117 +1284,429 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
return 0;
}
-GPtrArray *
+static void
+rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ const gchar *p = part->utf_stripped_content->data, *end;
+ guint i = 0;
+ end = p + part->utf_stripped_content->len;
+ gint32 uc, sc;
+ guint nlatin = 0, nchinese = 0, nspecial = 0;
+
+ while (p + i < end) {
+ U8_NEXT (p, i, part->utf_stripped_content->len, uc);
+
+ if (((gint32) uc) < 0) {
+ break;
+ }
+
+ if (u_isalpha (uc)) {
+ sc = ublock_getCode (uc);
+
+ switch (sc) {
+ case UBLOCK_BASIC_LATIN:
+ case UBLOCK_LATIN_1_SUPPLEMENT:
+ part->unicode_scripts |= RSPAMD_UNICODE_LATIN;
+ nlatin ++;
+ break;
+ case UBLOCK_HEBREW:
+ part->unicode_scripts |= RSPAMD_UNICODE_HEBREW;
+ nspecial ++;
+ break;
+ case UBLOCK_GREEK:
+ part->unicode_scripts |= RSPAMD_UNICODE_GREEK;
+ nspecial ++;
+ break;
+ case UBLOCK_CYRILLIC:
+ part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC;
+ nspecial ++;
+ break;
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
+ case UBLOCK_CJK_COMPATIBILITY:
+ case UBLOCK_CJK_RADICALS_SUPPLEMENT:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
+ part->unicode_scripts |= RSPAMD_UNICODE_CJK;
+ nchinese ++;
+ break;
+ case UBLOCK_HIRAGANA:
+ case UBLOCK_KATAKANA:
+ part->unicode_scripts |= RSPAMD_UNICODE_JP;
+ nspecial ++;
+ break;
+ case UBLOCK_HANGUL_JAMO:
+ case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
+ part->unicode_scripts |= RSPAMD_UNICODE_HANGUL;
+ nspecial ++;
+ break;
+ case UBLOCK_ARABIC:
+ part->unicode_scripts |= RSPAMD_UNICODE_ARABIC;
+ nspecial ++;
+ break;
+ case UBLOCK_DEVANAGARI:
+ part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI;
+ nspecial ++;
+ break;
+ case UBLOCK_ARMENIAN:
+ part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN;
+ nspecial ++;
+ break;
+ case UBLOCK_GEORGIAN:
+ part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN;
+ nspecial ++;
+ break;
+ case UBLOCK_GUJARATI:
+ part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI;
+ nspecial ++;
+ break;
+ case UBLOCK_TELUGU:
+ part->unicode_scripts |= RSPAMD_UNICODE_TELUGU;
+ nspecial ++;
+ break;
+ case UBLOCK_TAMIL:
+ part->unicode_scripts |= RSPAMD_UNICODE_TAMIL;
+ nspecial ++;
+ break;
+ case UBLOCK_THAI:
+ part->unicode_scripts |= RSPAMD_UNICODE_THAI;
+ nspecial ++;
+ break;
+ case RSPAMD_UNICODE_MALAYALAM:
+ part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM;
+ nspecial ++;
+ break;
+ case RSPAMD_UNICODE_SINHALA:
+ part->unicode_scripts |= RSPAMD_UNICODE_SINHALA;
+ nspecial ++;
+ break;
+ }
+ }
+
+ if (nspecial > 6 && nspecial > nlatin) {
+ break;
+ }
+ else if (nchinese > 6 && nchinese > nlatin) {
+ if (nspecial > 0) {
+ /* Likely japanese */
+ break;
+ }
+ }
+ }
+
+ msg_debug_lang_det ("stop after checking %d characters, "
+ "%d latin, %d special, %d chinese",
+ i, nlatin, nspecial, nchinese);
+}
+
+static inline void
+rspamd_language_detector_set_language (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ const gchar *code)
+{
+ struct rspamd_lang_detector_res *r;
+
+ r = rspamd_mempool_alloc0 (task->task_pool, sizeof (*r));
+ r->prob = 1.0;
+ r->lang = code;
+
+ part->languages = g_ptr_array_sized_new (1);
+ g_ptr_array_add (part->languages, r);
+ part->language = code;
+}
+
+static gboolean
+rspamd_language_detector_try_uniscript (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ guint i;
+
+ for (i = 0; i < G_N_ELEMENTS (unicode_langs); i ++) {
+ if (unicode_langs[i].unicode_code & part->unicode_scripts) {
+ msg_debug_lang_det ("set language based on unicode script %s",
+ unicode_langs[i].lang);
+ rspamd_language_detector_set_language (task, part,
+ unicode_langs[i].lang);
+
+ return TRUE;
+ }
+ }
+
+ if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
+ rspamd_language_detector_set_language (task, part,
+ "zh-CN");
+
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+
+KHASH_MAP_INIT_STR (rspamd_sw_hash, int);
+
+struct rspamd_sw_cbdata {
+ khash_t (rspamd_sw_hash) *res;
+ GArray *ranges;
+};
+
+static gint
+rspamd_ranges_cmp (const void *k, const void *memb)
+{
+ gint pos = GPOINTER_TO_INT (k);
+ const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *)memb;
+
+ if (pos >= r->start && pos < r->stop) {
+ return 0;
+ }
+ else if (pos < r->start) {
+ return -1;
+ }
+
+ return 1;
+}
+
+static gint
+rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
+{
+ /* Check if boundary */
+ const gchar *prev, *next;
+ struct rspamd_stop_word_range *r;
+ struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context;
+ khiter_t k;
+
+ if (match_start > 0) {
+ prev = text + match_start - 1;
+
+ if (!(g_ascii_isspace (*prev) || g_ascii_ispunct (*prev))) {
+ return 0;
+ }
+ }
+ else if (match_pos < len) {
+ next = text + match_pos + 1;
+
+ if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) {
+ return 0;
+ }
+ }
+
+ /* We have a word on the boundary, check range */
+ r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data,
+ cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp);
+
+ g_assert (r != NULL);
+
+ k = kh_get (rspamd_sw_hash, cbdata->res, r->elt->name);
+
+ if (k != kh_end (cbdata->res)) {
+ kh_value (cbdata->res, k) ++;
+ }
+ else {
+ gint tt;
+
+ k = kh_put (rspamd_sw_hash, cbdata->res, r->elt->name, &tt);
+ kh_value (cbdata->res, k) = 1;
+ }
+
+ return 0;
+}
+
+static gboolean
+rspamd_language_detector_try_stop_words (struct rspamd_task *task,
+ struct rspamd_lang_detector *d,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_language_category cat)
+{
+ struct rspamd_stop_word_elt *elt;
+ struct rspamd_sw_cbdata cbdata;
+ gboolean ret = FALSE;
+
+ elt = &d->stop_words[cat];
+ cbdata.res = kh_init (rspamd_sw_hash);
+ cbdata.ranges = elt->ranges;
+
+ rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data,
+ part->utf_stripped_content->len, rspamd_language_detector_sw_cb,
+ &cbdata, NULL);
+
+ if (kh_size (cbdata.res) > 0) {
+ gint max = G_MININT, cur_matches;
+ const gchar *sel = NULL, *cur_lang;
+
+ kh_foreach (cbdata.res, cur_lang, cur_matches, {
+ if (cur_matches > max) {
+ max = cur_matches;
+ sel = cur_lang;
+ }
+ });
+
+ if (max > 0 && sel) {
+ msg_debug_lang_det ("set language based on stop words script %s, %d found",
+ sel, max);
+ rspamd_language_detector_set_language (task, part,
+ sel);
+
+ ret = TRUE;
+ }
+ }
+
+ kh_destroy (rspamd_sw_hash, cbdata.res);
+
+ return ret;
+}
+
+gboolean
rspamd_language_detector_detect (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens, gsize words_len)
+ struct rspamd_lang_detector *d,
+ struct rspamd_mime_text_part *part)
{
khash_t(rspamd_candidates_hash) *candidates;
GPtrArray *result;
gdouble mean, std, start_ticks, end_ticks;
guint cand_len;
+ enum rspamd_language_category cat;
struct rspamd_lang_detector_res *cand;
enum rspamd_language_detected_type r;
struct rspamd_frequency_sort_cbdata cbd;
/* Check if we have sorted candidates based on frequency */
- gboolean frequency_heuristic_applied = FALSE;
+ gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
- if (ucs_tokens->len == 0) {
- return g_ptr_array_new ();
+ if (!part->utf_stripped_content) {
+ return FALSE;
}
start_ticks = rspamd_get_ticks (TRUE);
- candidates = kh_init (rspamd_candidates_hash);
- kh_resize (rspamd_candidates_hash, candidates, 32);
- r = rspamd_language_detector_try_ngramm (task, default_words, d,
- ucs_tokens, rs_trigramm,
- candidates);
+ rspamd_language_detector_unicode_scripts (task, part);
+ /* Apply unicode scripts heuristic */
- if (r == rs_detect_none) {
- msg_debug_lang_det ("no trigramms found, switch to unigramms");
- r = rspamd_language_detector_try_ngramm (task, default_words,
- d, ucs_tokens, rs_unigramm,
- candidates);
+ if (rspamd_language_detector_try_uniscript (task, part)) {
+ ret = TRUE;
}
- else if (r == rs_detect_multiple) {
- /* Check our guess */
- mean = 0.0;
- std = 0.0;
- cand_len = 0;
+ cat = rspamd_language_detector_get_category (part->unicode_scripts);
+
+ if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) {
+ ret = TRUE;
+ }
- /* Check distirbution */
- kh_foreach_value (candidates, cand, {
- if (!isnan (cand->prob)) {
- mean += cand->prob;
- cand_len ++;
+ if (!ret) {
+ if (part->utf_words->len < default_short_text_limit) {
+ r = rs_detect_none;
+ msg_debug_lang_det ("text is too short for trigramms detection: "
+ "%d words; at least %d words required",
+ (int)part->utf_words->len,
+ (int)default_short_text_limit);
+ rspamd_language_detector_set_language (task, part, "en");
+ candidates = kh_init (rspamd_candidates_hash);
+ }
+ else {
+ candidates = kh_init (rspamd_candidates_hash);
+ kh_resize (rspamd_candidates_hash, candidates, 32);
+
+ r = rspamd_language_detector_try_ngramm (task,
+ default_words,
+ d,
+ part->utf_words,
+ cat,
+ candidates);
+
+ if (r == rs_detect_none) {
+ msg_debug_lang_det ("no trigramms found, fallback to english");
+ rspamd_language_detector_set_language (task, part, "en");
+ } else if (r == rs_detect_multiple) {
+ /* Check our guess */
+
+ mean = 0.0;
+ std = 0.0;
+ cand_len = 0;
+
+ /* Check distirbution */
+ kh_foreach_value (candidates, cand, {
+ if (!isnan (cand->prob)) {
+ mean += cand->prob;
+ cand_len++;
+ }
+ });
+
+ if (cand_len > 0) {
+ mean /= cand_len;
+
+ kh_foreach_value (candidates, cand, {
+ gdouble err;
+ if (!isnan (cand->prob)) {
+ err = cand->prob - mean;
+ std += fabs (err);
+ }
+ });
+
+ std /= cand_len;
+ }
+
+ msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
+ cand_len, mean, std);
+
+ if (cand_len > 0 && std / fabs (mean) < 0.25) {
+ msg_debug_lang_det ("apply frequency heuristic sorting");
+ frequency_heuristic_applied = TRUE;
+ cbd.d = d;
+ cbd.mean = mean;
+ cbd.std = std;
+ cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+
+ if (part->utf_words->len < default_words / 2) {
+ cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ }
+ }
}
- });
+ }
- if (cand_len > 0) {
- mean /= cand_len;
+ /* Now, convert hash to array and sort it */
+ if (r != rs_detect_none && kh_size (candidates) > 0) {
+ result = g_ptr_array_sized_new (kh_size (candidates));
kh_foreach_value (candidates, cand, {
- gdouble err;
if (!isnan (cand->prob)) {
- err = cand->prob - mean;
- std += fabs (err);
+ msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
+ cand->prob);
+ g_ptr_array_add (result, cand);
}
});
- std /= cand_len;
- }
-
- msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
- cand_len, mean, std);
-
- if (cand_len > 0 && std / fabs (mean) < 0.25) {
- msg_debug_lang_det ("apply frequency heuristic sorting");
- frequency_heuristic_applied = TRUE;
- cbd.d = d;
- cbd.mean = mean;
- cbd.std = std;
- cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
-
- if (ucs_tokens->len < default_words / 2) {
- cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ if (frequency_heuristic_applied) {
+ g_ptr_array_sort_with_data (result,
+ rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
+ } else {
+ g_ptr_array_sort (result, rspamd_language_detector_cmp);
}
- }
- }
- /* Now, convert hash to array and sort it */
- result = g_ptr_array_sized_new (kh_size (candidates));
+ if (result->len > 0 && !frequency_heuristic_applied) {
+ cand = g_ptr_array_index (result, 0);
+ cand->elt->occurencies++;
+ d->total_occurencies++;
+ }
- kh_foreach_value (candidates, cand, {
- if (!isnan (cand->prob)) {
- msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
- cand->prob);
- g_ptr_array_add (result, cand);
+ part->languages = result;
+ ret = TRUE;
+ }
+ else if (part->languages == NULL) {
+ rspamd_language_detector_set_language (task, part, "en");
}
- });
-
- if (frequency_heuristic_applied) {
- g_ptr_array_sort_with_data (result,
- rspamd_language_detector_cmp_heuristic, (gpointer)&cbd);
- }
- else {
- g_ptr_array_sort (result, rspamd_language_detector_cmp);
- }
-
- kh_destroy (rspamd_candidates_hash, candidates);
- if (result->len > 0 && !frequency_heuristic_applied) {
- cand = g_ptr_array_index (result, 0);
- cand->elt->occurencies ++;
- d->total_occurencies ++;
+ kh_destroy (rspamd_candidates_hash, candidates);
}
end_ticks = rspamd_get_ticks (TRUE);
msg_debug_lang_det ("detected languages in %.0f ticks",
(end_ticks - start_ticks));
- return result;
+ return ret;
}
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index 2d28ec65a..50fe19b6e 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -20,11 +20,32 @@
#include "config.h"
#include "libserver/cfg_file.h"
#include "libstat/stat_api.h"
+#include "libmime/message.h"
struct rspamd_lang_detector;
struct rspamd_language_elt;
struct rspamd_task;
+enum rspamd_unicode_scripts {
+ RSPAMD_UNICODE_LATIN = (1 << 0),
+ RSPAMD_UNICODE_GREEK = (1 << 1),
+ RSPAMD_UNICODE_CYRILLIC = (1 << 2),
+ RSPAMD_UNICODE_HEBREW = (1 << 3),
+ RSPAMD_UNICODE_CJK = (1 << 4),
+ RSPAMD_UNICODE_JP = (1 << 5),
+ RSPAMD_UNICODE_ARABIC = (1 << 6),
+ RSPAMD_UNICODE_DEVANAGARI = (1 << 7),
+ RSPAMD_UNICODE_THAI = (1 << 8),
+ RSPAMD_UNICODE_ARMENIAN = (1 << 9),
+ RSPAMD_UNICODE_GEORGIAN = (1 << 10),
+ RSPAMD_UNICODE_GUJARATI = (1 << 11),
+ RSPAMD_UNICODE_TAMIL = (1 << 12),
+ RSPAMD_UNICODE_TELUGU = (1 << 13),
+ RSPAMD_UNICODE_MALAYALAM = (1 << 14),
+ RSPAMD_UNICODE_SINHALA = (1 << 15),
+ RSPAMD_UNICODE_HANGUL = (1 << 16),
+};
+
struct rspamd_lang_detector_res {
gdouble prob;
const gchar *lang;
@@ -59,8 +80,8 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
* @param words_len
* @return array of struct rspamd_lang_detector_res sorted by freq descending
*/
-GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task,
+gboolean rspamd_language_detector_detect (struct rspamd_task *task,
struct rspamd_lang_detector *d,
- GArray *ucs_tokens, gsize words_len);
+ struct rspamd_mime_text_part *part);
#endif
diff --git a/src/libmime/message.c b/src/libmime/message.c
index e6cb63504..0d4581ad7 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -67,7 +67,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
guint i, nlen, total_len = 0, short_len = 0;
gdouble avg_len = 0;
- if (part->normalized_words) {
+ if (part->utf_words) {
#ifdef WITH_SNOWBALL
static GHashTable *stemmers = NULL;
@@ -97,10 +97,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
#endif
- for (i = 0; i < part->normalized_words->len; i++) {
+ for (i = 0; i < part->utf_words->len; i++) {
guint64 h;
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
r = NULL;
#ifdef WITH_SNOWBALL
if (stem) {
@@ -156,7 +156,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
}
}
- if (part->normalized_words && part->normalized_words->len) {
+ if (part->utf_words && part->utf_words->len) {
gdouble *avg_len_p, *short_len_p;
avg_len_p = rspamd_mempool_get_variable (task->task_pool,
@@ -188,12 +188,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
}
}
-static guint
+static void
rspamd_mime_part_create_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
- rspamd_stat_token_t *w, ucs_w;
- guint i, ucs_len = 0;
enum rspamd_tokenize_type tok_type;
if (IS_PART_UTF (part)) {
@@ -203,69 +201,39 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
tok_type = RSPAMD_TOKENIZE_RAW;
}
- /* Ugly workaround */
- if (IS_PART_HTML (part)) {
- part->normalized_words = rspamd_tokenize_text (
- part->stripped_content->data,
- part->stripped_content->len, tok_type, task->cfg,
- part->exceptions,
- NULL);
- }
- else {
- part->normalized_words = rspamd_tokenize_text (
- part->stripped_content->data,
- part->stripped_content->len, tok_type, task->cfg,
- part->exceptions,
- NULL);
- }
-
- if (part->normalized_words) {
- part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
- sizeof (guint64), part->normalized_words->len);
-
- if (IS_PART_UTF (part) && task->lang_det) {
- part->ucs32_words = g_array_sized_new (FALSE, FALSE,
- sizeof (rspamd_stat_token_t), part->normalized_words->len);
- }
-
- if (part->ucs32_words) {
-
+ part->utf_words = rspamd_tokenize_text (
+ part->utf_stripped_content->data,
+ part->utf_stripped_content->len,
+ &part->utf_stripped_text,
+ tok_type, task->cfg,
+ part->exceptions,
+ NULL);
- for (i = 0; i < part->normalized_words->len; i++) {
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t,
- i);
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
- rspamd_language_detector_to_ucs (task->lang_det,
- task->task_pool,
- w, &ucs_w);
- g_array_append_val (part->ucs32_words, ucs_w);
- ucs_len += ucs_w.len;
- }
- }
- }
+ if (part->utf_words) {
+ part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
+ sizeof (guint64), part->utf_words->len);
}
- return ucs_len;
}
static void
rspamd_mime_part_detect_language (struct rspamd_task *task,
- struct rspamd_mime_text_part *part, guint ucs_len)
+ struct rspamd_mime_text_part *part)
{
struct rspamd_lang_detector_res *lang;
- if (part->ucs32_words) {
- part->languages = rspamd_language_detector_detect (task,
- task->lang_det,
- part->ucs32_words, ucs_len);
-
- if (part->languages->len > 0) {
+ if (!IS_PART_EMPTY (part) && part->utf_words && part->utf_words->len > 0 &&
+ task->lang_det) {
+ if (rspamd_language_detector_detect (task, task->lang_det, part)) {
lang = g_ptr_array_index (part->languages, 0);
part->language = lang->lang;
msg_info_task ("detected part language: %s", part->language);
}
+ else {
+ part->language = "en"; /* Safe fallback */
+ }
}
}
@@ -289,7 +257,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
state = seen_cr;
if (p > c) {
last_c = *(p - 1);
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)c, p - c);
}
@@ -299,11 +267,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
case seen_cr:
/* Double \r\r */
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
crlf_added = TRUE;
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
part->nlines ++;
@@ -326,17 +294,17 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
if (p > c) {
last_c = *(p - 1);
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)c, p - c);
}
c = p + 1;
if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
crlf_added = TRUE;
}
else {
@@ -348,13 +316,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
/* \r\n */
if (!crlf_added) {
if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *) " ", 1);
crlf_added = TRUE;
}
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
c = p + 1;
@@ -364,11 +332,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
case seen_lf:
/* Double \n\n */
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
crlf_added = TRUE;
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
part->nlines++;
@@ -414,13 +382,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
if (!crlf_added) {
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
/* Skip initial spaces */
if (G_UNLIKELY (*p == ' ')) {
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
}
@@ -451,7 +419,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
switch (state) {
case normal_char:
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)c, p - c);
while (c < p) {
@@ -479,10 +447,10 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
default:
if (!crlf_added) {
- g_byte_array_append (part->stripped_content,
+ g_byte_array_append (part->utf_stripped_content,
(const guint8 *)" ", 1);
g_ptr_array_add (part->newlines,
- (((gpointer) (goffset) (part->stripped_content->len))));
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
}
part->nlines++;
@@ -495,34 +463,52 @@ static void
rspamd_normalize_text_part (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
-
const gchar *p, *end;
guint i;
goffset off;
struct rspamd_process_exception *ex;
+ UErrorCode uc_err = U_ZERO_ERROR;
- /* Strip newlines */
- part->stripped_content = g_byte_array_sized_new (part->content->len);
part->newlines = g_ptr_array_sized_new (128);
- p = (const gchar *)part->content->data;
- end = p + part->content->len;
-
- rspamd_strip_newlines_parse (p, end, part);
-
- for (i = 0; i < part->newlines->len; i ++) {
- ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
- off = (goffset)g_ptr_array_index (part->newlines, i);
- g_ptr_array_index (part->newlines, i) = (gpointer)(goffset)
- (part->stripped_content->data + off);
- ex->pos = off;
- ex->len = 0;
- ex->type = RSPAMD_EXCEPTION_NEWLINE;
- part->exceptions = g_list_prepend (part->exceptions, ex);
+
+ if (IS_PART_EMPTY (part)) {
+ part->utf_stripped_content = g_byte_array_new ();
+ }
+ else {
+ part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
+
+ p = (const gchar *)part->utf_content->data;
+ end = p + part->utf_content->len;
+
+ rspamd_strip_newlines_parse (p, end, part);
+
+ for (i = 0; i < part->newlines->len; i ++) {
+ ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+ off = (goffset)g_ptr_array_index (part->newlines, i);
+ g_ptr_array_index (part->newlines, i) = (gpointer)(goffset)
+ (part->utf_stripped_content->data + off);
+ ex->pos = off;
+ ex->len = 0;
+ ex->type = RSPAMD_EXCEPTION_NEWLINE;
+ part->exceptions = g_list_prepend (part->exceptions, ex);
+ }
+ }
+
+ if (IS_PART_UTF (part)) {
+ utext_openUTF8 (&part->utf_stripped_text,
+ part->utf_stripped_content->data,
+ part->utf_stripped_content->len,
+ &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_task ("cannot open text from utf content");
+ /* Probably, should be an assertion */
+ }
}
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) free_byte_array_callback,
- part->stripped_content);
+ part->utf_stripped_content);
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
part->newlines);
@@ -615,10 +601,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
}
- if (part->content && part->content->len >= sizeof (gtube_pattern_reject) &&
- part->content->len <= max_check_size) {
- if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->content->data,
- part->content->len,
+ if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
+ part->utf_content->len <= max_check_size) {
+ if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
+ part->utf_content->len,
rspamd_multipattern_gtube_cb, NULL, NULL)) > 0) {
switch (ret) {
@@ -639,7 +625,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
msg_info_task (
"<%s>: gtube %s pattern has been found in part of length %ud",
task->message_id, rspamd_action_to_str (act),
- part->content->len);
+ part->utf_content->len);
}
}
}
@@ -655,9 +641,86 @@ exceptions_compare_func (gconstpointer a, gconstpointer b)
return ea->pos - eb->pos;
}
+static gboolean
+rspamd_message_process_plain_text_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ if (text_part->parsed.len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+ return TRUE;
+ }
+
+ rspamd_mime_text_part_maybe_convert (task, text_part);
+
+ if (text_part->utf_raw_content != NULL) {
+ /* Different from HTML, where we also parse HTML and strip tags */
+ text_part->utf_content = text_part->utf_raw_content;
+ text_part->unicode_content = text_part->unicode_raw_content;
+ }
+ else {
+ /*
+ * We ignore unconverted parts from now as it is dangerous
+ * to treat them as text parts
+ */
+
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static gboolean
+rspamd_message_process_html_text_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
+
+ if (text_part->parsed.len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+ return TRUE;
+ }
+
+ rspamd_mime_text_part_maybe_convert (task, text_part);
+
+ if (text_part->utf_raw_content == NULL) {
+ return FALSE;
+ }
+
+ text_part->html = rspamd_mempool_alloc0 (task->task_pool,
+ sizeof (*text_part->html));
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
+ text_part->utf_content = rspamd_html_process_part_full (
+ task->task_pool,
+ text_part->html,
+ text_part->utf_raw_content,
+ &text_part->exceptions,
+ task->urls,
+ task->emails);
+
+ if (text_part->utf_content->len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+ }
+
+ /* Also add unicode content */
+ text_part->unicode_content = g_array_sized_new (FALSE, FALSE,
+ sizeof (UChar), text_part->utf_content->len + 1);
+ rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content);
+
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) free_byte_array_callback,
+ text_part->utf_content);
+ rspamd_mempool_add_destructor (task->task_pool,
+ rspamd_array_free_hard,
+ text_part->unicode_content);
+
+ return TRUE;
+}
+
static void
-rspamd_message_process_text_part (struct rspamd_task *task,
- struct rspamd_mime_part *mime_part)
+rspamd_message_process_text_part_maybe (struct rspamd_task *task,
+ struct rspamd_mime_part *mime_part)
{
struct rspamd_mime_text_part *text_part;
rspamd_ftok_t html_tok, xhtml_tok;
@@ -738,87 +801,32 @@ rspamd_message_process_text_part (struct rspamd_task *task,
debug_task ("skip attachments for checking as text parts");
return;
}
-
- if (found_html) {
- text_part = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_text_part));
- text_part->raw.begin = mime_part->raw_data.begin;
- text_part->raw.len = mime_part->raw_data.len;
- text_part->parsed.begin = mime_part->parsed_data.begin;
- text_part->parsed.len = mime_part->parsed_data.len;
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
- text_part->mime_part = mime_part;
-
- if (mime_part->parsed_data.len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
- g_ptr_array_add (task->text_parts, text_part);
- return;
- }
-
- rspamd_mime_text_part_maybe_convert (task, text_part);
-
- if (text_part->utf_raw_content == NULL) {
- return;
- }
-
- text_part->html = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (*text_part->html));
- text_part->mime_part = mime_part;
-
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
- text_part->content = rspamd_html_process_part_full (
- task->task_pool,
- text_part->html,
- text_part->utf_raw_content,
- &text_part->exceptions,
- task->urls,
- task->emails);
-
- if (text_part->content->len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
- }
-
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) free_byte_array_callback,
- text_part->content);
- g_ptr_array_add (task->text_parts, text_part);
+ else if (!(found_txt || found_html)) {
+ /* Not a text part */
+ return;
}
- else if (found_txt) {
- text_part =
- rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_text_part));
- text_part->mime_part = mime_part;
- text_part->raw.begin = mime_part->raw_data.begin;
- text_part->raw.len = mime_part->raw_data.len;
- text_part->parsed.begin = mime_part->parsed_data.begin;
- text_part->parsed.len = mime_part->parsed_data.len;
- text_part->mime_part = mime_part;
-
- if (mime_part->parsed_data.len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
- g_ptr_array_add (task->text_parts, text_part);
- return;
- }
- rspamd_mime_text_part_maybe_convert (task, text_part);
+ text_part = rspamd_mempool_alloc0 (task->task_pool,
+ sizeof (struct rspamd_mime_text_part));
+ text_part->mime_part = mime_part;
+ text_part->raw.begin = mime_part->raw_data.begin;
+ text_part->raw.len = mime_part->raw_data.len;
+ text_part->parsed.begin = mime_part->parsed_data.begin;
+ text_part->parsed.len = mime_part->parsed_data.len;
+ text_part->utf_stripped_text = (UText)UTEXT_INITIALIZER;
- if (text_part->utf_raw_content != NULL) {
- /*
- * We ignore unconverted parts from now as it is dangerous
- * to treat them as text parts
- */
- text_part->content = text_part->utf_raw_content;
- g_ptr_array_add (task->text_parts, text_part);
- }
- else {
+ if (found_html) {
+ if (!rspamd_message_process_html_text_part (task, text_part)) {
return;
}
}
else {
- return;
+ if (!rspamd_message_process_plain_text_part (task, text_part)) {
+ return;
+ }
}
-
+ g_ptr_array_add (task->text_parts, text_part);
mime_part->flags |= RSPAMD_MIME_PART_TEXT;
mime_part->specific.txt = text_part;
@@ -867,7 +875,7 @@ rspamd_message_process_text_part (struct rspamd_task *task,
text_part->exceptions);
}
- text_part->ucs_len = rspamd_mime_part_create_words (task, text_part);
+ rspamd_mime_part_create_words (task, text_part);
}
/* Creates message from various data using libmagic to detect type */
@@ -1172,7 +1180,7 @@ rspamd_message_process (struct rspamd_task *task)
struct rspamd_mime_part *part;
part = g_ptr_array_index (task->parts, i);
- rspamd_message_process_text_part (task, part);
+ rspamd_message_process_text_part_maybe (task, part);
}
rspamd_images_process (task);
@@ -1207,7 +1215,7 @@ rspamd_message_process (struct rspamd_task *task)
sel = p2;
}
else {
- if (p1->ucs_len > p2->ucs_len) {
+ if (p1->unicode_content->len > p2->unicode_content->len) {
sel = p1;
}
else {
@@ -1215,7 +1223,7 @@ rspamd_message_process (struct rspamd_task *task)
}
}
- rspamd_mime_part_detect_language (task, sel, sel->ucs_len);
+ rspamd_mime_part_detect_language (task, sel);
if (sel->language && sel->language[0]) {
/* Propagate language */
@@ -1274,13 +1282,13 @@ rspamd_message_process (struct rspamd_task *task)
PTR_ARRAY_FOREACH (task->text_parts, i, text_part) {
if (!text_part->language) {
- rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len);
+ rspamd_mime_part_detect_language (task, text_part);
}
rspamd_mime_part_extract_words (task, text_part);
- if (text_part->normalized_words) {
- total_words += text_part->normalized_words->len;
+ if (text_part->utf_words) {
+ total_words += text_part->utf_words->len;
}
}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index baabb762a..205bf5bb2 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -14,6 +14,7 @@
#include "content_type.h"
#include <unicode/uchar.h>
+#include <unicode/utext.h>
struct rspamd_task;
struct controller_session;
@@ -86,20 +87,28 @@ struct rspamd_mime_text_part {
const gchar *language;
GPtrArray *languages;
const gchar *real_charset;
+
+ /* Raw data in native encoding */
rspamd_ftok_t raw;
rspamd_ftok_t parsed; /* decoded from mime encodings */
- GByteArray *content; /* utf8 encoded processed content */
- GArray *ucs_raw_content; /* unicode raw content (of UChar) */
+ /* UTF8 content */
+ GByteArray *utf_content; /* utf8 encoded processed content */
GByteArray *utf_raw_content; /* utf raw content */
- GByteArray *stripped_content; /* utf content with no newlines */
+ GByteArray *utf_stripped_content; /* utf content with no newlines */
+ GArray *normalized_hashes;
+ GArray *utf_words;
+ UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
+
+ /* Unicode content, used by libicu */
+ GArray *unicode_raw_content; /* unicode raw content (of UChar) */
+ GArray *unicode_content; /* unicode processed content (of UChar) */
+
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
struct html_content *html;
GList *exceptions; /**< list of offsets of urls */
struct rspamd_mime_part *mime_part;
- GArray *normalized_words;
- GArray *ucs32_words;
- GArray *normalized_hashes;
+
guint flags;
guint nlines;
guint spaces;
@@ -110,7 +119,7 @@ struct rspamd_mime_text_part {
guint empty_lines;
guint capital_letters;
guint numeric_characters;
- guint ucs_len;
+ guint unicode_scripts;
};
enum rspamd_received_type {
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index d3f255740..a0abb1bb0 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -283,18 +283,18 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
rspamd_mime_utf8_conv_init ();
utf = text_part->utf_raw_content;
- text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+ text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
sizeof (UChar), utf->len + 1);
- text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter,
- (UChar *)text_part->ucs_raw_content->data,
+ text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter,
+ (UChar *)text_part->unicode_raw_content->data,
utf->len + 1,
utf->data,
utf->len,
&uc_err);
if (!U_SUCCESS (uc_err)) {
- g_array_free (text_part->ucs_raw_content, TRUE);
- text_part->ucs_raw_content = NULL;
+ g_array_free (text_part->unicode_raw_content, TRUE);
+ text_part->unicode_raw_content = NULL;
}
}
@@ -311,12 +311,12 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
}
- if (!text_part->ucs_raw_content) {
+ if (!text_part->unicode_raw_content) {
return;
}
- src = (UChar *)text_part->ucs_raw_content->data;
- nsym = text_part->ucs_raw_content->len;
+ src = (UChar *)text_part->unicode_raw_content->data;
+ nsym = text_part->unicode_raw_content->len;
/* We can now check if we need to decompose */
end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
@@ -346,8 +346,8 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
}
else {
/* Copy normalised back */
- memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar));
- text_part->ucs_raw_content->len = nsym;
+ memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar));
+ text_part->unicode_raw_content->len = nsym;
text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
}
@@ -369,16 +369,16 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
rspamd_mime_utf8_conv_init ();
if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
- text_part->ucs_raw_content) {
+ text_part->unicode_raw_content) {
clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len,
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len,
clen);
g_byte_array_set_size (text_part->utf_raw_content, dlen);
r = ucnv_fromUChars (utf8_converter,
text_part->utf_raw_content->data,
dlen,
- (UChar *)text_part->ucs_raw_content->data,
- text_part->ucs_raw_content->len,
+ (UChar *)text_part->unicode_raw_content->data,
+ text_part->unicode_raw_content->len,
&uc_err);
text_part->utf_raw_content->len = r;
}
@@ -410,10 +410,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
}
- text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+ text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
sizeof (UChar), input->len + 1);
r = ucnv_toUChars (conv,
- (UChar *)text_part->ucs_raw_content->data,
+ (UChar *)text_part->unicode_raw_content->data,
input->len + 1,
input->data,
input->len,
@@ -426,7 +426,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
return FALSE;
}
- text_part->ucs_raw_content->len = r;
+ text_part->unicode_raw_content->len = r;
rspamd_mime_text_part_normalise (task, text_part);
/* Now, convert to utf8 */
@@ -434,7 +434,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
d = rspamd_mempool_alloc (task->task_pool, dlen);
r = ucnv_fromUChars (utf8_converter, d, dlen,
- (UChar *)text_part->ucs_raw_content->data, r, &uc_err);
+ (UChar *)text_part->unicode_raw_content->data, r, &uc_err);
if (!U_SUCCESS (uc_err)) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
@@ -750,3 +750,17 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
SET_PART_UTF (text_part);
}
+
+void
+rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ g_array_set_size (dest, in->len + 1);
+ dest->len = ucnv_toUChars (utf8_converter,
+ (UChar *)dest->data,
+ in->len + 1,
+ in->data,
+ in->len,
+ &uc_err);
+}
diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h
index 5e30efdae..0754bb348 100644
--- a/src/libmime/mime_encoding.h
+++ b/src/libmime/mime_encoding.h
@@ -86,4 +86,11 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
*/
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
+/**
+ * Converts utf8 to libicu unichars
+ * @param in
+ * @param dest
+ */
+void rspamd_utf_to_unicode (GByteArray *in, GArray *dest);
+
#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index c47db5761..268376e4d 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -905,8 +905,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
raw = TRUE;
}
- in = part->content->data;
- len = part->content->len;
+ in = part->utf_content->data;
+ len = part->utf_content->len;
}
}
@@ -1006,9 +1006,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
for (i = 0; i < task->text_parts->len; i++) {
part = g_ptr_array_index (task->text_parts, i);
- if (part->stripped_content) {
- scvec[i + 1] = (guchar *)part->stripped_content->data;
- lenvec[i + 1] = part->stripped_content->len;
+ if (part->utf_stripped_content) {
+ scvec[i + 1] = (guchar *)part->utf_stripped_content->data;
+ lenvec[i + 1] = part->utf_stripped_content->len;
}
else {
scvec[i + 1] = (guchar *)"";
diff --git a/src/libserver/task.c b/src/libserver/task.c
index bfeec990b..d77fc0145 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -242,20 +242,17 @@ rspamd_task_free (struct rspamd_task *task)
for (i = 0; i < task->text_parts->len; i ++) {
tp = g_ptr_array_index (task->text_parts, i);
- if (tp->normalized_words) {
- g_array_free (tp->normalized_words, TRUE);
+ if (tp->utf_words) {
+ g_array_free (tp->utf_words, TRUE);
}
if (tp->normalized_hashes) {
g_array_free (tp->normalized_hashes, TRUE);
}
- if (tp->ucs32_words) {
- g_array_free (tp->ucs32_words, TRUE);
- }
if (tp->languages) {
g_ptr_array_unref (tp->languages);
}
- if (tp->ucs_raw_content) {
- g_array_free (tp->ucs_raw_content, TRUE);
+ if (tp->unicode_raw_content) {
+ g_array_free (tp->unicode_raw_content, TRUE);
}
}
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 653cc3570..9e6ab72db 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2624,7 +2624,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
{
struct rspamd_url_mimepart_cbdata mcbd;
- if (part->stripped_content == NULL || part->stripped_content->len == 0) {
+ if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) {
msg_warn_task ("got empty text part");
return;
}
@@ -2632,8 +2632,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
mcbd.task = task;
mcbd.part = part;
- rspamd_url_find_multiple (task->task_pool, part->stripped_content->data,
- part->stripped_content->len, is_html, part->newlines,
+ rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
+ part->utf_stripped_content->len, is_html, part->newlines,
rspamd_url_text_part_callback, &mcbd);
}
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 540a9e23f..6d34ba51c 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -331,8 +331,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
for (i = 0; i < task->text_parts->len; i++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
- reserved_len += part->normalized_words->len;
+ if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
+ reserved_len += part->utf_words->len;
}
/* XXX: normal window size */
reserved_len += 5;
@@ -346,9 +346,9 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+ if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
- part->normalized_words, IS_PART_UTF (part),
+ part->utf_words, IS_PART_UTF (part),
NULL, task->tokens);
}
@@ -365,8 +365,18 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF,
+ UText utxt = UTEXT_INITIALIZER;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gsize slen = strlen (sub);
+
+ utext_openUTF8 (&utxt,
+ sub,
+ slen,
+ &uc_err);
+
+ words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
NULL, NULL, NULL);
+
if (words != NULL) {
for (i = 0; i < words->len; i ++) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index fce98c53f..ac7f8be85 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -21,8 +21,10 @@
#include "tokenizers.h"
#include "stat_internal.h"
#include "../../../contrib/mumhash/mum.h"
-#include "unicode/utf8.h"
-#include "unicode/uchar.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <unicode/uiter.h>
+#include <unicode/ubrk.h>
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
rspamd_stat_token_t * token,
@@ -59,7 +61,7 @@ const gchar t_delimiters[255] = {
/* Get next word from specified f_str_t buf */
static gboolean
-rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
GList **exceptions, gsize *rl, gboolean unused)
{
@@ -148,187 +150,97 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
return TRUE;
}
-static gboolean
-rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
- gchar const **cur, rspamd_stat_token_t * token,
- GList **exceptions, gsize *rl,
- gboolean check_signature)
+static inline gboolean
+rspamd_tokenize_check_limit (gboolean decay,
+ guint word_decay,
+ guint nwords,
+ guint64 *hv,
+ guint64 *prob,
+ const rspamd_stat_token_t *token,
+ gssize remain,
+ gssize total)
{
- gint32 i, siglen = 0, remain;
- goffset pos;
- const gchar *p, *s, *sig = NULL;
- UChar32 uc;
- guint processed = 0;
- struct rspamd_process_exception *ex = NULL;
- enum {
- skip_delimiters = 0,
- feed_token,
- process_signature
- } state = skip_delimiters;
-
- if (buf == NULL) {
- return FALSE;
- }
-
- if (exceptions != NULL && *exceptions != NULL) {
- ex = (*exceptions)->data;
- }
-
- g_assert (cur != NULL);
+ static const gdouble avg_word_len = 6.0;
- if (*cur == NULL) {
- *cur = buf->begin;
- }
-
- token->len = 0;
+ if (!decay) {
+ if (token->len >= sizeof (guint64)) {
+#ifdef _MUM_UNALIGNED_ACCESS
+ *hv = mum_hash_step (*hv, *(guint64 *)token->begin);
+#else
+ guint64 tmp;
+ memcpy (&tmp, token->begin, sizeof (tmp));
+ *hv = mum_hash_step (*hv, tmp);
+#endif
+ }
- pos = *cur - buf->begin;
- if (pos >= buf->len) {
- return FALSE;
- }
+ /* Check for decay */
+ if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
+ /* Start decay */
+ gdouble decay_prob;
- remain = buf->len - pos;
- s = *cur;
- p = s;
- token->begin = s;
+ *hv = mum_hash_finish (*hv);
- for (i = 0; i < remain; ) {
- p = &s[i];
- U8_NEXT (s, i, remain, uc); /* This also advances i */
+ /* We assume that word is 6 symbols length in average */
+ decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len);
- if (uc < 0) {
- if (i < remain) {
- uc = 0xFFFD;
+ if (decay_prob >= 1.0) {
+ *prob = G_MAXUINT64;
}
else {
- return FALSE;
+ *prob = decay_prob * G_MAXUINT64;
}
- }
- switch (state) {
- case skip_delimiters:
- if (ex != NULL && p - buf->begin == ex->pos) {
- goto process_exception;
- }
- else if (u_isgraph (uc)) {
- if (u_isalnum (uc)) {
- state = feed_token;
- token->begin = p;
- continue;
- }
- else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
- sig = p;
- siglen = remain - i;
- state = process_signature;
- continue;
- }
- }
- break;
- case feed_token:
- if (ex != NULL && p - buf->begin == (gint)ex->pos) {
- token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
- goto process_exception;
- }
- else if (!u_isalnum (uc)) {
- token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
- goto set_token;
- }
- processed ++;
- break;
- case process_signature:
- if (*p == '\r' || *p == '\n') {
- msg_debug ("signature found: %*s", (gint)siglen, sig);
- return FALSE;
- }
- else if (*p != ' ' && *p != '-' && *p != '_') {
- state = skip_delimiters;
- continue;
- }
- break;
+ return TRUE;
}
}
+ else {
+ /* Decaying probability */
+ /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+ *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
- /* Last character */
- if (state == feed_token) {
- p = &s[i];
- goto set_token;
+ if (*hv > *prob) {
+ return TRUE;
+ }
}
return FALSE;
+}
-set_token:
- if (rl) {
- *rl = processed;
- }
+static inline gboolean
+rspamd_utf_word_valid (const gchar *text, const gchar *end,
+ gint32 start, gint32 finish)
+{
+ const gchar *st = text + start, *fin = text + finish;
+ UChar32 c;
- if (token->len == 0 && processed > 0) {
- token->len = p - token->begin;
- g_assert (token->len > 0);
+ if (st >= end || fin > end || st >= fin) {
+ return FALSE;
}
- *cur = &s[i];
-
- return TRUE;
-
-process_exception:
- if (token->len == 0 && processed > 0) {
- /*
- * We have processed something before the next exception, so
- * continue processing on next iteration of this function call
- */
- token->len = p - token->begin;
- g_assert (token->len > 0);
-
- *cur = p;
+ U8_NEXT (text, start, finish, c);
+ if (u_isalnum (c)) {
return TRUE;
}
- if (ex->type == RSPAMD_EXCEPTION_URL) {
- token->begin = "!!EX!!";
- token->len = sizeof ("!!EX!!") - 1;
- token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- processed = token->len;
- }
-
- p += ex->len;
-
- /* We need to skip all exceptions that are within this exception */
- *exceptions = g_list_next (*exceptions);
-
- while (*exceptions) {
- ex = (*exceptions)->data;
-
- if (ex->pos < p - buf->begin) {
- /* Nested exception */
- if (ex->pos + ex->len > p - buf->begin) {
- /*
- * We have somehow overlapping nesting exception,
- * extend current offset
- */
- p = buf->begin + ex->pos + ex->len;
- }
-
- *exceptions = g_list_next (*exceptions);
- }
- else {
- break;
- }
- }
-
- *cur = p;
-
- if (rl) {
- *rl = processed;
- }
-
- return TRUE;
+ return FALSE;
}
+#define SHIFT_EX do { \
+ cur = g_list_next (cur); \
+ if (cur) { \
+ ex = (struct rspamd_process_exception *) cur->data; \
+ } \
+ else { \
+ ex = NULL; \
+ } \
+} while(0)
GArray *
rspamd_tokenize_text (const gchar *text, gsize len,
+ const UText *utxt,
enum rspamd_tokenize_type how,
- struct rspamd_config *cfg, GList *exceptions,
+ struct rspamd_config *cfg,
+ GList *exceptions,
guint64 *hash)
{
rspamd_stat_token_t token, buf;
@@ -336,11 +248,11 @@ rspamd_tokenize_text (const gchar *text, gsize len,
gsize l = 0;
GArray *res;
GList *cur = exceptions;
- token_get_function func;
guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
guint64 hv = 0;
gboolean decay = FALSE;
guint64 prob;
+ static UBreakIterator* bi = NULL;
if (text == NULL) {
return NULL;
@@ -353,18 +265,6 @@ rspamd_tokenize_text (const gchar *text, gsize len,
token.len = 0;
token.flags = 0;
- switch (how) {
- case RSPAMD_TOKENIZE_RAW:
- func = rspamd_tokenizer_get_word_compat;
- break;
- case RSPAMD_TOKENIZE_UTF:
- func = rspamd_tokenizer_get_word;
- break;
- default:
- g_assert_not_reached ();
- break;
- }
-
if (cfg != NULL) {
min_len = cfg->min_word_len;
max_len = cfg->max_word_len;
@@ -375,56 +275,175 @@ rspamd_tokenize_text (const gchar *text, gsize len,
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
initial_size);
- while (func (&buf, &pos, &token, &cur, &l, FALSE)) {
- if (l == 0 || (min_len > 0 && l < min_len) ||
- (max_len > 0 && l > max_len)) {
+ if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
+ while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
+ if (l == 0 || (min_len > 0 && l < min_len) ||
+ (max_len > 0 && l > max_len)) {
+ token.begin = pos;
+ continue;
+ }
+
+ if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+ &hv, &prob, &token, pos - text, len)) {
+ if (!decay) {
+ decay = TRUE;
+ }
+ else {
+ token.begin = pos;
+ continue;
+ }
+ }
+
+ g_array_append_val (res, token);
token.begin = pos;
- continue;
}
+ }
+ else {
+ /* UTF8 boundaries */
+ UErrorCode uc_err = U_ZERO_ERROR;
+ int32_t last, p;
+ struct rspamd_process_exception *ex = NULL;
- if (!decay) {
- if (token.len >= sizeof (guint64)) {
-#ifdef _MUM_UNALIGNED_ACCESS
- hv = mum_hash_step (hv, *(guint64 *)token.begin);
-#else
- guint64 tmp;
- memcpy (&tmp, token.begin, sizeof (tmp));
- hv = mum_hash_step (hv, tmp);
-#endif
- }
+ if (bi == NULL) {
+ bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
- /* Check for decay */
- if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
- /* Start decay */
- gdouble decay_prob;
+ g_assert (U_SUCCESS (uc_err));
+ }
- decay = TRUE;
- hv = mum_hash_finish (hv);
+ ubrk_setUText (bi, (UText*)utxt, &uc_err);
+ last = ubrk_first (bi);
+ p = last;
- /* We assume that word is 6 symbols length in average */
- decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
+ if (cur) {
+ ex = (struct rspamd_process_exception *)cur->data;
+ }
- if (decay_prob >= 1.0) {
- prob = G_MAXUINT64;
+ while (p != UBRK_DONE) {
+start_over:
+ token.len = 0;
+
+ if (p > last) {
+ if (ex && cur) {
+ /* Check exception */
+ if (ex->pos >= last && ex->pos <= p) {
+ /* We have an exception within boundary */
+ /* First, start to drain exceptions from the start */
+ while (cur && ex->pos <= last) {
+ /* We have an exception at the beginning, skip those */
+ last += ex->len;
+
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token.begin = "!!EX!!";
+ token.len = sizeof ("!!EX!!") - 1;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+ g_array_append_val (res, token);
+ token.flags = 0;
+ }
+
+ if (last > p) {
+ /* Exception spread over the boundaries */
+ while (last > p && p != UBRK_DONE) {
+ p = ubrk_next (bi);
+ }
+
+ /* We need to reset our scan with new p and last */
+ SHIFT_EX;
+ goto start_over;
+ }
+
+ SHIFT_EX;
+ }
+
+ /* Now, we can have an exception within boundary again */
+ if (cur && ex->pos >= last && ex->pos <= p) {
+ /* Append the first part */
+ if (rspamd_utf_word_valid (text, text + len, last,
+ ex->pos)) {
+ token.begin = text + last;
+ token.len = ex->pos - last;
+ token.flags = 0;
+ g_array_append_val (res, token);
+ }
+
+ /* Process the current exception */
+ last += ex->len + (ex->pos - last);
+
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token.begin = "!!EX!!";
+ token.len = sizeof ("!!EX!!") - 1;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+ g_array_append_val (res, token);
+ }
+
+ if (last > p) {
+ /* Exception spread over the boundaries */
+ while (last > p && p != UBRK_DONE) {
+ p = ubrk_next (bi);
+ }
+ /* We need to reset our scan with new p and last */
+ SHIFT_EX;
+ goto start_over;
+ }
+
+ SHIFT_EX;
+ }
+ else if (p > last) {
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ }
+ }
+ }
+ else if (ex->pos < last) {
+ /* Forward exceptions list */
+ while (cur && ex->pos <= last) {
+ /* We have an exception at the beginning, skip those */
+ SHIFT_EX;
+ }
+
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ }
+ }
+ else {
+ /* No exceptions within boundary */
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ }
+ }
}
else {
- prob = decay_prob * G_MAXUINT64;
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ }
+ }
+
+ if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+ &hv, &prob, &token, pos - text, len)) {
+ if (!decay) {
+ decay = TRUE;
+ } else {
+ token.len = 0;
+ }
}
}
- }
- else {
- /* Decaying probability */
- /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
- hv = 2862933555777941757ULL * hv + 3037000493ULL;
- if (hv > prob) {
- token.begin = pos;
- continue;
+ if (token.len > 0) {
+ g_array_append_val (res, token);
}
- }
- g_array_append_val (res, token);
- token.begin = pos;
+ last = p;
+ p = ubrk_next (bi);
+ }
}
if (!decay) {
@@ -438,6 +457,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
return res;
}
+#undef SHIFT_EX
+
/*
* vi:ts=4
*/
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 8be5f98a8..6c538eafc 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -7,6 +7,8 @@
#include "rspamd.h"
#include "stat_api.h"
+#include <unicode/utext.h>
+
#define RSPAMD_DEFAULT_TOKENIZER "osb"
struct rspamd_tokenizer_runtime;
@@ -28,7 +30,7 @@ struct rspamd_stat_tokenizer {
enum rspamd_tokenize_type {
RSPAMD_TOKENIZE_UTF = 0,
RSPAMD_TOKENIZE_RAW,
- RSPAMD_TOKENIZE_UCS
+ RSPAMD_TOKENIZE_UNICODE
};
/* Compare two token nodes */
@@ -37,6 +39,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_stat_token_t type) */
GArray * rspamd_tokenize_text (const gchar *text, gsize len,
+ const UText *utxt,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,
diff --git a/src/libutil/logger.c b/src/libutil/logger.c
index 027c21da1..cd624f831 100644
--- a/src/libutil/logger.c
+++ b/src/libutil/logger.c
@@ -273,13 +273,13 @@ rspamd_log_open_priv (rspamd_logger_t *rspamd_log, uid_t uid, gid_t gid)
S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH);
if (rspamd_log->fd == -1) {
fprintf (stderr,
- "open_log: cannot open desired log file: %s, %s my pid: %d",
- rspamd_log->log_file, strerror (errno), getpid ());
+ "open_log: cannot open desired log file: %s, %s\n",
+ rspamd_log->log_file, strerror (errno));
return -1;
}
if (fchown (rspamd_log->fd, uid, gid) == -1) {
fprintf (stderr,
- "open_log: cannot chown desired log file: %s, %s",
+ "open_log: cannot chown desired log file: %s, %s\n",
rspamd_log->log_file, strerror (errno));
close (rspamd_log->fd);
return -1;
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index bb3406e80..78c3e05b9 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -549,16 +549,16 @@ lua_textpart_get_content (lua_State * L)
rspamd_lua_setclass (L, "rspamd{text}", -1);
if (!type) {
- start = part->content->data;
- len = part->content->len;
+ start = part->utf_content->data;
+ len = part->utf_content->len;
}
else if (strcmp (type, "content") == 0) {
- start = part->content->data;
- len = part->content->len;
+ start = part->utf_content->data;
+ len = part->utf_content->len;
}
else if (strcmp (type, "content_oneline") == 0) {
- start = part->stripped_content->data;
- len = part->stripped_content->len;
+ start = part->utf_stripped_content->data;
+ len = part->utf_stripped_content->len;
}
else if (strcmp (type, "raw_parsed") == 0) {
start = part->parsed.begin;
@@ -618,8 +618,8 @@ lua_textpart_get_content_oneline (lua_State * L)
t = lua_newuserdata (L, sizeof (*t));
rspamd_lua_setclass (L, "rspamd{text}", -1);
- t->start = part->stripped_content->data;
- t->len = part->stripped_content->len;
+ t->start = part->utf_stripped_content->data;
+ t->len = part->utf_stripped_content->len;
t->flags = 0;
return 1;
@@ -636,11 +636,11 @@ lua_textpart_get_length (lua_State * L)
return 1;
}
- if (IS_PART_EMPTY (part) || part->content == NULL) {
+ if (IS_PART_EMPTY (part) || part->utf_content == NULL) {
lua_pushinteger (L, 0);
}
else {
- lua_pushinteger (L, part->content->len);
+ lua_pushinteger (L, part->utf_content->len);
}
return 1;
@@ -721,11 +721,11 @@ lua_textpart_get_words_count (lua_State *L)
return 1;
}
- if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
+ if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
lua_pushinteger (L, 0);
}
else {
- lua_pushinteger (L, part->normalized_words->len);
+ lua_pushinteger (L, part->utf_words->len);
}
return 1;
@@ -743,14 +743,14 @@ lua_textpart_get_words (lua_State *L)
return luaL_error (L, "invalid arguments");
}
- if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
+ if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
lua_createtable (L, 0, 0);
}
else {
- lua_createtable (L, part->normalized_words->len, 0);
+ lua_createtable (L, part->utf_words->len, 0);
- for (i = 0; i < part->normalized_words->len; i ++) {
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ for (i = 0; i < part->utf_words->len; i ++) {
+ w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
lua_pushlstring (L, w->begin, w->len);
lua_rawseti (L, -2, i + 1);
@@ -876,8 +876,8 @@ struct lua_shingle_data {
};
#define STORE_TOKEN(i, t) do { \
- if ((i) < part->normalized_words->len) { \
- word = &g_array_index (part->normalized_words, rspamd_stat_token_t, (i)); \
+ if ((i) < part->utf_words->len) { \
+ word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \
sd->t.begin = word->begin; \
sd->t.len = word->len; \
} \
@@ -936,8 +936,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L)
/* Calculate direct hash */
rspamd_cryptobox_hash_init (&st, key, rspamd_cryptobox_HASHKEYBYTES);
- for (i = 0; i < part->normalized_words->len; i ++) {
- word = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ for (i = 0; i < part->utf_words->len; i ++) {
+ word = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
rspamd_cryptobox_hash_update (&st, word->begin, word->len);
}
@@ -947,7 +947,7 @@ lua_textpart_get_fuzzy_hashes (lua_State * L)
sizeof (hexdigest));
lua_pushlstring (L, hexdigest, sizeof (hexdigest) - 1);
- sgl = rspamd_shingles_from_text (part->normalized_words, key,
+ sgl = rspamd_shingles_from_text (part->utf_words, key,
pool, lua_shingles_filter, part, RSPAMD_SHINGLES_MUMHASH);
if (sgl == NULL) {
diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c
index 16a8ace0c..e6a6052d4 100644
--- a/src/lua/lua_trie.c
+++ b/src/lua/lua_trie.c
@@ -262,9 +262,9 @@ lua_trie_search_mime (lua_State *L)
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->content != NULL) {
- text = part->content->data;
- len = part->content->len;
+ if (!IS_PART_EMPTY (part) && part->utf_content != NULL) {
+ text = part->utf_content->data;
+ len = part->utf_content->len;
if (lua_trie_search_str (L, trie, text, len) != 0) {
found = TRUE;
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 3de68e60a..d6095ab52 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1078,6 +1078,7 @@ lua_util_tokenize_text (lua_State *L)
GList *exceptions = NULL, *cur;
struct rspamd_lua_text *t;
struct rspamd_process_exception *ex;
+ UText utxt = UTEXT_INITIALIZER;
GArray *res;
rspamd_stat_token_t *w;
@@ -1129,7 +1130,15 @@ lua_util_tokenize_text (lua_State *L)
exceptions = g_list_reverse (exceptions);
}
- res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL,
+ UErrorCode uc_err = U_ZERO_ERROR;
+ utext_openUTF8 (&utxt,
+ in,
+ len,
+ &uc_err);
+
+ res = rspamd_tokenize_text ((gchar *)in, len,
+ &utxt,
+ RSPAMD_TOKENIZE_UTF, NULL,
exceptions,
NULL);
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 987879258..f917c26c8 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -560,13 +560,13 @@ rspamd_chartable_process_part (struct rspamd_task *task,
guint i, ncap = 0;
gdouble cur_score = 0.0;
- if (part == NULL || part->normalized_words == NULL ||
- part->normalized_words->len == 0) {
+ if (part == NULL || part->utf_words == NULL ||
+ part->utf_words->len == 0) {
return;
}
- for (i = 0; i < part->normalized_words->len; i++) {
- w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+ for (i = 0; i < part->utf_words->len; i++) {
+ w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
@@ -588,7 +588,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
*/
part->capital_letters += ncap;
- cur_score /= (gdouble)part->normalized_words->len;
+ cur_score /= (gdouble)part->utf_words->len;
if (cur_score > 2.0) {
cur_score = 2.0;
@@ -619,7 +619,17 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
guint i;
gdouble cur_score = 0.0;
- words = rspamd_tokenize_text (task->subject, strlen (task->subject),
+ UText utxt = UTEXT_INITIALIZER;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gsize slen = strlen (task->subject);
+
+ utext_openUTF8 (&utxt,
+ task->subject,
+ slen,
+ &uc_err);
+
+ words = rspamd_tokenize_text (task->subject, slen,
+ &utxt,
RSPAMD_TOKENIZE_UTF,
NULL,
NULL,
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index c0fd8aa4c..bf08c0e46 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -1196,7 +1196,7 @@ fuzzy_io_fin (void *ud)
static GArray *
fuzzy_preprocess_words (struct rspamd_mime_text_part *part, rspamd_mempool_t *pool)
{
- return part->normalized_words;
+ return part->utf_words;
}
static void
@@ -1418,8 +1418,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task,
rspamd_cryptobox_hash_init (&st, rule->hash_key->str,
rule->hash_key->len);
- rspamd_cryptobox_hash_update (&st, part->stripped_content->data,
- part->stripped_content->len);
+ rspamd_cryptobox_hash_update (&st, part->utf_stripped_content->data,
+ part->utf_stripped_content->len);
if (task->subject) {
/* We also include subject */
@@ -2615,7 +2615,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
}
/* Check length of part */
- fac = rule->ctx->text_multiplier * part->content->len;
+ fac = rule->ctx->text_multiplier * part->utf_content->len;
if ((double)min_bytes > fac) {
if (!rule->short_text_direct_hash) {
msg_info_task (
@@ -2624,7 +2624,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
"skip fuzzy check",
task->message_id, min_bytes,
fac,
- part->content->len,
+ part->utf_content->len,
rule->ctx->text_multiplier);
continue;
}
@@ -2635,21 +2635,21 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
"use direct hash",
task->message_id, min_bytes,
fac,
- part->content->len,
+ part->utf_content->len,
rule->ctx->text_multiplier);
short_text = TRUE;
}
}
- if (part->normalized_words == NULL ||
- part->normalized_words->len == 0) {
+ if (part->utf_words == NULL ||
+ part->utf_words->len == 0) {
msg_info_task ("<%s>, part hash empty, skip fuzzy check",
task->message_id);
continue;
}
if (rule->ctx->min_hash_len != 0 &&
- part->normalized_words->len <
+ part->utf_words->len <
rule->ctx->min_hash_len) {
if (!rule->short_text_direct_hash) {
msg_info_task (
diff --git a/src/plugins/lua/antivirus.lua b/src/plugins/lua/antivirus.lua
index 37c58bcf8..46ea8c40d 100644
--- a/src/plugins/lua/antivirus.lua
+++ b/src/plugins/lua/antivirus.lua
@@ -889,10 +889,16 @@ if opts and type(opts) == 'table' then
for _, p in ipairs(m['patterns']) do
if type(p) == 'table' then
for sym in pairs(p) do
+ rspamd_logger.debugm(N, rspamd_config, 'registering: %1', {
+ type = 'virtual',
+ name = sym,
+ parent = m['symbol'],
+ parent_id = id,
+ })
rspamd_config:register_symbol({
type = 'virtual',
name = sym,
- parent = m['symbol']
+ parent = id
})
end
end
diff --git a/src/plugins/lua/arc.lua b/src/plugins/lua/arc.lua
index 30ae0cd19..53fb7466a 100644
--- a/src/plugins/lua/arc.lua
+++ b/src/plugins/lua/arc.lua
@@ -608,7 +608,8 @@ end
rspamd_config:register_symbol({
name = settings['sign_symbol'],
- callback = arc_signing_cb
+ callback = arc_signing_cb,
+ groups = {"policies", "arc"}
})
-- Do not sign unless valid
diff --git a/src/plugins/lua/dkim_signing.lua b/src/plugins/lua/dkim_signing.lua
index 99e1fca68..f9c6ecdb6 100644
--- a/src/plugins/lua/dkim_signing.lua
+++ b/src/plugins/lua/dkim_signing.lua
@@ -213,5 +213,6 @@ end
rspamd_config:register_symbol({
name = settings['symbol'],
- callback = dkim_signing_cb
+ callback = dkim_signing_cb,
+ groups = {"policies", "dkim"}
})
diff --git a/src/rspamadm/confighelp.c b/src/rspamadm/confighelp.c
index ff80341ea..d3461489e 100644
--- a/src/rspamadm/confighelp.c
+++ b/src/rspamadm/confighelp.c
@@ -71,7 +71,7 @@ rspamadm_confighelp_help (gboolean full_help, const struct rspamadm_command *cmd
"-P: use specific Lua plugins path\n"
"--no-color: disable coloured output\n"
"--short: show only option names\n"
- "--no-examples: do not show examples (impied by --short)\n"
+ "--no-examples: do not show examples (implied by --short)\n"
"--help: shows available options and commands";
}
else {