aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/lang_detection.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r--src/libmime/lang_detection.c1116
1 files changed, 696 insertions, 420 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 8763365af..d4237690d 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -17,6 +17,7 @@
#include "lang_detection.h"
#include "libutil/logger.h"
#include "libcryptobox/cryptobox.h"
+#include "libutil/multipattern.h"
#include "ucl.h"
#include "khash.h"
#include <glob.h>
@@ -26,7 +27,7 @@
#include <unicode/ustring.h>
#include <math.h>
-static const gsize default_short_text_limit = 200;
+static const gsize default_short_text_limit = 20;
static const gsize default_words = 80;
static const gdouble update_prob = 0.6;
static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
@@ -42,28 +43,17 @@ struct rspamd_language_unicode_match {
* List of languages detected by unicode scripts
*/
static const struct rspamd_language_unicode_match unicode_langs[] = {
- {"el", UBLOCK_GREEK},
- {"ml", UBLOCK_MALAYALAM},
- {"te", UBLOCK_TELUGU},
- {"ta", UBLOCK_TAMIL},
- {"gu", UBLOCK_GUJARATI},
- {"th", UBLOCK_THAI},
- {"kn", UBLOCK_KANNADA},
- {"ka", UBLOCK_GEORGIAN},
- {"si", UBLOCK_SINHALA},
- {"hy", UBLOCK_ARMENIAN},
- {"lo", UBLOCK_LAO},
- {"km", UBLOCK_KHMER}
-};
-
-/*
- * List of languages to apply unigramms only
- */
-static const gchar *unigramms_langs[] = {
- "ja",
- "ko",
- "zh-CN",
- "zh-TW"
+ {"el", RSPAMD_UNICODE_GREEK},
+ {"ml", RSPAMD_UNICODE_MALAYALAM},
+ {"te", RSPAMD_UNICODE_TELUGU},
+ {"ta", RSPAMD_UNICODE_TAMIL},
+ {"gu", RSPAMD_UNICODE_GUJARATI},
+ {"th", RSPAMD_UNICODE_THAI},
+ {"ka", RSPAMD_UNICODE_GEORGIAN},
+ {"si", RSPAMD_UNICODE_SINHALA},
+ {"hy", RSPAMD_UNICODE_ARMENIAN},
+ {"ja", RSPAMD_UNICODE_JP},
+ {"ko", RSPAMD_UNICODE_HANGUL},
};
/*
@@ -73,24 +63,29 @@ static const gchar *tier0_langs[] = {
"en",
};
static const gchar *tier1_langs[] = {
- "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
- "ko", "pt", "ru", "pl", "tk", "th", "ar"
+ "fr", "it", "de", "es", "nl",
+ "pt", "ru", "pl", "tk", "th", "ar"
};
enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
RS_LANGUAGE_LATIN = (1 << 0),
- RS_LANGUAGE_UNISCRIPT = (1 << 1),
- RS_LANGUAGE_UNIGRAMM = (1 << 2),
RS_LANGUAGE_TIER1 = (1 << 3),
RS_LANGUAGE_TIER0 = (1 << 4),
};
+enum rspamd_language_category {
+ RSPAMD_LANGUAGE_LATIN = 0,
+ RSPAMD_LANGUAGE_CYRILLIC,
+ RSPAMD_LANGUAGE_DEVANAGARI,
+ RSPAMD_LANGUAGE_ARAB,
+ RSPAMD_LANGUAGE_MAX,
+};
+
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
enum rspamd_language_elt_flags flags;
- guint ngramms_total;
- guint unigramms_words;
+ enum rspamd_language_category category;
guint trigramms_words;
gdouble mean;
gdouble std;
@@ -109,6 +104,17 @@ struct rspamd_ngramm_chain {
gchar *utf;
};
+struct rspamd_stop_word_range {
+ guint start;
+ guint stop;
+ struct rspamd_language_elt *elt;
+};
+
+struct rspamd_stop_word_elt {
+ struct rspamd_multipattern *mp;
+ GArray *ranges; /* of rspamd_stop_word_range */
+};
+
#define msg_debug_lang_det(...) rspamd_conditional_debug_fast (NULL, NULL, \
rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
G_STRFUNC, \
@@ -149,18 +155,6 @@ rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts)
}
static guint
-rspamd_unigram_hash_func (gconstpointer key)
-{
- return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ());
-}
-
-static gboolean
-rspamd_unigram_equal_func (gconstpointer v, gconstpointer v2)
-{
- return memcmp (v, v2, sizeof (UChar)) == 0;
-}
-
-static guint
rspamd_trigram_hash_func (gconstpointer key)
{
return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar), rspamd_hash_seed ());
@@ -172,8 +166,6 @@ rspamd_trigram_equal_func (gconstpointer v, gconstpointer v2)
return memcmp (v, v2, 3 * sizeof (UChar)) == 0;
}
-KHASH_INIT (rspamd_unigram_hash, const UChar *, struct rspamd_ngramm_chain, true,
- rspamd_unigram_hash_func, rspamd_unigram_equal_func);
KHASH_INIT (rspamd_trigram_hash, const UChar *, struct rspamd_ngramm_chain, true,
rspamd_trigram_hash_func, rspamd_trigram_equal_func);
KHASH_INIT (rspamd_candidates_hash, const gchar *,
@@ -182,9 +174,8 @@ KHASH_INIT (rspamd_candidates_hash, const gchar *,
struct rspamd_lang_detector {
GPtrArray *languages;
- khash_t(rspamd_unigram_hash) *unigramms; /* unigramms frequencies */
- khash_t(rspamd_trigram_hash) *trigramms; /* trigramms frequencies */
- GHashTable *unicode_scripts; /* indexed by unicode script */
+ khash_t(rspamd_trigram_hash) *trigramms[RSPAMD_LANGUAGE_MAX]; /* trigramms frequencies */
+ struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
UConverter *uchar_converter;
gsize short_text_limit;
gsize total_occurencies; /* number of all languages found */
@@ -226,9 +217,13 @@ struct rspamd_language_ucs_elt {
static void
rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
- struct rspamd_lang_detector *d,
- struct rspamd_language_elt *lelt,
- struct rspamd_language_ucs_elt *ucs, guint len, guint freq, guint total)
+ struct rspamd_lang_detector *d,
+ struct rspamd_language_elt *lelt,
+ struct rspamd_language_ucs_elt *ucs,
+ guint len,
+ guint freq,
+ guint total,
+ khash_t (rspamd_trigram_hash) *htb)
{
struct rspamd_ngramm_chain *chain = NULL, st_chain;
struct rspamd_ngramm_elt *elt;
@@ -238,18 +233,13 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
switch (len) {
case 1:
- k = kh_get (rspamd_unigram_hash, d->unigramms, ucs->s);
- if (k != kh_end (d->unigramms)) {
- chain = &kh_value (d->unigramms, k);
- }
- break;
case 2:
g_assert_not_reached ();
break;
case 3:
- k = kh_get (rspamd_trigram_hash, d->trigramms, ucs->s);
- if (k != kh_end (d->trigramms)) {
- chain = &kh_value (d->trigramms, k);
+ k = kh_get (rspamd_trigram_hash, htb, ucs->s);
+ if (k != kh_end (htb)) {
+ chain = &kh_value (htb, k);
}
break;
default:
@@ -270,14 +260,8 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
elt->prob = ((gdouble)freq) / ((gdouble)total);
g_ptr_array_add (chain->languages, elt);
- if (len == 1) {
- k = kh_put (rspamd_unigram_hash, d->unigramms, ucs->s, &i);
- kh_value (d->unigramms, k) = *chain;
- }
- else {
- k = kh_put (rspamd_trigram_hash, d->trigramms, ucs->s, &i);
- kh_value (d->trigramms, k) = *chain;
- }
+ k = kh_put (rspamd_trigram_hash, htb, ucs->s, &i);
+ kh_value (htb, k) = *chain;
}
else {
/* Check sanity */
@@ -300,6 +284,23 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
}
}
+static inline enum rspamd_language_category
+rspamd_language_detector_get_category (guint uflags)
+{
+ enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN;
+
+ if (uflags & RSPAMD_UNICODE_CYRILLIC) {
+ cat = RSPAMD_LANGUAGE_CYRILLIC;
+ }
+ else if (uflags & RSPAMD_UNICODE_DEVANAGARI) {
+ cat = RSPAMD_LANGUAGE_DEVANAGARI;
+ }
+ else if (uflags & RSPAMD_UNICODE_ARABIC) {
+ cat = RSPAMD_LANGUAGE_ARAB;
+ }
+
+ return cat;
+}
static const gchar *
rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
@@ -307,9 +308,6 @@ rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
static gchar flags_buf[256];
goffset r = 0;
- if (elt->flags & RS_LANGUAGE_UNIGRAMM) {
- r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "unigrams,");
- }
if (elt->flags & RS_LANGUAGE_TIER1) {
r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier1,");
}
@@ -342,19 +340,22 @@ rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b)
static void
rspamd_language_detector_read_file (struct rspamd_config *cfg,
struct rspamd_lang_detector *d,
- const gchar *path)
+ const gchar *path,
+ const ucl_object_t *stop_words)
{
struct ucl_parser *parser;
ucl_object_t *top;
- const ucl_object_t *freqs, *n_words, *cur;
+ const ucl_object_t *freqs, *n_words, *cur, *type;
ucl_object_iter_t it = NULL;
UErrorCode uc_err = U_ZERO_ERROR;
struct rspamd_language_elt *nelt;
- const struct rspamd_language_unicode_match *uc_match;
struct rspamd_language_ucs_elt *ucs_elt;
+ khash_t (rspamd_trigram_hash) *htb = NULL;
gchar *pos;
- guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, loaded;
+ guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
+ loaded, nstop = 0;
gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
+ enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS);
if (!ucl_parser_add_file (parser, path)) {
@@ -396,141 +397,181 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
return;
}
else {
- nelt->unigramms_words = ucl_object_toint (ucl_array_find_index (n_words,
- 0));
nelt->trigramms_words = ucl_object_toint (ucl_array_find_index (n_words,
2));
}
- if ((uc_match = rspamd_language_search_unicode_match (nelt->name, unicode_langs,
- G_N_ELEMENTS (unicode_langs))) != NULL) {
- g_hash_table_insert (d->unicode_scripts, (gpointer)&uc_match->unicode_code,
- nelt);
- nelt->flags |= RS_LANGUAGE_UNISCRIPT;
- msg_info_config ("loaded unicode script only %s language: %d",
- nelt->name,
- uc_match->unicode_code);
+ type = ucl_object_lookup (top, "type");
+
+ if (type == NULL || ucl_object_type (type) != UCL_STRING) {
+ msg_warn_config ("cannot find type in language %s", nelt->name);
+ ucl_object_unref (top);
+
+ return;
}
else {
- GPtrArray *ngramms;
- guint nsym;
+ const gchar *stype = ucl_object_tostring (type);
- if (rspamd_language_search_str (nelt->name, unigramms_langs,
- G_N_ELEMENTS (unigramms_langs))) {
- nelt->flags |= RS_LANGUAGE_UNIGRAMM;
+ if (strcmp (stype, "latin") == 0) {
+ cat = RSPAMD_LANGUAGE_LATIN;
+ }
+ else if (strcmp (stype, "cyrillic") == 0) {
+ cat = RSPAMD_LANGUAGE_CYRILLIC;
}
+ else if (strcmp (stype, "arab") == 0) {
+ cat = RSPAMD_LANGUAGE_ARAB;
+ }
+ else if (strcmp (stype, "devanagari") == 0) {
+ cat = RSPAMD_LANGUAGE_DEVANAGARI;
+ }
+ else {
+ msg_warn_config ("unknown type %s of language %s", stype, nelt->name);
+ ucl_object_unref (top);
- if (rspamd_language_search_str (nelt->name, tier1_langs,
- G_N_ELEMENTS (tier1_langs))) {
- nelt->flags |= RS_LANGUAGE_TIER1;
+ return;
}
+ }
+
+ if (stop_words) {
+ const ucl_object_t *specific_stop_words;
+
+ specific_stop_words = ucl_object_lookup (stop_words, nelt->name);
+
+ if (specific_stop_words) {
+ it = NULL;
+ const ucl_object_t *w;
+ guint start, stop;
+
+ start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
+
+ while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) {
+ rspamd_multipattern_add_pattern (d->stop_words[cat].mp,
+ ucl_object_tostring (w), 0);
+ nstop ++;
+ }
- if (rspamd_language_search_str (nelt->name, tier0_langs,
- G_N_ELEMENTS (tier0_langs))) {
- nelt->flags |= RS_LANGUAGE_TIER0;
+ stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
+
+ struct rspamd_stop_word_range r;
+
+ r.start = start;
+ r.stop = stop;
+ r.elt = nelt;
+
+ g_array_append_val (d->stop_words[cat].ranges, r);
+ it = NULL;
}
+ }
- it = NULL;
- ngramms = g_ptr_array_sized_new (freqs->len);
- i = 0;
- skipped = 0;
- loaded = 0;
+ nelt->category = cat;
+ htb = d->trigramms[cat];
- while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
- const gchar *key;
- gsize keylen;
- guint freq;
+ GPtrArray *ngramms;
+ guint nsym;
- key = ucl_object_keyl (cur, &keylen);
- freq = ucl_object_toint (cur);
+ if (rspamd_language_search_str (nelt->name, tier1_langs,
+ G_N_ELEMENTS (tier1_langs))) {
+ nelt->flags |= RS_LANGUAGE_TIER1;
+ }
- i ++;
- delta = freq - mean;
- mean += delta / i;
- delta2 = freq - mean;
- m2 += delta * delta2;
+ if (rspamd_language_search_str (nelt->name, tier0_langs,
+ G_N_ELEMENTS (tier0_langs))) {
+ nelt->flags |= RS_LANGUAGE_TIER0;
+ }
- if (key != NULL) {
- ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
- sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar));
+ it = NULL;
+ ngramms = g_ptr_array_sized_new (freqs->len);
+ i = 0;
+ skipped = 0;
+ loaded = 0;
- nsym = ucnv_toUChars (d->uchar_converter,
- ucs_elt->s, keylen + 1,
- key,
- keylen, &uc_err);
- ucs_elt->utf = key;
+ while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
+ const gchar *key;
+ gsize keylen;
+ guint freq;
- if (uc_err != U_ZERO_ERROR) {
- msg_warn_config ("cannot convert key to unicode: %s",
- u_errorName (uc_err));
+ key = ucl_object_keyl (cur, &keylen);
+ freq = ucl_object_toint (cur);
- continue;
- }
+ i ++;
+ delta = freq - mean;
+ mean += delta / i;
+ delta2 = freq - mean;
+ m2 += delta * delta2;
- rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
+ if (key != NULL) {
+ ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
+ sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar));
- if (nsym == 3 && !(nelt->flags & RS_LANGUAGE_UNIGRAMM)) {
- g_ptr_array_add (ngramms, ucs_elt);
- }
- else if (nsym == 1 && nelt->flags & RS_LANGUAGE_UNIGRAMM) {
- g_ptr_array_add (ngramms, ucs_elt);
- }
- else {
- continue;
- }
+ nsym = ucnv_toUChars (d->uchar_converter,
+ ucs_elt->s, keylen + 1,
+ key,
+ keylen, &uc_err);
+ ucs_elt->utf = key;
- if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
- total_latin++;
- }
+ if (uc_err != U_ZERO_ERROR) {
+ msg_warn_config ("cannot convert key to unicode: %s",
+ u_errorName (uc_err));
+
+ continue;
+ }
- ucs_elt->freq = freq;
+ rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
- total_ngramms++;
+ if (nsym == 3) {
+ g_ptr_array_add (ngramms, ucs_elt);
+ }
+ else {
+ continue;
}
- }
- std = sqrt (m2 / (i - 1));
+ if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+ total_latin++;
+ }
- if (total_latin >= total_ngramms / 3) {
- nelt->flags |= RS_LANGUAGE_LATIN;
- }
+ ucs_elt->freq = freq;
- if (nelt->flags & RS_LANGUAGE_UNIGRAMM) {
- nsym = 1;
- }
- else {
- nsym = 3;
+ total_ngramms++;
}
+ }
- total = 0;
- PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
+ std = sqrt (m2 / (i - 1));
- if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
- rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
- ucs_elt->freq = 0;
- /* Skip latin ngramm for non-latin language to avoid garbadge */
- skipped ++;
- continue;
- }
+ if (total_latin >= total_ngramms / 3) {
+ nelt->flags |= RS_LANGUAGE_LATIN;
+ }
+
+ nsym = 3;
- /* Now, discriminate low frequency ngramms */
+ total = 0;
+ PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
- total += ucs_elt->freq;
- loaded ++;
+ if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
+ rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+ ucs_elt->freq = 0;
+ /* Skip latin ngramm for non-latin language to avoid garbadge */
+ skipped ++;
+ continue;
}
- g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
+ /* Now, discriminate low frequency ngramms */
- PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
- if (ucs_elt->freq > 0) {
- rspamd_language_detector_init_ngramm (cfg, d,
- nelt, ucs_elt, nsym,
- ucs_elt->freq, total);
- }
+ total += ucs_elt->freq;
+ loaded ++;
+ }
+
+ g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
+
+ PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
+ if (ucs_elt->freq > 0) {
+ rspamd_language_detector_init_ngramm (cfg, d,
+ nelt, ucs_elt, nsym,
+ ucs_elt->freq, total, htb);
}
+ }
#ifdef EXTRA_LANGDET_DEBUG
- /* Useful for debug */
+ /* Useful for debug */
for (i = 0; i < 10; i ++) {
ucs_elt = g_ptr_array_index (ngramms, i);
@@ -539,22 +580,20 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
#endif
- g_ptr_array_free (ngramms, TRUE);
- nelt->mean = mean;
- nelt->std = std;
- nelt->ngramms_total = total;
- msg_info_config ("loaded %s language, %d unigramms, %d trigramms, "
- "%d ngramms loaded; "
- "std=%.2f, mean=%.2f, skipped=%d, loaded=%d; "
- "(%s)",
- nelt->name,
- (gint)nelt->unigramms_words,
- (gint)nelt->trigramms_words,
- total,
- std, mean,
- skipped, loaded,
- rspamd_language_detector_print_flags (nelt));
- }
+ g_ptr_array_free (ngramms, TRUE);
+ nelt->mean = mean;
+ nelt->std = std;
+
+ msg_info_config ("loaded %s language, %d trigramms, "
+ "%d ngramms loaded; "
+ "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; "
+ "(%s)",
+ nelt->name,
+ (gint)nelt->trigramms_words,
+ total,
+ std, mean,
+ skipped, loaded, nstop,
+ rspamd_language_detector_print_flags (nelt));
g_ptr_array_add (d->languages, nelt);
ucl_object_unref (top);
@@ -631,16 +670,10 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d)
ucnv_close (d->uchar_converter);
}
- if (d->unicode_scripts) {
- g_hash_table_unref (d->unicode_scripts);
- }
-
- if (d->unigramms) {
- kh_destroy (rspamd_unigram_hash, d->unigramms);
- }
-
- if (d->trigramms) {
- kh_destroy (rspamd_trigram_hash, d->trigramms);
+ for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
+ kh_destroy (rspamd_trigram_hash, d->trigramms[i]);
+ rspamd_multipattern_destroy (d->stop_words[i].mp);
+ g_array_free (d->stop_words[i].ranges, TRUE);
}
if (d->languages) {
@@ -656,12 +689,14 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
*languages_disable = NULL;
const gchar *languages_path = default_languages_path;
glob_t gl;
- size_t i, short_text_limit = default_short_text_limit;
+ size_t i, short_text_limit = default_short_text_limit, total = 0;
UErrorCode uc_err = U_ZERO_ERROR;
GString *languages_pattern;
struct rspamd_ngramm_chain *chain, schain;
gchar *fname;
struct rspamd_lang_detector *ret = NULL;
+ struct ucl_parser *parser;
+ ucl_object_t *stop_words;
section = ucl_object_lookup (cfg->rcl_obj, "lang_detection");
@@ -683,6 +718,22 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
}
languages_pattern = g_string_sized_new (PATH_MAX);
+ rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path);
+ parser = ucl_parser_new (UCL_PARSER_DEFAULT);
+
+ if (ucl_parser_add_file (parser, languages_pattern->str)) {
+ stop_words = ucl_parser_get_object (parser);
+ }
+ else {
+ msg_err_config ("cannot read stop words from %s: %s",
+ languages_pattern->str,
+ ucl_parser_get_error (parser));
+ stop_words = NULL;
+ }
+
+ ucl_parser_free (parser);
+ languages_pattern->len = 0;
+
rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path);
memset (&gl, 0, sizeof (gl));
@@ -696,9 +747,13 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
ret->short_text_limit = short_text_limit;
/* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
- ret->unigramms = kh_init (rspamd_unigram_hash);
- ret->trigramms = kh_init (rspamd_trigram_hash);
- ret->unicode_scripts = g_hash_table_new (g_int_hash, g_int_equal);
+ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
+ ret->trigramms[i] = kh_init (rspamd_trigram_hash);
+ ret->stop_words[i].mp = rspamd_multipattern_create (
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+ ret->stop_words[i].ranges = g_array_new (FALSE, FALSE,
+ sizeof (struct rspamd_stop_word_range));
+ }
g_assert (uc_err == U_ZERO_ERROR);
@@ -708,7 +763,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
if (!rspamd_ucl_array_find_str (fname, languages_disable) ||
(languages_enable == NULL ||
rspamd_ucl_array_find_str (fname, languages_enable))) {
- rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i]);
+ rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i],
+ stop_words);
}
else {
msg_info_config ("skip language file %s: disabled", fname);
@@ -717,18 +773,27 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
g_free (fname);
}
- kh_foreach_value (ret->trigramms, schain, {
- chain = &schain;
- rspamd_language_detector_process_chain (cfg, chain);
- });
+ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
+ GError *err = NULL;
+
+ kh_foreach_value (ret->trigramms[i], schain, {
+ chain = &schain;
+ rspamd_language_detector_process_chain (cfg, chain);
+ });
+
+ if (!rspamd_multipattern_compile (ret->stop_words[i].mp, &err)) {
+ msg_err_config ("cannot compile stop words for %d language group: %e",
+ i, err);
+ g_error_free (err);
+ }
+
+ total += kh_size (ret->trigramms[i]);
+ }
- msg_info_config ("loaded %d languages, %d unicode only languages, "
- "%d unigramms, "
+ msg_info_config ("loaded %d languages, "
"%d trigramms",
(gint)ret->languages->len,
- (gint)g_hash_table_size (ret->unicode_scripts),
- (gint)kh_size (ret->unigramms),
- (gint)kh_size (ret->trigramms));
+ (gint)total);
REF_INIT_RETAIN (ret, rspamd_language_detector_dtor);
rspamd_mempool_add_destructor (cfg->cfg_pool,
@@ -859,11 +924,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
#endif
}
-enum rspamd_language_gramm_type {
- rs_unigramm = 0,
- rs_trigramm
-};
-
static goffset
rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
guint wlen, goffset cur_off)
@@ -914,9 +974,10 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
*/
static void
rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- UChar *window, enum rspamd_language_gramm_type type,
- khash_t(rspamd_candidates_hash) *candidates)
+ struct rspamd_lang_detector *d,
+ UChar *window,
+ khash_t(rspamd_candidates_hash) *candidates,
+ khash_t(rspamd_trigram_hash) *trigramms)
{
guint i;
gint ret;
@@ -926,19 +987,9 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
khiter_t k;
gdouble prob;
- switch (type) {
- case rs_unigramm:
- k = kh_get (rspamd_unigram_hash, d->unigramms, window);
- if (k != kh_end (d->unigramms)) {
- chain = &kh_value (d->unigramms, k);
- }
- break;
- case rs_trigramm:
- k = kh_get (rspamd_trigram_hash, d->trigramms, window);
- if (k != kh_end (d->trigramms)) {
- chain = &kh_value (d->trigramms, k);
- }
- break;
+ k = kh_get (rspamd_trigram_hash, trigramms, window);
+ if (k != kh_end (trigramms)) {
+ chain = &kh_value (trigramms, k);
}
if (chain) {
@@ -980,29 +1031,20 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
static void
rspamd_language_detector_detect_word (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- rspamd_stat_token_t *tok,
- khash_t(rspamd_candidates_hash) *candidates,
- enum rspamd_language_gramm_type type)
+ struct rspamd_lang_detector *d,
+ rspamd_stat_token_t *tok,
+ khash_t(rspamd_candidates_hash) *candidates,
+ khash_t(rspamd_trigram_hash) *trigramms)
{
- guint wlen;
+ const guint wlen = 3;
UChar window[3];
goffset cur = 0;
- switch (type) {
- case rs_unigramm:
- wlen = 1;
- break;
- case rs_trigramm:
- wlen = 3;
- break;
- }
-
/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {
rspamd_language_detector_process_ngramm_full (task,
- d, window, type, candidates);
+ d, window, candidates, trigramms);
}
}
@@ -1074,113 +1116,35 @@ rspamd_language_detector_filter_negligible (struct rspamd_task *task,
msg_debug_lang_det ("removed %d languages", filtered);
}
-static gboolean
-rspamd_language_detector_is_unicode (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
- goffset *selected_words,
- gsize nparts,
- khash_t(rspamd_candidates_hash) *candidates)
-{
- guint i, j, total_found = 0, total_checked = 0;
- rspamd_stat_token_t *tok;
- UChar t;
- gint uc_script, ret;
- khint_t k;
- struct rspamd_language_elt *elt;
- struct rspamd_lang_detector_res *cand;
-
- for (i = 0; i < nparts; i++) {
- tok = &g_array_index (ucs_tokens, rspamd_stat_token_t,
- selected_words[i]);
-
- for (j = 0; j < tok->len; j ++) {
- t = *(((UChar *)tok->begin) + j);
-
- uc_script = ublock_getCode (t);
- elt = g_hash_table_lookup (d->unicode_scripts, &uc_script);
-
- if (elt) {
- k = kh_get (rspamd_candidates_hash, candidates, elt->name);
- if (k != kh_end (candidates)) {
- cand = kh_value (candidates, k);
- }
- else {
- cand = NULL;
- }
-
- if (cand == NULL) {
- cand = rspamd_mempool_alloc (task->task_pool,
- sizeof (*cand));
- cand->elt = elt;
- cand->lang = elt->name;
- cand->prob = 1;
-
- k = kh_put (rspamd_candidates_hash, candidates, elt->name, &ret);
- kh_value (candidates, k) = cand;
- } else {
- /* Update guess */
- cand->prob ++;
- }
-
- total_found ++;
- }
-
- total_checked ++;
- }
-
- if (i >= nparts / 2 && total_found == 0) {
- /* No special scripts found, stop processing */
- return FALSE;
- }
- }
-
- if (total_found < total_checked / 2) {
- /* Not enough confidence */
- return FALSE;
- }
- else {
- /* Filter candidates */
- kh_foreach_value (candidates, cand, {
- cand->prob = cand->prob / total_checked;
- });
- }
-
- return TRUE;
-}
-
static void
rspamd_language_detector_detect_type (struct rspamd_task *task,
- guint nwords,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
- khash_t(rspamd_candidates_hash) *candidates,
- enum rspamd_language_gramm_type type) {
- guint nparts = MIN (ucs_tokens->len, nwords);
+ guint nwords,
+ struct rspamd_lang_detector *d,
+ GArray *words,
+ enum rspamd_language_category cat,
+ khash_t(rspamd_candidates_hash) *candidates)
+{
+ guint nparts = MIN (words->len, nwords);
goffset *selected_words;
- rspamd_stat_token_t *tok;
+ rspamd_stat_token_t *tok, ucs_w;
guint i;
selected_words = g_new0 (goffset, nparts);
- rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words);
+ rspamd_language_detector_random_select (words, nparts, selected_words);
msg_debug_lang_det ("randomly selected %d words", nparts);
- /* Check unicode scripts */
- if (kh_size (candidates) != 0 ||
- !rspamd_language_detector_is_unicode (task, d, ucs_tokens,
- selected_words, nparts, candidates)) {
-
- for (i = 0; i < nparts; i++) {
- tok = &g_array_index (ucs_tokens, rspamd_stat_token_t,
- selected_words[i]);
- rspamd_language_detector_detect_word (task, d, tok, candidates,
- type);
- }
-
- /* Filter negligible candidates */
- rspamd_language_detector_filter_negligible (task, candidates);
+ for (i = 0; i < nparts; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t,
+ selected_words[i]);
+ rspamd_language_detector_to_ucs (task->lang_det,
+ task->task_pool,
+ tok, &ucs_w);
+ rspamd_language_detector_detect_word (task, d, &ucs_w, candidates,
+ d->trigramms[cat]);
}
+ /* Filter negligible candidates */
+ rspamd_language_detector_filter_negligible (task, candidates);
g_free (selected_words);
}
@@ -1209,11 +1173,11 @@ enum rspamd_language_detected_type {
static enum rspamd_language_detected_type
rspamd_language_detector_try_ngramm (struct rspamd_task *task,
- guint nwords,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
- enum rspamd_language_gramm_type type,
- khash_t(rspamd_candidates_hash) *candidates)
+ guint nwords,
+ struct rspamd_lang_detector *d,
+ GArray *ucs_tokens,
+ enum rspamd_language_category cat,
+ khash_t(rspamd_candidates_hash) *candidates)
{
guint cand_len = 0;
struct rspamd_lang_detector_res *cand;
@@ -1222,8 +1186,8 @@ rspamd_language_detector_try_ngramm (struct rspamd_task *task,
nwords,
d,
ucs_tokens,
- candidates,
- type);
+ cat,
+ candidates);
kh_foreach_value (candidates, cand, {
if (!isnan (cand->prob)) {
@@ -1320,117 +1284,429 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
return 0;
}
-GPtrArray *
+static void
+rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ const gchar *p = part->utf_stripped_content->data, *end;
+ guint i = 0;
+ end = p + part->utf_stripped_content->len;
+ gint32 uc, sc;
+ guint nlatin = 0, nchinese = 0, nspecial = 0;
+
+ while (p + i < end) {
+ U8_NEXT (p, i, part->utf_stripped_content->len, uc);
+
+ if (((gint32) uc) < 0) {
+ break;
+ }
+
+ if (u_isalpha (uc)) {
+ sc = ublock_getCode (uc);
+
+ switch (sc) {
+ case UBLOCK_BASIC_LATIN:
+ case UBLOCK_LATIN_1_SUPPLEMENT:
+ part->unicode_scripts |= RSPAMD_UNICODE_LATIN;
+ nlatin ++;
+ break;
+ case UBLOCK_HEBREW:
+ part->unicode_scripts |= RSPAMD_UNICODE_HEBREW;
+ nspecial ++;
+ break;
+ case UBLOCK_GREEK:
+ part->unicode_scripts |= RSPAMD_UNICODE_GREEK;
+ nspecial ++;
+ break;
+ case UBLOCK_CYRILLIC:
+ part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC;
+ nspecial ++;
+ break;
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
+ case UBLOCK_CJK_COMPATIBILITY:
+ case UBLOCK_CJK_RADICALS_SUPPLEMENT:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
+ part->unicode_scripts |= RSPAMD_UNICODE_CJK;
+ nchinese ++;
+ break;
+ case UBLOCK_HIRAGANA:
+ case UBLOCK_KATAKANA:
+ part->unicode_scripts |= RSPAMD_UNICODE_JP;
+ nspecial ++;
+ break;
+ case UBLOCK_HANGUL_JAMO:
+ case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
+ part->unicode_scripts |= RSPAMD_UNICODE_HANGUL;
+ nspecial ++;
+ break;
+ case UBLOCK_ARABIC:
+ part->unicode_scripts |= RSPAMD_UNICODE_ARABIC;
+ nspecial ++;
+ break;
+ case UBLOCK_DEVANAGARI:
+ part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI;
+ nspecial ++;
+ break;
+ case UBLOCK_ARMENIAN:
+ part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN;
+ nspecial ++;
+ break;
+ case UBLOCK_GEORGIAN:
+ part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN;
+ nspecial ++;
+ break;
+ case UBLOCK_GUJARATI:
+ part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI;
+ nspecial ++;
+ break;
+ case UBLOCK_TELUGU:
+ part->unicode_scripts |= RSPAMD_UNICODE_TELUGU;
+ nspecial ++;
+ break;
+ case UBLOCK_TAMIL:
+ part->unicode_scripts |= RSPAMD_UNICODE_TAMIL;
+ nspecial ++;
+ break;
+ case UBLOCK_THAI:
+ part->unicode_scripts |= RSPAMD_UNICODE_THAI;
+ nspecial ++;
+ break;
+ case RSPAMD_UNICODE_MALAYALAM:
+ part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM;
+ nspecial ++;
+ break;
+ case RSPAMD_UNICODE_SINHALA:
+ part->unicode_scripts |= RSPAMD_UNICODE_SINHALA;
+ nspecial ++;
+ break;
+ }
+ }
+
+ if (nspecial > 6 && nspecial > nlatin) {
+ break;
+ }
+ else if (nchinese > 6 && nchinese > nlatin) {
+ if (nspecial > 0) {
+ /* Likely japanese */
+ break;
+ }
+ }
+ }
+
+ msg_debug_lang_det ("stop after checking %d characters, "
+ "%d latin, %d special, %d chinese",
+ i, nlatin, nspecial, nchinese);
+}
+
+static inline void
+rspamd_language_detector_set_language (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ const gchar *code)
+{
+ struct rspamd_lang_detector_res *r;
+
+ r = rspamd_mempool_alloc0 (task->task_pool, sizeof (*r));
+ r->prob = 1.0;
+ r->lang = code;
+
+ part->languages = g_ptr_array_sized_new (1);
+ g_ptr_array_add (part->languages, r);
+ part->language = code;
+}
+
+static gboolean
+rspamd_language_detector_try_uniscript (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ guint i;
+
+ for (i = 0; i < G_N_ELEMENTS (unicode_langs); i ++) {
+ if (unicode_langs[i].unicode_code & part->unicode_scripts) {
+ msg_debug_lang_det ("set language based on unicode script %s",
+ unicode_langs[i].lang);
+ rspamd_language_detector_set_language (task, part,
+ unicode_langs[i].lang);
+
+ return TRUE;
+ }
+ }
+
+ if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
+ rspamd_language_detector_set_language (task, part,
+ "zh-CN");
+
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+
+KHASH_MAP_INIT_STR (rspamd_sw_hash, int);
+
+struct rspamd_sw_cbdata {
+ khash_t (rspamd_sw_hash) *res;
+ GArray *ranges;
+};
+
+static gint
+rspamd_ranges_cmp (const void *k, const void *memb)
+{
+ gint pos = GPOINTER_TO_INT (k);
+ const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *)memb;
+
+ if (pos >= r->start && pos < r->stop) {
+ return 0;
+ }
+ else if (pos < r->start) {
+ return -1;
+ }
+
+ return 1;
+}
+
+static gint
+rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
+{
+ /* Check if boundary */
+ const gchar *prev, *next;
+ struct rspamd_stop_word_range *r;
+ struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context;
+ khiter_t k;
+
+ if (match_start > 0) {
+ prev = text + match_start - 1;
+
+ if (!(g_ascii_isspace (*prev) || g_ascii_ispunct (*prev))) {
+ return 0;
+ }
+ }
+ else if (match_pos < len) {
+ next = text + match_pos + 1;
+
+ if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) {
+ return 0;
+ }
+ }
+
+ /* We have a word on the boundary, check range */
+ r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data,
+ cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp);
+
+ g_assert (r != NULL);
+
+ k = kh_get (rspamd_sw_hash, cbdata->res, r->elt->name);
+
+ if (k != kh_end (cbdata->res)) {
+ kh_value (cbdata->res, k) ++;
+ }
+ else {
+ gint tt;
+
+ k = kh_put (rspamd_sw_hash, cbdata->res, r->elt->name, &tt);
+ kh_value (cbdata->res, k) = 1;
+ }
+
+ return 0;
+}
+
+static gboolean
+rspamd_language_detector_try_stop_words (struct rspamd_task *task,
+ struct rspamd_lang_detector *d,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_language_category cat)
+{
+ struct rspamd_stop_word_elt *elt;
+ struct rspamd_sw_cbdata cbdata;
+ gboolean ret = FALSE;
+
+ elt = &d->stop_words[cat];
+ cbdata.res = kh_init (rspamd_sw_hash);
+ cbdata.ranges = elt->ranges;
+
+ rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data,
+ part->utf_stripped_content->len, rspamd_language_detector_sw_cb,
+ &cbdata, NULL);
+
+ if (kh_size (cbdata.res) > 0) {
+ gint max = G_MININT, cur_matches;
+ const gchar *sel = NULL, *cur_lang;
+
+ kh_foreach (cbdata.res, cur_lang, cur_matches, {
+ if (cur_matches > max) {
+ max = cur_matches;
+ sel = cur_lang;
+ }
+ });
+
+ if (max > 0 && sel) {
+ msg_debug_lang_det ("set language based on stop words script %s, %d found",
+ sel, max);
+ rspamd_language_detector_set_language (task, part,
+ sel);
+
+ ret = TRUE;
+ }
+ }
+
+ kh_destroy (rspamd_sw_hash, cbdata.res);
+
+ return ret;
+}
+
+gboolean
rspamd_language_detector_detect (struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens, gsize words_len)
+ struct rspamd_lang_detector *d,
+ struct rspamd_mime_text_part *part)
{
khash_t(rspamd_candidates_hash) *candidates;
GPtrArray *result;
gdouble mean, std, start_ticks, end_ticks;
guint cand_len;
+ enum rspamd_language_category cat;
struct rspamd_lang_detector_res *cand;
enum rspamd_language_detected_type r;
struct rspamd_frequency_sort_cbdata cbd;
/* Check if we have sorted candidates based on frequency */
- gboolean frequency_heuristic_applied = FALSE;
+ gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
- if (ucs_tokens->len == 0) {
- return g_ptr_array_new ();
+ if (!part->utf_stripped_content) {
+ return FALSE;
}
start_ticks = rspamd_get_ticks (TRUE);
- candidates = kh_init (rspamd_candidates_hash);
- kh_resize (rspamd_candidates_hash, candidates, 32);
- r = rspamd_language_detector_try_ngramm (task, default_words, d,
- ucs_tokens, rs_trigramm,
- candidates);
+ rspamd_language_detector_unicode_scripts (task, part);
+ /* Apply unicode scripts heuristic */
- if (r == rs_detect_none) {
- msg_debug_lang_det ("no trigramms found, switch to unigramms");
- r = rspamd_language_detector_try_ngramm (task, default_words,
- d, ucs_tokens, rs_unigramm,
- candidates);
+ if (rspamd_language_detector_try_uniscript (task, part)) {
+ ret = TRUE;
}
- else if (r == rs_detect_multiple) {
- /* Check our guess */
- mean = 0.0;
- std = 0.0;
- cand_len = 0;
+ cat = rspamd_language_detector_get_category (part->unicode_scripts);
+
+ if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) {
+ ret = TRUE;
+ }
- /* Check distirbution */
- kh_foreach_value (candidates, cand, {
- if (!isnan (cand->prob)) {
- mean += cand->prob;
- cand_len ++;
+ if (!ret) {
+ if (part->utf_words->len < default_short_text_limit) {
+ r = rs_detect_none;
+ msg_debug_lang_det ("text is too short for trigramms detection: "
+ "%d words; at least %d words required",
+ (int)part->utf_words->len,
+ (int)default_short_text_limit);
+ rspamd_language_detector_set_language (task, part, "en");
+ candidates = kh_init (rspamd_candidates_hash);
+ }
+ else {
+ candidates = kh_init (rspamd_candidates_hash);
+ kh_resize (rspamd_candidates_hash, candidates, 32);
+
+ r = rspamd_language_detector_try_ngramm (task,
+ default_words,
+ d,
+ part->utf_words,
+ cat,
+ candidates);
+
+ if (r == rs_detect_none) {
+ msg_debug_lang_det ("no trigramms found, fallback to english");
+ rspamd_language_detector_set_language (task, part, "en");
+ } else if (r == rs_detect_multiple) {
+ /* Check our guess */
+
+ mean = 0.0;
+ std = 0.0;
+ cand_len = 0;
+
+ /* Check distirbution */
+ kh_foreach_value (candidates, cand, {
+ if (!isnan (cand->prob)) {
+ mean += cand->prob;
+ cand_len++;
+ }
+ });
+
+ if (cand_len > 0) {
+ mean /= cand_len;
+
+ kh_foreach_value (candidates, cand, {
+ gdouble err;
+ if (!isnan (cand->prob)) {
+ err = cand->prob - mean;
+ std += fabs (err);
+ }
+ });
+
+ std /= cand_len;
+ }
+
+ msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
+ cand_len, mean, std);
+
+ if (cand_len > 0 && std / fabs (mean) < 0.25) {
+ msg_debug_lang_det ("apply frequency heuristic sorting");
+ frequency_heuristic_applied = TRUE;
+ cbd.d = d;
+ cbd.mean = mean;
+ cbd.std = std;
+ cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+
+ if (part->utf_words->len < default_words / 2) {
+ cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ }
+ }
}
- });
+ }
- if (cand_len > 0) {
- mean /= cand_len;
+ /* Now, convert hash to array and sort it */
+ if (r != rs_detect_none && kh_size (candidates) > 0) {
+ result = g_ptr_array_sized_new (kh_size (candidates));
kh_foreach_value (candidates, cand, {
- gdouble err;
if (!isnan (cand->prob)) {
- err = cand->prob - mean;
- std += fabs (err);
+ msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
+ cand->prob);
+ g_ptr_array_add (result, cand);
}
});
- std /= cand_len;
- }
-
- msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
- cand_len, mean, std);
-
- if (cand_len > 0 && std / fabs (mean) < 0.25) {
- msg_debug_lang_det ("apply frequency heuristic sorting");
- frequency_heuristic_applied = TRUE;
- cbd.d = d;
- cbd.mean = mean;
- cbd.std = std;
- cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
-
- if (ucs_tokens->len < default_words / 2) {
- cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ if (frequency_heuristic_applied) {
+ g_ptr_array_sort_with_data (result,
+ rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
+ } else {
+ g_ptr_array_sort (result, rspamd_language_detector_cmp);
}
- }
- }
- /* Now, convert hash to array and sort it */
- result = g_ptr_array_sized_new (kh_size (candidates));
+ if (result->len > 0 && !frequency_heuristic_applied) {
+ cand = g_ptr_array_index (result, 0);
+ cand->elt->occurencies++;
+ d->total_occurencies++;
+ }
- kh_foreach_value (candidates, cand, {
- if (!isnan (cand->prob)) {
- msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
- cand->prob);
- g_ptr_array_add (result, cand);
+ part->languages = result;
+ ret = TRUE;
+ }
+ else if (part->languages == NULL) {
+ rspamd_language_detector_set_language (task, part, "en");
}
- });
-
- if (frequency_heuristic_applied) {
- g_ptr_array_sort_with_data (result,
- rspamd_language_detector_cmp_heuristic, (gpointer)&cbd);
- }
- else {
- g_ptr_array_sort (result, rspamd_language_detector_cmp);
- }
-
- kh_destroy (rspamd_candidates_hash, candidates);
- if (result->len > 0 && !frequency_heuristic_applied) {
- cand = g_ptr_array_index (result, 0);
- cand->elt->occurencies ++;
- d->total_occurencies ++;
+ kh_destroy (rspamd_candidates_hash, candidates);
}
end_ticks = rspamd_get_ticks (TRUE);
msg_debug_lang_det ("detected languages in %.0f ticks",
(end_ticks - start_ticks));
- return result;
+ return ret;
}