aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-27 12:25:57 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-27 12:25:57 +0000
commitcd56be702682fdf4b9b0a383a5482d388962ce97 (patch)
tree529eefa651521163f140b980a9649dc4590710e2 /src/libmime
parentd17046ff44a90fa6766fc1da4a546685ff992636 (diff)
downloadrspamd-cd56be702682fdf4b9b0a383a5482d388962ce97.tar.gz
rspamd-cd56be702682fdf4b9b0a383a5482d388962ce97.zip
[Project] Start rework of language detection to improve quality
Diffstat (limited to 'src/libmime')
-rw-r--r--src/libmime/lang_detection.c89
1 files changed, 46 insertions, 43 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index ed2aa1681..75be74f25 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -31,14 +31,49 @@ static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
- RS_LANGUAGE_LATIN = (1 <<0),
+ RS_LANGUAGE_LATIN = (1 << 0),
+ RS_LANGUAGE_TIER1 = (1 << 1),
+ RS_LANGUAGE_TIER2 = (1 << 2),
};
+struct rspamd_language_unicode_match {
+ const gchar *lang;
+ gint unicode_code;
+};
+
+/*
+ * List of languages detected by unicode scripts
+ */
+static const struct rspamd_language_unicode_match unicode_langs[] = {
+ {"el", UBLOCK_GREEK},
+ {"ml", UBLOCK_MALAYALAM},
+ {"te", UBLOCK_TELUGU},
+ {"ta", UBLOCK_TAMIL},
+ {"gu", UBLOCK_GUJARATI},
+ {"th", UBLOCK_THAI},
+ {"kn", UBLOCK_KANNADA},
+ {"ka", UBLOCK_GEORGIAN},
+ {"si", UBLOCK_SINHALA},
+ {"hy", UBLOCK_ARMENIAN},
+ {"lo", UBLOCK_LAO},
+ {"km", UBLOCK_KHMER}
+};
+
+/*
+ * List of languages to apply unigramms only
+ */
+static const gchar *unigramms_langs[] = {
+ "ja",
+ "ko",
+ "zh-CN",
+ "zh-TW"
+};
+
+
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
enum rspamd_language_elt_flags flags;
guint unigramms_total; /* total frequencies for unigramms */
- guint bigramms_total; /* total frequencies for bigramms */
guint trigramms_total; /* total frequencies for trigramms */
guint occurencies; /* total number of parts with this language */
};
@@ -51,8 +86,8 @@ struct rspamd_ngramm_elt {
struct rspamd_lang_detector {
GPtrArray *languages;
GHashTable *unigramms; /* unigramms frequencies */
- GHashTable *bigramms; /* bigramms frequencies */
GHashTable *trigramms; /* trigramms frequencies */
+ GHashTable *unicode_scripts; /* indexed by unicode script */
UConverter *uchar_converter;
gsize short_text_limit;
gsize total_occurencies; /* number of all languages found */
@@ -78,18 +113,6 @@ rspamd_unigram_equal (gconstpointer v, gconstpointer v2)
}
static guint
-rspamd_bigram_hash (gconstpointer key)
-{
- return rspamd_cryptobox_fast_hash (key, 2 * sizeof (UChar), rspamd_hash_seed ());
-}
-
-static gboolean
-rspamd_bigram_equal (gconstpointer v, gconstpointer v2)
-{
- return memcmp (v, v2, 2 * sizeof (UChar)) == 0;
-}
-
-static guint
rspamd_trigram_hash (gconstpointer key)
{
return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar), rspamd_hash_seed ());
@@ -144,7 +167,7 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
target = d->unigramms;
break;
case 2:
- target = d->bigramms;
+ /* Ignore */
break;
case 3:
target = d->trigramms;
@@ -243,8 +266,6 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
else {
nelt->unigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
0));
- nelt->bigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
- 1));
nelt->trigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
2));
}
@@ -275,7 +296,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
rspamd_language_detector_ucs_lowercase (ucs_key, nsym);
if (nsym == 2) {
/* We have a digraph */
- total = nelt->bigramms_total;
+ continue;
}
else if (nsym == 3) {
total = nelt->trigramms_total;
@@ -303,10 +324,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
nelt->flags |= RS_LANGUAGE_LATIN;
}
- msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms",
+ msg_info_config ("loaded %s language, %d unigramms, %d trigramms",
nelt->name,
(gint)nelt->unigramms_total,
- (gint)nelt->bigramms_total,
(gint)nelt->trigramms_total);
g_ptr_array_add (d->languages, nelt);
@@ -381,8 +401,6 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
/* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
ret->unigramms = g_hash_table_new_full (rspamd_unigram_hash,
rspamd_unigram_equal, NULL, rspamd_ptr_array_free_hard);
- ret->bigramms = g_hash_table_new_full (rspamd_bigram_hash,
- rspamd_bigram_equal, NULL, rspamd_ptr_array_free_hard);
ret->trigramms = g_hash_table_new_full (rspamd_trigram_hash,
rspamd_trigram_equal, NULL, rspamd_ptr_array_free_hard);
@@ -403,11 +421,10 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
g_free (fname);
}
- msg_info_config ("loaded %d languages, %d unigramms, %d bigramms, "
+ msg_info_config ("loaded %d languages, %d unigramms, "
"%d trigramms",
(gint)ret->languages->len,
g_hash_table_size (ret->unigramms),
- g_hash_table_size (ret->bigramms),
g_hash_table_size (ret->trigramms));
end:
if (gl.gl_pathc > 0) {
@@ -506,7 +523,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
enum rspamd_language_gramm_type {
rs_unigramm = 0,
- rs_bigramm,
rs_trigramm
};
@@ -573,9 +589,6 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
case rs_unigramm:
ngramms = d->unigramms;
break;
- case rs_bigramm:
- ngramms = d->bigramms;
- break;
case rs_trigramm:
ngramms = d->trigramms;
break;
@@ -617,9 +630,6 @@ rspamd_language_detector_detect_word (struct rspamd_task *task,
case rs_unigramm:
wlen = 1;
break;
- case rs_bigramm:
- wlen = 2;
- break;
case rs_trigramm:
wlen = 3;
break;
@@ -848,18 +858,11 @@ rspamd_language_detector_detect (struct rspamd_task *task,
candidates);
if (r == rs_detect_none) {
- msg_debug_lang_det ("short mode; no trigramms found, switch to bigramms");
- r = rspamd_language_detector_try_ngramm (task, default_words, d,
- ucs_tokens, rs_bigramm,
+ msg_debug_lang_det ("short mode; no trigramms found, "
+ "switch to unigramms");
+ r = rspamd_language_detector_try_ngramm (task, default_words,
+ d, ucs_tokens, rs_unigramm,
candidates);
-
- if (r == rs_detect_none) {
- msg_debug_lang_det ("short mode; no trigramms found, "
- "switch to unigramms");
- r = rspamd_language_detector_try_ngramm (task, default_words,
- d, ucs_tokens, rs_unigramm,
- candidates);
- }
}
else if (r == rs_detect_multiple) {
/* Check our guess */