summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-27 13:17:50 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-27 13:17:50 +0000
commitbb70d081392c8bd57dd456f57b8c1aa0f58f0f71 (patch)
tree9debd30d7000dbdec2858d6efe0f9b4223eac481
parentcd56be702682fdf4b9b0a383a5482d388962ce97 (diff)
downloadrspamd-bb70d081392c8bd57dd456f57b8c1aa0f58f0f71.tar.gz
rspamd-bb70d081392c8bd57dd456f57b8c1aa0f58f0f71.zip
[Project] Add more flags to languages
-rw-r--r--src/libmime/lang_detection.c189
1 files changed, 141 insertions, 48 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 75be74f25..64e820b31 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -29,12 +29,6 @@ static const gsize default_words = 30;
static const gdouble update_prob = 0.6;
static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
-enum rspamd_language_elt_flags {
- RS_LANGUAGE_DEFAULT = 0,
- RS_LANGUAGE_LATIN = (1 << 0),
- RS_LANGUAGE_TIER1 = (1 << 1),
- RS_LANGUAGE_TIER2 = (1 << 2),
-};
struct rspamd_language_unicode_match {
const gchar *lang;
@@ -69,6 +63,22 @@ static const gchar *unigramms_langs[] = {
"zh-TW"
};
+/*
+ * Top languages
+ */
+static const gchar *tier1_langs[] = {
+ "en", "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
+ "ko", "pt", "ru", "pl", "tk", "th", "ar"
+};
+
+enum rspamd_language_elt_flags {
+ RS_LANGUAGE_DEFAULT = 0,
+ RS_LANGUAGE_LATIN = (1 << 0),
+ RS_LANGUAGE_UNISCRIPT = (1 << 1),
+ RS_LANGUAGE_UNIGRAMM = (1 << 2),
+ RS_LANGUAGE_TIER1 = (1 << 3),
+ RS_LANGUAGE_TIER2 = (1 << 4),
+};
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
@@ -100,6 +110,34 @@ struct rspamd_lang_detector {
INIT_LOG_MODULE(langdet)
+static const struct rspamd_language_unicode_match *
+rspamd_language_search_unicode_match (const gchar *key,
+ const struct rspamd_language_unicode_match *elts, size_t nelts)
+{
+ size_t i;
+
+ for (i = 0; i < nelts; i++) {
+ if (strcmp (elts[i].lang, key) == 0) {
+ return &elts[i];
+ }
+ }
+
+ return NULL;
+}
+
+static gboolean
+rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts)
+{
+ size_t i;
+
+ for (i = 0; i < nelts; i++) {
+ if (strcmp (elts[i], key) == 0) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
static guint
rspamd_unigram_hash (gconstpointer key)
{
@@ -210,6 +248,11 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
}
}
+struct rspamd_language_ucs_elt {
+ guint freq;
+ UChar s[0];
+};
+
static void
rspamd_language_detector_read_file (struct rspamd_config *cfg,
struct rspamd_lang_detector *d,
@@ -221,8 +264,10 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
ucl_object_iter_t it = NULL;
UErrorCode uc_err = U_ZERO_ERROR;
struct rspamd_language_elt *nelt;
+ const struct rspamd_language_unicode_match *uc_match;
+ struct rspamd_language_ucs_elt *ucs_elt;
gchar *pos;
- guint total = 0, total_latin = 0, total_ngramms = 0;
+ guint total = 0, total_latin = 0, total_ngramms = 0, i;
parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS);
if (!ucl_parser_add_file (parser, path)) {
@@ -270,58 +315,105 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
2));
}
- while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
- const gchar *key;
- gsize keylen;
- guint freq, nsym;
- UChar *ucs_key;
+ if ((uc_match = rspamd_language_search_unicode_match (nelt->name, unicode_langs,
+ G_N_ELEMENTS (unicode_langs))) != NULL) {
+ g_hash_table_insert (d->unicode_scripts, (gpointer)&uc_match->unicode_code,
+ nelt);
+ nelt->flags |= RS_LANGUAGE_UNISCRIPT;
+ }
+ else {
+ GPtrArray *ngramms;
+ guint nsym;
- key = ucl_object_keyl (cur, &keylen);
- freq = ucl_object_toint (cur);
+ if (rspamd_language_search_str (nelt->name, unigramms_langs,
+ G_N_ELEMENTS (unigramms_langs))) {
+ nelt->flags |= RS_LANGUAGE_UNIGRAMM;
+ total = nelt->unigramms_total;
+ }
+ else {
+ total = nelt->trigramms_total;
+ }
- if (key != NULL) {
- ucs_key = rspamd_mempool_alloc (cfg->cfg_pool,
- (keylen + 1) * sizeof (UChar));
+ it = NULL;
+ ngramms = g_ptr_array_sized_new (freqs->len);
+
+ while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
+ const gchar *key;
+ gsize keylen;
+ guint freq;
+
+ key = ucl_object_keyl (cur, &keylen);
+ freq = ucl_object_toint (cur);
+
+ if (key != NULL) {
+ ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
+ sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar));
+
+ nsym = ucnv_toUChars (d->uchar_converter,
+ ucs_elt->s, keylen + 1,
+ key,
+ keylen, &uc_err);
+
+ if (uc_err != U_ZERO_ERROR) {
+ msg_warn_config ("cannot convert key to unicode: %s",
+ u_errorName (uc_err));
+
+ continue;
+ }
+
+ rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
+
+ if (nsym == 2) {
+ /* We have a digraph */
+ continue;
+ }
+ else if (nsym == 3 && !(nelt->flags & RS_LANGUAGE_UNIGRAMM)) {
+ g_ptr_array_add (ngramms, ucs_elt);
+ }
+ else if (nsym == 1 && nelt->flags & RS_LANGUAGE_UNIGRAMM) {
+ g_ptr_array_add (ngramms, ucs_elt);
+ }
+ else if (nsym > 3) {
+ msg_warn_config ("have more than 3 characters in key: %d",
+ nsym);
+ continue;
+ }
+
+ if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+ total_latin++;
+ }
+
+ ucs_elt->freq = freq;
+
+ total_ngramms++;
+ }
+ }
- nsym = ucnv_toUChars (d->uchar_converter, ucs_key, keylen + 1, key,
- keylen, &uc_err);
+ if (total_latin >= total_ngramms * 2 / 3) {
+ nelt->flags |= RS_LANGUAGE_LATIN;
+ }
- if (uc_err != U_ZERO_ERROR) {
- msg_warn_config ("cannot convert key to unicode: %s",
- u_errorName (uc_err));
+ if (nelt->flags & RS_LANGUAGE_UNIGRAMM) {
+ nsym = 1;
+ }
+ else {
+ nsym = 3;
+ }
- continue;
- }
+ PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
- rspamd_language_detector_ucs_lowercase (ucs_key, nsym);
- if (nsym == 2) {
- /* We have a digraph */
- continue;
- }
- else if (nsym == 3) {
- total = nelt->trigramms_total;
- }
- else if (nsym == 1) {
- total = nelt->unigramms_total;
- }
- else if (nsym > 3) {
- msg_warn_config ("have more than 3 characters in key: %d", nsym);
+ if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
+ rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+ /* Skip latin ngramm for non-latin language to avoid garbadge */
continue;
}
- rspamd_language_detector_init_ngramm (cfg, d, nelt, ucs_key, nsym,
- freq, total);
-
- if (rspamd_language_detector_ucs_is_latin (ucs_key, nsym)) {
- total_latin ++;
- }
-
- total_ngramms ++;
+ rspamd_language_detector_init_ngramm (cfg, d, nelt, ucs_elt->s,
+ nsym,
+ ucs_elt->freq, total);
}
- }
- if (total_latin >= total_ngramms * 2 / 3) {
- nelt->flags |= RS_LANGUAGE_LATIN;
+ g_ptr_array_free (ngramms, TRUE);
}
msg_info_config ("loaded %s language, %d unigramms, %d trigramms",
@@ -403,6 +495,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
rspamd_unigram_equal, NULL, rspamd_ptr_array_free_hard);
ret->trigramms = g_hash_table_new_full (rspamd_trigram_hash,
rspamd_trigram_equal, NULL, rspamd_ptr_array_free_hard);
+ ret->unicode_scripts = g_hash_table_new (g_int_hash, g_int_equal);
g_assert (uc_err == U_ZERO_ERROR);