diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 17:44:16 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 17:44:16 +0100 |
commit | 68ea1140d77cd4dda13247ec300251563a28c176 (patch) | |
tree | cda81350ccf8e0e40ff3d0e43ff33582558185bf /src/libmime/lang_detection.c | |
parent | 070120ed1370ac7179cf4945195294df6a26b4dc (diff) | |
download | rspamd-68ea1140d77cd4dda13247ec300251563a28c176.tar.gz rspamd-68ea1140d77cd4dda13247ec300251563a28c176.zip |
[Project] Some further fixesvstakhov-fasttext-langdet
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 49 |
1 files changed, 33 insertions, 16 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index d8e81e075..62d04975c 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -174,8 +174,10 @@ KHASH_INIT (rspamd_stopwords_hash, rspamd_ftok_t *, char, false, rspamd_ftok_hash, rspamd_ftok_equal); +KHASH_INIT (rspamd_languages_hash, const gchar *, struct rspamd_language_elt *, true, + rspamd_str_hash, rspamd_str_equal); struct rspamd_lang_detector { - GPtrArray *languages; + khash_t(rspamd_languages_hash) *languages; khash_t(rspamd_trigram_hash) *trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */ struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX]; khash_t(rspamd_stopwords_hash) *stop_words_norm; @@ -686,7 +688,10 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, skipped, loaded, nelt->stop_words, rspamd_language_detector_print_flags (nelt)); - g_ptr_array_add (d->languages, nelt); + int ret; + khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret); + g_assert (ret > 0); /* must be unique */ + kh_value(d->languages, k) = nelt; ucl_object_unref (top); } @@ -764,7 +769,7 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d) } if (d->languages) { - g_ptr_array_free (d->languages, TRUE); + kh_destroy (rspamd_languages_hash, d->languages); } kh_destroy (rspamd_stopwords_hash, d->stop_words_norm); @@ -833,7 +838,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg) } ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret)); - ret->languages = g_ptr_array_sized_new (gl.gl_pathc); + ret->languages = kh_init(rspamd_languages_hash); + kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc); ret->uchar_converter = rspamd_get_utf8_converter (); ret->short_text_limit = short_text_limit; ret->stop_words_norm = kh_init (rspamd_stopwords_hash); @@ -894,7 +900,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg) msg_info_config ("loaded %d languages, " "%d trigrams; %s", - (gint)ret->languages->len, + (gint)kh_size(ret->languages), (gint)total, fasttext_status); g_free (fasttext_status); @@ -1810,25 +1816,28 @@ rspamd_language_detector_detect (struct rspamd_task *task, guint nchinese = 0, nspecial = 0; rspamd_language_detector_unicode_scripts (task, part, &nchinese, &nspecial); - /* Apply unicode scripts heuristic */ - if (rspamd_language_detector_try_uniscript (task, part, nchinese, nspecial)) { - ret = TRUE; - } + /* Disable internal language detection heuristics if we have fasttext */ + if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { + /* Apply unicode scripts heuristic */ + if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) { + ret = TRUE; + } - cat = rspamd_language_detector_get_category (part->unicode_scripts); + cat = rspamd_language_detector_get_category(part->unicode_scripts); - if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) { - ret = TRUE; + if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) { + ret = TRUE; + } } if (!ret) { unsigned ndetected = 0; if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { - rspamd_fasttext_predict_result_t fasttext_predict_result; - fasttext_predict_result = rspamd_lang_detection_fasttext_detect(d->fasttext_detector, - part->utf_stripped_content->data, - part->utf_stripped_content->len, 4); + rspamd_fasttext_predict_result_t fasttext_predict_result = + rspamd_lang_detection_fasttext_detect(d->fasttext_detector, + part->utf_stripped_content->data, + part->utf_stripped_content->len, 4); ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result); @@ -1851,6 +1860,12 @@ rspamd_language_detector_detect (struct rspamd_task *task, cand = kh_value(candidates, k); cand->lang = lang; cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); + + /* Find the corresponding language elt */ + k = kh_get(rspamd_languages_hash, d->languages, lang); + if (k != kh_end(d->languages)) { + cand->elt = kh_value(d->languages, k); + } } } @@ -1864,6 +1879,8 @@ rspamd_language_detector_detect (struct rspamd_task *task, r = rs_detect_none; } } + + rspamd_fasttext_predict_result_destroy(fasttext_predict_result); } if (ndetected == 0) { if (part->utf_words->len < default_short_text_limit) { |