From 2426e04a9aa304ad1d24cbceb91493f205bf5b57 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 29 Apr 2023 14:46:55 +0100 Subject: [PATCH] [Project] Show fasttext info --- src/libmime/lang_detection.c | 11 +++++++++-- src/libmime/lang_detection_fasttext.cxx | 23 +++++++++++++++++++++++ src/libmime/lang_detection_fasttext.h | 7 +++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 57d2f301d..09591438e 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -15,6 +15,7 @@ */ #include "lang_detection.h" +#include "lang_detection_fasttext.h" #include "libserver/logger.h" #include "libcryptobox/cryptobox.h" #include "libutil/multipattern.h" @@ -181,6 +182,7 @@ struct rspamd_lang_detector { UConverter *uchar_converter; gsize short_text_limit; gsize total_occurrences; /* number of all languages found */ + gpointer fasttext_detector; ref_entry_t ref; }; @@ -766,6 +768,7 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d) } kh_destroy (rspamd_stopwords_hash, d->stop_words_norm); + rspamd_lang_detection_fasttext_destroy(d->fasttext_detector); } } @@ -886,10 +889,14 @@ rspamd_language_detector_init (struct rspamd_config *cfg) total += kh_size (ret->trigrams[i]); } + ret->fasttext_detector = rspamd_lang_detection_fasttext_init(cfg); + char *fasttext_status = rspamd_lang_detection_fasttext_show_info(ret->fasttext_detector); + msg_info_config ("loaded %d languages, " - "%d trigrams", + "%d trigrams; %s", (gint)ret->languages->len, - (gint)total); + (gint)total, fasttext_status); + g_free (fasttext_status); if (stop_words) { ucl_object_unref (stop_words); diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx index cf6b5c852..d6bd96ca1 100644 --- a/src/libmime/lang_detection_fasttext.cxx +++ b/src/libmime/lang_detection_fasttext.cxx @@ -33,6 +33,7 @@ namespace rspamd::langdet { class fasttext_langdet { private: fasttext::FastText ft; + std::string model_fname; bool loaded; struct one_shot_buf : public std::streambuf { @@ -53,6 +54,7 @@ public: try { ft.loadModel(ucl_object_tostring(model)); loaded = true; + model_fname = std::string{ucl_object_tostring(model)}; } catch (std::exception &e) { auto err_message = fmt::format("cannot load fasttext model: {}", e.what()); @@ -93,6 +95,16 @@ public: return nullptr; } + + auto model_info(void) const -> std::string { + if (!loaded) { + return "fasttext model is not loaded"; + } + else { + return fmt::format("fasttext model {}: {} languages, {} tokens", model_fname, + ft.getDictionary()->nlabels(), ft.getDictionary()->ntokens()); + } + } }; } #endif @@ -112,6 +124,17 @@ void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg) #endif } +char *rspamd_lang_detection_fasttext_show_info(void *ud) +{ +#ifndef WITH_FASTTEXT + return g_strdup("fasttext is not compiled in"); +#else + auto model_info = FASTTEXT_MODEL_TO_C_API(ud)->model_info(); + + return g_strdup(model_info.c_str()); +#endif +} + rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, const char *in, size_t len, int k) { diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h index 44bc8bf71..71e253940 100644 --- a/src/libmime/lang_detection_fasttext.h +++ b/src/libmime/lang_detection_fasttext.h @@ -27,6 +27,13 @@ struct rspamd_config; */ void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg); +/** + * Show info about fasttext language detector + * @param ud + * @return + */ +char *rspamd_lang_detection_fasttext_show_info(void *ud); + typedef void * rspamd_fasttext_predict_result_t; /** -- 2.39.5