diff options
Diffstat (limited to 'src/libmime/lang_detection_fasttext.cxx')
-rw-r--r-- | src/libmime/lang_detection_fasttext.cxx | 21 |
1 files changed, 14 insertions, 7 deletions
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx index 8ea2706e6..983ff78de 100644 --- a/src/libmime/lang_detection_fasttext.cxx +++ b/src/libmime/lang_detection_fasttext.cxx @@ -22,6 +22,7 @@ #include "libserver/logger.h" #include "contrib/fmt/include/fmt/base.h" #include "stat_api.h" +#include "libserver/word.h" #include <exception> #include <string_view> #include <vector> @@ -180,26 +181,32 @@ bool rspamd_lang_detection_fasttext_is_enabled(void *ud) rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, struct rspamd_task *task, - GArray *utf_words, + rspamd_words_t *utf_words, int k) { #ifndef WITH_FASTTEXT return nullptr; #else /* Avoid too long inputs */ - static const unsigned int max_fasttext_input_len = 1024 * 1024; + static const size_t max_fasttext_input_len = 1024 * 1024; auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); std::vector<std::int32_t> words_vec; - words_vec.reserve(utf_words->len); - for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) { - const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i); + if (!utf_words || !utf_words->a) { + return nullptr; + } + + auto words_count = kv_size(*utf_words); + words_vec.reserve(words_count); + + for (auto i = 0; i < std::min(words_count, max_fasttext_input_len); i++) { + const auto *w = &kv_A(*utf_words, i); if (w->original.len > 0) { real_model->word2vec(w->original.begin, w->original.len, words_vec); } } - msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len); + msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), words_count); auto *res = real_model->detect_language(words_vec, k); @@ -266,4 +273,4 @@ void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res #endif } -G_END_DECLS
\ No newline at end of file +G_END_DECLS |