aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/lang_detection_fasttext.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/libmime/lang_detection_fasttext.cxx')
-rw-r--r--src/libmime/lang_detection_fasttext.cxx21
1 files changed, 14 insertions, 7 deletions
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
index 8ea2706e6..983ff78de 100644
--- a/src/libmime/lang_detection_fasttext.cxx
+++ b/src/libmime/lang_detection_fasttext.cxx
@@ -22,6 +22,7 @@
#include "libserver/logger.h"
#include "contrib/fmt/include/fmt/base.h"
#include "stat_api.h"
+#include "libserver/word.h"
#include <exception>
#include <string_view>
#include <vector>
@@ -180,26 +181,32 @@ bool rspamd_lang_detection_fasttext_is_enabled(void *ud)
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
struct rspamd_task *task,
- GArray *utf_words,
+ rspamd_words_t *utf_words,
int k)
{
#ifndef WITH_FASTTEXT
return nullptr;
#else
/* Avoid too long inputs */
- static const unsigned int max_fasttext_input_len = 1024 * 1024;
+ static const size_t max_fasttext_input_len = 1024 * 1024;
auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
std::vector<std::int32_t> words_vec;
- words_vec.reserve(utf_words->len);
- for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) {
- const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i);
+ if (!utf_words || !utf_words->a) {
+ return nullptr;
+ }
+
+ auto words_count = kv_size(*utf_words);
+ words_vec.reserve(words_count);
+
+ for (auto i = 0; i < std::min(words_count, max_fasttext_input_len); i++) {
+ const auto *w = &kv_A(*utf_words, i);
if (w->original.len > 0) {
real_model->word2vec(w->original.begin, w->original.len, words_vec);
}
}
- msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len);
+ msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), words_count);
auto *res = real_model->detect_language(words_vec, k);
@@ -266,4 +273,4 @@ void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res
#endif
}
-G_END_DECLS \ No newline at end of file
+G_END_DECLS