From 4627303717edb6c620b6d7855c5fce50a6c84577 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 29 Apr 2023 14:22:41 +0100 Subject: [PATCH] [Project] Add preliminary language detector based on fasttext library --- CMakeLists.txt | 1 + config.h.in | 1 + src/libmime/CMakeLists.txt | 1 + src/libmime/lang_detection_fasttext.cxx | 170 ++++++++++++++++++++++++ src/libmime/lang_detection_fasttext.h | 70 ++++++++++ 5 files changed, 243 insertions(+) create mode 100644 src/libmime/lang_detection_fasttext.cxx create mode 100644 src/libmime/lang_detection_fasttext.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ab5658d3..ac2585669 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -247,6 +247,7 @@ if(ENABLE_FASTTEXT MATCHES "ON") ProcessPackage(FASTTEXT LIBRARY fasttext INCLUDE fasttext.h INCLUDE_SUFFIXES include/fasttext ROOT ${FASTTEXT_ROOT_DIR} MODULES fasttext) + SET(WITH_FASTTEXT "1") endif() include (CompilerWarnings) diff --git a/config.h.in b/config.h.in index 4fedba724..b70308331 100644 --- a/config.h.in +++ b/config.h.in @@ -116,6 +116,7 @@ #cmakedefine WITH_LIBUNWIND 1 #cmakedefine WITH_LUA_TRACE 1 #cmakedefine WITH_LUA_REPL 1 +#cmakedefine WITH_FASTTEXT 1 #cmakedefine DISABLE_PTHREAD_MUTEX 1 diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt index 4a64aac58..09e5dbfca 100644 --- a/src/libmime/CMakeLists.txt +++ b/src/libmime/CMakeLists.txt @@ -12,6 +12,7 @@ SET(LIBRSPAMDMIMESRC ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c + ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx ) diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx new file mode 100644 index 000000000..cf6b5c852 --- /dev/null +++ b/src/libmime/lang_detection_fasttext.cxx @@ -0,0 +1,170 @@ +/*- + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lang_detection_fasttext.h" + +#ifdef WITH_FASTTEXT +#include "fasttext/fasttext.h" +#include "libserver/cfg_file.h" +#include "libserver/logger.h" +#include "fmt/core.h" +#include +#include +#include +#include +#include +#endif + +#ifdef WITH_FASTTEXT +namespace rspamd::langdet { +class fasttext_langdet { +private: + fasttext::FastText ft; + bool loaded; + + struct one_shot_buf : public std::streambuf { + explicit one_shot_buf(const char *in, std::size_t sz) { + auto deconst_in = const_cast(in); + setg(deconst_in, deconst_in, deconst_in + sz); + } + }; +public: + explicit fasttext_langdet(struct rspamd_config *cfg) { + const auto *ucl_obj = cfg->rcl_obj; + const auto *opts_section = ucl_object_find_key(ucl_obj, "options"); + + if (opts_section) { + const auto *model = ucl_object_find_key(opts_section, "fasttext_langdet_model"); + + if (model) { + try { + ft.loadModel(ucl_object_tostring(model)); + loaded = true; + } + catch (std::exception &e) { + auto err_message = fmt::format("cannot load fasttext model: {}", e.what()); + msg_err_config("%s", err_message.c_str()); + loaded = false; + } + } + } + } + + /* Disallow multiple initialisation */ + fasttext_langdet() = delete; + fasttext_langdet(const fasttext_langdet &) = delete; + fasttext_langdet(fasttext_langdet &&) = delete; + + ~fasttext_langdet() = default; + + + auto detect_language(const char *in, size_t len, int k) -> std::vector> * + { + if (!loaded) { + return nullptr; + } + + /* Hack to deal with streams without copies */ + one_shot_buf buf{in, len}; + auto stream = std::istream{&buf}; + auto predictions = new std::vector>; + predictions->reserve(k); + auto res = ft.predictLine(stream, *predictions, k, 0.0f); + + if (res) { + return predictions; + } + else { + delete predictions; + } + + return nullptr; + } +}; +} +#endif + +/* C API part */ +G_BEGIN_DECLS + +#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast(p) +#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast> *>(res) + +void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg) +{ +#ifndef WITH_FASTTEXT + return nullptr; +#else + return (void *)new rspamd::langdet::fasttext_langdet(cfg); +#endif +} + +rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, + const char *in, size_t len, int k) +{ +#ifndef WITH_FASTTEXT + return nullptr; +#else + auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); + auto *res = real_model->detect_language(in, len, k); + + return (rspamd_fasttext_predict_result_t)res; +#endif +} + +void rspamd_lang_detection_fasttext_destroy(void *ud) +{ +#ifdef WITH_FASTTEXT + delete FASTTEXT_MODEL_TO_C_API(ud); +#endif +} + +const char * +rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + if (real_res && !real_res->empty()) { + return real_res->front().second.c_str(); + } +#endif + return nullptr; +} + +float +rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + if (real_res && !real_res->empty()) { + return real_res->front().first; + } +#endif + return 0.0f; +} + +void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + delete real_res; +#endif +} + +G_END_DECLS \ No newline at end of file diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h new file mode 100644 index 000000000..44bc8bf71 --- /dev/null +++ b/src/libmime/lang_detection_fasttext.h @@ -0,0 +1,70 @@ +/*- + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H +#define RSPAMD_LANG_DETECTION_FASTTEXT_H + +#include "config.h" + +G_BEGIN_DECLS +struct rspamd_config; +/** + * Initialize fasttext language detector + * @param cfg + * @return opaque pointer + */ +void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg); + + +typedef void * rspamd_fasttext_predict_result_t; +/** + * Detect language using fasttext + * @param ud opaque pointer + * @param in input text + * @param len length of input text + * @param k number of results to return + * @return TRUE if language is detected + */ +rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, + const char *in, size_t len, int k); + +/** + * Get language from fasttext result + * @param res + * @return + */ +const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res); + +/** + * Get probability from fasttext result + * @param res + * @return + */ +float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res); + +/** + * Destroy fasttext result + * @param res + */ +void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res); + +/** + * Destroy fasttext language detector + */ +void rspamd_lang_detection_fasttext_destroy(void *ud); + + +G_END_DECLS +#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */ -- 2.39.5