diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 14:22:41 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 14:22:41 +0100 |
commit | 4627303717edb6c620b6d7855c5fce50a6c84577 (patch) | |
tree | 81a6309522a0f3f19fac2e075200372c899b30dd /src/libmime | |
parent | 529a1f8b4767ee8a84daaea6f6d825bb4793e1f3 (diff) | |
download | rspamd-4627303717edb6c620b6d7855c5fce50a6c84577.tar.gz rspamd-4627303717edb6c620b6d7855c5fce50a6c84577.zip |
[Project] Add preliminary language detector based on fasttext library
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/libmime/lang_detection_fasttext.cxx | 170 | ||||
-rw-r--r-- | src/libmime/lang_detection_fasttext.h | 70 |
3 files changed, 241 insertions, 0 deletions
diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt index 4a64aac58..09e5dbfca 100644 --- a/src/libmime/CMakeLists.txt +++ b/src/libmime/CMakeLists.txt @@ -12,6 +12,7 @@ SET(LIBRSPAMDMIMESRC ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c + ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx ) diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx new file mode 100644 index 000000000..cf6b5c852 --- /dev/null +++ b/src/libmime/lang_detection_fasttext.cxx @@ -0,0 +1,170 @@ +/*- + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lang_detection_fasttext.h" + +#ifdef WITH_FASTTEXT +#include "fasttext/fasttext.h" +#include "libserver/cfg_file.h" +#include "libserver/logger.h" +#include "fmt/core.h" +#include <exception> +#include <string> +#include <vector> +#include <sstream> +#include <streambuf> +#endif + +#ifdef WITH_FASTTEXT +namespace rspamd::langdet { +class fasttext_langdet { +private: + fasttext::FastText ft; + bool loaded; + + struct one_shot_buf : public std::streambuf { + explicit one_shot_buf(const char *in, std::size_t sz) { + auto deconst_in = const_cast<char *>(in); + setg(deconst_in, deconst_in, deconst_in + sz); + } + }; +public: + explicit fasttext_langdet(struct rspamd_config *cfg) { + const auto *ucl_obj = cfg->rcl_obj; + const auto *opts_section = ucl_object_find_key(ucl_obj, "options"); + + if (opts_section) { + const auto *model = ucl_object_find_key(opts_section, "fasttext_langdet_model"); + + if (model) { + try { + ft.loadModel(ucl_object_tostring(model)); + loaded = true; + } + catch (std::exception &e) { + auto err_message = fmt::format("cannot load fasttext model: {}", e.what()); + msg_err_config("%s", err_message.c_str()); + loaded = false; + } + } + } + } + + /* Disallow multiple initialisation */ + fasttext_langdet() = delete; + fasttext_langdet(const fasttext_langdet &) = delete; + fasttext_langdet(fasttext_langdet &&) = delete; + + ~fasttext_langdet() = default; + + + auto detect_language(const char *in, size_t len, int k) -> std::vector<std::pair<fasttext::real, std::string>> * + { + if (!loaded) { + return nullptr; + } + + /* Hack to deal with streams without copies */ + one_shot_buf buf{in, len}; + auto stream = std::istream{&buf}; + auto predictions = new std::vector<std::pair<fasttext::real, std::string>>; + predictions->reserve(k); + auto res = ft.predictLine(stream, *predictions, k, 0.0f); + + if (res) { + return predictions; + } + else { + delete predictions; + } + + return nullptr; + } +}; +} +#endif + +/* C API part */ +G_BEGIN_DECLS + +#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p) +#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res) + +void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg) +{ +#ifndef WITH_FASTTEXT + return nullptr; +#else + return (void *)new rspamd::langdet::fasttext_langdet(cfg); +#endif +} + +rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, + const char *in, size_t len, int k) +{ +#ifndef WITH_FASTTEXT + return nullptr; +#else + auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); + auto *res = real_model->detect_language(in, len, k); + + return (rspamd_fasttext_predict_result_t)res; +#endif +} + +void rspamd_lang_detection_fasttext_destroy(void *ud) +{ +#ifdef WITH_FASTTEXT + delete FASTTEXT_MODEL_TO_C_API(ud); +#endif +} + +const char * +rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + if (real_res && !real_res->empty()) { + return real_res->front().second.c_str(); + } +#endif + return nullptr; +} + +float +rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + if (real_res && !real_res->empty()) { + return real_res->front().first; + } +#endif + return 0.0f; +} + +void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + delete real_res; +#endif +} + +G_END_DECLS
\ No newline at end of file diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h new file mode 100644 index 000000000..44bc8bf71 --- /dev/null +++ b/src/libmime/lang_detection_fasttext.h @@ -0,0 +1,70 @@ +/*- + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H +#define RSPAMD_LANG_DETECTION_FASTTEXT_H + +#include "config.h" + +G_BEGIN_DECLS +struct rspamd_config; +/** + * Initialize fasttext language detector + * @param cfg + * @return opaque pointer + */ +void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg); + + +typedef void * rspamd_fasttext_predict_result_t; +/** + * Detect language using fasttext + * @param ud opaque pointer + * @param in input text + * @param len length of input text + * @param k number of results to return + * @return TRUE if language is detected + */ +rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, + const char *in, size_t len, int k); + +/** + * Get language from fasttext result + * @param res + * @return + */ +const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res); + +/** + * Get probability from fasttext result + * @param res + * @return + */ +float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res); + +/** + * Destroy fasttext result + * @param res + */ +void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res); + +/** + * Destroy fasttext language detector + */ +void rspamd_lang_detection_fasttext_destroy(void *ud); + + +G_END_DECLS +#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */ |