]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Add preliminary language detector based on fasttext library
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 29 Apr 2023 13:22:41 +0000 (14:22 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 29 Apr 2023 13:22:41 +0000 (14:22 +0100)
CMakeLists.txt
config.h.in
src/libmime/CMakeLists.txt
src/libmime/lang_detection_fasttext.cxx [new file with mode: 0644]
src/libmime/lang_detection_fasttext.h [new file with mode: 0644]

index 8ab5658d36b8b8abaebc8e06eedb20a4c777c764..ac25856699d221f90fc4a7dc301bf8ba57546ed3 100644 (file)
@@ -247,6 +247,7 @@ if(ENABLE_FASTTEXT MATCHES "ON")
        ProcessPackage(FASTTEXT LIBRARY fasttext INCLUDE fasttext.h
                        INCLUDE_SUFFIXES include/fasttext
                        ROOT ${FASTTEXT_ROOT_DIR} MODULES fasttext)
+       SET(WITH_FASTTEXT "1")
 endif()
 
 include (CompilerWarnings)
index 4fedba7245167346e1365458bf183412bd8e5ba2..b70308331ab4160c79e021b8824b846d77f05e3f 100644 (file)
 #cmakedefine WITH_LIBUNWIND      1
 #cmakedefine WITH_LUA_TRACE      1
 #cmakedefine WITH_LUA_REPL       1
+#cmakedefine WITH_FASTTEXT       1
 
 #cmakedefine DISABLE_PTHREAD_MUTEX 1
 
index 4a64aac58cc883ff49fd48b2d7baa122801bf0c0..09e5dbfcafabeae97aa7500152843064a27852db 100644 (file)
@@ -12,6 +12,7 @@ SET(LIBRSPAMDMIMESRC
                                ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
+               ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx
                ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
                )
 
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
new file mode 100644 (file)
index 0000000..cf6b5c8
--- /dev/null
@@ -0,0 +1,170 @@
+/*-
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lang_detection_fasttext.h"
+
+#ifdef WITH_FASTTEXT
+#include "fasttext/fasttext.h"
+#include "libserver/cfg_file.h"
+#include "libserver/logger.h"
+#include "fmt/core.h"
+#include <exception>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <streambuf>
+#endif
+
+#ifdef WITH_FASTTEXT
+namespace rspamd::langdet {
+class fasttext_langdet {
+private:
+       fasttext::FastText ft;
+       bool loaded;
+
+       struct one_shot_buf : public std::streambuf {
+               explicit one_shot_buf(const char *in, std::size_t sz) {
+                       auto deconst_in = const_cast<char *>(in);
+                       setg(deconst_in, deconst_in, deconst_in + sz);
+               }
+       };
+public:
+       explicit fasttext_langdet(struct rspamd_config *cfg) {
+               const auto *ucl_obj = cfg->rcl_obj;
+               const auto *opts_section = ucl_object_find_key(ucl_obj, "options");
+
+               if (opts_section) {
+                       const auto *model = ucl_object_find_key(opts_section, "fasttext_langdet_model");
+
+                       if (model) {
+                               try {
+                                       ft.loadModel(ucl_object_tostring(model));
+                                       loaded = true;
+                               }
+                               catch (std::exception &e) {
+                                       auto err_message = fmt::format("cannot load fasttext model: {}", e.what());
+                                       msg_err_config("%s", err_message.c_str());
+                                       loaded = false;
+                               }
+                       }
+               }
+       }
+
+       /* Disallow multiple initialisation */
+       fasttext_langdet() = delete;
+       fasttext_langdet(const fasttext_langdet &) = delete;
+       fasttext_langdet(fasttext_langdet &&) = delete;
+
+       ~fasttext_langdet() = default;
+
+
+       auto detect_language(const char *in, size_t len, int k) -> std::vector<std::pair<fasttext::real, std::string>> *
+       {
+               if (!loaded) {
+                       return nullptr;
+               }
+
+               /* Hack to deal with streams without copies */
+               one_shot_buf buf{in, len};
+               auto stream = std::istream{&buf};
+               auto predictions = new std::vector<std::pair<fasttext::real, std::string>>;
+               predictions->reserve(k);
+               auto res = ft.predictLine(stream, *predictions, k, 0.0f);
+
+               if (res) {
+                       return predictions;
+               }
+               else {
+                       delete predictions;
+               }
+
+               return nullptr;
+       }
+};
+}
+#endif
+
+/* C API part */
+G_BEGIN_DECLS
+
+#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p)
+#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res)
+
+void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg)
+{
+#ifndef WITH_FASTTEXT
+       return nullptr;
+#else
+       return (void *)new rspamd::langdet::fasttext_langdet(cfg);
+#endif
+}
+
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+                                                                                          const char *in, size_t len, int k)
+{
+#ifndef WITH_FASTTEXT
+       return nullptr;
+#else
+       auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
+       auto *res = real_model->detect_language(in, len, k);
+
+       return (rspamd_fasttext_predict_result_t)res;
+#endif
+}
+
+void rspamd_lang_detection_fasttext_destroy(void *ud)
+{
+#ifdef WITH_FASTTEXT
+       delete FASTTEXT_MODEL_TO_C_API(ud);
+#endif
+}
+
+const char *
+rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+       auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+       if (real_res && !real_res->empty()) {
+               return real_res->front().second.c_str();
+       }
+#endif
+       return nullptr;
+}
+
+float
+rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+       auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+       if (real_res && !real_res->empty()) {
+               return real_res->front().first;
+       }
+#endif
+       return 0.0f;
+}
+
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+       auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+       delete real_res;
+#endif
+}
+
+G_END_DECLS
\ No newline at end of file
diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h
new file mode 100644 (file)
index 0000000..44bc8bf
--- /dev/null
@@ -0,0 +1,70 @@
+/*-
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H
+#define RSPAMD_LANG_DETECTION_FASTTEXT_H
+
+#include "config.h"
+
+G_BEGIN_DECLS
+struct rspamd_config;
+/**
+ * Initialize fasttext language detector
+ * @param cfg
+ * @return opaque pointer
+ */
+void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg);
+
+
+typedef  void * rspamd_fasttext_predict_result_t;
+/**
+ * Detect language using fasttext
+ * @param ud opaque pointer
+ * @param in input text
+ * @param len length of input text
+ * @param k number of results to return
+ * @return TRUE if language is detected
+ */
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+               const char *in, size_t len, int k);
+
+/**
+ * Get language from fasttext result
+ * @param res
+ * @return
+ */
+const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Get probability from fasttext result
+ * @param res
+ * @return
+ */
+float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Destroy fasttext result
+ * @param res
+ */
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Destroy fasttext language detector
+ */
+void rspamd_lang_detection_fasttext_destroy(void *ud);
+
+
+G_END_DECLS
+#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */