Browse Source

[Project] Add preliminary language detector based on fasttext library

tags/3.6
Vsevolod Stakhov 1 year ago
parent
commit
4627303717
No account linked to committer's email address

+ 1
- 0
CMakeLists.txt View File

@@ -247,6 +247,7 @@ if(ENABLE_FASTTEXT MATCHES "ON")
ProcessPackage(FASTTEXT LIBRARY fasttext INCLUDE fasttext.h
INCLUDE_SUFFIXES include/fasttext
ROOT ${FASTTEXT_ROOT_DIR} MODULES fasttext)
SET(WITH_FASTTEXT "1")
endif()

include (CompilerWarnings)

+ 1
- 0
config.h.in View File

@@ -116,6 +116,7 @@
#cmakedefine WITH_LIBUNWIND 1
#cmakedefine WITH_LUA_TRACE 1
#cmakedefine WITH_LUA_REPL 1
#cmakedefine WITH_FASTTEXT 1

#cmakedefine DISABLE_PTHREAD_MUTEX 1


+ 1
- 0
src/libmime/CMakeLists.txt View File

@@ -12,6 +12,7 @@ SET(LIBRSPAMDMIMESRC
${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx
${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
)


+ 170
- 0
src/libmime/lang_detection_fasttext.cxx View File

@@ -0,0 +1,170 @@
/*-
* Copyright 2023 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "lang_detection_fasttext.h"

#ifdef WITH_FASTTEXT
#include "fasttext/fasttext.h"
#include "libserver/cfg_file.h"
#include "libserver/logger.h"
#include "fmt/core.h"
#include <exception>
#include <string>
#include <vector>
#include <sstream>
#include <streambuf>
#endif

#ifdef WITH_FASTTEXT
namespace rspamd::langdet {
class fasttext_langdet {
private:
fasttext::FastText ft;
bool loaded;

struct one_shot_buf : public std::streambuf {
explicit one_shot_buf(const char *in, std::size_t sz) {
auto deconst_in = const_cast<char *>(in);
setg(deconst_in, deconst_in, deconst_in + sz);
}
};
public:
explicit fasttext_langdet(struct rspamd_config *cfg) {
const auto *ucl_obj = cfg->rcl_obj;
const auto *opts_section = ucl_object_find_key(ucl_obj, "options");

if (opts_section) {
const auto *model = ucl_object_find_key(opts_section, "fasttext_langdet_model");

if (model) {
try {
ft.loadModel(ucl_object_tostring(model));
loaded = true;
}
catch (std::exception &e) {
auto err_message = fmt::format("cannot load fasttext model: {}", e.what());
msg_err_config("%s", err_message.c_str());
loaded = false;
}
}
}
}

/* Disallow multiple initialisation */
fasttext_langdet() = delete;
fasttext_langdet(const fasttext_langdet &) = delete;
fasttext_langdet(fasttext_langdet &&) = delete;

~fasttext_langdet() = default;


auto detect_language(const char *in, size_t len, int k) -> std::vector<std::pair<fasttext::real, std::string>> *
{
if (!loaded) {
return nullptr;
}

/* Hack to deal with streams without copies */
one_shot_buf buf{in, len};
auto stream = std::istream{&buf};
auto predictions = new std::vector<std::pair<fasttext::real, std::string>>;
predictions->reserve(k);
auto res = ft.predictLine(stream, *predictions, k, 0.0f);

if (res) {
return predictions;
}
else {
delete predictions;
}

return nullptr;
}
};
}
#endif

/* C API part */
G_BEGIN_DECLS

#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p)
#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res)

void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg)
{
#ifndef WITH_FASTTEXT
return nullptr;
#else
return (void *)new rspamd::langdet::fasttext_langdet(cfg);
#endif
}

rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
const char *in, size_t len, int k)
{
#ifndef WITH_FASTTEXT
return nullptr;
#else
auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
auto *res = real_model->detect_language(in, len, k);

return (rspamd_fasttext_predict_result_t)res;
#endif
}

void rspamd_lang_detection_fasttext_destroy(void *ud)
{
#ifdef WITH_FASTTEXT
delete FASTTEXT_MODEL_TO_C_API(ud);
#endif
}

const char *
rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res)
{
#ifdef WITH_FASTTEXT
auto *real_res = FASTTEXT_RESULT_TO_C_API(res);

if (real_res && !real_res->empty()) {
return real_res->front().second.c_str();
}
#endif
return nullptr;
}

float
rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res)
{
#ifdef WITH_FASTTEXT
auto *real_res = FASTTEXT_RESULT_TO_C_API(res);

if (real_res && !real_res->empty()) {
return real_res->front().first;
}
#endif
return 0.0f;
}

void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res)
{
#ifdef WITH_FASTTEXT
auto *real_res = FASTTEXT_RESULT_TO_C_API(res);

delete real_res;
#endif
}

G_END_DECLS

+ 70
- 0
src/libmime/lang_detection_fasttext.h View File

@@ -0,0 +1,70 @@
/*-
* Copyright 2023 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H
#define RSPAMD_LANG_DETECTION_FASTTEXT_H

#include "config.h"

G_BEGIN_DECLS
struct rspamd_config;
/**
* Initialize fasttext language detector
* @param cfg
* @return opaque pointer
*/
void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg);


typedef void * rspamd_fasttext_predict_result_t;
/**
* Detect language using fasttext
* @param ud opaque pointer
* @param in input text
* @param len length of input text
* @param k number of results to return
* @return TRUE if language is detected
*/
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
const char *in, size_t len, int k);

/**
* Get language from fasttext result
* @param res
* @return
*/
const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res);

/**
* Get probability from fasttext result
* @param res
* @return
*/
float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res);

/**
* Destroy fasttext result
* @param res
*/
void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res);

/**
* Destroy fasttext language detector
*/
void rspamd_lang_detection_fasttext_destroy(void *ud);


G_END_DECLS
#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */

Loading…
Cancel
Save