From 9fd0dc23765c38f315d3bf67516bbe52a1b61bbd Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 30 Dec 2017 19:52:53 +0000 Subject: [PATCH] [Project] Start language detection project --- src/libmime/CMakeLists.txt | 3 +- src/libmime/lang_detection.c | 204 +++++++++++++++++++++++++++++++++++ src/libmime/lang_detection.h | 27 +++++ 3 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 src/libmime/lang_detection.c create mode 100644 src/libmime/lang_detection.h diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt index 78ada58e1..81d75d382 100644 --- a/src/libmime/CMakeLists.txt +++ b/src/libmime/CMakeLists.txt @@ -9,6 +9,7 @@ SET(LIBRSPAMDMIMESRC ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c - ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c) + ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c + ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c) SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c new file mode 100644 index 000000000..730733dfb --- /dev/null +++ b/src/libmime/lang_detection.c @@ -0,0 +1,204 @@ +/*- + * Copyright 2017 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lang_detection.h" +#include "libutil/logger.h" +#include "libcryptobox/cryptobox.h" +#include "ucl.h" +#include +#include +#include + +struct rspamd_language_elt { + const gchar *name; /* e.g. "en" or "ru" */ + guint bigramms_total; /* total frequencies for bigramms */ + GHashTable *bigramms; /* bigrams frequencies */ + guint trigramms_total; /* total frequencies for trigramms */ + GHashTable *trigramms; /* trigramms frequencies */ +}; + +struct rspamd_lang_detector { + GPtrArray *languages; + UConverter *uchar_converter; +}; + +static guint +rspamd_bigram_hash (gconstpointer key) +{ + return rspamd_cryptobox_fast_hash (key, 2 * sizeof (UChar), rspamd_hash_seed ()); +} + +static gboolean +rspamd_bigram_equal (gconstpointer v, gconstpointer v2) +{ + return memcmp (v, v2, 2 * sizeof (UChar)) == 0; +} + +static guint +rspamd_trigram_hash (gconstpointer key) +{ + return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar), rspamd_hash_seed ()); +} + +static gboolean +rspamd_trigram_equal (gconstpointer v, gconstpointer v2) +{ + return memcmp (v, v2, 3 * sizeof (UChar)) == 0; +} + +static void +rspamd_language_detector_read_file (struct rspamd_config *cfg, + struct rspamd_lang_detector *d, + const gchar *path) +{ + struct ucl_parser *parser; + ucl_object_t *top; + const ucl_object_t *freqs, *cur; + ucl_object_iter_t it = NULL; + UErrorCode uc_err = U_ZERO_ERROR; + struct rspamd_language_elt *nelt; + gchar *pos; + + parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS); + if (!ucl_parser_add_file (parser, path)) { + msg_warn_config ("cannot parse file %s: %s", path, + ucl_parser_get_error (parser)); + ucl_parser_free (parser); + + return; + } + + top = ucl_parser_get_object (parser); + ucl_parser_free (parser); + + freqs = ucl_object_lookup (top, "freq"); + + if (freqs == NULL) { + msg_warn_config ("file %s has no 'freq' key", path); + ucl_object_unref (top); + + return; + } + + pos = strrchr (path, '/'); + g_assert (pos != NULL); + nelt = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*nelt)); + nelt->name = rspamd_mempool_strdup (cfg->cfg_pool, pos + 1); + /* Remove extension */ + pos = strchr (nelt->name, '.'); + g_assert (pos != NULL); + *pos = '\0'; + nelt->bigramms = g_hash_table_new (rspamd_bigram_hash, rspamd_bigram_equal); + nelt->trigramms = g_hash_table_new (rspamd_trigram_hash, rspamd_trigram_equal); + + while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) { + const gchar *key; + gsize keylen, nsym; + guint freq; + UChar *ucs_key; + + key = ucl_object_keyl (cur, &keylen); + freq = ucl_object_toint (cur); + + if (key != NULL) { + ucs_key = rspamd_mempool_alloc (cfg->cfg_pool, + (keylen + 1) * sizeof (UChar)); + + nsym = ucnv_toUChars (d->uchar_converter, ucs_key, keylen + 1, key, + keylen, &uc_err); + + if (uc_err != U_ZERO_ERROR) { + msg_warn_config ("cannot convert key to unicode: %s", + u_errorName (uc_err)); + + continue; + } + + if (nsym == 2) { + /* We have a digraph */ + g_hash_table_insert (nelt->bigramms, ucs_key, + GUINT_TO_POINTER (freq)); + nelt->bigramms_total += freq; + } + else if (nsym == 3) { + g_hash_table_insert (nelt->trigramms, ucs_key, + GUINT_TO_POINTER (freq)); + nelt->trigramms_total += freq; + } + else if (nsym > 3) { + msg_warn_config ("have more than 3 characters in key: %d", nsym); + } + } + } + + msg_info_config ("loaded %s language, %d digramms, %d trigramms", + nelt->name, (gint)g_hash_table_size (nelt->bigramms), + (gint)g_hash_table_size (nelt->trigramms)); + + g_ptr_array_add (d->languages, nelt); + ucl_object_unref (top); +} + +struct rspamd_lang_detector* +rspamd_language_detector_init (struct rspamd_config *cfg) +{ + const ucl_object_t *section, *elt; + const gchar *languages_path = RSPAMD_PLUGINSDIR "/languages"; + glob_t gl; + size_t i; + UErrorCode uc_err = U_ZERO_ERROR; + GString *languages_pattern; + struct rspamd_lang_detector *ret = NULL; + + section = ucl_object_lookup (cfg->rcl_obj, "lang_detection"); + + if (section != NULL) { + elt = ucl_object_lookup (section, "languages"); + + if (elt) { + languages_path = ucl_object_tostring (elt); + } + } + + languages_pattern = g_string_sized_new (PATH_MAX); + rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path); + memset (&gl, 0, sizeof (gl)); + + if (glob (languages_pattern->str, 0, NULL, &gl) != 0) { + msg_err_config ("cannot read any files matching %v", languages_pattern); + goto end; + } + + ret = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*ret)); + ret->languages = g_ptr_array_sized_new (gl.gl_pathc); + ret->uchar_converter = ucnv_open ("UTF-8", &uc_err); + + g_assert (uc_err == U_ZERO_ERROR); + + for (i = 0; i < gl.gl_pathc; i ++) { + rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i]); + } + + msg_info_config ("loaded %d languages", (gint)ret->languages->len); +end: + if (gl.gl_pathc > 0) { + globfree (&gl); + } + + g_string_free (languages_pattern, TRUE); + + return ret; +} diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h new file mode 100644 index 000000000..aa1bf5494 --- /dev/null +++ b/src/libmime/lang_detection.h @@ -0,0 +1,27 @@ +/*- + * Copyright 2017 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_LANG_DETECTION_H +#define RSPAMD_LANG_DETECTION_H + +#include "config.h" +#include "libserver/cfg_file.h" + +struct rspamd_lang_detector; + +struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg); + +#endif -- 2.39.5