]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Start language detection project
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 30 Dec 2017 19:52:53 +0000 (19:52 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 30 Dec 2017 19:52:53 +0000 (19:52 +0000)
src/libmime/CMakeLists.txt
src/libmime/lang_detection.c [new file with mode: 0644]
src/libmime/lang_detection.h [new file with mode: 0644]

index 78ada58e1ffece560acf16c51de6b2abd3d9809a..81d75d382780e6e75e47b9b6cb9211d4da903be9 100644 (file)
@@ -9,6 +9,7 @@ SET(LIBRSPAMDMIMESRC
                                ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
-                               ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c)
+                               ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
+                               ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c)
 
 SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
new file mode 100644 (file)
index 0000000..730733d
--- /dev/null
@@ -0,0 +1,204 @@
+/*-
+ * Copyright 2017 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lang_detection.h"
+#include "libutil/logger.h"
+#include "libcryptobox/cryptobox.h"
+#include "ucl.h"
+#include <glob.h>
+#include <unicode/utf8.h>
+#include <unicode/ucnv.h>
+
+struct rspamd_language_elt {
+       const gchar *name; /* e.g. "en" or "ru" */
+       guint bigramms_total; /* total frequencies for bigramms */
+       GHashTable *bigramms; /* bigrams frequencies */
+       guint trigramms_total; /* total frequencies for trigramms */
+       GHashTable *trigramms; /* trigramms frequencies */
+};
+
+struct rspamd_lang_detector {
+       GPtrArray *languages;
+       UConverter *uchar_converter;
+};
+
+static guint
+rspamd_bigram_hash (gconstpointer key)
+{
+       return rspamd_cryptobox_fast_hash (key, 2 * sizeof (UChar), rspamd_hash_seed ());
+}
+
+static gboolean
+rspamd_bigram_equal (gconstpointer v, gconstpointer v2)
+{
+       return memcmp (v, v2, 2 * sizeof (UChar)) == 0;
+}
+
+static guint
+rspamd_trigram_hash (gconstpointer key)
+{
+       return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar), rspamd_hash_seed ());
+}
+
+static gboolean
+rspamd_trigram_equal (gconstpointer v, gconstpointer v2)
+{
+       return memcmp (v, v2, 3 * sizeof (UChar)) == 0;
+}
+
+static void
+rspamd_language_detector_read_file (struct rspamd_config *cfg,
+               struct rspamd_lang_detector *d,
+               const gchar *path)
+{
+       struct ucl_parser *parser;
+       ucl_object_t *top;
+       const ucl_object_t *freqs, *cur;
+       ucl_object_iter_t it = NULL;
+       UErrorCode uc_err = U_ZERO_ERROR;
+       struct rspamd_language_elt *nelt;
+       gchar *pos;
+
+       parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS);
+       if (!ucl_parser_add_file (parser, path)) {
+               msg_warn_config ("cannot parse file %s: %s", path,
+                               ucl_parser_get_error (parser));
+               ucl_parser_free (parser);
+
+               return;
+       }
+
+       top = ucl_parser_get_object (parser);
+       ucl_parser_free (parser);
+
+       freqs = ucl_object_lookup (top, "freq");
+
+       if (freqs == NULL) {
+               msg_warn_config ("file %s has no 'freq' key", path);
+               ucl_object_unref (top);
+
+               return;
+       }
+
+       pos = strrchr (path, '/');
+       g_assert (pos != NULL);
+       nelt = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*nelt));
+       nelt->name = rspamd_mempool_strdup (cfg->cfg_pool, pos + 1);
+       /* Remove extension */
+       pos = strchr (nelt->name, '.');
+       g_assert (pos != NULL);
+       *pos = '\0';
+       nelt->bigramms = g_hash_table_new (rspamd_bigram_hash, rspamd_bigram_equal);
+       nelt->trigramms = g_hash_table_new (rspamd_trigram_hash, rspamd_trigram_equal);
+
+       while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
+               const gchar *key;
+               gsize keylen, nsym;
+               guint freq;
+               UChar *ucs_key;
+
+               key = ucl_object_keyl (cur, &keylen);
+               freq = ucl_object_toint (cur);
+
+               if (key != NULL) {
+                       ucs_key = rspamd_mempool_alloc (cfg->cfg_pool,
+                                       (keylen + 1) * sizeof (UChar));
+
+                       nsym = ucnv_toUChars (d->uchar_converter, ucs_key, keylen + 1, key,
+                                       keylen, &uc_err);
+
+                       if (uc_err != U_ZERO_ERROR) {
+                               msg_warn_config ("cannot convert key to unicode: %s",
+                                               u_errorName (uc_err));
+
+                               continue;
+                       }
+
+                       if (nsym == 2) {
+                               /* We have a digraph */
+                               g_hash_table_insert (nelt->bigramms, ucs_key,
+                                               GUINT_TO_POINTER (freq));
+                               nelt->bigramms_total += freq;
+                       }
+                       else if (nsym == 3) {
+                               g_hash_table_insert (nelt->trigramms, ucs_key,
+                                               GUINT_TO_POINTER (freq));
+                               nelt->trigramms_total += freq;
+                       }
+                       else if (nsym > 3) {
+                               msg_warn_config ("have more than 3 characters in key: %d", nsym);
+                       }
+               }
+       }
+
+       msg_info_config ("loaded %s language, %d digramms, %d trigramms",
+                       nelt->name, (gint)g_hash_table_size (nelt->bigramms),
+                       (gint)g_hash_table_size (nelt->trigramms));
+
+       g_ptr_array_add (d->languages, nelt);
+       ucl_object_unref (top);
+}
+
+struct rspamd_lang_detector*
+rspamd_language_detector_init (struct rspamd_config *cfg)
+{
+       const ucl_object_t *section, *elt;
+       const gchar *languages_path = RSPAMD_PLUGINSDIR "/languages";
+       glob_t gl;
+       size_t i;
+       UErrorCode uc_err = U_ZERO_ERROR;
+       GString *languages_pattern;
+       struct rspamd_lang_detector *ret = NULL;
+
+       section = ucl_object_lookup (cfg->rcl_obj, "lang_detection");
+
+       if (section != NULL) {
+               elt = ucl_object_lookup (section, "languages");
+
+               if (elt) {
+                       languages_path = ucl_object_tostring (elt);
+               }
+       }
+
+       languages_pattern = g_string_sized_new (PATH_MAX);
+       rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path);
+       memset (&gl, 0, sizeof (gl));
+
+       if (glob (languages_pattern->str, 0, NULL, &gl) != 0) {
+               msg_err_config ("cannot read any files matching %v", languages_pattern);
+               goto end;
+       }
+
+       ret = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*ret));
+       ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
+       ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
+
+       g_assert (uc_err == U_ZERO_ERROR);
+
+       for (i = 0; i < gl.gl_pathc; i ++) {
+               rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i]);
+       }
+
+       msg_info_config ("loaded %d languages", (gint)ret->languages->len);
+end:
+       if (gl.gl_pathc > 0) {
+               globfree (&gl);
+       }
+
+       g_string_free (languages_pattern, TRUE);
+
+       return ret;
+}
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
new file mode 100644 (file)
index 0000000..aa1bf54
--- /dev/null
@@ -0,0 +1,27 @@
+/*-
+ * Copyright 2017 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_LANG_DETECTION_H
+#define RSPAMD_LANG_DETECTION_H
+
+#include "config.h"
+#include "libserver/cfg_file.h"
+
+struct rspamd_lang_detector;
+
+struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg);
+
+#endif