aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/lang_detection.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-04 10:32:57 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-04 10:32:57 +0000
commit9cf530bdb68ec3bfbfec643bf89fb279983e41a9 (patch)
treecfef69a21fb516fc65e65d084de83aa797e73c52 /src/libmime/lang_detection.c
parent5ebcabcff378c0c56ed187c60f941b33d06013d7 (diff)
downloadrspamd-9cf530bdb68ec3bfbfec643bf89fb279983e41a9.tar.gz
rspamd-9cf530bdb68ec3bfbfec643bf89fb279983e41a9.zip
[Minor] Add diacritics flag for language detector
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r--src/libmime/lang_detection.c44
1 files changed, 35 insertions, 9 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 9ccd7bef5..af646d7cf 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -70,13 +70,6 @@ static const gchar *tier1_langs[] = {
"pt", "ru", "pl", "tk", "th", "ar"
};
-enum rspamd_language_elt_flags {
- RS_LANGUAGE_DEFAULT = 0,
- RS_LANGUAGE_LATIN = (1 << 0),
- RS_LANGUAGE_TIER1 = (1 << 3),
- RS_LANGUAGE_TIER0 = (1 << 4),
-};
-
enum rspamd_language_category {
RSPAMD_LANGUAGE_LATIN = 0,
RSPAMD_LANGUAGE_CYRILLIC,
@@ -87,7 +80,7 @@ enum rspamd_language_category {
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
- enum rspamd_language_elt_flags flags;
+ gint flags; /* enum rspamd_language_elt_flags */
enum rspamd_language_category category;
guint trigramms_words;
guint stop_words;
@@ -353,7 +346,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
{
struct ucl_parser *parser;
ucl_object_t *top;
- const ucl_object_t *freqs, *n_words, *cur, *type;
+ const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
ucl_object_iter_t it = NULL;
UErrorCode uc_err = U_ZERO_ERROR;
struct rspamd_language_elt *nelt;
@@ -440,6 +433,29 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
}
+ flags = ucl_object_lookup (top, "flags");
+
+ if (type != NULL && ucl_object_type (type) == UCL_ARRAY) {
+ ucl_object_iter_t it = NULL;
+ const ucl_object_t *cur;
+
+ while ((cur = ucl_object_iterate (flags, &it, true)) != NULL) {
+ const gchar *fl = ucl_object_tostring (cur);
+
+ if (cur) {
+ if (strcmp (fl, "diacritics") == 0) {
+ nelt->flags |= RS_LANGUAGE_DIACRITICS;
+ }
+ else {
+ msg_debug_config ("unknown flag %s of language %s", fl, nelt->name);
+ }
+ }
+ else {
+ msg_debug_config ("unknown flags type of language %s", nelt->name);
+ }
+ }
+ }
+
if (stop_words) {
const ucl_object_t *specific_stop_words;
@@ -1902,4 +1918,14 @@ rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
}
return FALSE;
+}
+
+gint
+rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt)
+{
+ if (elt) {
+ return elt->flags;
+ }
+
+ return 0;
} \ No newline at end of file