summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/libmime/lang_detection.c44
-rw-r--r--src/libmime/lang_detection.h15
2 files changed, 49 insertions, 10 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 9ccd7bef5..af646d7cf 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -70,13 +70,6 @@ static const gchar *tier1_langs[] = {
"pt", "ru", "pl", "tk", "th", "ar"
};
-enum rspamd_language_elt_flags {
- RS_LANGUAGE_DEFAULT = 0,
- RS_LANGUAGE_LATIN = (1 << 0),
- RS_LANGUAGE_TIER1 = (1 << 3),
- RS_LANGUAGE_TIER0 = (1 << 4),
-};
-
enum rspamd_language_category {
RSPAMD_LANGUAGE_LATIN = 0,
RSPAMD_LANGUAGE_CYRILLIC,
@@ -87,7 +80,7 @@ enum rspamd_language_category {
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
- enum rspamd_language_elt_flags flags;
+ gint flags; /* enum rspamd_language_elt_flags */
enum rspamd_language_category category;
guint trigramms_words;
guint stop_words;
@@ -353,7 +346,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
{
struct ucl_parser *parser;
ucl_object_t *top;
- const ucl_object_t *freqs, *n_words, *cur, *type;
+ const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
ucl_object_iter_t it = NULL;
UErrorCode uc_err = U_ZERO_ERROR;
struct rspamd_language_elt *nelt;
@@ -440,6 +433,29 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
}
+ flags = ucl_object_lookup (top, "flags");
+
+ if (type != NULL && ucl_object_type (type) == UCL_ARRAY) {
+ ucl_object_iter_t it = NULL;
+ const ucl_object_t *cur;
+
+ while ((cur = ucl_object_iterate (flags, &it, true)) != NULL) {
+ const gchar *fl = ucl_object_tostring (cur);
+
+ if (cur) {
+ if (strcmp (fl, "diacritics") == 0) {
+ nelt->flags |= RS_LANGUAGE_DIACRITICS;
+ }
+ else {
+ msg_debug_config ("unknown flag %s of language %s", fl, nelt->name);
+ }
+ }
+ else {
+ msg_debug_config ("unknown flags type of language %s", nelt->name);
+ }
+ }
+ }
+
if (stop_words) {
const ucl_object_t *specific_stop_words;
@@ -1902,4 +1918,14 @@ rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
}
return FALSE;
+}
+
+gint
+rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt)
+{
+ if (elt) {
+ return elt->flags;
+ }
+
+ return 0;
} \ No newline at end of file
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index 3eaa4e286..b1382e6ad 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -50,6 +50,14 @@ enum rspamd_unicode_scripts {
RSPAMD_UNICODE_HANGUL = (1 << 16),
};
+enum rspamd_language_elt_flags {
+ RS_LANGUAGE_DEFAULT = 0,
+ RS_LANGUAGE_LATIN = (1 << 0),
+ RS_LANGUAGE_TIER1 = (1 << 3),
+ RS_LANGUAGE_TIER0 = (1 << 4),
+ RS_LANGUAGE_DIACRITICS = (1 << 5),
+};
+
struct rspamd_lang_detector_res {
gdouble prob;
const gchar *lang;
@@ -88,7 +96,12 @@ gboolean rspamd_language_detector_detect (struct rspamd_task *task,
gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
const gchar *word, gsize wlen);
-
+/**
+ * Return language flags for a specific language elt
+ * @param elt
+ * @return
+ */
+gint rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt);
#ifdef __cplusplus
}
#endif