Browse Source

[Minor] Add diacritics flag for language detector

tags/2.3
Vsevolod Stakhov 4 years ago
parent
commit
9cf530bdb6

+ 1
- 1
contrib/languages-data/cs.json
File diff suppressed because it is too large
View File


+ 1
- 1
contrib/languages-data/fr.json
File diff suppressed because it is too large
View File


+ 1
- 1
contrib/languages-data/pl.json
File diff suppressed because it is too large
View File


+ 35
- 9
src/libmime/lang_detection.c View File

@@ -70,13 +70,6 @@ static const gchar *tier1_langs[] = {
"pt", "ru", "pl", "tk", "th", "ar"
};

enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
RS_LANGUAGE_LATIN = (1 << 0),
RS_LANGUAGE_TIER1 = (1 << 3),
RS_LANGUAGE_TIER0 = (1 << 4),
};

enum rspamd_language_category {
RSPAMD_LANGUAGE_LATIN = 0,
RSPAMD_LANGUAGE_CYRILLIC,
@@ -87,7 +80,7 @@ enum rspamd_language_category {

struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
enum rspamd_language_elt_flags flags;
gint flags; /* enum rspamd_language_elt_flags */
enum rspamd_language_category category;
guint trigramms_words;
guint stop_words;
@@ -353,7 +346,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
{
struct ucl_parser *parser;
ucl_object_t *top;
const ucl_object_t *freqs, *n_words, *cur, *type;
const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
ucl_object_iter_t it = NULL;
UErrorCode uc_err = U_ZERO_ERROR;
struct rspamd_language_elt *nelt;
@@ -440,6 +433,29 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
}

flags = ucl_object_lookup (top, "flags");

if (type != NULL && ucl_object_type (type) == UCL_ARRAY) {
ucl_object_iter_t it = NULL;
const ucl_object_t *cur;

while ((cur = ucl_object_iterate (flags, &it, true)) != NULL) {
const gchar *fl = ucl_object_tostring (cur);

if (cur) {
if (strcmp (fl, "diacritics") == 0) {
nelt->flags |= RS_LANGUAGE_DIACRITICS;
}
else {
msg_debug_config ("unknown flag %s of language %s", fl, nelt->name);
}
}
else {
msg_debug_config ("unknown flags type of language %s", nelt->name);
}
}
}

if (stop_words) {
const ucl_object_t *specific_stop_words;

@@ -1902,4 +1918,14 @@ rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
}

return FALSE;
}

gint
rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt)
{
if (elt) {
return elt->flags;
}

return 0;
}

+ 14
- 1
src/libmime/lang_detection.h View File

@@ -50,6 +50,14 @@ enum rspamd_unicode_scripts {
RSPAMD_UNICODE_HANGUL = (1 << 16),
};

enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
RS_LANGUAGE_LATIN = (1 << 0),
RS_LANGUAGE_TIER1 = (1 << 3),
RS_LANGUAGE_TIER0 = (1 << 4),
RS_LANGUAGE_DIACRITICS = (1 << 5),
};

struct rspamd_lang_detector_res {
gdouble prob;
const gchar *lang;
@@ -88,7 +96,12 @@ gboolean rspamd_language_detector_detect (struct rspamd_task *task,
gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
const gchar *word, gsize wlen);


/**
* Return language flags for a specific language elt
* @param elt
* @return
*/
gint rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt);
#ifdef __cplusplus
}
#endif

Loading…
Cancel
Save