summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-03 15:25:23 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-03 15:27:11 +0000
commit891590d6093196348f63655a6c308fdca79ec061 (patch)
treed6ac5f8781788ba7d142e6a128c432d241c20c20
parent85631ea2677e0a95679c38b3103af03fff161d9c (diff)
downloadrspamd-891590d6093196348f63655a6c308fdca79ec061.tar.gz
rspamd-891590d6093196348f63655a6c308fdca79ec061.zip
[Project] Further improvements to language detector
-rw-r--r--src/libmime/lang_detection.c62
1 files changed, 47 insertions, 15 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 2420e1c5f..e15f99f76 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -22,7 +22,7 @@
#include <unicode/utf8.h>
#include <unicode/ucnv.h>
#include <unicode/uchar.h>
-#include <unicode/uscript.h>
+#include <unicode/ustring.h>
#include <math.h>
static const gsize default_short_text_limit = 200;
@@ -647,7 +647,7 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1),
utf_token->begin, utf_token->len, &uc_err);
- if (nsym >= 0) {
+ if (nsym >= 0 && uc_err == U_ZERO_ERROR) {
rspamd_language_detector_ucs_lowercase (out, nsym);
ucs_token->begin = (const gchar *) out;
ucs_token->len = nsym;
@@ -664,6 +664,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
guint step_len, remainder, i, out_idx;
guint64 coin, sel;
goffset tmp;
+ rspamd_stat_token_t *tok;
g_assert (nwords != 0);
g_assert (offsets_out != NULL);
@@ -694,17 +695,45 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
for (i = step_len + remainder; i < ucs_tokens->len;
i += step_len, out_idx ++) {
+ guint ntries = 0;
coin = rspamd_random_uint64_fast ();
sel = (coin % step_len) + i;
- offsets_out[out_idx] = sel;
+
+ for (;;) {
+ tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
+ /* Filter bad tokens */
+ if (tok->len >= 2 && u_isalpha (*(UChar *)tok->begin)
+ && u_isalpha (*(((UChar *)tok->begin) + (tok->len - 1)))) {
+ offsets_out[out_idx] = sel;
+ break;
+ }
+ else {
+ ntries ++;
+ coin = rspamd_random_uint64_fast ();
+
+ if (ntries < step_len) {
+ sel = (coin % step_len) + i;
+ }
+ else if (ntries < ucs_tokens->len) {
+ sel = coin % ucs_tokens->len;
+ }
+ else {
+ offsets_out[out_idx] = sel;
+ break;
+ }
+ }
+ }
}
+
+
/*
* Fisher-Yates algorithm:
* for i from 0 to n−2 do
- * j ← random integer such that i ≤ j < n
- * exchange a[i] and a[j]
- */
+ * j ← random integer such that i ≤ j < n
+ * exchange a[i] and a[j]
+ */
+#if 0
if (out_idx > 2) {
for (i = 0; i < out_idx - 2; i++) {
coin = rspamd_random_uint64_fast ();
@@ -715,6 +744,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
offsets_out[sel] = tmp;
}
}
+#endif
}
enum rspamd_language_gramm_type {
@@ -749,10 +779,11 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
/* No more fun */
return -1;
}
-
- /* Normal case */
- for (i = 0; i < wlen; i ++) {
- window[i] = *(((UChar *)tok->begin) + cur_off + i);
+ else {
+ /* Normal case */
+ for (i = 0; i < wlen; i++) {
+ window[i] = *(((UChar *) tok->begin) + cur_off + i);
+ }
}
}
else {
@@ -780,8 +811,7 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
struct rspamd_ngramm_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTable *ngramms;
- /* Ignore if ngramm is found in that amount of languages */
- static const guint languages_cutoff = 10;
+ gdouble mult = 1.0, prob;
switch (type) {
case rs_unigramm:
@@ -795,20 +825,22 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
ar = g_hash_table_lookup (ngramms, window);
- if (ar && ar->len < languages_cutoff) {
+ if (ar) {
PTR_ARRAY_FOREACH (ar, i, elt) {
cand = g_hash_table_lookup (candidates, elt->elt->name);
+ prob = elt->prob * mult;
+
if (cand == NULL) {
cand = g_malloc (sizeof (*cand));
cand->elt = elt->elt;
cand->lang = elt->elt->name;
- cand->prob = elt->prob;
+ cand->prob = prob;
g_hash_table_insert (candidates, (gpointer)cand->lang, cand);
} else {
/* Update guess */
- cand->prob += elt->prob;
+ cand->prob += prob;
}
}
}