Browse Source

[Minor] Improve language detection debug logging

tags/1.7.0
Vsevolod Stakhov 6 years ago
parent
commit
98f063228f
3 changed files with 72 additions and 29 deletions
  1. 67
    27
      src/libmime/lang_detection.c
  2. 3
    1
      src/libmime/lang_detection.h
  3. 2
    1
      src/libmime/message.c

+ 67
- 27
src/libmime/lang_detection.c View File

@@ -45,6 +45,11 @@ struct rspamd_lang_detector {
gsize short_text_limit;
};

#define msg_debug_lang_det(...) rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \
"langdet", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)

static guint
rspamd_unigram_hash (gconstpointer key)
{
@@ -406,7 +411,8 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
* Do full guess for a specific ngramm, checking all languages defined
*/
static void
rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
@@ -459,7 +465,8 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
* Check only candidates, if none found, switch to full version
*/
static gboolean
rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
rspamd_language_detector_process_ngramm_update (struct rspamd_task *task,
struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
@@ -500,7 +507,8 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,

if (total_freq == 0) {
/* Nothing found , do full scan which will also update candidates */
rspamd_language_detector_process_ngramm_full (d, window, type, candidates);
rspamd_language_detector_process_ngramm_full (task, d, window,
type, candidates);

return FALSE;
}
@@ -509,7 +517,8 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
}

static gboolean
rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
rspamd_language_detector_update_guess (struct rspamd_task *task,
struct rspamd_lang_detector *d,
rspamd_stat_token_t *tok, GHashTable *candidates,
enum rspamd_language_gramm_type type)
{
@@ -535,14 +544,14 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
!= -1) {

if (rspamd_random_double_fast () > update_prob) {
if (!rspamd_language_detector_process_ngramm_update (d, window,
if (!rspamd_language_detector_process_ngramm_update (task, d, window,
type, candidates)) {
ret = FALSE;
}
}
else {
/* Try to do full update in case if we are missing some candidates */
rspamd_language_detector_process_ngramm_full (d, window, type,
rspamd_language_detector_process_ngramm_full (task, d, window, type,
candidates);
}
}
@@ -551,7 +560,8 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
}

static void
rspamd_language_detector_detect_word (struct rspamd_lang_detector *d,
rspamd_language_detector_detect_word (struct rspamd_task *task,
struct rspamd_lang_detector *d,
rspamd_stat_token_t *tok, GHashTable *candidates,
enum rspamd_language_gramm_type type)
{
@@ -574,7 +584,8 @@ rspamd_language_detector_detect_word (struct rspamd_lang_detector *d,
/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {
rspamd_language_detector_process_ngramm_full (d, window, type, candidates);
rspamd_language_detector_process_ngramm_full (task,
d, window, type, candidates);
}
}

@@ -583,11 +594,13 @@ rspamd_language_detector_detect_word (struct rspamd_lang_detector *d,
* has the lowest probabilities
*/
static void
rspamd_language_detector_filter_negligible (GHashTable *candidates)
rspamd_language_detector_filter_negligible (struct rspamd_task *task,
GHashTable *candidates)
{
GHashTableIter it;
gpointer k, v;
struct rspamd_lang_detector_res *cand;
guint filtered = 0;
gdouble max_prob = -(G_MAXDOUBLE);

/* Normalize step */
@@ -618,43 +631,51 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
* prob2 is 2^4 less than prob1
*/
if (max_prob - cand->prob > 1.5) {
msg_debug_lang_det ("exclude language %s: %.3f (%.3f max)",
cand->lang, cand->prob, max_prob);
g_hash_table_iter_remove (&it);
filtered ++;
}
}

msg_debug_lang_det ("removed %d languages", filtered);
}

static void
rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
rspamd_language_detector_detect_type (struct rspamd_task *task,
guint nwords,
struct rspamd_lang_detector *d,
GArray *ucs_tokens,
GHashTable *candidates,
enum rspamd_language_gramm_type type,
gboolean start_over)
{
guint nparts = MIN (ucs_tokens->len, default_words);
guint nparts = MIN (ucs_tokens->len, nwords);
goffset *selected_words;
rspamd_stat_token_t *tok;
guint i;

selected_words = g_new0 (goffset, nparts);
rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words);
msg_debug_lang_det ("randomly selected %d words", nparts);

/* Deal with the first word in a special case */
tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[0]);

if (start_over) {
rspamd_language_detector_detect_word (d, tok, candidates, type);
rspamd_language_detector_detect_word (task, d, tok, candidates, type);
}
else {
rspamd_language_detector_update_guess (d, tok, candidates, type);
rspamd_language_detector_update_guess (task, d, tok, candidates, type);
}

for (i = 1; i < nparts; i ++) {
tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[i]);
rspamd_language_detector_update_guess (d, tok, candidates, type);
rspamd_language_detector_update_guess (task, d, tok, candidates, type);
}

/* Filter negligible candidates */
rspamd_language_detector_filter_negligible (candidates);
rspamd_language_detector_filter_negligible (task, candidates);
}

static gint
@@ -681,14 +702,16 @@ enum rspamd_language_detected_type {
};

static enum rspamd_language_detected_type
rspamd_language_detector_try_ngramm (struct rspamd_lang_detector *d,
rspamd_language_detector_try_ngramm (struct rspamd_task *task,
guint nwords,
struct rspamd_lang_detector *d,
GArray *ucs_tokens,
enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
guint cand_len;

rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
rspamd_language_detector_detect_type (task, nwords, d, ucs_tokens, candidates,
type, TRUE);

cand_len = g_hash_table_size (candidates);
@@ -704,7 +727,8 @@ rspamd_language_detector_try_ngramm (struct rspamd_lang_detector *d,
}

GPtrArray *
rspamd_language_detector_detect (struct rspamd_lang_detector *d,
rspamd_language_detector_detect (struct rspamd_task *task,
struct rspamd_lang_detector *d,
GArray *ucs_tokens, gsize words_len)
{
GHashTable *candidates, *tcandidates;
@@ -724,34 +748,46 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,

if (words_len < d->short_text_limit) {
/* For short text, start directly from trigramms */
r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm,
msg_debug_lang_det ("text is less than %z words: %z, start with trigramms",
d->short_text_limit, words_len);
r = rspamd_language_detector_try_ngramm (task, default_words, d,
ucs_tokens, rs_trigramm,
candidates);

if (r == rs_detect_none) {
r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm,
msg_debug_lang_det ("short mode; no trigramms found, switch to bigramms");
r = rspamd_language_detector_try_ngramm (task, default_words, d,
ucs_tokens, rs_bigramm,
candidates);

if (r == rs_detect_none) {
r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm,
msg_debug_lang_det ("short mode; no trigramms found, "
"switch to unigramms");
r = rspamd_language_detector_try_ngramm (task, default_words,
d, ucs_tokens, rs_unigramm,
candidates);
}
}
}
else {
/* Start with unigramms */
r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm,
r = rspamd_language_detector_try_ngramm (task, default_words,
d, ucs_tokens, rs_unigramm,
candidates);

switch (r) {
case rs_detect_none:
case rs_detect_single:
/* No unigramms found or single set found, no reason to continue */;
msg_debug_lang_det ("no unigramms found, try bigramms");
break;
case rs_detect_multiple:
/* Try to improve guess */
msg_debug_lang_det ("unigramms pass finished, found %d candidates",
(gint)g_hash_table_size (candidates));
tcandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
NULL, g_free);
r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm,
r = rspamd_language_detector_try_ngramm (task, default_words,
d, ucs_tokens, rs_trigramm,
tcandidates);

switch (r) {
@@ -789,7 +825,8 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,
g_hash_table_unref (candidates);
candidates = tcandidates;

msg_err ("trigramms checked, %.3f mean, %.4f stddev", mean, std);
msg_debug_lang_det ("trigramms checked, %.3f mean, %.4f stddev",
mean, std);

if (std / fabs (mean) < 0.01) {
/* Try trigramms */
@@ -797,7 +834,10 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,
rspamd_str_equal,
NULL, g_free);

r = rspamd_language_detector_try_ngramm (d, ucs_tokens,
r = rspamd_language_detector_try_ngramm (task,
default_words * 2,
d,
ucs_tokens,
rs_trigramm,
tcandidates);

@@ -819,7 +859,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,

while (g_hash_table_iter_next (&it, &k, &v)) {
cand = (struct rspamd_lang_detector_res *) v;
msg_err ("%s -> %.2f", cand->lang, cand->prob);
msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, cand->prob);
g_ptr_array_add (result, cand);
g_hash_table_iter_steal (&it);
}

+ 3
- 1
src/libmime/lang_detection.h View File

@@ -23,6 +23,7 @@

struct rspamd_lang_detector;
struct rspamd_language_elt;
struct rspamd_task;

struct rspamd_lang_detector_res {
gdouble prob;
@@ -54,7 +55,8 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
* @param words_len
* @return array of struct rspamd_lang_detector_res sorted by freq descending
*/
GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task,
struct rspamd_lang_detector *d,
GArray *ucs_tokens, gsize words_len);

#endif

+ 2
- 1
src/libmime/message.c View File

@@ -107,7 +107,8 @@ rspamd_extract_words (struct rspamd_task *task,
}
}

part->languages = rspamd_language_detector_detect (task->lang_det,
part->languages = rspamd_language_detector_detect (task,
task->lang_det,
part->ucs32_words, ucs_len);

if (part->languages->len > 0) {

Loading…
Cancel
Save