diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 17:34:08 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 17:34:08 +0000 |
commit | e180ef2ce601b4118dab29ab074712c0a58244e4 (patch) | |
tree | b3da007cff3ce5964f86e868301fc9cb897ffb70 | |
parent | 63ef123b048d5f1f2f6a5d172be6dc1a2629e2d7 (diff) | |
download | rspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.tar.gz rspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.zip |
[Project] Finish basic tasks in new unicode project
-rw-r--r-- | src/libmime/lang_detection.c | 10 | ||||
-rw-r--r-- | src/libserver/re_cache.c | 10 | ||||
-rw-r--r-- | src/libstat/backends/redis_backend.c | 12 | ||||
-rw-r--r-- | src/libstat/classifiers/bayes.c | 17 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 10 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 30 | ||||
-rw-r--r-- | src/libutil/shingles.c | 17 | ||||
-rw-r--r-- | src/lua/lua_mimepart.c | 5 | ||||
-rw-r--r-- | src/lua/lua_task.c | 4 | ||||
-rw-r--r-- | src/lua/lua_util.c | 2 | ||||
-rw-r--r-- | src/plugins/chartable.c | 44 | ||||
-rw-r--r-- | src/plugins/fuzzy_check.c | 3 |
12 files changed, 96 insertions, 68 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index dbe9dbe95..e80a13e29 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -560,8 +560,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, cur_ucs = ucs_elt->s; nsym = 0; + uc_err = U_ZERO_ERROR; - while (keylen > 0) { + while (cur_utf < end) { *cur_ucs++ = ucnv_getNextUChar (d->uchar_converter, &cur_utf, end, &uc_err); if (!U_SUCCESS (uc_err)) { @@ -569,12 +570,11 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, } nsym ++; - keylen --; } if (!U_SUCCESS (uc_err)) { - msg_warn_config ("cannot convert key to unicode: %s", - u_errorName (uc_err)); + msg_warn_config ("cannot convert key %*s to unicode: %s", + (gint)keylen, key, u_errorName (uc_err)); continue; } @@ -1178,7 +1178,7 @@ rspamd_language_detector_detect_type (struct rspamd_task *task, { guint nparts = MIN (words->len, nwords); goffset *selected_words; - rspamd_stat_token_t *tok, ucs_w; + rspamd_stat_token_t *tok; guint i; selected_words = g_new0 (goffset, nparts); diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 7b7cabb69..e43de2c64 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1223,9 +1223,10 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, if (part->utf_words) { for (j = 0; j < part->utf_words->len; j ++) { - tok = &g_array_index (part->utf_words, rspamd_stat_token_t, j); - scvec[cnt] = tok->begin; - lenvec[cnt++] = tok->len; + tok = &g_array_index (part->utf_words, + rspamd_stat_token_t, j); + scvec[cnt] = tok->normalized.begin; + lenvec[cnt++] = tok->normalized.len; } } } @@ -1433,6 +1434,9 @@ rspamd_re_cache_type_to_string (enum rspamd_re_type type) case RSPAMD_RE_SELECTOR: ret = "selector"; break; + case RSPAMD_RE_WORDS: + ret = "words"; + break; case RSPAMD_RE_MAX: ret = "invalid class"; break; diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index 00441a7a6..b003d5a27 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -527,14 +527,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b_tokens %b %b:%b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->begin, tok->t1->len, - tok->t2->begin, tok->t2->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len, + tok->t2->stemmed.begin, tok->t2->stemmed.len); } else if (tok->t1) { redisAsyncCommand (rt->redis, NULL, NULL, "HSET %b_tokens %b %b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->begin, tok->t1->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len); } } else { @@ -548,14 +548,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b %s %b:%b", n0, (size_t) l0, "tokens", - tok->t1->begin, tok->t1->len, - tok->t2->begin, tok->t2->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len, + tok->t2->stemmed.begin, tok->t2->stemmed.len); } else if (tok->t1) { redisAsyncCommand (rt->redis, NULL, NULL, "HSET %b %s %b", n0, (size_t) l0, "tokens", - tok->t1->begin, tok->t1->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len); } } diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 934c8d941..2b0cf21e8 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -147,8 +147,8 @@ bayes_classify_token (struct rspamd_classifier *ctx, msg_debug_bayes ( "token(meta) %uL <%*s:%*s> probabilistically skipped", tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin); + (int) tok->t1->original.len, tok->t1->original.begin, + (int) tok->t2->original.len, tok->t2->original.begin); } return; @@ -199,8 +199,9 @@ bayes_classify_token (struct rspamd_classifier *ctx, msg_debug_bayes ( "token %uL <%*s:%*s> skipped, prob not in range: %f", tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin, bayes_spam_prob); + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, + bayes_spam_prob); return; } @@ -227,8 +228,8 @@ bayes_classify_token (struct rspamd_classifier *ctx, "current spam prob: %.3f, current ham prob: %.3f", token_type, tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin, + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, fw, w, total_count, spam_count, ham_count, spam_prob, ham_prob, bayes_spam_prob, bayes_ham_prob, @@ -541,8 +542,8 @@ bayes_learn_spam (struct rspamd_classifier * ctx, msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, " "spam_count: %d, ham_count: %d", tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin, + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, tok->window_idx, total_cnt, spam_cnt, ham_cnt); } else { diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 0465f0c3c..ed3f78fde 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -41,6 +41,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, lua_State *L = task->cfg->lua_state; ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16); + memset (&elt, 0, sizeof (elt)); elt.flags = RSPAMD_STAT_TOKEN_FLAG_META; if (st_ctx->lua_stat_tokens_ref != -1) { @@ -82,8 +83,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, tok.begin = lua_tolstring (L, -1, &tok.len); if (tok.begin && tok.len > 0) { - elt.begin = rspamd_mempool_ftokdup (task->task_pool, &tok); - elt.len = tok.len; + elt.original.begin = + rspamd_mempool_ftokdup (task->task_pool, &tok); + elt.original.len = tok.len; + elt.stemmed.begin = elt.original.begin; + elt.stemmed.len = elt.original.len; + elt.normalized.begin = elt.original.begin; + elt.normalized.len = elt.original.len; g_array_append_val (ar, elt); } diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index a19217a89..0b53f8af9 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -304,30 +304,40 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, for (w = 0; w < words->len; w ++) { token = &g_array_index (words, rspamd_stat_token_t, w); token_flags = token->flags; + const gchar *begin; + gsize len; - if (task->lang_det) { - if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) { - /* Skip stop word */ - continue; - } + if (token->flags & + (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD|RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + /* Skip stop/skipped words */ + continue; + } + + if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + begin = token->stemmed.begin; + len = token->stemmed.len; + } + else { + begin = token->original.begin; + len = token->original.len; } if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { rspamd_ftok_t ftok; - ftok.begin = token->begin; - ftok.len = token->len; + ftok.begin = begin; + ftok.len = len; cur = rspamd_fstrhash_lc (&ftok, is_utf); } else { /* We know that the words are normalized */ if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) { cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, - token->begin, token->len, osb_cf->seed); + begin, len, osb_cf->seed); } else { - rspamd_cryptobox_siphash ((guchar *)&cur, token->begin, - token->len, osb_cf->sk); + rspamd_cryptobox_siphash ((guchar *)&cur, begin, + len, osb_cf->sk); if (prefix) { cur ^= seed; diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index 240facc4a..87099a6e7 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -154,7 +154,8 @@ rspamd_shingles_from_text (GArray *input, if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { for (j = beg; j < i; j ++) { word = &g_array_index (input, rspamd_stat_token_t, j); - row = rspamd_fstring_append (row, word->begin, word->len); + row = rspamd_fstring_append (row, word->stemmed.begin, + word->stemmed.len); } /* Now we need to create a new row here */ @@ -172,7 +173,7 @@ rspamd_shingles_from_text (GArray *input, } } else { - guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed; + guint64 window[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed; switch (alg) { case RSPAMD_SHINGLES_XXHASH: @@ -186,27 +187,27 @@ rspamd_shingles_from_text (GArray *input, break; } - memset (res, 0, sizeof (res)); + memset (window, 0, sizeof (window)); for (i = 0; i <= (gint)input->len; i ++) { if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { /* Shift hashes window to right */ for (k = 0; k < SHINGLES_WINDOW - 1; k ++) { - res[j * SHINGLES_WINDOW + k] = - res[j * SHINGLES_WINDOW + k + 1]; + window[j * SHINGLES_WINDOW + k] = + window[j * SHINGLES_WINDOW + k + 1]; } word = &g_array_index (input, rspamd_stat_token_t, beg); /* Insert the last element to the pipe */ memcpy (&seed, keys[j], sizeof (seed)); - res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = + window[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = rspamd_cryptobox_fast_hash_specific (ht, - word->begin, word->len, + word->stemmed.begin, word->stemmed.len, seed); val = 0; for (k = 0; k < SHINGLES_WINDOW; k ++) { - val ^= res[j * SHINGLES_WINDOW + k] >> + val ^= window[j * SHINGLES_WINDOW + k] >> (8 * (SHINGLES_WINDOW - k - 1)); } diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index 9e74c87c0..d2ff7e8e4 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -775,7 +775,7 @@ lua_textpart_get_words (lua_State *L) for (i = 0; i < part->utf_words->len; i ++) { w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); - lua_pushlstring (L, w->begin, w->len); + lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); lua_rawseti (L, -2, i + 1); } } @@ -983,7 +983,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L) for (i = 0; i < part->utf_words->len; i ++) { word = &g_array_index (part->utf_words, rspamd_stat_token_t, i); - rspamd_cryptobox_hash_update (&st, word->begin, word->len); + rspamd_cryptobox_hash_update (&st, + word->stemmed.begin, word->stemmed.len); } rspamd_cryptobox_hash_final (&st, digest); diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index b8ac864df..4f28e9492 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -4757,13 +4757,13 @@ lua_push_stat_token (lua_State *L, rspamd_token_t *tok) if (tok->t1) { lua_pushstring (L, "t1"); - lua_pushlstring (L, tok->t1->begin, tok->t1->len); + lua_pushlstring (L, tok->t1->stemmed.begin, tok->t1->stemmed.len); lua_settable (L, -3); } if (tok->t2) { lua_pushstring (L, "t2"); - lua_pushlstring (L, tok->t2->begin, tok->t2->len); + lua_pushlstring (L, tok->t2->stemmed.begin, tok->t2->stemmed.len); lua_settable (L, -3); } diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 9ed095c34..1f9b84c85 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -1154,7 +1154,7 @@ lua_util_tokenize_text (lua_State *L) for (i = 0; i < res->len; i ++) { w = &g_array_index (res, rspamd_stat_token_t, i); - lua_pushlstring (L, w->begin, w->len); + lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); lua_rawseti (L, -2, i + 1); } } diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 0efbe55ca..c566cc517 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -358,12 +358,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, guint *ncap, struct chartable_ctx *chartable_module_ctx) { - const gchar *p, *end; + const UChar32 *p, *end; gdouble badness = 0.0; UChar32 uc; UBlockCode sc; gint last_is_latin = -1; - guint same_script_count = 0, nsym = 0, i = 0; + guint same_script_count = 0, nsym = 0; enum { start_process = 0, got_alpha, @@ -371,13 +371,13 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, got_unknown, } state = start_process, prev_state = start_process; - p = w->begin; - end = p + w->len; + p = w->unicode.begin; + end = p + w->unicode.len; /* We assume that w is normalized */ - while (p + i < end) { - U8_NEXT (p, i, w->len, uc); + while (p < end) { + uc = *p++; if (((gint32)uc) < 0) { break; @@ -464,7 +464,8 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, } } - msg_debug_chartable ("word %*s, badness: %.2f", (gint)w->len, w->begin, + msg_debug_chartable ("word %*s, badness: %.2f", + (gint)w->normalized.len, w->normalized.begin, badness); return badness; @@ -490,11 +491,11 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task, got_unknown, } state = start_process; - p = w->begin; - end = p + w->len; + p = w->normalized.begin; + end = p + w->normalized.len; last_sc = 0; - if (w->len > chartable_module_ctx->max_word_len) { + if (w->normalized.len > chartable_module_ctx->max_word_len) { return 0.0; } @@ -549,7 +550,8 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task, badness = 4.0; } - msg_debug_chartable ("word %*s, badness: %.2f", (gint)w->len, w->begin, + msg_debug_chartable ("word %*s, badness: %.2f", + (gint)w->normalized.len, w->normalized.begin, badness); return badness; @@ -572,9 +574,9 @@ rspamd_chartable_process_part (struct rspamd_task *task, for (i = 0; i < part->utf_words->len; i++) { w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); - if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { + if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { - if (IS_PART_UTF (part)) { + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { cur_score += rspamd_chartable_process_word_utf (task, w, FALSE, &ncap, chartable_module_ctx); } @@ -659,6 +661,8 @@ chartable_url_symbol_callback (struct rspamd_task *task, struct rspamd_symcache_item *item, void *unused) { + /* XXX: TODO: unbreak module once URLs unicode project is over */ +#if 0 struct rspamd_url *u; GHashTableIter it; gpointer k, v; @@ -677,10 +681,10 @@ chartable_url_symbol_callback (struct rspamd_task *task, } if (u->hostlen > 0) { - w.begin = u->host; - w.len = u->hostlen; + w.stemmed.begin = u->host; + w.stemmed.len = u->hostlen; - if (g_utf8_validate (w.begin, w.len, NULL)) { + if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) { cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL, chartable_module_ctx); } @@ -702,10 +706,10 @@ chartable_url_symbol_callback (struct rspamd_task *task, } if (u->hostlen > 0) { - w.begin = u->host; - w.len = u->hostlen; + w.stemmed.begin = u->host; + w.stemmed.len = u->hostlen; - if (g_utf8_validate (w.begin, w.len, NULL)) { + if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) { cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL, chartable_module_ctx); } @@ -721,6 +725,6 @@ chartable_url_symbol_callback (struct rspamd_task *task, cur_score, NULL); } - +#endif rspamd_symcache_finalize_item (task, item); } diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index fa9e9191c..dd59fc542 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -1459,7 +1459,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task, for (i = 0; i < words->len; i ++) { word = &g_array_index (words, rspamd_stat_token_t, i); - rspamd_cryptobox_hash_update (&st, word->begin, word->len); + rspamd_cryptobox_hash_update (&st, word->stemmed.begin, + word->stemmed.len); } rspamd_cryptobox_hash_final (&st, shcmd->basic.digest); |