diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/lang_detection.c | 33 | ||||
-rw-r--r-- | src/libmime/message.c | 2 | ||||
-rw-r--r-- | src/libstat/stat_api.h | 1 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 5 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 100 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 9 |
6 files changed, 124 insertions, 26 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index e2651b63c..b2a2f1f6c 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -884,17 +884,17 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, UErrorCode uc_err = U_ZERO_ERROR; ucs_token->flags = utf_token->flags; - out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->len + 1)); - nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1), - utf_token->begin, utf_token->len, &uc_err); + out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->normalized.len + 1)); + nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->normalized.len + 1), + utf_token->normalized.begin, utf_token->normalized.len, &uc_err); if (nsym >= 0 && uc_err == U_ZERO_ERROR) { rspamd_language_detector_ucs_lowercase (out, nsym); - ucs_token->begin = (const gchar *) out; - ucs_token->len = nsym; + ucs_token->normalized.begin = (const gchar *) out; + ucs_token->normalized.len = nsym; } else { - ucs_token->len = 0; + ucs_token->normalized.len = 0; } } @@ -942,8 +942,9 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, for (;;) { tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel); /* Filter bad tokens */ - if (tok->len >= 2 && u_isalpha (*(UChar *)tok->begin) - && u_isalpha (*(((UChar *)tok->begin) + (tok->len - 1)))) { + if (tok->normalized.len >= 2 && + u_isalpha (*(UChar *)tok->normalized.begin) && + u_isalpha (*(((UChar *)tok->normalized.begin) + (tok->normalized.len - 1)))) { offsets_out[out_idx] = sel; break; } @@ -1000,33 +1001,33 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, window[0] = (UChar)' '; for (i = 0; i < wlen - 1; i ++) { - window[i + 1] = *(((UChar *)tok->begin) + i); + window[i + 1] = *(((UChar *)tok->normalized.begin) + i); } } - else if (cur_off + wlen == tok->len + 1) { + else if (cur_off + wlen == tok->normalized.len + 1) { /* Add trailing space */ for (i = 0; i < wlen - 1; i ++) { - window[i] = *(((UChar *)tok->begin) + cur_off + i); + window[i] = *(((UChar *)tok->normalized.begin) + cur_off + i); } window[wlen - 1] = (UChar)' '; } - else if (cur_off + wlen > tok->len + 1) { + else if (cur_off + wlen > tok->normalized.len + 1) { /* No more fun */ return -1; } else { /* Normal case */ for (i = 0; i < wlen; i++) { - window[i] = *(((UChar *) tok->begin) + cur_off + i); + window[i] = *(((UChar *) tok->normalized.begin) + cur_off + i); } } } else { - if (tok->len <= cur_off) { + if (tok->normalized.len <= cur_off) { return -1; } - window[0] = *(((UChar *)tok->begin) + cur_off); + window[0] = *(((UChar *)tok->normalized.begin) + cur_off); } return cur_off + 1; @@ -1810,7 +1811,7 @@ rspamd_language_detector_unref (struct rspamd_lang_detector* d) gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d, - const gchar *word, gsize wlen) + const gchar *word, gsize wlen) { khiter_t k; rspamd_ftok_t search; diff --git a/src/libmime/message.c b/src/libmime/message.c index 5f9373a9a..7572a4178 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -72,7 +72,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, if (part->utf_words) { #ifdef WITH_SNOWBALL - static GHashTable *stemmers = NULL; + if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) { diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index c046dd227..8ab3332b9 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -36,6 +36,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7) #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) +#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index d68e3bc60..a19217a89 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -306,9 +306,8 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, token_flags = token->flags; if (task->lang_det) { - if (rspamd_language_detector_is_stop_word (task->lang_det, - token->begin, token->len)) { - /* Skip it */ + if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) { + /* Skip stop word */ continue; } } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 247c24dbd..9bbe899fb 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -21,6 +21,8 @@ #include "tokenizers.h" #include "stat_internal.h" #include "contrib/mumhash/mum.h" +#include "libmime/lang_detection.h" +#include "libstemmer.h" #include <unicode/utf8.h> #include <unicode/uchar.h> @@ -664,5 +666,99 @@ rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool) } } -void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, - const gchar *language);
\ No newline at end of file +void +rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, + const gchar *language, + struct rspamd_lang_detector *d) +{ + static GHashTable *stemmers = NULL; + struct sb_stemmer *stem = NULL; + guint i; + rspamd_stat_token_t *tok; + gchar *dest; + gsize dlen; + + if (!stemmers) { + stemmers = g_hash_table_new (rspamd_strcase_hash, + rspamd_strcase_equal); + } + + if (language && language[0] != '\0') { + stem = g_hash_table_lookup (stemmers, language); + + if (stem == NULL) { + + stem = sb_stemmer_new (language, "UTF_8"); + + if (stem == NULL) { + msg_debug_pool ( + "<%s> cannot create lemmatizer for %s language", + language); + g_hash_table_insert (stemmers, g_strdup (language), + GINT_TO_POINTER (-1)); + } + else { + g_hash_table_insert (stemmers, g_strdup (language), + stem); + } + } + else if (stem == GINT_TO_POINTER (-1)) { + /* Negative cache */ + stem = NULL; + } + } + for (i = 0; i < words->len; i++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + if (stem) { + const gchar *stemmed; + + stemmed = sb_stemmer_stem (stem, + tok->normalized.begin, tok->normalized.len); + + dlen = strlen (stemmed); + + if (dlen > 0) { + dest = rspamd_mempool_alloc (pool, dlen); + memcpy (dest, stemmed, dlen); + rspamd_str_lc_utf8 (dest, dlen); + tok->stemmed.len = dlen; + tok->stemmed.begin = dest; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED; + } + else { + /* Fallback */ + dest = rspamd_mempool_alloc (pool, tok->normalized.len); + memcpy (dest, tok->normalized.begin, tok->normalized.len); + rspamd_str_lc_utf8 (dest, tok->normalized.len); + tok->stemmed.len = tok->normalized.len; + tok->stemmed.begin = dest; + } + } + else { + /* No stemmer, utf8 lowercase */ + dest = rspamd_mempool_alloc (pool, tok->normalized.len); + memcpy (dest, tok->normalized.begin, tok->normalized.len); + rspamd_str_lc_utf8 (dest, tok->normalized.len); + tok->stemmed.len = tok->normalized.len; + tok->stemmed.begin = dest; + } + + if (tok->stemmed.len > 0 && rspamd_language_detector_is_stop_word (d, + tok->stemmed.begin, tok->stemmed.len)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD; + } + } + else { + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + /* Raw text, lowercase */ + dest = rspamd_mempool_alloc (pool, tok->original.len); + memcpy (dest, tok->original.begin, tok->original.len); + rspamd_str_lc (dest, tok->original.len); + tok->stemmed.len = tok->original.len; + tok->stemmed.begin = dest; + } + } + } +}
\ No newline at end of file diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 9a5561671..eb4a285de 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -54,13 +54,14 @@ gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, GPtrArray *result); gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, - struct rspamd_tokenizer_config *cf, - gsize *len); + struct rspamd_tokenizer_config *cf, + gsize *len); +struct rspamd_lang_detector; void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool); - void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, - const gchar *language); + const gchar *language, + struct rspamd_lang_detector *d); GArray * rspamd_tokenize_subject (struct rspamd_task *task); #endif |