aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c127
-rw-r--r--src/libstat/tokenizers/tokenizers.h5
2 files changed, 82 insertions, 50 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 604fc070e..dcd556910 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -251,7 +251,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,
- guint64 *hash)
+ guint64 *hash,
+ GArray *cur_words)
{
rspamd_stat_token_t token, buf;
const gchar *pos = NULL;
@@ -265,7 +266,7 @@ rspamd_tokenize_text (const gchar *text, gsize len,
static UBreakIterator* bi = NULL;
if (text == NULL) {
- return NULL;
+ return cur_words;
}
buf.original.begin = text;
@@ -281,8 +282,13 @@ rspamd_tokenize_text (const gchar *text, gsize len,
initial_size = word_decay * 2;
}
- res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
- initial_size);
+ if (!cur_words) {
+ res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
+ initial_size);
+ }
+ else {
+ res = cur_words;
+ }
if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
@@ -474,71 +480,96 @@ start_over:
#undef SHIFT_EX
-GArray *
-rspamd_tokenize_subject (struct rspamd_task *task)
+static void
+rspamd_add_metawords_from_str (const gchar *beg, gsize len,
+ struct rspamd_task *task)
{
UText utxt = UTEXT_INITIALIZER;
UErrorCode uc_err = U_ZERO_ERROR;
- gsize slen;
- gboolean valid_utf = TRUE;
- GArray *words = NULL;
guint i = 0;
- gint32 uc;
- rspamd_stat_token_t *tok;
+ UChar32 uc;
+ gboolean valid_utf = TRUE;
- if (task->subject) {
- const gchar *p = task->subject;
+ while (i < len) {
+ U8_NEXT (beg, i, len, uc);
- slen = strlen (task->subject);
+ if (((gint32) uc) < 0) {
+ valid_utf = FALSE;
+ break;
+ }
- while (i < slen) {
- U8_NEXT (p, i, slen, uc);
+#if U_ICU_VERSION_MAJOR_NUM < 50
+ if (u_isalpha (uc)) {
+ gint32 sc = ublock_getCode (uc);
- if (((gint32) uc) < 0) {
+ if (sc == UBLOCK_THAI) {
valid_utf = FALSE;
+ msg_info_task ("enable workaround for Thai characters for old libicu");
break;
}
-#if U_ICU_VERSION_MAJOR_NUM < 50
- if (u_isalpha (uc)) {
- gint32 sc = ublock_getCode (uc);
-
- if (sc == UBLOCK_THAI) {
- valid_utf = FALSE;
- msg_info_task ("enable workaround for Thai characters for old libicu");
- break;
- }
- }
-#endif
}
+#endif
+ }
- if (valid_utf) {
- utext_openUTF8 (&utxt,
- task->subject,
- slen,
- &uc_err);
+ if (valid_utf) {
+ utext_openUTF8 (&utxt,
+ beg,
+ len,
+ &uc_err);
- words = rspamd_tokenize_text (task->subject, slen,
- &utxt, RSPAMD_TOKENIZE_UTF,
- task->cfg, NULL, NULL);
+ task->meta_words = rspamd_tokenize_text (beg, len,
+ &utxt, RSPAMD_TOKENIZE_UTF,
+ task->cfg, NULL, NULL, task->meta_words);
- utext_close (&utxt);
- }
- else {
- words = rspamd_tokenize_text (task->subject, slen,
- NULL, RSPAMD_TOKENIZE_RAW,
- task->cfg, NULL, NULL);
- }
+ utext_close (&utxt);
}
+ else {
+ task->meta_words = rspamd_tokenize_text (beg, len,
+ NULL, RSPAMD_TOKENIZE_RAW,
+ task->cfg, NULL, NULL, task->meta_words);
+ }
+}
+
+void
+rspamd_tokenize_meta_words (struct rspamd_task *task)
+{
+ guint i = 0;
+ rspamd_stat_token_t *tok;
+
+ if (task->subject) {
+ rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task);
+ }
+
+ if (task->from_mime) {
+ struct rspamd_email_address *addr;
- if (words != NULL) {
+ addr = g_ptr_array_index (task->from_mime, 0);
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index (words, rspamd_stat_token_t, i);
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+ if (addr->name) {
+ rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task);
}
}
- return words;
+ if (task->meta_words != NULL) {
+ const gchar *language = NULL;
+
+ if (task->text_parts && task->text_parts->len > 0) {
+ struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0);
+
+ if (tp->language) {
+ language = tp->language;
+ }
+ }
+
+ rspamd_normalize_words (task->meta_words, task->task_pool);
+ rspamd_stem_words (task->meta_words, task->task_pool, language,
+ task->lang_det);
+
+ for (i = 0; i < task->meta_words->len; i++) {
+ tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
+ }
+ }
}
static inline void
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 683d728ed..784426d31 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -43,7 +43,8 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,
- guint64 *hash);
+ guint64 *hash,
+ GArray *cur_words);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
@@ -64,7 +65,7 @@ void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
const gchar *language,
struct rspamd_lang_detector *d);
-GArray * rspamd_tokenize_subject (struct rspamd_task *task);
+void rspamd_tokenize_meta_words (struct rspamd_task *task);
#endif
/*
* vi:ts=4