/*- * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Common tokenization functions */ #include "rspamd.h" #include "tokenizers.h" #include "stat_internal.h" #include "contrib/mumhash/mum.h" #include "libmime/lang_detection.h" #include "libstemmer.h" #include #include #include #include #include #if U_ICU_VERSION_MAJOR_NUM >= 44 #include #endif #include typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, rspamd_stat_token_t * token, GList **exceptions, gsize *rl, gboolean check_signature); const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* Get next word from specified f_str_t buf */ static gboolean rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gsize *rl, gboolean unused) { gsize remain, pos; const gchar *p; struct rspamd_process_exception *ex = NULL; if (buf == NULL) { return FALSE; } g_assert (cur != NULL); if (exceptions != NULL && *exceptions != NULL) { ex = (*exceptions)->data; } if (token->original.begin == NULL || *cur == NULL) { if (ex != NULL) { if (ex->pos == 0) { token->original.begin = buf->original.begin + ex->len; token->original.len = ex->len; token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; } else { token->original.begin = buf->original.begin; token->original.len = 0; } } else { token->original.begin = buf->original.begin; token->original.len = 0; } *cur = token->original.begin; } token->original.len = 0; pos = *cur - buf->original.begin; if (pos >= buf->original.len) { return FALSE; } remain = buf->original.len - pos; p = *cur; /* Skip non delimiters symbols */ do { if (ex != NULL && ex->pos == pos) { /* Go to the next exception */ *exceptions = g_list_next (*exceptions); *cur = p + ex->len; return TRUE; } pos++; p++; remain--; } while (remain > 0 && t_delimiters[(guchar)*p]); token->original.begin = p; while (remain > 0 && !t_delimiters[(guchar)*p]) { if (ex != NULL && ex->pos == pos) { *exceptions = g_list_next (*exceptions); *cur = p + ex->len; return TRUE; } token->original.len++; pos++; remain--; p++; } if (remain == 0) { return FALSE; } if (rl) { *rl = token->original.len; } token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; *cur = p; return TRUE; } static inline gboolean rspamd_tokenize_check_limit (gboolean decay, guint word_decay, guint nwords, guint64 *hv, guint64 *prob, const rspamd_stat_token_t *token, gssize remain, gssize total) { static const gdouble avg_word_len = 6.0; if (!decay) { if (token->original.len >= sizeof (guint64)) { #ifdef _MUM_UNALIGNED_ACCESS *hv = mum_hash_step (*hv, *(guint64 *)token->original.begin); #else guint64 tmp; memcpy (&tmp, token->original.begin, sizeof (tmp)); *hv = mum_hash_step (*hv, tmp); #endif } /* Check for decay */ if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) { /* Start decay */ gdouble decay_prob; *hv = mum_hash_finish (*hv); /* We assume that word is 6 symbols length in average */ decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len) * 10; decay_prob = floor (decay_prob) / 10.0; if (decay_prob >= 1.0) { *prob = G_MAXUINT64; } else { *prob = decay_prob * G_MAXUINT64; } return TRUE; } } else { /* Decaying probability */ /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */ *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL; if (*hv > *prob) { return TRUE; } } return FALSE; } static inline gboolean rspamd_utf_word_valid (const guchar *text, const guchar *end, gint32 start, gint32 finish) { const guchar *st = text + start, *fin = text + finish; UChar32 c; if (st >= end || fin > end || st >= fin) { return FALSE; } U8_NEXT (text, start, finish, c); if (u_isJavaIDPart (c)) { return TRUE; } return FALSE; } #define SHIFT_EX do { \ cur = g_list_next (cur); \ if (cur) { \ ex = (struct rspamd_process_exception *) cur->data; \ } \ else { \ ex = NULL; \ } \ } while(0) static inline void rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res) { rspamd_stat_token_t token; memset (&token, 0, sizeof (token)); if (ex->type == RSPAMD_EXCEPTION_GENERIC) { token.original.begin = "!!EX!!"; token.original.len = sizeof ("!!EX!!") - 1; token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; g_array_append_val (res, token); token.flags = 0; } else if (ex->type == RSPAMD_EXCEPTION_URL) { struct rspamd_url *uri; uri = ex->ptr; if (uri && uri->tldlen > 0) { token.original.begin = uri->tld; token.original.len = uri->tldlen; } else { token.original.begin = "!!EX!!"; token.original.len = sizeof ("!!EX!!") - 1; } token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; g_array_append_val (res, token); token.flags = 0; } } GArray * rspamd_tokenize_text (const gchar *text, gsize len, const UText *utxt, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, guint64 *hash, GArray *cur_words) { rspamd_stat_token_t token, buf; const gchar *pos = NULL; gsize l = 0; GArray *res; GList *cur = exceptions; guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; guint64 hv = 0; gboolean decay = FALSE; guint64 prob = 0; static UBreakIterator* bi = NULL; if (text == NULL) { return cur_words; } buf.original.begin = text; buf.original.len = len; buf.flags = 0; memset (&token, 0, sizeof (token)); if (cfg != NULL) { min_len = cfg->min_word_len; max_len = cfg->max_word_len; word_decay = cfg->words_decay; initial_size = word_decay * 2; } if (!cur_words) { res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), initial_size); } else { res = cur_words; } if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || (max_len > 0 && l > max_len)) { token.original.begin = pos; continue; } if (token.original.len > 0 && rspamd_tokenize_check_limit (decay, word_decay, res->len, &hv, &prob, &token, pos - text, len)) { if (!decay) { decay = TRUE; } else { token.original.begin = pos; continue; } } g_array_append_val (res, token); token.original.begin = pos; } } else { /* UTF8 boundaries */ UErrorCode uc_err = U_ZERO_ERROR; int32_t last, p; struct rspamd_process_exception *ex = NULL; if (bi == NULL) { bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err); g_assert (U_SUCCESS (uc_err)); } ubrk_setUText (bi, (UText*)utxt, &uc_err); last = ubrk_first (bi); p = last; if (cur) { ex = (struct rspamd_process_exception *)cur->data; } while (p != UBRK_DONE) { start_over: token.original.len = 0; if (p > last) { if (ex && cur) { /* Check exception */ if (ex->pos >= last && ex->pos <= p) { /* We have an exception within boundary */ /* First, start to drain exceptions from the start */ while (cur && ex->pos <= last) { /* We have an exception at the beginning, skip those */ last += ex->len; rspamd_tokenize_exception (ex, res); if (last > p) { /* Exception spread over the boundaries */ while (last > p && p != UBRK_DONE) { p = ubrk_next (bi); } /* We need to reset our scan with new p and last */ SHIFT_EX; goto start_over; } SHIFT_EX; } /* Now, we can have an exception within boundary again */ if (cur && ex->pos >= last && ex->pos <= p) { /* Append the first part */ if (rspamd_utf_word_valid (text, text + len, last, ex->pos)) { token.original.begin = text + last; token.original.len = ex->pos - last; token.flags = 0; g_array_append_val (res, token); } /* Process the current exception */ last += ex->len + (ex->pos - last); rspamd_tokenize_exception (ex, res); if (last > p) { /* Exception spread over the boundaries */ while (last > p && p != UBRK_DONE) { p = ubrk_next (bi); } /* We need to reset our scan with new p and last */ SHIFT_EX; goto start_over; } SHIFT_EX; } else if (p > last) { if (rspamd_utf_word_valid (text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; } } } else if (ex->pos < last) { /* Forward exceptions list */ while (cur && ex->pos <= last) { /* We have an exception at the beginning, skip those */ SHIFT_EX; } if (rspamd_utf_word_valid (text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; } } else { /* No exceptions within boundary */ if (rspamd_utf_word_valid (text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; } } } else { if (rspamd_utf_word_valid (text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; } } if (token.original.len > 0 && rspamd_tokenize_check_limit (decay, word_decay, res->len, &hv, &prob, &token, p, len)) { if (!decay) { decay = TRUE; } else { token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED; } } } if (token.original.len > 0) { g_array_append_val (res, token); } last = p; p = ubrk_next (bi); } } if (!decay) { hv = mum_hash_finish (hv); } if (hash) { *hash = hv; } return res; } #undef SHIFT_EX static void rspamd_add_metawords_from_str (const gchar *beg, gsize len, struct rspamd_task *task) { UText utxt = UTEXT_INITIALIZER; UErrorCode uc_err = U_ZERO_ERROR; guint i = 0; UChar32 uc; gboolean valid_utf = TRUE; while (i < len) { U8_NEXT (beg, i, len, uc); if (((gint32) uc) < 0) { valid_utf = FALSE; break; } #if U_ICU_VERSION_MAJOR_NUM < 50 if (u_isalpha (uc)) { gint32 sc = ublock_getCode (uc); if (sc == UBLOCK_THAI) { valid_utf = FALSE; msg_info_task ("enable workaround for Thai characters for old libicu"); break; } } #endif } if (valid_utf) { utext_openUTF8 (&utxt, beg, len, &uc_err); task->meta_words = rspamd_tokenize_text (beg, len, &utxt, RSPAMD_TOKENIZE_UTF, task->cfg, NULL, NULL, task->meta_words); utext_close (&utxt); } else { task->meta_words = rspamd_tokenize_text (beg, len, NULL, RSPAMD_TOKENIZE_RAW, task->cfg, NULL, NULL, task->meta_words); } } void rspamd_tokenize_meta_words (struct rspamd_task *task) { guint i = 0; rspamd_stat_token_t *tok; if (task->subject) { rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task); } if (task->from_mime && task->from_mime->len > 0) { struct rspamd_email_address *addr; addr = g_ptr_array_index (task->from_mime, 0); if (addr->name) { rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task); } } if (task->meta_words != NULL) { const gchar *language = NULL; if (task->text_parts && task->text_parts->len > 0) { struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0); if (tp->language) { language = tp->language; } } rspamd_normalize_words (task->meta_words, task->task_pool); rspamd_stem_words (task->meta_words, task->task_pool, language, task->lang_det); for (i = 0; i < task->meta_words->len; i++) { tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER; } } } static inline void rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, rspamd_stat_token_t *tok, rspamd_mempool_t *pool) { UChar32 *dest, t, *d; gint32 i = 0; dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32)); d = dest; while (i < srclen) { U16_NEXT_UNSAFE (src, i, t); if (u_isgraph (t)) { UCharCategory cat; cat = u_charType (t); #if U_ICU_VERSION_MAJOR_NUM >= 57 if (u_hasBinaryProperty (t, UCHAR_EMOJI)) { tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI; } #endif if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_DECIMAL_DIGIT_NUMBER || cat == U_CONNECTOR_PUNCTUATION || cat == U_MATH_SYMBOL || cat == U_CURRENCY_SYMBOL || cat == U_INITIAL_PUNCTUATION || cat == U_FINAL_PUNCTUATION) { *d++ = u_tolower (t); } } else { /* Invisible spaces ! */ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES; } } tok->unicode.begin = dest; tok->unicode.len = d - dest; } static inline void rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok, rspamd_mempool_t *pool) { guint i, doff = 0; gsize utflen = 0; gchar *dest; UChar32 t; for (i = 0; i < tok->unicode.len; i ++) { utflen += U8_LENGTH (tok->unicode.begin[i]); } dest = rspamd_mempool_alloc (pool, utflen + 1); for (i = 0; i < tok->unicode.len; i ++) { t = tok->unicode.begin[i]; U8_APPEND_UNSAFE (dest, doff, t); } g_assert (doff <= utflen); dest[doff] = '\0'; tok->normalized.len = doff; tok->normalized.begin = dest; } void rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool) { UErrorCode uc_err = U_ZERO_ERROR; UConverter *utf8_converter; UChar tmpbuf[1024]; /* Assume that we have no longer words... */ gsize ulen; utf8_converter = rspamd_get_utf8_converter (); if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { ulen = ucnv_toUChars (utf8_converter, tmpbuf, G_N_ELEMENTS (tmpbuf), tok->original.begin, tok->original.len, &uc_err); /* Now, we need to understand if we need to normalise the word */ if (!U_SUCCESS (uc_err)) { tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; tok->unicode.begin = NULL; tok->unicode.len = 0; tok->normalized.begin = NULL; tok->normalized.len = 0; } else { #if U_ICU_VERSION_MAJOR_NUM >= 44 const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); gint32 end; /* We can now check if we need to decompose */ end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err); if (!U_SUCCESS (uc_err)) { rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); tok->normalized.begin = NULL; tok->normalized.len = 0; tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; } else { if (end == ulen) { /* Already normalised, just lowercase */ rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); rspamd_ucs32_to_normalised (tok, pool); } else { /* Perform normalization */ UChar normbuf[1024]; g_assert (end < G_N_ELEMENTS (normbuf)); /* First part */ memcpy (normbuf, tmpbuf, end * sizeof (UChar)); /* Second part */ ulen = unorm2_normalizeSecondAndAppend (norm, normbuf, end, G_N_ELEMENTS (normbuf), tmpbuf + end, ulen - end, &uc_err); if (!U_SUCCESS (uc_err)) { if (uc_err != U_BUFFER_OVERFLOW_ERROR) { msg_warn_pool_check ("cannot normalise text '%*s': %s", (gint)tok->original.len, tok->original.begin, u_errorName (uc_err)); rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); rspamd_ucs32_to_normalised (tok, pool); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; } } else { /* Copy normalised back */ rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED; rspamd_ucs32_to_normalised (tok, pool); } } } #else /* Legacy version with no unorm2 interface */ rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); rspamd_ucs32_to_normalised (tok, pool); #endif } } else { if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { /* Simple lowercase */ gchar *dest; dest = rspamd_mempool_alloc (pool, tok->original.len + 1); rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1); rspamd_str_lc (dest, tok->original.len); tok->normalized.len = tok->original.len; tok->normalized.begin = dest; } } } void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool) { rspamd_stat_token_t *tok; guint i; for (i = 0; i < words->len; i++) { tok = &g_array_index (words, rspamd_stat_token_t, i); rspamd_normalize_single_word (tok, pool); } } void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, const gchar *language, struct rspamd_lang_detector *d) { static GHashTable *stemmers = NULL; struct sb_stemmer *stem = NULL; guint i; rspamd_stat_token_t *tok; gchar *dest; gsize dlen; if (!stemmers) { stemmers = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal); } if (language && language[0] != '\0') { stem = g_hash_table_lookup (stemmers, language); if (stem == NULL) { stem = sb_stemmer_new (language, "UTF_8"); if (stem == NULL) { msg_debug_pool ( "<%s> cannot create lemmatizer for %s language", language); g_hash_table_insert (stemmers, g_strdup (language), GINT_TO_POINTER (-1)); } else { g_hash_table_insert (stemmers, g_strdup (language), stem); } } else if (stem == GINT_TO_POINTER (-1)) { /* Negative cache */ stem = NULL; } } for (i = 0; i < words->len; i++) { tok = &g_array_index (words, rspamd_stat_token_t, i); if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { if (stem) { const gchar *stemmed = NULL; stemmed = sb_stemmer_stem (stem, tok->normalized.begin, tok->normalized.len); dlen = stemmed ? strlen (stemmed) : 0; if (dlen > 0) { dest = rspamd_mempool_alloc (pool, dlen + 1); memcpy (dest, stemmed, dlen); dest[dlen] = '\0'; tok->stemmed.len = dlen; tok->stemmed.begin = dest; tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED; } else { /* Fallback */ tok->stemmed.len = tok->normalized.len; tok->stemmed.begin = tok->normalized.begin; } } else { tok->stemmed.len = tok->normalized.len; tok->stemmed.begin = tok->normalized.begin; } if (tok->stemmed.len > 0 && d != NULL && rspamd_language_detector_is_stop_word (d, tok->stemmed.begin, tok->stemmed.len)) { tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD; } } else { if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { /* Raw text, lowercase */ tok->stemmed.len = tok->normalized.len; tok->stemmed.begin = tok->normalized.begin; } } } }