123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
- * Common tokenization functions
- */
-
- #include "rspamd.h"
- #include "tokenizers.h"
- #include "stat_internal.h"
- #include "contrib/mumhash/mum.h"
- #include "libmime/lang_detection.h"
- #include "libstemmer.h"
-
- #include <unicode/utf8.h>
- #include <unicode/uchar.h>
- #include <unicode/uiter.h>
- #include <unicode/ubrk.h>
- #include <unicode/ucnv.h>
- #if U_ICU_VERSION_MAJOR_NUM >= 44
- #include <unicode/unorm2.h>
- #endif
-
- #include <math.h>
-
- typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
- rspamd_stat_token_t * token,
- GList **exceptions, gsize *rl, gboolean check_signature);
-
- const gchar t_delimiters[255] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
- 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
- 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0
- };
-
- /* Get next word from specified f_str_t buf */
- static gboolean
- rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
- gchar const **cur, rspamd_stat_token_t * token,
- GList **exceptions, gsize *rl, gboolean unused)
- {
- gsize remain, pos;
- const gchar *p;
- struct rspamd_process_exception *ex = NULL;
-
- if (buf == NULL) {
- return FALSE;
- }
-
- g_assert (cur != NULL);
-
- if (exceptions != NULL && *exceptions != NULL) {
- ex = (*exceptions)->data;
- }
-
- if (token->original.begin == NULL || *cur == NULL) {
- if (ex != NULL) {
- if (ex->pos == 0) {
- token->original.begin = buf->original.begin + ex->len;
- token->original.len = ex->len;
- token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- }
- else {
- token->original.begin = buf->original.begin;
- token->original.len = 0;
- }
- }
- else {
- token->original.begin = buf->original.begin;
- token->original.len = 0;
- }
- *cur = token->original.begin;
- }
-
- token->original.len = 0;
-
- pos = *cur - buf->original.begin;
- if (pos >= buf->original.len) {
- return FALSE;
- }
-
- remain = buf->original.len - pos;
- p = *cur;
-
- /* Skip non delimiters symbols */
- do {
- if (ex != NULL && ex->pos == pos) {
- /* Go to the next exception */
- *exceptions = g_list_next (*exceptions);
- *cur = p + ex->len;
- return TRUE;
- }
- pos++;
- p++;
- remain--;
- } while (remain > 0 && t_delimiters[(guchar)*p]);
-
- token->original.begin = p;
-
- while (remain > 0 && !t_delimiters[(guchar)*p]) {
- if (ex != NULL && ex->pos == pos) {
- *exceptions = g_list_next (*exceptions);
- *cur = p + ex->len;
- return TRUE;
- }
- token->original.len++;
- pos++;
- remain--;
- p++;
- }
-
- if (remain == 0) {
- return FALSE;
- }
-
- if (rl) {
- *rl = token->original.len;
- }
-
- token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
-
- *cur = p;
-
- return TRUE;
- }
-
- static inline gboolean
- rspamd_tokenize_check_limit (gboolean decay,
- guint word_decay,
- guint nwords,
- guint64 *hv,
- guint64 *prob,
- const rspamd_stat_token_t *token,
- gssize remain,
- gssize total)
- {
- static const gdouble avg_word_len = 6.0;
-
- if (!decay) {
- if (token->original.len >= sizeof (guint64)) {
- #ifdef _MUM_UNALIGNED_ACCESS
- *hv = mum_hash_step (*hv, *(guint64 *)token->original.begin);
- #else
- guint64 tmp;
- memcpy (&tmp, token->original.begin, sizeof (tmp));
- *hv = mum_hash_step (*hv, tmp);
- #endif
- }
-
- /* Check for decay */
- if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
- /* Start decay */
- gdouble decay_prob;
-
- *hv = mum_hash_finish (*hv);
-
- /* We assume that word is 6 symbols length in average */
- decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len) * 10;
- decay_prob = floor (decay_prob) / 10.0;
-
- if (decay_prob >= 1.0) {
- *prob = G_MAXUINT64;
- }
- else {
- *prob = decay_prob * G_MAXUINT64;
- }
-
- return TRUE;
- }
- }
- else {
- /* Decaying probability */
- /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
- *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
-
- if (*hv > *prob) {
- return TRUE;
- }
- }
-
- return FALSE;
- }
-
- static inline gboolean
- rspamd_utf_word_valid (const guchar *text, const guchar *end,
- gint32 start, gint32 finish)
- {
- const guchar *st = text + start, *fin = text + finish;
- UChar32 c;
-
- if (st >= end || fin > end || st >= fin) {
- return FALSE;
- }
-
- U8_NEXT (text, start, finish, c);
-
- if (u_isJavaIDPart (c)) {
- return TRUE;
- }
-
- return FALSE;
- }
- #define SHIFT_EX do { \
- cur = g_list_next (cur); \
- if (cur) { \
- ex = (struct rspamd_process_exception *) cur->data; \
- } \
- else { \
- ex = NULL; \
- } \
- } while(0)
-
- static inline void
- rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
- {
- rspamd_stat_token_t token;
-
- memset (&token, 0, sizeof (token));
-
- if (ex->type == RSPAMD_EXCEPTION_GENERIC) {
- token.original.begin = "!!EX!!";
- token.original.len = sizeof ("!!EX!!") - 1;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-
- g_array_append_val (res, token);
- token.flags = 0;
- }
- else if (ex->type == RSPAMD_EXCEPTION_URL) {
- struct rspamd_url *uri;
-
- uri = ex->ptr;
-
- if (uri && uri->tldlen > 0) {
- token.original.begin = uri->tld;
- token.original.len = uri->tldlen;
-
- }
- else {
- token.original.begin = "!!EX!!";
- token.original.len = sizeof ("!!EX!!") - 1;
- }
-
- token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- g_array_append_val (res, token);
- token.flags = 0;
- }
- }
-
-
- GArray *
- rspamd_tokenize_text (const gchar *text, gsize len,
- const UText *utxt,
- enum rspamd_tokenize_type how,
- struct rspamd_config *cfg,
- GList *exceptions,
- guint64 *hash,
- GArray *cur_words)
- {
- rspamd_stat_token_t token, buf;
- const gchar *pos = NULL;
- gsize l = 0;
- GArray *res;
- GList *cur = exceptions;
- guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
- guint64 hv = 0;
- gboolean decay = FALSE;
- guint64 prob = 0;
- static UBreakIterator* bi = NULL;
-
- if (text == NULL) {
- return cur_words;
- }
-
- buf.original.begin = text;
- buf.original.len = len;
- buf.flags = 0;
-
- memset (&token, 0, sizeof (token));
-
- if (cfg != NULL) {
- min_len = cfg->min_word_len;
- max_len = cfg->max_word_len;
- word_decay = cfg->words_decay;
- initial_size = word_decay * 2;
- }
-
- if (!cur_words) {
- res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
- initial_size);
- }
- else {
- res = cur_words;
- }
-
- if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
- while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
- if (l == 0 || (min_len > 0 && l < min_len) ||
- (max_len > 0 && l > max_len)) {
- token.original.begin = pos;
- continue;
- }
-
- if (token.original.len > 0 &&
- rspamd_tokenize_check_limit (decay, word_decay, res->len,
- &hv, &prob, &token, pos - text, len)) {
- if (!decay) {
- decay = TRUE;
- }
- else {
- token.original.begin = pos;
- continue;
- }
- }
-
- g_array_append_val (res, token);
- token.original.begin = pos;
- }
- }
- else {
- /* UTF8 boundaries */
- UErrorCode uc_err = U_ZERO_ERROR;
- int32_t last, p;
- struct rspamd_process_exception *ex = NULL;
-
- if (bi == NULL) {
- bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
-
- g_assert (U_SUCCESS (uc_err));
- }
-
- ubrk_setUText (bi, (UText*)utxt, &uc_err);
- last = ubrk_first (bi);
- p = last;
-
- if (cur) {
- ex = (struct rspamd_process_exception *)cur->data;
- }
-
- while (p != UBRK_DONE) {
- start_over:
- token.original.len = 0;
-
- if (p > last) {
- if (ex && cur) {
- /* Check exception */
- if (ex->pos >= last && ex->pos <= p) {
- /* We have an exception within boundary */
- /* First, start to drain exceptions from the start */
- while (cur && ex->pos <= last) {
- /* We have an exception at the beginning, skip those */
- last += ex->len;
- rspamd_tokenize_exception (ex, res);
-
- if (last > p) {
- /* Exception spread over the boundaries */
- while (last > p && p != UBRK_DONE) {
- p = ubrk_next (bi);
- }
-
- /* We need to reset our scan with new p and last */
- SHIFT_EX;
- goto start_over;
- }
-
- SHIFT_EX;
- }
-
- /* Now, we can have an exception within boundary again */
- if (cur && ex->pos >= last && ex->pos <= p) {
- /* Append the first part */
- if (rspamd_utf_word_valid (text, text + len, last,
- ex->pos)) {
- token.original.begin = text + last;
- token.original.len = ex->pos - last;
- token.flags = 0;
- g_array_append_val (res, token);
- }
-
- /* Process the current exception */
- last += ex->len + (ex->pos - last);
-
- rspamd_tokenize_exception (ex, res);
-
- if (last > p) {
- /* Exception spread over the boundaries */
- while (last > p && p != UBRK_DONE) {
- p = ubrk_next (bi);
- }
- /* We need to reset our scan with new p and last */
- SHIFT_EX;
- goto start_over;
- }
-
- SHIFT_EX;
- }
- else if (p > last) {
- if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.original.begin = text + last;
- token.original.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
- RSPAMD_STAT_TOKEN_FLAG_UTF;
- }
- }
- }
- else if (ex->pos < last) {
- /* Forward exceptions list */
- while (cur && ex->pos <= last) {
- /* We have an exception at the beginning, skip those */
- SHIFT_EX;
- }
-
- if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.original.begin = text + last;
- token.original.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
- RSPAMD_STAT_TOKEN_FLAG_UTF;
- }
- }
- else {
- /* No exceptions within boundary */
- if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.original.begin = text + last;
- token.original.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
- RSPAMD_STAT_TOKEN_FLAG_UTF;
- }
- }
- }
- else {
- if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.original.begin = text + last;
- token.original.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
- RSPAMD_STAT_TOKEN_FLAG_UTF;
- }
- }
-
- if (token.original.len > 0 &&
- rspamd_tokenize_check_limit (decay, word_decay, res->len,
- &hv, &prob, &token, p, len)) {
- if (!decay) {
- decay = TRUE;
- } else {
- token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED;
- }
- }
- }
-
- if (token.original.len > 0) {
- if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
- /* Due to bug in glib ! */
- msg_err ("too many words found: %d, stop tokenization to avoid DoS",
- res->len);
-
- goto end;
- }
- g_array_append_val (res, token);
- }
-
- last = p;
- p = ubrk_next (bi);
- }
- }
-
- end:
- if (!decay) {
- hv = mum_hash_finish (hv);
- }
-
- if (hash) {
- *hash = hv;
- }
-
- return res;
- }
-
- #undef SHIFT_EX
-
- static void
- rspamd_add_metawords_from_str (const gchar *beg, gsize len,
- struct rspamd_task *task)
- {
- UText utxt = UTEXT_INITIALIZER;
- UErrorCode uc_err = U_ZERO_ERROR;
- guint i = 0;
- UChar32 uc;
- gboolean valid_utf = TRUE;
-
- while (i < len) {
- U8_NEXT (beg, i, len, uc);
-
- if (((gint32) uc) < 0) {
- valid_utf = FALSE;
- break;
- }
-
- #if U_ICU_VERSION_MAJOR_NUM < 50
- if (u_isalpha (uc)) {
- gint32 sc = ublock_getCode (uc);
-
- if (sc == UBLOCK_THAI) {
- valid_utf = FALSE;
- msg_info_task ("enable workaround for Thai characters for old libicu");
- break;
- }
- }
- #endif
- }
-
- if (valid_utf) {
- utext_openUTF8 (&utxt,
- beg,
- len,
- &uc_err);
-
- task->meta_words = rspamd_tokenize_text (beg, len,
- &utxt, RSPAMD_TOKENIZE_UTF,
- task->cfg, NULL, NULL, task->meta_words);
-
- utext_close (&utxt);
- }
- else {
- task->meta_words = rspamd_tokenize_text (beg, len,
- NULL, RSPAMD_TOKENIZE_RAW,
- task->cfg, NULL, NULL, task->meta_words);
- }
- }
-
- void
- rspamd_tokenize_meta_words (struct rspamd_task *task)
- {
- guint i = 0;
- rspamd_stat_token_t *tok;
-
- if (task->subject) {
- rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task);
- }
-
- if (task->from_mime && task->from_mime->len > 0) {
- struct rspamd_email_address *addr;
-
- addr = g_ptr_array_index (task->from_mime, 0);
-
- if (addr->name) {
- rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task);
- }
- }
-
- if (task->meta_words != NULL) {
- const gchar *language = NULL;
-
- if (task->text_parts && task->text_parts->len > 0) {
- struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0);
-
- if (tp->language) {
- language = tp->language;
- }
- }
-
- rspamd_normalize_words (task->meta_words, task->task_pool);
- rspamd_stem_words (task->meta_words, task->task_pool, language,
- task->lang_det);
-
- for (i = 0; i < task->meta_words->len; i++) {
- tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
- }
- }
- }
-
- static inline void
- rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
- rspamd_stat_token_t *tok,
- rspamd_mempool_t *pool)
- {
- UChar32 *dest, t, *d;
- gint32 i = 0;
-
- dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32));
- d = dest;
-
- while (i < srclen) {
- U16_NEXT_UNSAFE (src, i, t);
-
- if (u_isgraph (t)) {
- UCharCategory cat;
-
- cat = u_charType (t);
- #if U_ICU_VERSION_MAJOR_NUM >= 57
- if (u_hasBinaryProperty (t, UCHAR_EMOJI)) {
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
- }
- #endif
-
- if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) ||
- cat == U_CONNECTOR_PUNCTUATION ||
- cat == U_MATH_SYMBOL ||
- cat == U_CURRENCY_SYMBOL) {
- *d++ = u_tolower (t);
- }
- }
- else {
- /* Invisible spaces ! */
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
- }
- }
-
- tok->unicode.begin = dest;
- tok->unicode.len = d - dest;
- }
-
- static inline void
- rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok,
- rspamd_mempool_t *pool)
- {
- guint i, doff = 0;
- gsize utflen = 0;
- gchar *dest;
- UChar32 t;
-
- for (i = 0; i < tok->unicode.len; i ++) {
- utflen += U8_LENGTH (tok->unicode.begin[i]);
- }
-
- dest = rspamd_mempool_alloc (pool, utflen + 1);
-
- for (i = 0; i < tok->unicode.len; i ++) {
- t = tok->unicode.begin[i];
- U8_APPEND_UNSAFE (dest, doff, t);
- }
-
- g_assert (doff <= utflen);
- dest[doff] = '\0';
-
- tok->normalized.len = doff;
- tok->normalized.begin = dest;
- }
-
- void
- rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
- {
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_converter;
- UChar tmpbuf[1024]; /* Assume that we have no longer words... */
- gsize ulen;
-
- utf8_converter = rspamd_get_utf8_converter ();
-
- if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
- ulen = ucnv_toUChars (utf8_converter,
- tmpbuf,
- G_N_ELEMENTS (tmpbuf),
- tok->original.begin,
- tok->original.len,
- &uc_err);
-
- /* Now, we need to understand if we need to normalise the word */
- if (!U_SUCCESS (uc_err)) {
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
- tok->unicode.begin = NULL;
- tok->unicode.len = 0;
- tok->normalized.begin = NULL;
- tok->normalized.len = 0;
- }
- else {
- #if U_ICU_VERSION_MAJOR_NUM >= 44
- const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
- gint32 end;
-
- /* We can now check if we need to decompose */
- end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
- tok->normalized.begin = NULL;
- tok->normalized.len = 0;
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
- }
- else {
- if (end == ulen) {
- /* Already normalised, just lowercase */
- rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
- rspamd_ucs32_to_normalised (tok, pool);
- }
- else {
- /* Perform normalization */
- UChar normbuf[1024];
-
- g_assert (end < G_N_ELEMENTS (normbuf));
- /* First part */
- memcpy (normbuf, tmpbuf, end * sizeof (UChar));
- /* Second part */
- ulen = unorm2_normalizeSecondAndAppend (norm,
- normbuf, end,
- G_N_ELEMENTS (normbuf),
- tmpbuf + end,
- ulen - end,
- &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
- msg_warn_pool_check ("cannot normalise text '%*s': %s",
- (gint)tok->original.len, tok->original.begin,
- u_errorName (uc_err));
- rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
- rspamd_ucs32_to_normalised (tok, pool);
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
- }
- }
- else {
- /* Copy normalised back */
- rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool);
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
- rspamd_ucs32_to_normalised (tok, pool);
- }
- }
- }
- #else
- /* Legacy version with no unorm2 interface */
- rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
- rspamd_ucs32_to_normalised (tok, pool);
- #endif
- }
- }
- else {
- if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
- /* Simple lowercase */
- gchar *dest;
-
- dest = rspamd_mempool_alloc (pool, tok->original.len + 1);
- rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1);
- rspamd_str_lc (dest, tok->original.len);
- tok->normalized.len = tok->original.len;
- tok->normalized.begin = dest;
- }
- }
- }
-
- void
- rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
- {
- rspamd_stat_token_t *tok;
- guint i;
-
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index (words, rspamd_stat_token_t, i);
- rspamd_normalize_single_word (tok, pool);
- }
- }
-
- void
- rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
- const gchar *language,
- struct rspamd_lang_detector *d)
- {
- static GHashTable *stemmers = NULL;
- struct sb_stemmer *stem = NULL;
- guint i;
- rspamd_stat_token_t *tok;
- gchar *dest;
- gsize dlen;
-
- if (!stemmers) {
- stemmers = g_hash_table_new (rspamd_strcase_hash,
- rspamd_strcase_equal);
- }
-
- if (language && language[0] != '\0') {
- stem = g_hash_table_lookup (stemmers, language);
-
- if (stem == NULL) {
-
- stem = sb_stemmer_new (language, "UTF_8");
-
- if (stem == NULL) {
- msg_debug_pool (
- "<%s> cannot create lemmatizer for %s language",
- language);
- g_hash_table_insert (stemmers, g_strdup (language),
- GINT_TO_POINTER (-1));
- }
- else {
- g_hash_table_insert (stemmers, g_strdup (language),
- stem);
- }
- }
- else if (stem == GINT_TO_POINTER (-1)) {
- /* Negative cache */
- stem = NULL;
- }
- }
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index (words, rspamd_stat_token_t, i);
-
- if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
- if (stem) {
- const gchar *stemmed = NULL;
-
- stemmed = sb_stemmer_stem (stem,
- tok->normalized.begin, tok->normalized.len);
-
- dlen = stemmed ? strlen (stemmed) : 0;
-
- if (dlen > 0) {
- dest = rspamd_mempool_alloc (pool, dlen + 1);
- memcpy (dest, stemmed, dlen);
- dest[dlen] = '\0';
- tok->stemmed.len = dlen;
- tok->stemmed.begin = dest;
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
- }
- else {
- /* Fallback */
- tok->stemmed.len = tok->normalized.len;
- tok->stemmed.begin = tok->normalized.begin;
- }
- }
- else {
- tok->stemmed.len = tok->normalized.len;
- tok->stemmed.begin = tok->normalized.begin;
- }
-
- if (tok->stemmed.len > 0 && d != NULL &&
- rspamd_language_detector_is_stop_word (d, tok->stemmed.begin, tok->stemmed.len)) {
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
- }
- }
- else {
- if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
- /* Raw text, lowercase */
- tok->stemmed.len = tok->normalized.len;
- tok->stemmed.begin = tok->normalized.begin;
- }
- }
- }
- }
|