aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-06 19:49:44 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-06 19:50:18 +0100
commitc31f8bf12bff61c9422de9eeff0292c6ac339c5e (patch)
tree224c38634f5d6f45218752ca3abb1b39bc7e4093 /src/libstat/tokenizers
parentaf5f57916e4345d988802794c84460960ee47d0c (diff)
downloadrspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.tar.gz
rspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.zip
[Feature] Implement new text tokenizer based on libicu
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c418
-rw-r--r--src/libstat/tokenizers/tokenizers.h3
2 files changed, 218 insertions, 203 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 5436430fe..9babfc8a1 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -21,8 +21,10 @@
#include "tokenizers.h"
#include "stat_internal.h"
#include "../../../contrib/mumhash/mum.h"
-#include "unicode/utf8.h"
-#include "unicode/uchar.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <unicode/uiter.h>
+#include <unicode/ubrk.h>
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
rspamd_stat_token_t * token,
@@ -148,187 +150,88 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
return TRUE;
}
-static gboolean
-rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
- gchar const **cur, rspamd_stat_token_t * token,
- GList **exceptions, gsize *rl,
- gboolean check_signature)
+static inline gboolean
+rspamd_tokenize_check_limit (gboolean decay,
+ guint word_decay,
+ guint nwords,
+ guint64 *hv,
+ guint64 *prob,
+ const rspamd_stat_token_t *token,
+ gssize remain,
+ gssize total)
{
- gint32 i, siglen = 0, remain;
- goffset pos;
- const gchar *p, *s, *sig = NULL;
- UChar32 uc;
- guint processed = 0;
- struct rspamd_process_exception *ex = NULL;
- enum {
- skip_delimiters = 0,
- feed_token,
- process_signature
- } state = skip_delimiters;
-
- if (buf == NULL) {
- return FALSE;
- }
-
- if (exceptions != NULL && *exceptions != NULL) {
- ex = (*exceptions)->data;
- }
-
- g_assert (cur != NULL);
-
- if (*cur == NULL) {
- *cur = buf->begin;
- }
+ static const gdouble avg_word_len = 6.0;
- token->len = 0;
+ if (!decay) {
+ if (token->len >= sizeof (guint64)) {
+#ifdef _MUM_UNALIGNED_ACCESS
+ *hv = mum_hash_step (*hv, *(guint64 *)token->begin);
+#else
+ guint64 tmp;
+ memcpy (&tmp, token->begin, sizeof (tmp));
+ *hv = mum_hash_step (*hv, tmp);
+#endif
+ }
- pos = *cur - buf->begin;
- if (pos >= buf->len) {
- return FALSE;
- }
+ /* Check for decay */
+ if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
+ /* Start decay */
+ gdouble decay_prob;
- remain = buf->len - pos;
- s = *cur;
- p = s;
- token->begin = s;
+ *hv = mum_hash_finish (*hv);
- for (i = 0; i < remain; ) {
- p = &s[i];
- U8_NEXT (s, i, remain, uc); /* This also advances i */
+ /* We assume that word is 6 symbols length in average */
+ decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len);
- if (uc < 0) {
- if (i < remain) {
- uc = 0xFFFD;
+ if (decay_prob >= 1.0) {
+ *prob = G_MAXUINT64;
}
else {
- return FALSE;
+ *prob = decay_prob * G_MAXUINT64;
}
- }
- switch (state) {
- case skip_delimiters:
- if (ex != NULL && p - buf->begin == ex->pos) {
- goto process_exception;
- }
- else if (u_isgraph (uc)) {
- if (u_isalnum (uc)) {
- state = feed_token;
- token->begin = p;
- continue;
- }
- else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
- sig = p;
- siglen = remain - i;
- state = process_signature;
- continue;
- }
- }
- break;
- case feed_token:
- if (ex != NULL && p - buf->begin == (gint)ex->pos) {
- token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
- goto process_exception;
- }
- else if (!u_isalnum (uc)) {
- token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
- goto set_token;
- }
- processed ++;
- break;
- case process_signature:
- if (*p == '\r' || *p == '\n') {
- msg_debug ("signature found: %*s", (gint)siglen, sig);
- return FALSE;
- }
- else if (*p != ' ' && *p != '-' && *p != '_') {
- state = skip_delimiters;
- continue;
- }
- break;
+ return TRUE;
}
}
+ else {
+ /* Decaying probability */
+ /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+ *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
- /* Last character */
- if (state == feed_token) {
- p = &s[i];
- goto set_token;
+ if (*hv > *prob) {
+ return TRUE;
+ }
}
return FALSE;
+}
-set_token:
- if (rl) {
- *rl = processed;
- }
+static inline gboolean
+rspamd_utf_word_valid (const gchar *text, const gchar *end,
+ gint32 start, gint32 finish)
+{
+ const gchar *st = text + start, *fin = text + finish;
+ UChar32 c;
- if (token->len == 0 && processed > 0) {
- token->len = p - token->begin;
- g_assert (token->len > 0);
+ if (st >= end || fin > end || st >= fin) {
+ return FALSE;
}
- *cur = &s[i];
-
- return TRUE;
-
-process_exception:
- if (token->len == 0 && processed > 0) {
- /*
- * We have processed something before the next exception, so
- * continue processing on next iteration of this function call
- */
- token->len = p - token->begin;
- g_assert (token->len > 0);
-
- *cur = p;
+ U8_NEXT (text, start, finish, c);
+ if (u_isalnum (c)) {
return TRUE;
}
- if (ex->type == RSPAMD_EXCEPTION_URL) {
- token->begin = "!!EX!!";
- token->len = sizeof ("!!EX!!") - 1;
- token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- processed = token->len;
- }
-
- p += ex->len;
-
- /* We need to skip all exceptions that are within this exception */
- *exceptions = g_list_next (*exceptions);
-
- while (*exceptions) {
- ex = (*exceptions)->data;
-
- if (ex->pos < p - buf->begin) {
- /* Nested exception */
- if (ex->pos + ex->len > p - buf->begin) {
- /*
- * We have somehow overlapping nesting exception,
- * extend current offset
- */
- p = buf->begin + ex->pos + ex->len;
- }
-
- *exceptions = g_list_next (*exceptions);
- }
- else {
- break;
- }
- }
-
- *cur = p;
-
- if (rl) {
- *rl = processed;
- }
-
- return TRUE;
+ return FALSE;
}
GArray *
rspamd_tokenize_text (const gchar *text, gsize len,
+ const UText *utxt,
enum rspamd_tokenize_type how,
- struct rspamd_config *cfg, GList *exceptions,
+ struct rspamd_config *cfg,
+ GList *exceptions,
guint64 *hash)
{
rspamd_stat_token_t token, buf;
@@ -336,11 +239,11 @@ rspamd_tokenize_text (const gchar *text, gsize len,
gsize l = 0;
GArray *res;
GList *cur = exceptions;
- token_get_function func;
guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
guint64 hv = 0;
gboolean decay = FALSE;
guint64 prob;
+ static UBreakIterator* bi = NULL;
if (text == NULL) {
return NULL;
@@ -353,18 +256,6 @@ rspamd_tokenize_text (const gchar *text, gsize len,
token.len = 0;
token.flags = 0;
- switch (how) {
- case RSPAMD_TOKENIZE_RAW:
- func = rspamd_tokenizer_get_word_raw;
- break;
- case RSPAMD_TOKENIZE_UTF:
- func = rspamd_tokenizer_get_word_utf8;
- break;
- default:
- g_assert_not_reached ();
- break;
- }
-
if (cfg != NULL) {
min_len = cfg->min_word_len;
max_len = cfg->max_word_len;
@@ -375,56 +266,177 @@ rspamd_tokenize_text (const gchar *text, gsize len,
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
initial_size);
- while (func (&buf, &pos, &token, &cur, &l, FALSE)) {
- if (l == 0 || (min_len > 0 && l < min_len) ||
- (max_len > 0 && l > max_len)) {
+ if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
+ while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
+ if (l == 0 || (min_len > 0 && l < min_len) ||
+ (max_len > 0 && l > max_len)) {
+ token.begin = pos;
+ continue;
+ }
+
+ if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+ &hv, &prob, &token, pos - text, len)) {
+ if (!decay) {
+ decay = TRUE;
+ } else {
+ token.begin = pos;
+ continue;
+ }
+ }
+
+ g_array_append_val (res, token);
token.begin = pos;
- continue;
}
+ }
+ else {
+ /* UTF8 boundaries */
+ UErrorCode uc_err = U_ZERO_ERROR;
+ int32_t last, p;
+ struct rspamd_process_exception *ex = NULL;
- if (!decay) {
- if (token.len >= sizeof (guint64)) {
-#ifdef _MUM_UNALIGNED_ACCESS
- hv = mum_hash_step (hv, *(guint64 *)token.begin);
-#else
- guint64 tmp;
- memcpy (&tmp, token.begin, sizeof (tmp));
- hv = mum_hash_step (hv, tmp);
-#endif
- }
+ if (bi == NULL) {
+ bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
- /* Check for decay */
- if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
- /* Start decay */
- gdouble decay_prob;
+ g_assert (U_SUCCESS (uc_err));
+ }
- decay = TRUE;
- hv = mum_hash_finish (hv);
+ ubrk_setUText (bi, (UText*)utxt, &uc_err);
+ last = ubrk_first (bi);
+ p = last;
- /* We assume that word is 6 symbols length in average */
- decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
+ if (cur) {
+ ex = (struct rspamd_process_exception *)cur->data;
+ }
- if (decay_prob >= 1.0) {
- prob = G_MAXUINT64;
+ while (p != UBRK_DONE) {
+start_over:
+ token.len = 0;
+
+ if (p > last) {
+ if (ex && cur) {
+ /* Check exception */
+ if (ex->pos >= last && ex->pos <= p) {
+ /* We have an exception within boundary */
+ /* First, start to drain exceptions from the start */
+ while (cur && ex->pos <= last) {
+ /* We have an exception at the beginning, skip those */
+ last += ex->len;
+
+ if (last > p) {
+ /* Exception spread over the boundaries */
+ while (last > p && p != UBRK_DONE) {
+ p = ubrk_next (bi);
+ }
+
+ /* We need to reset our scan with new p and last */
+ goto start_over;
+ }
+
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token.begin = "!!EX!!";
+ token.len = sizeof ("!!EX!!") - 1;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+ g_array_append_val (res, token);
+ token.flags = 0;
+ }
+
+ cur = g_list_next (cur);
+
+ if (cur) {
+ ex = (struct rspamd_process_exception *) cur->data;
+ }
+ }
+
+ /* Now, we can have an exception within boundary again */
+ if (cur && ex->pos >= last && ex->pos <= p) {
+ /* Append the first part */
+ if (rspamd_utf_word_valid (text, text + len, last,
+ ex->pos)) {
+ token.begin = text + last;
+ token.len = ex->pos - last;
+ token.flags = 0;
+ g_array_append_val (res, token);
+ }
+
+ /* Process the current exception */
+ last += ex->len + token.len;
+
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token.begin = "!!EX!!";
+ token.len = sizeof ("!!EX!!") - 1;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+ g_array_append_val (res, token);
+ }
+
+ if (last > p) {
+ /* Exception spread over the boundaries */
+ while (last > p && p != UBRK_DONE) {
+ p = ubrk_next (bi);
+ }
+ /* We need to reset our scan with new p and last */
+ goto start_over;
+ }
+ }
+ else if (p > last) {
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ token.flags = 0;
+ }
+ }
+ }
+ else if (ex->pos < last) {
+ /* Forward exceptions list */
+ while (cur && ex->pos <= last) {
+ /* We have an exception at the beginning, skip those */
+ cur = g_list_next (cur);
+
+ if (cur) {
+ ex = (struct rspamd_process_exception *) cur->data;
+ }
+ }
+
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ token.flags = 0;
+ }
+ }
+ else {
+ /* No exceptions within boundary */
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ token.flags = 0;
+ }
+ }
}
else {
- prob = decay_prob * G_MAXUINT64;
+ if (rspamd_utf_word_valid (text, text + len, last, p)) {
+ token.begin = text + last;
+ token.len = p - last;
+ }
+ }
+
+ if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+ &hv, &prob, &token, pos - text, len)) {
+ if (!decay) {
+ decay = TRUE;
+ } else {
+ token.len = 0;
+ }
}
}
- }
- else {
- /* Decaying probability */
- /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
- hv = 2862933555777941757ULL * hv + 3037000493ULL;
- if (hv > prob) {
- token.begin = pos;
- continue;
+ if (token.len > 0) {
+ g_array_append_val (res, token);
}
- }
- g_array_append_val (res, token);
- token.begin = pos;
+ last = p;
+ p = ubrk_next (bi);
+ }
}
if (!decay) {
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 16ab142fd..6c538eafc 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -7,6 +7,8 @@
#include "rspamd.h"
#include "stat_api.h"
+#include <unicode/utext.h>
+
#define RSPAMD_DEFAULT_TOKENIZER "osb"
struct rspamd_tokenizer_runtime;
@@ -37,6 +39,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_stat_token_t type) */
GArray * rspamd_tokenize_text (const gchar *text, gsize len,
+ const UText *utxt,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,