aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 14:43:36 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 14:43:36 +0000
commitb522caaf83b4a3f16246bdc38d0f7ce866cdc660 (patch)
tree5c42b7bbf7a274aa65a682bda9dbf07512865bbb /src
parentd01688d6aabc2d51fd52c640c21265a7fd8e3bdc (diff)
downloadrspamd-b522caaf83b4a3f16246bdc38d0f7ce866cdc660.tar.gz
rspamd-b522caaf83b4a3f16246bdc38d0f7ce866cdc660.zip
[Project] Start words unicode structure rework
Diffstat (limited to 'src')
-rw-r--r--src/libstat/stat_api.h21
-rw-r--r--src/libstat/tokenizers/tokenizers.c100
-rw-r--r--src/libutil/fstring.h6
3 files changed, 71 insertions, 56 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index 84db8ee01..645e1f1aa 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -26,16 +26,21 @@
* High level statistics API
*/
-#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1 << 0)
-#define RSPAMD_STAT_TOKEN_FLAG_META (1 << 1)
-#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2)
-#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3)
-#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4)
-#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1 << 5)
+#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0)
+#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
+#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
+#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
+#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1u << 4)
+#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
+#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
+#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
+#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
typedef struct rspamd_stat_token_s {
- const gchar *begin;
- gsize len;
+ rspamd_ftok_t original;
+ rspamd_ftok_unicode_t unicode;
+ rspamd_ftok_t normalised;
+ rspamd_ftok_t stemmed;
guint flags;
} rspamd_stat_token_t;
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 2ef5c08fb..8664b9e19 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -80,33 +80,33 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
ex = (*exceptions)->data;
}
- if (token->begin == NULL || *cur == NULL) {
+ if (token->original.begin == NULL || *cur == NULL) {
if (ex != NULL) {
if (ex->pos == 0) {
- token->begin = buf->begin + ex->len;
- token->len = ex->len;
+ token->original.begin = buf->original.begin + ex->len;
+ token->original.len = ex->len;
token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
}
else {
- token->begin = buf->begin;
- token->len = 0;
+ token->original.begin = buf->original.begin;
+ token->original.len = 0;
}
}
else {
- token->begin = buf->begin;
- token->len = 0;
+ token->original.begin = buf->original.begin;
+ token->original.len = 0;
}
- *cur = token->begin;
+ *cur = token->original.begin;
}
- token->len = 0;
+ token->original.len = 0;
- pos = *cur - buf->begin;
- if (pos >= buf->len) {
+ pos = *cur - buf->original.begin;
+ if (pos >= buf->original.len) {
return FALSE;
}
- remain = buf->len - pos;
+ remain = buf->original.len - pos;
p = *cur;
/* Skip non delimiters symbols */
@@ -122,7 +122,7 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
remain--;
} while (remain > 0 && t_delimiters[(guchar)*p]);
- token->begin = p;
+ token->original.begin = p;
while (remain > 0 && !t_delimiters[(guchar)*p]) {
if (ex != NULL && ex->pos == pos) {
@@ -130,7 +130,7 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
*cur = p + ex->len;
return TRUE;
}
- token->len++;
+ token->original.len++;
pos++;
remain--;
p++;
@@ -141,7 +141,7 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
}
if (rl) {
- *rl = token->len;
+ *rl = token->original.len;
}
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
@@ -164,12 +164,12 @@ rspamd_tokenize_check_limit (gboolean decay,
static const gdouble avg_word_len = 6.0;
if (!decay) {
- if (token->len >= sizeof (guint64)) {
+ if (token->original.len >= sizeof (guint64)) {
#ifdef _MUM_UNALIGNED_ACCESS
- *hv = mum_hash_step (*hv, *(guint64 *)token->begin);
+ *hv = mum_hash_step (*hv, *(guint64 *)token->original.begin);
#else
guint64 tmp;
- memcpy (&tmp, token->begin, sizeof (tmp));
+ memcpy (&tmp, token->original.begin, sizeof (tmp));
*hv = mum_hash_step (*hv, tmp);
#endif
}
@@ -260,11 +260,11 @@ rspamd_tokenize_text (const gchar *text, gsize len,
return NULL;
}
- buf.begin = text;
- buf.len = len;
+ buf.original.begin = text;
+ buf.original.len = len;
buf.flags = 0;
- token.begin = NULL;
- token.len = 0;
+ token.original.begin = NULL;
+ token.original.len = 0;
token.flags = 0;
if (cfg != NULL) {
@@ -281,24 +281,24 @@ rspamd_tokenize_text (const gchar *text, gsize len,
while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
if (l == 0 || (min_len > 0 && l < min_len) ||
(max_len > 0 && l > max_len)) {
- token.begin = pos;
+ token.original.begin = pos;
continue;
}
- if (token.len > 0 &&
+ if (token.original.len > 0 &&
rspamd_tokenize_check_limit (decay, word_decay, res->len,
&hv, &prob, &token, pos - text, len)) {
if (!decay) {
decay = TRUE;
}
else {
- token.begin = pos;
+ token.original.begin = pos;
continue;
}
}
g_array_append_val (res, token);
- token.begin = pos;
+ token.original.begin = pos;
}
}
else {
@@ -323,7 +323,7 @@ rspamd_tokenize_text (const gchar *text, gsize len,
while (p != UBRK_DONE) {
start_over:
- token.len = 0;
+ token.original.len = 0;
if (p > last) {
if (ex && cur) {
@@ -336,8 +336,8 @@ start_over:
last += ex->len;
if (ex->type == RSPAMD_EXCEPTION_URL) {
- token.begin = "!!EX!!";
- token.len = sizeof ("!!EX!!") - 1;
+ token.original.begin = "!!EX!!";
+ token.original.len = sizeof ("!!EX!!") - 1;
token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
g_array_append_val (res, token);
@@ -363,8 +363,8 @@ start_over:
/* Append the first part */
if (rspamd_utf_word_valid (text, text + len, last,
ex->pos)) {
- token.begin = text + last;
- token.len = ex->pos - last;
+ token.original.begin = text + last;
+ token.original.len = ex->pos - last;
token.flags = 0;
g_array_append_val (res, token);
}
@@ -373,8 +373,8 @@ start_over:
last += ex->len + (ex->pos - last);
if (ex->type == RSPAMD_EXCEPTION_URL) {
- token.begin = "!!EX!!";
- token.len = sizeof ("!!EX!!") - 1;
+ token.original.begin = "!!EX!!";
+ token.original.len = sizeof ("!!EX!!") - 1;
token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
g_array_append_val (res, token);
@@ -394,9 +394,10 @@ start_over:
}
else if (p > last) {
if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.begin = text + last;
- token.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
}
}
}
@@ -408,40 +409,43 @@ start_over:
}
if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.begin = text + last;
- token.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
}
}
else {
/* No exceptions within boundary */
if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.begin = text + last;
- token.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
}
}
}
else {
if (rspamd_utf_word_valid (text, text + len, last, p)) {
- token.begin = text + last;
- token.len = p - last;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
}
}
- if (token.len > 0 &&
+ if (token.original.len > 0 &&
rspamd_tokenize_check_limit (decay, word_decay, res->len,
&hv, &prob, &token, p, len)) {
if (!decay) {
decay = TRUE;
} else {
- token.len = 0;
+ token.original.len = 0;
}
}
}
- if (token.len > 0) {
+ if (token.original.len > 0) {
g_array_append_val (res, token);
}
diff --git a/src/libutil/fstring.h b/src/libutil/fstring.h
index 1f194827c..88e41b47a 100644
--- a/src/libutil/fstring.h
+++ b/src/libutil/fstring.h
@@ -18,6 +18,7 @@
#include "config.h"
#include "mem_pool.h"
+#include <unicode/uchar.h>
/**
* Fixed strings library
@@ -38,6 +39,11 @@ typedef struct f_str_tok {
const gchar *begin;
} rspamd_ftok_t;
+typedef struct f_str_unicode_tok {
+ gsize len; /* in uchars */
+ const UChar *begin;
+} rspamd_ftok_unicode_t;
+
/**
* Create new fixed length string
*/