summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-08-23 17:27:34 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-08-23 17:27:34 +0100
commite9c773e6bb0e09b4802f3cb06b93b7a082e464ed (patch)
tree96347e9b0885687b3ad6de3444c5bc5759f5e58a /src
parented9d4ec8c8b62664f0157ccb6dceaba264e1891b (diff)
downloadrspamd-e9c773e6bb0e09b4802f3cb06b93b7a082e464ed.tar.gz
rspamd-e9c773e6bb0e09b4802f3cb06b93b7a082e464ed.zip
[Project] Start unicode rework
Diffstat (limited to 'src')
-rw-r--r--src/libmime/message.c16
-rw-r--r--src/libmime/message.h15
-rw-r--r--src/libstat/stat_process.c4
-rw-r--r--src/libstat/tokenizers/tokenizers.c34
-rw-r--r--src/libstat/tokenizers/tokenizers.h14
-rw-r--r--src/lua/lua_util.c8
-rw-r--r--src/plugins/chartable.c3
7 files changed, 55 insertions, 39 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index e4c59be63..5d9cf19d1 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -194,20 +194,28 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
{
rspamd_stat_token_t *w, ucs_w;
guint i, ucs_len = 0;
+ enum rspamd_tokenize_type tok_type;
+
+ if (IS_PART_UTF (part)) {
+ tok_type = RSPAMD_TOKENIZE_UTF;
+ }
+ else {
+ tok_type = RSPAMD_TOKENIZE_RAW;
+ }
/* Ugly workaround */
if (IS_PART_HTML (part)) {
part->normalized_words = rspamd_tokenize_text (
part->stripped_content->data,
- part->stripped_content->len, IS_PART_UTF (part), task->cfg,
- part->exceptions, FALSE,
+ part->stripped_content->len, tok_type, task->cfg,
+ part->exceptions,
NULL);
}
else {
part->normalized_words = rspamd_tokenize_text (
part->stripped_content->data,
- part->stripped_content->len, IS_PART_UTF (part), task->cfg,
- part->exceptions, FALSE,
+ part->stripped_content->len, tok_type, task->cfg,
+ part->exceptions,
NULL);
}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index b16011666..b0a7983b4 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -13,6 +13,8 @@
#include "mime_headers.h"
#include "content_type.h"
+#include <unicode/uchar.h>
+
struct rspamd_task;
struct controller_session;
struct html_content;
@@ -77,16 +79,19 @@ struct rspamd_mime_part {
#define IS_PART_RAW(part) (!((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF))
#define IS_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
+
struct rspamd_mime_text_part {
const gchar *language;
GPtrArray *languages;
const gchar *real_charset;
rspamd_ftok_t raw;
- rspamd_ftok_t parsed;
- GByteArray *content;
- GByteArray *utf_raw_content;
- GByteArray *stripped_content;
- GPtrArray *newlines; /**< positions of newlines in text */
+ rspamd_ftok_t parsed; /* decoded from mime encodings */
+ GByteArray *content; /* utf8 encoded processed content */
+
+ UChar *ucs_raw_content; /* unicode raw content */
+ GByteArray *utf_raw_content; /* utf raw content */
+ GByteArray *stripped_content; /* utf content with no newlines */
+ GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
struct html_content *html;
GList *exceptions; /**< list of offsets of urls */
struct rspamd_mime_part *mime_part;
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index f58bf6150..540a9e23f 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -365,8 +365,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE,
- NULL);
+ words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF,
+ NULL, NULL, NULL);
if (words != NULL) {
for (i = 0; i < words->len; i ++) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 36861b196..fce98c53f 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -26,7 +26,7 @@
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
rspamd_stat_token_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
+ GList **exceptions, gsize *rl, gboolean check_signature);
const gchar t_delimiters[255] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
@@ -61,7 +61,7 @@ const gchar t_delimiters[255] = {
static gboolean
rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
+ GList **exceptions, gsize *rl, gboolean unused)
{
gsize remain, pos;
const gchar *p;
@@ -138,12 +138,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
}
if (rl) {
- if (is_utf) {
- *rl = g_utf8_strlen (token->begin, token->len);
- }
- else {
- *rl = token->len;
- }
+ *rl = token->len;
}
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
@@ -156,7 +151,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
static gboolean
rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl,
+ GList **exceptions, gsize *rl,
gboolean check_signature)
{
gint32 i, siglen = 0, remain;
@@ -179,7 +174,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
ex = (*exceptions)->data;
}
- g_assert (is_utf);
g_assert (cur != NULL);
if (*cur == NULL) {
@@ -332,9 +326,10 @@ process_exception:
}
GArray *
-rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- struct rspamd_config *cfg, GList *exceptions, gboolean compat,
- guint64 *hash)
+rspamd_tokenize_text (const gchar *text, gsize len,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg, GList *exceptions,
+ guint64 *hash)
{
rspamd_stat_token_t token, buf;
const gchar *pos = NULL;
@@ -358,11 +353,16 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
token.len = 0;
token.flags = 0;
- if (compat || !is_utf) {
+ switch (how) {
+ case RSPAMD_TOKENIZE_RAW:
func = rspamd_tokenizer_get_word_compat;
- }
- else {
+ break;
+ case RSPAMD_TOKENIZE_UTF:
func = rspamd_tokenizer_get_word;
+ break;
+ default:
+ g_assert_not_reached ();
+ break;
}
if (cfg != NULL) {
@@ -375,7 +375,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
initial_size);
- while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
+ while (func (&buf, &pos, &token, &cur, &l, FALSE)) {
if (l == 0 || (min_len > 0 && l < min_len) ||
(max_len > 0 && l > max_len)) {
token.begin = pos;
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 530eb40a0..8be5f98a8 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -25,14 +25,22 @@ struct rspamd_stat_tokenizer {
GPtrArray *result);
};
+enum rspamd_tokenize_type {
+ RSPAMD_TOKENIZE_UTF = 0,
+ RSPAMD_TOKENIZE_RAW,
+ RSPAMD_TOKENIZE_UCS
+};
+
/* Compare two token nodes */
gint token_node_compare_func (gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- struct rspamd_config *cfg, GList *exceptions, gboolean compat,
- guint64 *hash);
+GArray * rspamd_tokenize_text (const gchar *text, gsize len,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg,
+ GList *exceptions,
+ guint64 *hash);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 70e16118d..3de68e60a 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1080,7 +1080,6 @@ lua_util_tokenize_text (lua_State *L)
struct rspamd_process_exception *ex;
GArray *res;
rspamd_stat_token_t *w;
- gboolean compat = FALSE;
if (lua_type (L, 1) == LUA_TSTRING) {
in = luaL_checklstring (L, 1, &len);
@@ -1126,15 +1125,12 @@ lua_util_tokenize_text (lua_State *L)
lua_pop (L, 1);
}
- if (lua_gettop (L) > 2 && lua_type (L, 3) == LUA_TBOOLEAN) {
- compat = lua_toboolean (L, 3);
- }
-
if (exceptions) {
exceptions = g_list_reverse (exceptions);
}
- res = rspamd_tokenize_text ((gchar *)in, len, TRUE, NULL, exceptions, compat,
+ res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL,
+ exceptions,
NULL);
if (res == NULL) {
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 9331e42dd..987879258 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -620,10 +620,9 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
gdouble cur_score = 0.0;
words = rspamd_tokenize_text (task->subject, strlen (task->subject),
- TRUE,
+ RSPAMD_TOKENIZE_UTF,
NULL,
NULL,
- FALSE,
NULL);
if (words && words->len > 0) {