aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-08-23 17:27:34 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-08-23 17:27:34 +0100
commite9c773e6bb0e09b4802f3cb06b93b7a082e464ed (patch)
tree96347e9b0885687b3ad6de3444c5bc5759f5e58a /src/libstat
parented9d4ec8c8b62664f0157ccb6dceaba264e1891b (diff)
downloadrspamd-e9c773e6bb0e09b4802f3cb06b93b7a082e464ed.tar.gz
rspamd-e9c773e6bb0e09b4802f3cb06b93b7a082e464ed.zip
[Project] Start unicode rework
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/stat_process.c4
-rw-r--r--src/libstat/tokenizers/tokenizers.c34
-rw-r--r--src/libstat/tokenizers/tokenizers.h14
3 files changed, 30 insertions, 22 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index f58bf6150..540a9e23f 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -365,8 +365,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE,
- NULL);
+ words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF,
+ NULL, NULL, NULL);
if (words != NULL) {
for (i = 0; i < words->len; i ++) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 36861b196..fce98c53f 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -26,7 +26,7 @@
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
rspamd_stat_token_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
+ GList **exceptions, gsize *rl, gboolean check_signature);
const gchar t_delimiters[255] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
@@ -61,7 +61,7 @@ const gchar t_delimiters[255] = {
static gboolean
rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
+ GList **exceptions, gsize *rl, gboolean unused)
{
gsize remain, pos;
const gchar *p;
@@ -138,12 +138,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
}
if (rl) {
- if (is_utf) {
- *rl = g_utf8_strlen (token->begin, token->len);
- }
- else {
- *rl = token->len;
- }
+ *rl = token->len;
}
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
@@ -156,7 +151,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
static gboolean
rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
gchar const **cur, rspamd_stat_token_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl,
+ GList **exceptions, gsize *rl,
gboolean check_signature)
{
gint32 i, siglen = 0, remain;
@@ -179,7 +174,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
ex = (*exceptions)->data;
}
- g_assert (is_utf);
g_assert (cur != NULL);
if (*cur == NULL) {
@@ -332,9 +326,10 @@ process_exception:
}
GArray *
-rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- struct rspamd_config *cfg, GList *exceptions, gboolean compat,
- guint64 *hash)
+rspamd_tokenize_text (const gchar *text, gsize len,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg, GList *exceptions,
+ guint64 *hash)
{
rspamd_stat_token_t token, buf;
const gchar *pos = NULL;
@@ -358,11 +353,16 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
token.len = 0;
token.flags = 0;
- if (compat || !is_utf) {
+ switch (how) {
+ case RSPAMD_TOKENIZE_RAW:
func = rspamd_tokenizer_get_word_compat;
- }
- else {
+ break;
+ case RSPAMD_TOKENIZE_UTF:
func = rspamd_tokenizer_get_word;
+ break;
+ default:
+ g_assert_not_reached ();
+ break;
}
if (cfg != NULL) {
@@ -375,7 +375,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
initial_size);
- while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
+ while (func (&buf, &pos, &token, &cur, &l, FALSE)) {
if (l == 0 || (min_len > 0 && l < min_len) ||
(max_len > 0 && l > max_len)) {
token.begin = pos;
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 530eb40a0..8be5f98a8 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -25,14 +25,22 @@ struct rspamd_stat_tokenizer {
GPtrArray *result);
};
+enum rspamd_tokenize_type {
+ RSPAMD_TOKENIZE_UTF = 0,
+ RSPAMD_TOKENIZE_RAW,
+ RSPAMD_TOKENIZE_UCS
+};
+
/* Compare two token nodes */
gint token_node_compare_func (gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- struct rspamd_config *cfg, GList *exceptions, gboolean compat,
- guint64 *hash);
+GArray * rspamd_tokenize_text (const gchar *text, gsize len,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg,
+ GList *exceptions,
+ guint64 *hash);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,