diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-07-26 10:49:23 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-07-26 10:49:23 +0100 |
commit | 537a7180a0d5132c11636c4fd8b1450cd99d352c (patch) | |
tree | fb9f8c84955a411bdffbd6371ea32f2716fb3687 /src/libstat/tokenizers | |
parent | 5fd7a90fdaa33f52c59bdb0ca84451e5c1e22365 (diff) | |
download | rspamd-537a7180a0d5132c11636c4fd8b1450cd99d352c.tar.gz rspamd-537a7180a0d5132c11636c4fd8b1450cd99d352c.zip |
[Rework] Use clang-format to unify formatting in all sources
No meaningful changes.
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 213 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 476 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 68 |
3 files changed, 380 insertions, 377 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index a8007ec0f..d871c7a4e 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -27,16 +27,26 @@ #define DEFAULT_OSB_VERSION 2 static const int primes[] = { - 1, 7, - 3, 13, - 5, 29, - 11, 51, - 23, 101, - 47, 203, - 97, 407, - 197, 817, - 397, 1637, - 797, 3277, + 1, + 7, + 3, + 13, + 5, + 29, + 11, + 51, + 23, + 101, + 47, + 203, + 97, + 407, + 197, + 817, + 397, + 1637, + 797, + 3277, }; static const guchar osb_tokenizer_magic[] = {'o', 's', 'b', 't', 'o', 'k', 'v', '2'}; @@ -60,13 +70,13 @@ struct rspamd_osb_tokenizer_config { * Return default config */ static struct rspamd_osb_tokenizer_config * -rspamd_tokenizer_osb_default_config (void) +rspamd_tokenizer_osb_default_config(void) { static struct rspamd_osb_tokenizer_config def; - if (memcmp (def.magic, osb_tokenizer_magic, sizeof (osb_tokenizer_magic)) != 0) { - memset (&def, 0, sizeof (def)); - memcpy (def.magic, osb_tokenizer_magic, sizeof (osb_tokenizer_magic)); + if (memcmp(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic)) != 0) { + memset(&def, 0, sizeof(def)); + memcpy(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic)); def.version = DEFAULT_OSB_VERSION; def.window_size = DEFAULT_FEATURE_WINDOW_SIZE; def.ht = RSPAMD_OSB_HASH_XXHASH; @@ -77,8 +87,8 @@ rspamd_tokenizer_osb_default_config (void) } static struct rspamd_osb_tokenizer_config * -rspamd_tokenizer_osb_config_from_ucl (rspamd_mempool_t * pool, - const ucl_object_t *obj) +rspamd_tokenizer_osb_config_from_ucl(rspamd_mempool_t *pool, + const ucl_object_t *obj) { const ucl_object_t *elt; struct rspamd_osb_tokenizer_config *cf, *def; @@ -87,61 +97,58 @@ rspamd_tokenizer_osb_config_from_ucl (rspamd_mempool_t * pool, if (pool != NULL) { - cf = rspamd_mempool_alloc0 (pool, sizeof (*cf)); + cf = rspamd_mempool_alloc0(pool, sizeof(*cf)); } else { - cf = g_malloc0 (sizeof (*cf)); + cf = g_malloc0(sizeof(*cf)); } /* Use default config */ - def = rspamd_tokenizer_osb_default_config (); - memcpy (cf, def, sizeof (*cf)); + def = rspamd_tokenizer_osb_default_config(); + memcpy(cf, def, sizeof(*cf)); - elt = ucl_object_lookup (obj, "hash"); - if (elt != NULL && ucl_object_type (elt) == UCL_STRING) { - if (g_ascii_strncasecmp (ucl_object_tostring (elt), "xxh", 3) - == 0) { + elt = ucl_object_lookup(obj, "hash"); + if (elt != NULL && ucl_object_type(elt) == UCL_STRING) { + if (g_ascii_strncasecmp(ucl_object_tostring(elt), "xxh", 3) == 0) { cf->ht = RSPAMD_OSB_HASH_XXHASH; - elt = ucl_object_lookup (obj, "seed"); - if (elt != NULL && ucl_object_type (elt) == UCL_INT) { - cf->seed = ucl_object_toint (elt); + elt = ucl_object_lookup(obj, "seed"); + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + cf->seed = ucl_object_toint(elt); } } - else if (g_ascii_strncasecmp (ucl_object_tostring (elt), "sip", 3) - == 0) { + else if (g_ascii_strncasecmp(ucl_object_tostring(elt), "sip", 3) == 0) { cf->ht = RSPAMD_OSB_HASH_SIPHASH; - elt = ucl_object_lookup (obj, "key"); - - if (elt != NULL && ucl_object_type (elt) == UCL_STRING) { - key = rspamd_decode_base32 (ucl_object_tostring (elt), - 0, &keylen, RSPAMD_BASE32_DEFAULT); - if (keylen < sizeof (rspamd_sipkey_t)) { - msg_warn ("siphash key is too short: %z", keylen); - g_free (key); + elt = ucl_object_lookup(obj, "key"); + + if (elt != NULL && ucl_object_type(elt) == UCL_STRING) { + key = rspamd_decode_base32(ucl_object_tostring(elt), + 0, &keylen, RSPAMD_BASE32_DEFAULT); + if (keylen < sizeof(rspamd_sipkey_t)) { + msg_warn("siphash key is too short: %z", keylen); + g_free(key); } else { - memcpy (cf->sk, key, sizeof (cf->sk)); - g_free (key); + memcpy(cf->sk, key, sizeof(cf->sk)); + g_free(key); } } else { - msg_warn_pool ("siphash cannot be used without key"); + msg_warn_pool("siphash cannot be used without key"); } - } } else { - elt = ucl_object_lookup (obj, "compat"); - if (elt != NULL && ucl_object_toboolean (elt)) { + elt = ucl_object_lookup(obj, "compat"); + if (elt != NULL && ucl_object_toboolean(elt)) { cf->ht = RSPAMD_OSB_HASH_COMPAT; } } - elt = ucl_object_lookup (obj, "window"); - if (elt != NULL && ucl_object_type (elt) == UCL_INT) { - cf->window_size = ucl_object_toint (elt); + elt = ucl_object_lookup(obj, "window"); + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + cf->window_size = ucl_object_toint(elt); if (cf->window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) { - msg_err_pool ("too large window size: %d", cf->window_size); + msg_err_pool("too large window size: %d", cf->window_size); cf->window_size = DEFAULT_FEATURE_WINDOW_SIZE; } } @@ -150,31 +157,31 @@ rspamd_tokenizer_osb_config_from_ucl (rspamd_mempool_t * pool, } gpointer -rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, - struct rspamd_tokenizer_config *cf, - gsize *len) +rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, + gsize *len) { struct rspamd_osb_tokenizer_config *osb_cf, *def; if (cf != NULL && cf->opts != NULL) { - osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, cf->opts); + osb_cf = rspamd_tokenizer_osb_config_from_ucl(pool, cf->opts); } else { - def = rspamd_tokenizer_osb_default_config (); - osb_cf = rspamd_mempool_alloc (pool, sizeof (*osb_cf)); - memcpy (osb_cf, def, sizeof (*osb_cf)); + def = rspamd_tokenizer_osb_default_config(); + osb_cf = rspamd_mempool_alloc(pool, sizeof(*osb_cf)); + memcpy(osb_cf, def, sizeof(*osb_cf)); /* Do not write sipkey to statfile */ } if (osb_cf->ht == RSPAMD_OSB_HASH_SIPHASH) { - msg_info_pool ("siphash key is not stored into statfiles, so you'd " - "need to keep it inside the configuration"); + msg_info_pool("siphash key is not stored into statfiles, so you'd " + "need to keep it inside the configuration"); } - memset (osb_cf->sk, 0, sizeof (osb_cf->sk)); + memset(osb_cf->sk, 0, sizeof(osb_cf->sk)); if (len != NULL) { - *len = sizeof (*osb_cf); + *len = sizeof(*osb_cf); } return osb_cf; @@ -259,13 +266,12 @@ struct token_pipe_entry { rspamd_stat_token_t *t; }; -gint -rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, - struct rspamd_task *task, - GArray *words, - gboolean is_utf, - const gchar *prefix, - GPtrArray *result) +gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result) { rspamd_token_t *new_tok = NULL; rspamd_stat_token_t *token; @@ -284,31 +290,31 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, window_size = osb_cf->window_size; if (prefix) { - seed = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, - prefix, strlen (prefix), osb_cf->seed); + seed = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + prefix, strlen(prefix), osb_cf->seed); } else { seed = osb_cf->seed; } - hashpipe = g_alloca (window_size * sizeof (hashpipe[0])); + hashpipe = g_alloca(window_size * sizeof(hashpipe[0])); for (i = 0; i < window_size; i++) { hashpipe[i].h = 0xfe; hashpipe[i].t = NULL; } - token_size = sizeof (rspamd_token_t) + - sizeof (gdouble) * ctx->statfiles->len; - g_assert (token_size > 0); + token_size = sizeof(rspamd_token_t) + + sizeof(gdouble) * ctx->statfiles->len; + g_assert(token_size > 0); - for (w = 0; w < words->len; w ++) { - token = &g_array_index (words, rspamd_stat_token_t, w); + for (w = 0; w < words->len; w++) { + token = &g_array_index(words, rspamd_stat_token_t, w); token_flags = token->flags; const gchar *begin; gsize len; if (token->flags & - (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD|RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD | RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { /* Skip stop/skipped words */ continue; } @@ -327,17 +333,17 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, ftok.begin = begin; ftok.len = len; - cur = rspamd_fstrhash_lc (&ftok, is_utf); + cur = rspamd_fstrhash_lc(&ftok, is_utf); } else { /* We know that the words are normalized */ if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) { - cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, - begin, len, osb_cf->seed); + cur = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + begin, len, osb_cf->seed); } else { - rspamd_cryptobox_siphash ((guchar *)&cur, begin, - len, osb_cf->sk); + rspamd_cryptobox_siphash((guchar *) &cur, begin, + len, osb_cf->sk); if (prefix) { cur ^= seed; @@ -346,36 +352,37 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, } if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) { - new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size); + new_tok = rspamd_mempool_alloc0(task->task_pool, token_size); new_tok->flags = token_flags; new_tok->t1 = token; new_tok->t2 = token; new_tok->data = cur; new_tok->window_idx = 0; - g_ptr_array_add (result, new_tok); + g_ptr_array_add(result, new_tok); continue; } -#define ADD_TOKEN do {\ - new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size); \ - new_tok->flags = token_flags; \ - new_tok->t1 = hashpipe[0].t; \ - new_tok->t2 = hashpipe[i].t; \ - if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ - h1 = ((guint32)hashpipe[0].h) * primes[0] + \ - ((guint32)hashpipe[i].h) * primes[i << 1]; \ - h2 = ((guint32)hashpipe[0].h) * primes[1] + \ - ((guint32)hashpipe[i].h) * primes[(i << 1) - 1]; \ - memcpy((guchar *)&new_tok->data, &h1, sizeof (h1)); \ - memcpy(((guchar *)&new_tok->data) + sizeof (h1), &h2, sizeof (h2)); \ - } \ - else { \ - new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \ - } \ - new_tok->window_idx = i; \ - g_ptr_array_add (result, new_tok); \ - } while(0) +#define ADD_TOKEN \ + do { \ + new_tok = rspamd_mempool_alloc0(task->task_pool, token_size); \ + new_tok->flags = token_flags; \ + new_tok->t1 = hashpipe[0].t; \ + new_tok->t2 = hashpipe[i].t; \ + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ + h1 = ((guint32) hashpipe[0].h) * primes[0] + \ + ((guint32) hashpipe[i].h) * primes[i << 1]; \ + h2 = ((guint32) hashpipe[0].h) * primes[1] + \ + ((guint32) hashpipe[i].h) * primes[(i << 1) - 1]; \ + memcpy((guchar *) &new_tok->data, &h1, sizeof(h1)); \ + memcpy(((guchar *) &new_tok->data) + sizeof(h1), &h2, sizeof(h2)); \ + } \ + else { \ + new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \ + } \ + new_tok->window_idx = i; \ + g_ptr_array_add(result, new_tok); \ + } while (0) if (processed < window_size) { /* Just fill a hashpipe */ @@ -402,9 +409,9 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, } if (processed > 1 && processed <= window_size) { - processed --; - memmove (hashpipe, &hashpipe[window_size - processed], - processed * sizeof (hashpipe[0])); + processed--; + memmove(hashpipe, &hashpipe[window_size - processed], + processed * sizeof(hashpipe[0])); for (i = 1; i < processed; i++) { ADD_TOKEN; diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 55ee62f85..6e55a33a6 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -35,9 +35,9 @@ #include <math.h> -typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, - rspamd_stat_token_t * token, - GList **exceptions, gsize *rl, gboolean check_signature); +typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, gchar const **pos, + rspamd_stat_token_t *token, + GList **exceptions, gsize *rl, gboolean check_signature); const gchar t_delimiters[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, @@ -65,14 +65,13 @@ const gchar t_delimiters[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0 -}; + 0, 0, 0, 0, 0, 0}; /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, - gchar const **cur, rspamd_stat_token_t * token, - GList **exceptions, gsize *rl, gboolean unused) +rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf, + gchar const **cur, rspamd_stat_token_t *token, + GList **exceptions, gsize *rl, gboolean unused) { gsize remain, pos; const gchar *p; @@ -82,7 +81,7 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, return FALSE; } - g_assert (cur != NULL); + g_assert(cur != NULL); if (exceptions != NULL && *exceptions != NULL) { ex = (*exceptions)->data; @@ -121,20 +120,20 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, do { if (ex != NULL && ex->pos == pos) { /* Go to the next exception */ - *exceptions = g_list_next (*exceptions); + *exceptions = g_list_next(*exceptions); *cur = p + ex->len; return TRUE; } pos++; p++; remain--; - } while (remain > 0 && t_delimiters[(guchar)*p]); + } while (remain > 0 && t_delimiters[(guchar) *p]); token->original.begin = p; - while (remain > 0 && !t_delimiters[(guchar)*p]) { + while (remain > 0 && !t_delimiters[(guchar) *p]) { if (ex != NULL && ex->pos == pos) { - *exceptions = g_list_next (*exceptions); + *exceptions = g_list_next(*exceptions); *cur = p + ex->len; return TRUE; } @@ -160,40 +159,40 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, } static inline gboolean -rspamd_tokenize_check_limit (gboolean decay, - guint word_decay, - guint nwords, - guint64 *hv, - guint64 *prob, - const rspamd_stat_token_t *token, - gssize remain, - gssize total) +rspamd_tokenize_check_limit(gboolean decay, + guint word_decay, + guint nwords, + guint64 *hv, + guint64 *prob, + const rspamd_stat_token_t *token, + gssize remain, + gssize total) { static const gdouble avg_word_len = 6.0; if (!decay) { - if (token->original.len >= sizeof (guint64)) { + if (token->original.len >= sizeof(guint64)) { guint64 tmp; - memcpy (&tmp, token->original.begin, sizeof (tmp)); - *hv = mum_hash_step (*hv, tmp); + memcpy(&tmp, token->original.begin, sizeof(tmp)); + *hv = mum_hash_step(*hv, tmp); } /* Check for decay */ - if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) { + if (word_decay > 0 && nwords > word_decay && remain < (gssize) total) { /* Start decay */ gdouble decay_prob; - *hv = mum_hash_finish (*hv); + *hv = mum_hash_finish(*hv); /* We assume that word is 6 symbols length in average */ - decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len) * 10; - decay_prob = floor (decay_prob) / 10.0; + decay_prob = (gdouble) word_decay / ((total - (remain)) / avg_word_len) * 10; + decay_prob = floor(decay_prob) / 10.0; if (decay_prob >= 1.0) { *prob = G_MAXUINT64; } else { - *prob = (guint64)(decay_prob * (double)G_MAXUINT64); + *prob = (guint64) (decay_prob * (double) G_MAXUINT64); } return TRUE; @@ -213,8 +212,8 @@ rspamd_tokenize_check_limit (gboolean decay, } static inline gboolean -rspamd_utf_word_valid (const guchar *text, const guchar *end, - gint32 start, gint32 finish) +rspamd_utf_word_valid(const guchar *text, const guchar *end, + gint32 start, gint32 finish) { const guchar *st = text + start, *fin = text + finish; UChar32 c; @@ -223,37 +222,38 @@ rspamd_utf_word_valid (const guchar *text, const guchar *end, return FALSE; } - U8_NEXT (text, start, finish, c); + U8_NEXT(text, start, finish, c); - if (u_isJavaIDPart (c)) { + if (u_isJavaIDPart(c)) { return TRUE; } return FALSE; } -#define SHIFT_EX do { \ - cur = g_list_next (cur); \ - if (cur) { \ - ex = (struct rspamd_process_exception *) cur->data; \ - } \ - else { \ - ex = NULL; \ - } \ -} while(0) +#define SHIFT_EX \ + do { \ + cur = g_list_next(cur); \ + if (cur) { \ + ex = (struct rspamd_process_exception *) cur->data; \ + } \ + else { \ + ex = NULL; \ + } \ + } while (0) static inline void -rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res) +rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) { rspamd_stat_token_t token; - memset (&token, 0, sizeof (token)); + memset(&token, 0, sizeof(token)); if (ex->type == RSPAMD_EXCEPTION_GENERIC) { token.original.begin = "!!EX!!"; - token.original.len = sizeof ("!!EX!!") - 1; + token.original.len = sizeof("!!EX!!") - 1; token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - g_array_append_val (res, token); + g_array_append_val(res, token); token.flags = 0; } else if (ex->type == RSPAMD_EXCEPTION_URL) { @@ -262,31 +262,30 @@ rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res) uri = ex->ptr; if (uri && uri->tldlen > 0) { - token.original.begin = rspamd_url_tld_unsafe (uri); + token.original.begin = rspamd_url_tld_unsafe(uri); token.original.len = uri->tldlen; - } else { token.original.begin = "!!EX!!"; - token.original.len = sizeof ("!!EX!!") - 1; + token.original.len = sizeof("!!EX!!") - 1; } token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - g_array_append_val (res, token); + g_array_append_val(res, token); token.flags = 0; } } GArray * -rspamd_tokenize_text (const gchar *text, gsize len, - const UText *utxt, - enum rspamd_tokenize_type how, - struct rspamd_config *cfg, - GList *exceptions, - guint64 *hash, - GArray *cur_words, - rspamd_mempool_t *pool) +rspamd_tokenize_text(const gchar *text, gsize len, + const UText *utxt, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + guint64 *hash, + GArray *cur_words, + rspamd_mempool_t *pool) { rspamd_stat_token_t token, buf; const gchar *pos = NULL; @@ -297,7 +296,7 @@ rspamd_tokenize_text (const gchar *text, gsize len, guint64 hv = 0; gboolean decay = FALSE, long_text_mode = FALSE; guint64 prob = 0; - static UBreakIterator* bi = NULL; + static UBreakIterator *bi = NULL; static const gsize long_text_limit = 1 * 1024 * 1024; static const ev_tstamp max_exec_time = 0.2; /* 200 ms */ ev_tstamp start; @@ -311,14 +310,14 @@ rspamd_tokenize_text (const gchar *text, gsize len, * In this mode we do additional checks to avoid performance issues */ long_text_mode = TRUE; - start = ev_time (); + start = ev_time(); } buf.original.begin = text; buf.original.len = len; buf.flags = 0; - memset (&token, 0, sizeof (token)); + memset(&token, 0, sizeof(token)); if (cfg != NULL) { min_len = cfg->min_word_len; @@ -328,15 +327,15 @@ rspamd_tokenize_text (const gchar *text, gsize len, } if (!cur_words) { - res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), - initial_size); + res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), + initial_size); } else { res = cur_words; } - if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { - while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) { + if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { + while (rspamd_tokenizer_get_word_raw(&buf, &pos, &token, &cur, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || (max_len > 0 && l > max_len)) { token.original.begin = pos; @@ -344,8 +343,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, } if (token.original.len > 0 && - rspamd_tokenize_check_limit (decay, word_decay, res->len, - &hv, &prob, &token, pos - text, len)) { + rspamd_tokenize_check_limit(decay, word_decay, res->len, + &hv, &prob, &token, pos - text, len)) { if (!decay) { decay = TRUE; } @@ -357,27 +356,27 @@ rspamd_tokenize_text (const gchar *text, gsize len, if (long_text_mode) { if ((res->len + 1) % 16 == 0) { - ev_tstamp now = ev_time (); + ev_tstamp now = ev_time(); if (now - start > max_exec_time) { - msg_warn_pool_check ( - "too long time has been spent on tokenization:" - " %.1f ms, limit is %.1f ms; %d words added so far", - (now - start) * 1e3, max_exec_time * 1e3, - res->len); + msg_warn_pool_check( + "too long time has been spent on tokenization:" + " %.1f ms, limit is %.1f ms; %d words added so far", + (now - start) * 1e3, max_exec_time * 1e3, + res->len); goto end; } } } - g_array_append_val (res, token); + g_array_append_val(res, token); - if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) { + if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ - msg_err_pool_check ( - "too many words found: %d, stop tokenization to avoid DoS", - res->len); + msg_err_pool_check( + "too many words found: %d, stop tokenization to avoid DoS", + res->len); goto end; } @@ -392,21 +391,21 @@ rspamd_tokenize_text (const gchar *text, gsize len, struct rspamd_process_exception *ex = NULL; if (bi == NULL) { - bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err); + bi = ubrk_open(UBRK_WORD, NULL, NULL, 0, &uc_err); - g_assert (U_SUCCESS (uc_err)); + g_assert(U_SUCCESS(uc_err)); } - ubrk_setUText (bi, (UText*)utxt, &uc_err); - last = ubrk_first (bi); + ubrk_setUText(bi, (UText *) utxt, &uc_err); + last = ubrk_first(bi); p = last; if (cur) { - ex = (struct rspamd_process_exception *)cur->data; + ex = (struct rspamd_process_exception *) cur->data; } while (p != UBRK_DONE) { -start_over: + start_over: token.original.len = 0; if (p > last) { @@ -418,19 +417,19 @@ start_over: while (cur && ex->pos <= last) { /* We have an exception at the beginning, skip those */ last += ex->len; - rspamd_tokenize_exception (ex, res); + rspamd_tokenize_exception(ex, res); if (last > p) { /* Exception spread over the boundaries */ while (last > p && p != UBRK_DONE) { gint32 old_p = p; - p = ubrk_next (bi); + p = ubrk_next(bi); if (p != UBRK_DONE && p <= old_p) { - msg_warn_pool_check ( - "tokenization reversed back on position %d," - "%d new position (%d backward), likely libicu bug!", - (gint)(p), (gint)(old_p), old_p - p); + msg_warn_pool_check( + "tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint) (p), (gint) (old_p), old_p - p); goto end; } @@ -447,8 +446,8 @@ start_over: /* Now, we can have an exception within boundary again */ if (cur && ex->pos >= last && ex->pos <= p) { /* Append the first part */ - if (rspamd_utf_word_valid (text, text + len, last, - ex->pos)) { + if (rspamd_utf_word_valid(text, text + len, last, + ex->pos)) { token.original.begin = text + last; token.original.len = ex->pos - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | @@ -458,18 +457,18 @@ start_over: /* Process the current exception */ last += ex->len + (ex->pos - last); - rspamd_tokenize_exception (ex, res); + rspamd_tokenize_exception(ex, res); if (last > p) { /* Exception spread over the boundaries */ while (last > p && p != UBRK_DONE) { gint32 old_p = p; - p = ubrk_next (bi); + p = ubrk_next(bi); if (p != UBRK_DONE && p <= old_p) { - msg_warn_pool_check ( - "tokenization reversed back on position %d," - "%d new position (%d backward), likely libicu bug!", - (gint)(p), (gint)(old_p), old_p - p); + msg_warn_pool_check( + "tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint) (p), (gint) (old_p), old_p - p); goto end; } @@ -482,7 +481,7 @@ start_over: SHIFT_EX; } else if (p > last) { - if (rspamd_utf_word_valid (text, text + len, last, p)) { + if (rspamd_utf_word_valid(text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | @@ -497,7 +496,7 @@ start_over: SHIFT_EX; } - if (rspamd_utf_word_valid (text, text + len, last, p)) { + if (rspamd_utf_word_valid(text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | @@ -506,7 +505,7 @@ start_over: } else { /* No exceptions within boundary */ - if (rspamd_utf_word_valid (text, text + len, last, p)) { + if (rspamd_utf_word_valid(text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | @@ -515,7 +514,7 @@ start_over: } } else { - if (rspamd_utf_word_valid (text, text + len, last, p)) { + if (rspamd_utf_word_valid(text, text + len, last, p)) { token.original.begin = text + last; token.original.len = p - last; token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | @@ -524,11 +523,12 @@ start_over: } if (token.original.len > 0 && - rspamd_tokenize_check_limit (decay, word_decay, res->len, - &hv, &prob, &token, p, len)) { + rspamd_tokenize_check_limit(decay, word_decay, res->len, + &hv, &prob, &token, p, len)) { if (!decay) { decay = TRUE; - } else { + } + else { token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED; } } @@ -536,15 +536,15 @@ start_over: if (token.original.len > 0) { /* Additional check for number of words */ - if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) { + if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ - msg_err ("too many words found: %d, stop tokenization to avoid DoS", + msg_err("too many words found: %d, stop tokenization to avoid DoS", res->len); goto end; } - g_array_append_val (res, token); + g_array_append_val(res, token); } /* Also check for long text mode */ @@ -553,14 +553,14 @@ start_over: const int words_check_mask = 0x7F; if ((res->len & words_check_mask) == words_check_mask) { - ev_tstamp now = ev_time (); + ev_tstamp now = ev_time(); if (now - start > max_exec_time) { - msg_warn_pool_check ( - "too long time has been spent on tokenization:" - " %.1f ms, limit is %.1f ms; %d words added so far", - (now - start) * 1e3, max_exec_time * 1e3, - res->len); + msg_warn_pool_check( + "too long time has been spent on tokenization:" + " %.1f ms, limit is %.1f ms; %d words added so far", + (now - start) * 1e3, max_exec_time * 1e3, + res->len); goto end; } @@ -568,12 +568,12 @@ start_over: } last = p; - p = ubrk_next (bi); + p = ubrk_next(bi); if (p != UBRK_DONE && p <= last) { - msg_warn_pool_check ("tokenization reversed back on position %d," - "%d new position (%d backward), likely libicu bug!", - (gint)(p), (gint)(last), last - p); + msg_warn_pool_check("tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint) (p), (gint) (last), last - p); goto end; } @@ -582,7 +582,7 @@ start_over: end: if (!decay) { - hv = mum_hash_finish (hv); + hv = mum_hash_finish(hv); } if (hash) { @@ -595,8 +595,8 @@ end: #undef SHIFT_EX static void -rspamd_add_metawords_from_str (const gchar *beg, gsize len, - struct rspamd_task *task) +rspamd_add_metawords_from_str(const gchar *beg, gsize len, + struct rspamd_task *task) { UText utxt = UTEXT_INITIALIZER; UErrorCode uc_err = U_ZERO_ERROR; @@ -605,7 +605,7 @@ rspamd_add_metawords_from_str (const gchar *beg, gsize len, gboolean valid_utf = TRUE; while (i < len) { - U8_NEXT (beg, i, len, uc); + U8_NEXT(beg, i, len, uc); if (((gint32) uc) < 0) { valid_utf = FALSE; @@ -613,12 +613,12 @@ rspamd_add_metawords_from_str (const gchar *beg, gsize len, } #if U_ICU_VERSION_MAJOR_NUM < 50 - if (u_isalpha (uc)) { - gint32 sc = ublock_getCode (uc); + if (u_isalpha(uc)) { + gint32 sc = ublock_getCode(uc); if (sc == UBLOCK_THAI) { valid_utf = FALSE; - msg_info_task ("enable workaround for Thai characters for old libicu"); + msg_info_task("enable workaround for Thai characters for old libicu"); break; } } @@ -626,101 +626,100 @@ rspamd_add_metawords_from_str (const gchar *beg, gsize len, } if (valid_utf) { - utext_openUTF8 (&utxt, - beg, - len, - &uc_err); + utext_openUTF8(&utxt, + beg, + len, + &uc_err); - task->meta_words = rspamd_tokenize_text (beg, len, - &utxt, RSPAMD_TOKENIZE_UTF, - task->cfg, NULL, NULL, - task->meta_words, - task->task_pool); + task->meta_words = rspamd_tokenize_text(beg, len, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL, + task->meta_words, + task->task_pool); - utext_close (&utxt); + utext_close(&utxt); } else { - task->meta_words = rspamd_tokenize_text (beg, len, - NULL, RSPAMD_TOKENIZE_RAW, - task->cfg, NULL, NULL, task->meta_words, - task->task_pool); + task->meta_words = rspamd_tokenize_text(beg, len, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL, task->meta_words, + task->task_pool); } } -void -rspamd_tokenize_meta_words (struct rspamd_task *task) +void rspamd_tokenize_meta_words(struct rspamd_task *task) { guint i = 0; rspamd_stat_token_t *tok; - if (MESSAGE_FIELD (task, subject)) { - rspamd_add_metawords_from_str (MESSAGE_FIELD (task, subject), - strlen (MESSAGE_FIELD (task, subject)), task); + if (MESSAGE_FIELD(task, subject)) { + rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject), + strlen(MESSAGE_FIELD(task, subject)), task); } - if (MESSAGE_FIELD (task, from_mime) && MESSAGE_FIELD (task, from_mime)->len > 0) { + if (MESSAGE_FIELD(task, from_mime) && MESSAGE_FIELD(task, from_mime)->len > 0) { struct rspamd_email_address *addr; - addr = g_ptr_array_index (MESSAGE_FIELD (task, from_mime), 0); + addr = g_ptr_array_index(MESSAGE_FIELD(task, from_mime), 0); if (addr->name) { - rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task); + rspamd_add_metawords_from_str(addr->name, strlen(addr->name), task); } } if (task->meta_words != NULL) { const gchar *language = NULL; - if (MESSAGE_FIELD (task, text_parts) && - MESSAGE_FIELD (task, text_parts)->len > 0) { - struct rspamd_mime_text_part *tp = g_ptr_array_index ( - MESSAGE_FIELD (task, text_parts), 0); + if (MESSAGE_FIELD(task, text_parts) && + MESSAGE_FIELD(task, text_parts)->len > 0) { + struct rspamd_mime_text_part *tp = g_ptr_array_index( + MESSAGE_FIELD(task, text_parts), 0); if (tp->language) { language = tp->language; } } - rspamd_normalize_words (task->meta_words, task->task_pool); - rspamd_stem_words (task->meta_words, task->task_pool, language, - task->lang_det); + rspamd_normalize_words(task->meta_words, task->task_pool); + rspamd_stem_words(task->meta_words, task->task_pool, language, + task->lang_det); for (i = 0; i < task->meta_words->len; i++) { - tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i); + tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER; } } } static inline void -rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, - rspamd_stat_token_t *tok, - rspamd_mempool_t *pool) +rspamd_uchars_to_ucs32(const UChar *src, gsize srclen, + rspamd_stat_token_t *tok, + rspamd_mempool_t *pool) { UChar32 *dest, t, *d; gint32 i = 0; - dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32)); + dest = rspamd_mempool_alloc(pool, srclen * sizeof(UChar32)); d = dest; while (i < srclen) { - U16_NEXT_UNSAFE (src, i, t); + U16_NEXT_UNSAFE(src, i, t); - if (u_isgraph (t)) { + if (u_isgraph(t)) { UCharCategory cat; - cat = u_charType (t); + cat = u_charType(t); #if U_ICU_VERSION_MAJOR_NUM >= 57 - if (u_hasBinaryProperty (t, UCHAR_EMOJI)) { + if (u_hasBinaryProperty(t, UCHAR_EMOJI)) { tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI; } #endif if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) || - cat == U_CONNECTOR_PUNCTUATION || - cat == U_MATH_SYMBOL || - cat == U_CURRENCY_SYMBOL) { - *d++ = u_tolower (t); + cat == U_CONNECTOR_PUNCTUATION || + cat == U_MATH_SYMBOL || + cat == U_CURRENCY_SYMBOL) { + *d++ = u_tolower(t); } } else { @@ -734,52 +733,51 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, } static inline void -rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok, - rspamd_mempool_t *pool) +rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok, + rspamd_mempool_t *pool) { guint i, doff = 0; gsize utflen = 0; gchar *dest; UChar32 t; - for (i = 0; i < tok->unicode.len; i ++) { - utflen += U8_LENGTH (tok->unicode.begin[i]); + for (i = 0; i < tok->unicode.len; i++) { + utflen += U8_LENGTH(tok->unicode.begin[i]); } - dest = rspamd_mempool_alloc (pool, utflen + 1); + dest = rspamd_mempool_alloc(pool, utflen + 1); - for (i = 0; i < tok->unicode.len; i ++) { + for (i = 0; i < tok->unicode.len; i++) { t = tok->unicode.begin[i]; - U8_APPEND_UNSAFE (dest, doff, t); + U8_APPEND_UNSAFE(dest, doff, t); } - g_assert (doff <= utflen); + g_assert(doff <= utflen); dest[doff] = '\0'; tok->normalized.len = doff; tok->normalized.begin = dest; } -void -rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool) +void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool) { UErrorCode uc_err = U_ZERO_ERROR; UConverter *utf8_converter; UChar tmpbuf[1024]; /* Assume that we have no longer words... */ gsize ulen; - utf8_converter = rspamd_get_utf8_converter (); + utf8_converter = rspamd_get_utf8_converter(); if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { - ulen = ucnv_toUChars (utf8_converter, - tmpbuf, - G_N_ELEMENTS (tmpbuf), - tok->original.begin, - tok->original.len, - &uc_err); + ulen = ucnv_toUChars(utf8_converter, + tmpbuf, + G_N_ELEMENTS(tmpbuf), + tok->original.begin, + tok->original.len, + &uc_err); /* Now, we need to understand if we need to normalise the word */ - if (!U_SUCCESS (uc_err)) { + if (!U_SUCCESS(uc_err)) { tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; tok->unicode.begin = NULL; tok->unicode.len = 0; @@ -788,14 +786,14 @@ rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool) } else { #if U_ICU_VERSION_MAJOR_NUM >= 44 - const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); + const UNormalizer2 *norm = rspamd_get_unicode_normalizer(); gint32 end; /* We can now check if we need to decompose */ - end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err); + end = unorm2_spanQuickCheckYes(norm, tmpbuf, ulen, &uc_err); - if (!U_SUCCESS (uc_err)) { - rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); + if (!U_SUCCESS(uc_err)) { + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); tok->normalized.begin = NULL; tok->normalized.len = 0; tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; @@ -803,46 +801,46 @@ rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool) else { if (end == ulen) { /* Already normalised, just lowercase */ - rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); - rspamd_ucs32_to_normalised (tok, pool); + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised(tok, pool); } else { /* Perform normalization */ UChar normbuf[1024]; - g_assert (end < G_N_ELEMENTS (normbuf)); + g_assert(end < G_N_ELEMENTS(normbuf)); /* First part */ - memcpy (normbuf, tmpbuf, end * sizeof (UChar)); + memcpy(normbuf, tmpbuf, end * sizeof(UChar)); /* Second part */ - ulen = unorm2_normalizeSecondAndAppend (norm, - normbuf, end, - G_N_ELEMENTS (normbuf), - tmpbuf + end, - ulen - end, - &uc_err); - - if (!U_SUCCESS (uc_err)) { + ulen = unorm2_normalizeSecondAndAppend(norm, + normbuf, end, + G_N_ELEMENTS(normbuf), + tmpbuf + end, + ulen - end, + &uc_err); + + if (!U_SUCCESS(uc_err)) { if (uc_err != U_BUFFER_OVERFLOW_ERROR) { - msg_warn_pool_check ("cannot normalise text '%*s': %s", - (gint)tok->original.len, tok->original.begin, - u_errorName (uc_err)); - rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); - rspamd_ucs32_to_normalised (tok, pool); + msg_warn_pool_check("cannot normalise text '%*s': %s", + (gint) tok->original.len, tok->original.begin, + u_errorName(uc_err)); + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised(tok, pool); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; } } else { /* Copy normalised back */ - rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool); + rspamd_uchars_to_ucs32(normbuf, ulen, tok, pool); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED; - rspamd_ucs32_to_normalised (tok, pool); + rspamd_ucs32_to_normalised(tok, pool); } } } #else /* Legacy version with no unorm2 interface */ - rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); - rspamd_ucs32_to_normalised (tok, pool); + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised(tok, pool); #endif } } @@ -851,31 +849,29 @@ rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool) /* Simple lowercase */ gchar *dest; - dest = rspamd_mempool_alloc (pool, tok->original.len + 1); - rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1); - rspamd_str_lc (dest, tok->original.len); + dest = rspamd_mempool_alloc(pool, tok->original.len + 1); + rspamd_strlcpy(dest, tok->original.begin, tok->original.len + 1); + rspamd_str_lc(dest, tok->original.len); tok->normalized.len = tok->original.len; tok->normalized.begin = dest; } } } -void -rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool) +void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool) { rspamd_stat_token_t *tok; guint i; for (i = 0; i < words->len; i++) { - tok = &g_array_index (words, rspamd_stat_token_t, i); - rspamd_normalize_single_word (tok, pool); + tok = &g_array_index(words, rspamd_stat_token_t, i); + rspamd_normalize_single_word(tok, pool); } } -void -rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, - const gchar *language, - struct rspamd_lang_detector *d) +void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, + const gchar *language, + struct rspamd_lang_detector *d) { static GHashTable *stemmers = NULL; struct sb_stemmer *stem = NULL; @@ -885,49 +881,49 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, gsize dlen; if (!stemmers) { - stemmers = g_hash_table_new (rspamd_strcase_hash, - rspamd_strcase_equal); + stemmers = g_hash_table_new(rspamd_strcase_hash, + rspamd_strcase_equal); } if (language && language[0] != '\0') { - stem = g_hash_table_lookup (stemmers, language); + stem = g_hash_table_lookup(stemmers, language); if (stem == NULL) { - stem = sb_stemmer_new (language, "UTF_8"); + stem = sb_stemmer_new(language, "UTF_8"); if (stem == NULL) { - msg_debug_pool ( - "<%s> cannot create lemmatizer for %s language", - language); - g_hash_table_insert (stemmers, g_strdup (language), - GINT_TO_POINTER (-1)); + msg_debug_pool( + "<%s> cannot create lemmatizer for %s language", + language); + g_hash_table_insert(stemmers, g_strdup(language), + GINT_TO_POINTER(-1)); } else { - g_hash_table_insert (stemmers, g_strdup (language), - stem); + g_hash_table_insert(stemmers, g_strdup(language), + stem); } } - else if (stem == GINT_TO_POINTER (-1)) { + else if (stem == GINT_TO_POINTER(-1)) { /* Negative cache */ stem = NULL; } } for (i = 0; i < words->len; i++) { - tok = &g_array_index (words, rspamd_stat_token_t, i); + tok = &g_array_index(words, rspamd_stat_token_t, i); if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { if (stem) { const gchar *stemmed = NULL; - stemmed = sb_stemmer_stem (stem, - tok->normalized.begin, tok->normalized.len); + stemmed = sb_stemmer_stem(stem, + tok->normalized.begin, tok->normalized.len); - dlen = stemmed ? strlen (stemmed) : 0; + dlen = stemmed ? strlen(stemmed) : 0; if (dlen > 0) { - dest = rspamd_mempool_alloc (pool, dlen + 1); - memcpy (dest, stemmed, dlen); + dest = rspamd_mempool_alloc(pool, dlen + 1); + memcpy(dest, stemmed, dlen); dest[dlen] = '\0'; tok->stemmed.len = dlen; tok->stemmed.begin = dest; @@ -945,7 +941,7 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, } if (tok->stemmed.len > 0 && d != NULL && - rspamd_language_detector_is_stop_word (d, tok->stemmed.begin, tok->stemmed.len)) { + rspamd_language_detector_is_stop_word(d, tok->stemmed.begin, tok->stemmed.len)) { tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD; } } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index ca7261802..e908c359d 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -11,7 +11,7 @@ #define RSPAMD_DEFAULT_TOKENIZER "osb" -#ifdef __cplusplus +#ifdef __cplusplus extern "C" { #endif @@ -22,15 +22,15 @@ struct rspamd_stat_ctx; struct rspamd_stat_tokenizer { gchar *name; - gpointer (*get_config) (rspamd_mempool_t *pool, - struct rspamd_tokenizer_config *cf, gsize *len); + gpointer (*get_config)(rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, gsize *len); - gint (*tokenize_func) (struct rspamd_stat_ctx *ctx, - struct rspamd_task *task, - GArray *words, - gboolean is_utf, - const gchar *prefix, - GPtrArray *result); + gint (*tokenize_func)(struct rspamd_stat_ctx *ctx, + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); }; enum rspamd_tokenize_type { @@ -40,44 +40,44 @@ enum rspamd_tokenize_type { }; /* Compare two token nodes */ -gint token_node_compare_func (gconstpointer a, gconstpointer b); +gint token_node_compare_func(gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_stat_token_t type) */ -GArray *rspamd_tokenize_text (const gchar *text, gsize len, - const UText *utxt, - enum rspamd_tokenize_type how, - struct rspamd_config *cfg, - GList *exceptions, - guint64 *hash, - GArray *cur_words, - rspamd_mempool_t *pool); +GArray *rspamd_tokenize_text(const gchar *text, gsize len, + const UText *utxt, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + guint64 *hash, + GArray *cur_words, + rspamd_mempool_t *pool); /* OSB tokenize function */ -gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, - struct rspamd_task *task, - GArray *words, - gboolean is_utf, - const gchar *prefix, - GPtrArray *result); +gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); -gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, - struct rspamd_tokenizer_config *cf, - gsize *len); +gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, + gsize *len); struct rspamd_lang_detector; -void rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool); +void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool); -void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool); +void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool); -void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, - const gchar *language, - struct rspamd_lang_detector *d); +void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, + const gchar *language, + struct rspamd_lang_detector *d); -void rspamd_tokenize_meta_words (struct rspamd_task *task); +void rspamd_tokenize_meta_words(struct rspamd_task *task); -#ifdef __cplusplus +#ifdef __cplusplus } #endif |