summaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2024-03-18 18:56:33 +0000
committerVsevolod Stakhov <vsevolod@rspamd.com>2024-03-18 18:56:33 +0000
commit14c13854d3cae9d93c3d148be30fb72f1eaffe55 (patch)
tree7b1a3e41b75490fac4d45722c90a1847543c6796 /src/libstat/tokenizers
parent6b2b4167187fee09365271cca182866ecb029af3 (diff)
downloadrspamd-14c13854d3cae9d93c3d148be30fb72f1eaffe55.tar.gz
rspamd-14c13854d3cae9d93c3d148be30fb72f1eaffe55.zip
[Rework] Further types conversion (no functional changes)
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/osb.c28
-rw-r--r--src/libstat/tokenizers/tokenizers.c64
-rw-r--r--src/libstat/tokenizers/tokenizers.h32
3 files changed, 62 insertions, 62 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 3f770c69e..039ead231 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -49,7 +49,7 @@ static const int primes[] = {
3277,
};
-static const guchar osb_tokenizer_magic[] = {'o', 's', 'b', 't', 'o', 'k', 'v', '2'};
+static const unsigned char osb_tokenizer_magic[] = {'o', 's', 'b', 't', 'o', 'k', 'v', '2'};
enum rspamd_osb_hash_type {
RSPAMD_OSB_HASH_COMPAT = 0,
@@ -58,7 +58,7 @@ enum rspamd_osb_hash_type {
};
struct rspamd_osb_tokenizer_config {
- guchar magic[8];
+ unsigned char magic[8];
gshort version;
gshort window_size;
enum rspamd_osb_hash_type ht;
@@ -92,7 +92,7 @@ rspamd_tokenizer_osb_config_from_ucl(rspamd_mempool_t *pool,
{
const ucl_object_t *elt;
struct rspamd_osb_tokenizer_config *cf, *def;
- guchar *key = NULL;
+ unsigned char *key = NULL;
gsize keylen;
@@ -266,12 +266,12 @@ struct token_pipe_entry {
rspamd_stat_token_t *t;
};
-gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
- struct rspamd_task *task,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result)
+int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const char *prefix,
+ GPtrArray *result)
{
rspamd_token_t *new_tok = NULL;
rspamd_stat_token_t *token;
@@ -280,7 +280,7 @@ gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
struct token_pipe_entry *hashpipe;
uint32_t h1, h2;
gsize token_size;
- guint processed = 0, i, w, window_size, token_flags = 0;
+ unsigned int processed = 0, i, w, window_size, token_flags = 0;
if (words == NULL) {
return FALSE;
@@ -309,7 +309,7 @@ gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
for (w = 0; w < words->len; w++) {
token = &g_array_index(words, rspamd_stat_token_t, w);
token_flags = token->flags;
- const gchar *begin;
+ const char *begin;
gsize len;
if (token->flags &
@@ -341,7 +341,7 @@ gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
begin, len, osb_cf->seed);
}
else {
- rspamd_cryptobox_siphash((guchar *) &cur, begin,
+ rspamd_cryptobox_siphash((unsigned char *) &cur, begin,
len, osb_cf->sk);
if (prefix) {
@@ -373,8 +373,8 @@ gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
((uint32_t) hashpipe[i].h) * primes[i << 1]; \
h2 = ((uint32_t) hashpipe[0].h) * primes[1] + \
((uint32_t) hashpipe[i].h) * primes[(i << 1) - 1]; \
- memcpy((guchar *) &new_tok->data, &h1, sizeof(h1)); \
- memcpy(((guchar *) &new_tok->data) + sizeof(h1), &h2, sizeof(h2)); \
+ memcpy((unsigned char *) &new_tok->data, &h1, sizeof(h1)); \
+ memcpy(((unsigned char *) &new_tok->data) + sizeof(h1), &h2, sizeof(h2)); \
} \
else { \
new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 702668142..0ea1bcfc6 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -35,11 +35,11 @@
#include <math.h>
-typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, gchar const **pos,
+typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, char const **pos,
rspamd_stat_token_t *token,
GList **exceptions, gsize *rl, gboolean check_signature);
-const gchar t_delimiters[256] = {
+const char t_delimiters[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -70,11 +70,11 @@ const gchar t_delimiters[256] = {
/* Get next word from specified f_str_t buf */
static gboolean
rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
- gchar const **cur, rspamd_stat_token_t *token,
+ char const **cur, rspamd_stat_token_t *token,
GList **exceptions, gsize *rl, gboolean unused)
{
gsize remain, pos;
- const gchar *p;
+ const char *p;
struct rspamd_process_exception *ex = NULL;
if (buf == NULL) {
@@ -127,11 +127,11 @@ rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
pos++;
p++;
remain--;
- } while (remain > 0 && t_delimiters[(guchar) *p]);
+ } while (remain > 0 && t_delimiters[(unsigned char) *p]);
token->original.begin = p;
- while (remain > 0 && !t_delimiters[(guchar) *p]) {
+ while (remain > 0 && !t_delimiters[(unsigned char) *p]) {
if (ex != NULL && ex->pos == pos) {
*exceptions = g_list_next(*exceptions);
*cur = p + ex->len;
@@ -160,15 +160,15 @@ rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
static inline gboolean
rspamd_tokenize_check_limit(gboolean decay,
- guint word_decay,
- guint nwords,
+ unsigned int word_decay,
+ unsigned int nwords,
uint64_t *hv,
uint64_t *prob,
const rspamd_stat_token_t *token,
gssize remain,
gssize total)
{
- static const gdouble avg_word_len = 6.0;
+ static const double avg_word_len = 6.0;
if (!decay) {
if (token->original.len >= sizeof(uint64_t)) {
@@ -180,12 +180,12 @@ rspamd_tokenize_check_limit(gboolean decay,
/* Check for decay */
if (word_decay > 0 && nwords > word_decay && remain < (gssize) total) {
/* Start decay */
- gdouble decay_prob;
+ double decay_prob;
*hv = mum_hash_finish(*hv);
/* We assume that word is 6 symbols length in average */
- decay_prob = (gdouble) word_decay / ((total - (remain)) / avg_word_len) * 10;
+ decay_prob = (double) word_decay / ((total - (remain)) / avg_word_len) * 10;
decay_prob = floor(decay_prob) / 10.0;
if (decay_prob >= 1.0) {
@@ -212,10 +212,10 @@ rspamd_tokenize_check_limit(gboolean decay,
}
static inline gboolean
-rspamd_utf_word_valid(const guchar *text, const guchar *end,
+rspamd_utf_word_valid(const unsigned char *text, const unsigned char *end,
int32_t start, int32_t finish)
{
- const guchar *st = text + start, *fin = text + finish;
+ const unsigned char *st = text + start, *fin = text + finish;
UChar32 c;
if (st >= end || fin > end || st >= fin) {
@@ -278,7 +278,7 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
GArray *
-rspamd_tokenize_text(const gchar *text, gsize len,
+rspamd_tokenize_text(const char *text, gsize len,
const UText *utxt,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
@@ -288,11 +288,11 @@ rspamd_tokenize_text(const gchar *text, gsize len,
rspamd_mempool_t *pool)
{
rspamd_stat_token_t token, buf;
- const gchar *pos = NULL;
+ const char *pos = NULL;
gsize l = 0;
GArray *res;
GList *cur = exceptions;
- guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+ unsigned int min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
uint64_t hv = 0;
gboolean decay = FALSE, long_text_mode = FALSE;
uint64_t prob = 0;
@@ -429,7 +429,7 @@ rspamd_tokenize_text(const gchar *text, gsize len,
msg_warn_pool_check(
"tokenization reversed back on position %d,"
"%d new position (%d backward), likely libicu bug!",
- (gint) (p), (gint) (old_p), old_p - p);
+ (int) (p), (int) (old_p), old_p - p);
goto end;
}
@@ -468,7 +468,7 @@ rspamd_tokenize_text(const gchar *text, gsize len,
msg_warn_pool_check(
"tokenization reversed back on position %d,"
"%d new position (%d backward), likely libicu bug!",
- (gint) (p), (gint) (old_p), old_p - p);
+ (int) (p), (int) (old_p), old_p - p);
goto end;
}
@@ -573,7 +573,7 @@ rspamd_tokenize_text(const gchar *text, gsize len,
if (p != UBRK_DONE && p <= last) {
msg_warn_pool_check("tokenization reversed back on position %d,"
"%d new position (%d backward), likely libicu bug!",
- (gint) (p), (gint) (last), last - p);
+ (int) (p), (int) (last), last - p);
goto end;
}
@@ -595,12 +595,12 @@ end:
#undef SHIFT_EX
static void
-rspamd_add_metawords_from_str(const gchar *beg, gsize len,
+rspamd_add_metawords_from_str(const char *beg, gsize len,
struct rspamd_task *task)
{
UText utxt = UTEXT_INITIALIZER;
UErrorCode uc_err = U_ZERO_ERROR;
- guint i = 0;
+ unsigned int i = 0;
UChar32 uc;
gboolean valid_utf = TRUE;
@@ -649,7 +649,7 @@ rspamd_add_metawords_from_str(const gchar *beg, gsize len,
void rspamd_tokenize_meta_words(struct rspamd_task *task)
{
- guint i = 0;
+ unsigned int i = 0;
rspamd_stat_token_t *tok;
if (MESSAGE_FIELD(task, subject)) {
@@ -668,7 +668,7 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
}
if (task->meta_words != NULL) {
- const gchar *language = NULL;
+ const char *language = NULL;
if (MESSAGE_FIELD(task, text_parts) &&
MESSAGE_FIELD(task, text_parts)->len > 0) {
@@ -736,9 +736,9 @@ static inline void
rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok,
rspamd_mempool_t *pool)
{
- guint i, doff = 0;
+ unsigned int i, doff = 0;
gsize utflen = 0;
- gchar *dest;
+ char *dest;
UChar32 t;
for (i = 0; i < tok->unicode.len; i++) {
@@ -822,7 +822,7 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
if (!U_SUCCESS(uc_err)) {
if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
msg_warn_pool_check("cannot normalise text '%*s': %s",
- (gint) tok->original.len, tok->original.begin,
+ (int) tok->original.len, tok->original.begin,
u_errorName(uc_err));
rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
rspamd_ucs32_to_normalised(tok, pool);
@@ -847,7 +847,7 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
else {
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
/* Simple lowercase */
- gchar *dest;
+ char *dest;
dest = rspamd_mempool_alloc(pool, tok->original.len + 1);
rspamd_strlcpy(dest, tok->original.begin, tok->original.len + 1);
@@ -861,7 +861,7 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
{
rspamd_stat_token_t *tok;
- guint i;
+ unsigned int i;
for (i = 0; i < words->len; i++) {
tok = &g_array_index(words, rspamd_stat_token_t, i);
@@ -870,14 +870,14 @@ void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
}
void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
- const gchar *language,
+ const char *language,
struct rspamd_lang_detector *lang_detector)
{
static GHashTable *stemmers = NULL;
struct sb_stemmer *stem = NULL;
- guint i;
+ unsigned int i;
rspamd_stat_token_t *tok;
- gchar *dest;
+ char *dest;
gsize dlen;
if (!stemmers) {
@@ -914,7 +914,7 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
if (stem) {
- const gchar *stemmed = NULL;
+ const char *stemmed = NULL;
stemmed = sb_stemmer_stem(stem,
tok->normalized.begin, tok->normalized.len);
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index ff5c530c5..d4a8824a8 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -36,17 +36,17 @@ struct rspamd_stat_ctx;
/* Common tokenizer structure */
struct rspamd_stat_tokenizer {
- gchar *name;
+ char *name;
gpointer (*get_config)(rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf, gsize *len);
- gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
- struct rspamd_task *task,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result);
+ int (*tokenize_func)(struct rspamd_stat_ctx *ctx,
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const char *prefix,
+ GPtrArray *result);
};
enum rspamd_tokenize_type {
@@ -56,11 +56,11 @@ enum rspamd_tokenize_type {
};
/* Compare two token nodes */
-gint token_node_compare_func(gconstpointer a, gconstpointer b);
+int token_node_compare_func(gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray *rspamd_tokenize_text(const gchar *text, gsize len,
+GArray *rspamd_tokenize_text(const char *text, gsize len,
const UText *utxt,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
@@ -70,12 +70,12 @@ GArray *rspamd_tokenize_text(const gchar *text, gsize len,
rspamd_mempool_t *pool);
/* OSB tokenize function */
-gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
- struct rspamd_task *task,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result);
+int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const char *prefix,
+ GPtrArray *result);
gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,
@@ -88,7 +88,7 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
- const gchar *language,
+ const char *language,
struct rspamd_lang_detector *lang_detector);
void rspamd_tokenize_meta_words(struct rspamd_task *task);