summaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-07-23 12:57:31 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-07-23 12:57:31 +0100
commit379055dbbb4af997b4d3ffb161d447872d7ca357 (patch)
tree3774553d470f93e12ddeb454aad9b3b607cf8918 /src/tokenizers
parent602ae7a0b7e215ba2677131b8fdc70abc156b3ca (diff)
downloadrspamd-379055dbbb4af997b4d3ffb161d447872d7ca357.tar.gz
rspamd-379055dbbb4af997b4d3ffb161d447872d7ca357.zip
Unify style without sorting headers.
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/osb.c38
-rw-r--r--src/tokenizers/tokenizers.c131
-rw-r--r--src/tokenizers/tokenizers.h24
3 files changed, 114 insertions, 79 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 823e1e5b5..b74441eca 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -32,26 +32,34 @@
/* Minimum length of token */
#define MIN_LEN 4
-extern const int primes[];
+extern const int primes[];
int
-osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t * input, GTree ** tree,
- gboolean save_token, gboolean is_utf, GList *exceptions)
+osb_tokenize_text (struct tokenizer *tokenizer,
+ rspamd_mempool_t * pool,
+ f_str_t * input,
+ GTree ** tree,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions)
{
- token_node_t *new = NULL;
- f_str_t token = { NULL, 0, 0 };
- guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- gint i, l, processed = 0;
- gchar *res;
+ token_node_t *new = NULL;
+ f_str_t token = { NULL, 0, 0 };
+ guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+ gint i, l, processed = 0;
+ gchar *res;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
+ rspamd_mempool_add_destructor (pool,
+ (rspamd_mempool_destruct_t) g_tree_destroy,
+ *tree);
}
memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
- while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
+ while ((res =
+ tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
/* Skip small words */
if (is_utf) {
l = g_utf8_strlen (token.begin, token.len);
@@ -67,7 +75,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t
if (processed < FEATURE_WINDOW_SIZE) {
/* Just fill a hashpipe */
hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
- fstrhash_lowercase (&token, is_utf);
+ fstrhash_lowercase (&token, is_utf);
}
else {
/* Shift hashpipe */
@@ -75,16 +83,18 @@ osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, f_str_t
hashpipe[i] = hashpipe[i - 1];
}
hashpipe[0] = fstrhash_lowercase (&token, is_utf);
- processed ++;
+ processed++;
for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
+ h2 = hashpipe[0] * primes[1] + hashpipe[i] *
+ primes[(i << 1) - 1];
new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
new->h1 = h1;
new->h2 = h2;
if (save_token) {
- new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, &token);
+ new->extra =
+ (uintptr_t)rspamd_mempool_fstrdup (pool, &token);
}
if (g_tree_lookup (*tree, new) == NULL) {
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 448dcd53e..eb7a489e5 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -30,11 +30,11 @@
#include "main.h"
#include "tokenizers.h"
-struct tokenizer tokenizers[] = {
+struct tokenizer tokenizers[] = {
{"osb-text", osb_tokenize_text, get_next_word},
};
-const int primes[] = {
+const int primes[] = {
1, 7,
3, 13,
5, 29,
@@ -48,38 +48,38 @@ const int primes[] = {
};
const gchar t_delimiters[255] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
- 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
- 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0
};
-struct tokenizer *
+struct tokenizer *
get_tokenizer (const char *name)
{
- guint i;
+ guint i;
for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
if (strcmp (tokenizers[i].name, name) == 0) {
@@ -93,7 +93,7 @@ get_tokenizer (const char *name)
int
token_node_compare_func (gconstpointer a, gconstpointer b)
{
- const token_node_t *aa = a, *bb = b;
+ const token_node_t *aa = a, *bb = b;
if (aa->h1 == bb->h1) {
return aa->h2 - bb->h2;
@@ -106,9 +106,9 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
gchar *
get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions)
{
- gsize remain, pos;
- guchar *p;
- struct process_exception *ex = NULL;
+ gsize remain, pos;
+ guchar *p;
+ struct process_exception *ex = NULL;
if (buf == NULL) {
return NULL;
@@ -165,7 +165,7 @@ get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions)
token->len++;
pos++;
remain--;
- p ++;
+ p++;
}
if (remain == 0) {
@@ -177,30 +177,34 @@ get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions)
/* Struct to access gmime headers */
struct raw_header {
- struct raw_header *next;
- char *name;
- char *value;
+ struct raw_header *next;
+ char *name;
+ char *value;
};
typedef struct _GMimeHeader {
- GHashTable *hash;
- GHashTable *writers;
- struct raw_header *headers;
+ GHashTable *hash;
+ GHashTable *writers;
+ struct raw_header *headers;
} local_GMimeHeader;
int
-tokenize_headers (rspamd_mempool_t * pool, struct rspamd_task *task, GTree ** tree)
+tokenize_headers (rspamd_mempool_t * pool,
+ struct rspamd_task *task,
+ GTree ** tree)
{
- token_node_t *new = NULL;
- f_str_t headername;
- f_str_t headervalue;
+ token_node_t *new = NULL;
+ f_str_t headername;
+ f_str_t headervalue;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
+ rspamd_mempool_add_destructor (pool,
+ (rspamd_mempool_destruct_t) g_tree_destroy,
+ *tree);
}
#ifndef GMIME24
- struct raw_header *h;
+ struct raw_header *h;
h = GMIME_OBJECT (task->message)->headers->headers;
while (h) {
@@ -219,10 +223,10 @@ tokenize_headers (rspamd_mempool_t * pool, struct rspamd_task *task, GTree ** tr
h = h->next;
}
#else
- GMimeHeaderList *ls;
- GMimeHeaderIter *iter;
- const char *name;
- const char *value;
+ GMimeHeaderList *ls;
+ GMimeHeaderIter *iter;
+ const char *name;
+ const char *value;
ls = GMIME_OBJECT (task->message)->headers;
iter = g_mime_header_iter_new ();
@@ -254,13 +258,14 @@ tokenize_headers (rspamd_mempool_t * pool, struct rspamd_task *task, GTree ** tr
void
tokenize_subject (struct rspamd_task *task, GTree ** tree)
{
- f_str_t subject;
- const gchar *sub;
- struct tokenizer *osb_tokenizer;
+ f_str_t subject;
+ const gchar *sub;
+ struct tokenizer *osb_tokenizer;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
}
osb_tokenizer = get_tokenizer ("osb-text");
@@ -269,12 +274,24 @@ tokenize_subject (struct rspamd_task *task, GTree ** tree)
if (task->subject != NULL) {
subject.begin = task->subject;
subject.len = strlen (task->subject);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
+ osb_tokenizer->tokenize_func (osb_tokenizer,
+ task->task_pool,
+ &subject,
+ tree,
+ FALSE,
+ TRUE,
+ NULL);
}
if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
subject.begin = (gchar *)sub;
subject.len = strlen (sub);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
+ osb_tokenizer->tokenize_func (osb_tokenizer,
+ task->task_pool,
+ &subject,
+ tree,
+ FALSE,
+ TRUE,
+ NULL);
}
}
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index 207602dc8..883f38058 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -19,22 +19,30 @@ typedef struct token_node_s {
/* Common tokenizer structure */
struct tokenizer {
gchar *name;
- gint (*tokenize_func)(struct tokenizer *tokenizer, rspamd_mempool_t *pool, f_str_t *input,
- GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
- gchar* (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions);
+ gint (*tokenize_func)(struct tokenizer *tokenizer, rspamd_mempool_t *pool,
+ f_str_t *input,
+ GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
+ gchar * (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions);
};
/* Compare two token nodes */
int token_node_compare_func (gconstpointer a, gconstpointer b);
/* Get tokenizer structure by name or return NULL if this name is not found */
-struct tokenizer* get_tokenizer (const char *name);
+struct tokenizer * get_tokenizer (const char *name);
/* Get next word from specified f_str_t buf */
-gchar* get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions);
+gchar * get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions);
/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t *pool, f_str_t *input,
- GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
+int osb_tokenize_text (struct tokenizer *tokenizer,
+ rspamd_mempool_t *pool,
+ f_str_t *input,
+ GTree **cur,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions);
/* Common tokenizer for headers */
-int tokenize_headers (rspamd_mempool_t *pool, struct rspamd_task *task, GTree **cur);
+int tokenize_headers (rspamd_mempool_t *pool,
+ struct rspamd_task *task,
+ GTree **cur);
/* Make tokens for a subject */
void tokenize_subject (struct rspamd_task *task, GTree ** tree);