aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
commita3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch)
tree352c634bbbc74cf17644545ace66a8feedc841c3 /src
parent63725086863e4f422340479f83dd7ef374613e76 (diff)
downloadrspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz
rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip
* Welcome 0.4.0
Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishing
Diffstat (limited to 'src')
-rw-r--r--src/binlog.c9
-rw-r--r--src/cfg_file.h2
-rw-r--r--src/cfg_utils.c3
-rw-r--r--src/cfg_xml.c12
-rw-r--r--src/classifiers/bayes.c31
-rw-r--r--src/controller.c21
-rw-r--r--src/filter.c10
-rw-r--r--src/fstring.c95
-rw-r--r--src/fstring.h4
-rw-r--r--src/fuzzy.c22
-rw-r--r--src/main.h7
-rw-r--r--src/mem_pool.c20
-rw-r--r--src/message.c21
-rw-r--r--src/statfile.c8
-rw-r--r--src/statfile_sync.c39
-rw-r--r--src/tokenizers/osb.c57
-rw-r--r--src/tokenizers/tokenizers.c64
-rw-r--r--src/tokenizers/tokenizers.h16
-rw-r--r--src/url.c12
-rw-r--r--src/url.h3
20 files changed, 274 insertions, 182 deletions
diff --git a/src/binlog.c b/src/binlog.c
index 76cfa0cca..2a1eb9fcb 100644
--- a/src/binlog.c
+++ b/src/binlog.c
@@ -120,7 +120,7 @@ binlog_check_file (struct rspamd_binlog *log)
return FALSE;
}
- log->cur_seq = log->cur_idx->last_index;
+ log->cur_seq = log->metaindex->last_index * BINLOG_IDX_LEN + log->cur_idx->last_index;
log->cur_time = log->cur_idx->indexes[log->cur_idx->last_index].time;
return TRUE;
@@ -425,6 +425,13 @@ binlog_sync (struct rspamd_binlog *log, guint64 from_rev, guint64 *from_time, GB
*rep = NULL;
return FALSE;
}
+ else if (from_rev > log->cur_seq) {
+ /* Slave has more actual copy, write this to log and abort sync */
+ msg_warn ("slave has more recent revision of statfile %s: %uL and our is: %uL", log->filename, from_rev, log->cur_seq);
+ *rep = NULL;
+ *from_time = 0;
+ return FALSE;
+ }
metaindex_num = from_rev / BINLOG_IDX_LEN;
/* First of all try to find this revision */
diff --git a/src/cfg_file.h b/src/cfg_file.h
index 9e10cd8fc..268906d40 100644
--- a/src/cfg_file.h
+++ b/src/cfg_file.h
@@ -270,6 +270,8 @@ struct config_file {
gboolean log_extended; /**< log extended information */
gsize max_statfile_size; /**< maximum size for statfile */
+ guint32 statfile_sync_interval; /**< synchronization interval */
+ guint32 statfile_sync_timeout; /**< synchronization timeout */
struct memcached_server memcached_servers[MAX_MEMCACHED_SERVERS]; /**< memcached servers */
gsize memcached_servers_num; /**< number of memcached servers */
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index 99a4bc3d9..720d931ef 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -172,6 +172,9 @@ init_defaults (struct config_file *cfg)
cfg->dns_throttling_errors = 20;
cfg->dns_throttling_time = 10000;
+ cfg->statfile_sync_interval = 60000;
+ cfg->statfile_sync_timeout = 20000;
+
cfg->max_statfile_size = DEFAULT_STATFILE_SIZE;
cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal);
cfg->variables = g_hash_table_new (g_str_hash, g_str_equal);
diff --git a/src/cfg_xml.c b/src/cfg_xml.c
index fba925cc6..09f6574a0 100644
--- a/src/cfg_xml.c
+++ b/src/cfg_xml.c
@@ -281,6 +281,18 @@ static struct xml_parser_rule grammar[] = {
G_STRUCT_OFFSET (struct config_file, filters_str),
NULL
},
+ {
+ "sync_interval",
+ xml_handle_seconds,
+ G_STRUCT_OFFSET (struct config_file, statfile_sync_interval),
+ NULL
+ },
+ {
+ "sync_timeout",
+ xml_handle_seconds,
+ G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout),
+ NULL
+ },
NULL_ATTR
},
NULL_DEF_ATTR
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index b4f7826e5..dadd33e5e 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -43,11 +43,11 @@ bayes_error_quark (void)
}
struct bayes_statfile_data {
- double hits;
- double total_hits;
+ guint64 hits;
+ guint64 total_hits;
double local_probability;
double post_probability;
- double value;
+ guint value;
struct statfile *st;
stat_file_t *file;
};
@@ -67,25 +67,22 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data)
{
token_node_t *node = key;
struct bayes_callback_data *cd = data;
- double v, c;
+ gint v, c;
c = (cd->in_class) ? 1 : -1;
/* Consider that not found blocks have value 1 */
v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
- if (fabs (v) < ALPHA && c > 0) {
+ if (v == 0 && c > 0) {
statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
}
else {
- if (G_LIKELY (c > 0 && c < G_MAXDOUBLE)) {
- v += c;
+ if (G_LIKELY (c > 0)) {
+ v ++;
}
else if (c < 0){
- if (v > -c) {
- v -= c;
- }
- else {
- v = 0;
+ if (v != 0) {
+ v --;
}
}
statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v);
@@ -103,14 +100,15 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
token_node_t *node = key;
struct bayes_callback_data *cd = data;
- double local_hits = 0, renorm = 0;
- int i;
+ double renorm = 0;
+ gint i;
+ guint64 local_hits = 0;
struct bayes_statfile_data *cur;
for (i = 0; i < cd->statfiles_num; i ++) {
cur = &cd->statfiles[i];
cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now);
- if (cur->value > ALPHA) {
+ if (cur->value > 0) {
cur->total_hits += cur->value;
cur->hits = cur->value;
local_hits += cur->value;
@@ -121,7 +119,8 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
}
for (i = 0; i < cd->statfiles_num; i ++) {
cur = &cd->statfiles[i];
- cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (local_hits + 1.0));
+ cur->local_probability = 0.5 + ((double)cur->value - ((double)local_hits - cur->value)) /
+ (LOCAL_PROB_DENOM * (1.0 + local_hits));
renorm += cur->post_probability * cur->local_probability;
}
diff --git a/src/controller.c b/src/controller.c
index f899518fe..e7e4c10d8 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -212,7 +212,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct
struct statfile *st;
gchar out_buf[BUFSIZ];
gint i;
- guint64 rev, ti, len, pos;
+ guint64 rev, ti, len, pos, blocks;
gchar *out;
struct rspamd_binlog_element log_elt;
struct stat_file_block *stat_elt;
@@ -222,7 +222,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct
if (statfile == NULL) {
return FALSE;
}
-
+
/* Begin to copy all blocks into array */
statfile_get_revision (statfile, &rev, (time_t *)&ti);
if (ti == 0) {
@@ -230,10 +230,13 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct
ti = time (NULL);
statfile_set_revision (statfile, rev, ti);
}
- len = statfile->cur_section.length * sizeof (struct rspamd_binlog_element);
+ msg_info ("send a whole statfile %s with version %uL to slave", symbol, rev);
+
+ blocks = statfile_get_total_blocks (statfile);
+ len = blocks * sizeof (struct rspamd_binlog_element);
out = memory_pool_alloc (session->session_pool, len);
- for (i = 0, pos = 0; i < statfile->cur_section.length; i ++) {
+ for (i = 0, pos = 0; i < blocks; i ++) {
stat_elt = (struct stat_file_block *)((u_char *)statfile->map + statfile->seek_pos + i * sizeof (struct stat_file_block));
if (fabs (stat_elt->value) > 0.001) {
/* Write only those values which value is not 0 */
@@ -324,6 +327,7 @@ process_sync_command (struct controller_session *session, gchar **args)
}
while (binlog_sync (binlog, rev, &time, &data)) {
+ rev ++;
r = rspamd_snprintf (out_buf, sizeof (out_buf), "%uL %uL %z" CRLF, rev, time, data->len);
if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) {
if (data != NULL) {
@@ -339,7 +343,6 @@ process_sync_command (struct controller_session *session, gchar **args)
return FALSE;
}
}
- rev ++;
}
if (time == 0) {
@@ -666,12 +669,6 @@ process_command (struct controller_command *cmd, gchar **cmd_args, struct contro
}
return TRUE;
}
- else {
- if (! rspamd_dispatcher_write (session->dispatcher, CRLF, sizeof (CRLF) - 1, FALSE, TRUE)) {
- return FALSE;
- }
- return TRUE;
- }
break;
case COMMAND_HELP:
r = rspamd_snprintf (out_buf, sizeof (out_buf),
@@ -851,7 +848,7 @@ controller_read_socket (f_str_t * in, void *arg)
c.begin = part->content->data;
c.len = part->content->len;
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
- session->session_pool, &c, &tokens, FALSE, part->is_utf)) {
+ session->session_pool, &c, &tokens, FALSE, part->is_utf, part->urls_offset)) {
i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END);
free_task (task, FALSE);
if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
diff --git a/src/filter.c b/src/filter.c
index 2c094fda8..797b4f6fe 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
c.len = strlen (cur->data);
if (c.len > 0) {
c.begin = cur->data;
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) {
msg_info ("cannot tokenize input");
return;
}
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) {
msg_info ("cannot tokenize input");
return;
}
@@ -805,7 +805,7 @@ check_metric_action (double score, double required_score, struct metric *metric)
gboolean
learn_task (const gchar *statfile, struct worker_task *task, GError **err)
{
- GList *cur;
+ GList *cur, *ex;
struct classifier_config *cl;
struct classifier_ctx *cls_ctx;
gchar *s;
@@ -841,6 +841,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
if (s != NULL) {
c.len = strlen (cur->data);
c.begin = cur->data;
+ ex = NULL;
}
else {
part = cur->data;
@@ -852,11 +853,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
c.begin = part->content->data;
c.len = part->content->len;
is_utf = part->is_utf;
+ ex = part->urls_offset;
}
/* Get tokens */
if (!cl->tokenizer->tokenize_func (
cl->tokenizer, task->task_pool,
- &c, &tokens, FALSE, is_utf)) {
+ &c, &tokens, FALSE, is_utf, ex)) {
g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
return FALSE;
}
diff --git a/src/fstring.c b/src/fstring.c
index 5fcb12bd2..84c8c54bd 100644
--- a/src/fstring.c
+++ b/src/fstring.c
@@ -297,6 +297,34 @@ fstrgrow (memory_pool_t * pool, f_str_t * orig, size_t newlen)
return res;
}
+static guint32
+fstrhash_c (gchar c, guint32 hval)
+{
+ guint32 tmp;
+ /*
+ * xor in the current byte against each byte of hval
+ * (which alone gaurantees that every bit of input will have
+ * an effect on the output)
+ */
+ tmp = c & 0xFF;
+ tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24);
+ hval ^= tmp;
+
+ /* add some bits out of the middle as low order bits */
+ hval = hval + ((hval >> 12) & 0x0000ffff);
+
+ /* swap most and min significative bytes */
+ tmp = (hval << 24) | ((hval >> 24) & 0xff);
+ /* zero most and min significative bytes of hval */
+ hval &= 0x00ffff00;
+ hval |= tmp;
+ /*
+ * rotate hval 3 bits to the left (thereby making the
+ * 3rd msb of the above mess the hsb of the output hash)
+ */
+ return (hval << 3) + (hval >> 29);
+}
+
/*
* Return hash value for a string
*/
@@ -305,7 +333,6 @@ fstrhash (f_str_t * str)
{
size_t i;
guint32 hval;
- guint32 tmp;
gchar *c = str->begin;
if (str == NULL) {
@@ -314,32 +341,54 @@ fstrhash (f_str_t * str)
hval = str->len;
for (i = 0; i < str->len; i++, c++) {
- /*
- * xor in the current byte against each byte of hval
- * (which alone gaurantees that every bit of input will have
- * an effect on the output)
- */
- tmp = *c & 0xFF;
- tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24);
- hval ^= tmp;
-
- /* add some bits out of the middle as low order bits */
- hval = hval + ((hval >> 12) & 0x0000ffff);
-
- /* swap most and min significative bytes */
- tmp = (hval << 24) | ((hval >> 24) & 0xff);
- /* zero most and min significative bytes of hval */
- hval &= 0x00ffff00;
- hval |= tmp;
- /*
- * rotate hval 3 bits to the left (thereby making the
- * 3rd msb of the above mess the hsb of the output hash)
- */
- hval = (hval << 3) + (hval >> 29);
+ hval = fstrhash_c (*c, hval);
}
return hval;
}
+/*
+ * Return hash value for a string
+ */
+guint32
+fstrhash_lowercase (f_str_t * str, gboolean is_utf)
+{
+ gsize i;
+ guint32 j, hval;
+ const gchar *p = str->begin, *end = NULL;
+ gchar t;
+ gunichar uc;
+
+ if (str == NULL) {
+ return 0;
+ }
+ hval = str->len;
+
+ if (is_utf) {
+ while (end < str->begin + str->len) {
+ g_utf8_validate (p, str->len, &end);
+ while (p < end) {
+ uc = g_unichar_tolower (g_utf8_get_char (p));
+ for (j = 0; j < sizeof (gunichar); j ++) {
+ t = (uc >> (j * 8)) & 0xff;
+ if (t != 0) {
+ hval = fstrhash_c (t, hval);
+ }
+ }
+ p = g_utf8_next_char (p);
+ }
+ p = end + 1;
+ }
+
+ }
+ else {
+ for (i = 0; i < str->len; i++, p++) {
+ hval = fstrhash_c (g_ascii_tolower (*p), hval);
+ }
+ }
+
+ return hval;
+}
+
void
fstrstrip (f_str_t * str)
{
diff --git a/src/fstring.h b/src/fstring.h
index 616287fe4..eb32dc285 100644
--- a/src/fstring.h
+++ b/src/fstring.h
@@ -94,6 +94,10 @@ f_str_t* fstrgrow (memory_pool_t *pool, f_str_t *orig, size_t newlen);
guint32 fstrhash (f_str_t *str);
/*
+ * Return fast hash value for fixed string converted to lowercase
+ */
+guint32 fstrhash_lowercase (f_str_t *str, gboolean is_utf);
+/*
* Make copy of string to 0-terminated string
*/
gchar* fstrcstr (f_str_t *str, memory_pool_t *pool);
diff --git a/src/fuzzy.c b/src/fuzzy.c
index 1a9337071..0a24494e2 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -320,11 +320,11 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
gchar *c;
gsize real_len = 0, len = part->content->len;
GList *cur_offset;
- struct uri *cur_url = NULL;
+ struct process_exception *cur_ex = NULL;
cur_offset = part->urls_offset;
if (cur_offset != NULL) {
- cur_url = cur_offset->data;
+ cur_ex = cur_offset->data;
}
c = part->content->data;
@@ -332,12 +332,12 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
bzero (&rs, sizeof (rs));
for (i = 0; i < len;) {
- if (cur_url != NULL && cur_url->pos == i) {
- i += cur_url->len + 1;
- c += cur_url->len + 1;
+ if (cur_ex != NULL && cur_ex->pos == i) {
+ i += cur_ex->len + 1;
+ c += cur_ex->len + 1;
cur_offset = g_list_next (cur_offset);
if (cur_offset != NULL) {
- cur_url = cur_offset->data;
+ cur_ex = cur_offset->data;
}
}
else {
@@ -354,18 +354,18 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool)
cur_offset = part->urls_offset;
if (cur_offset != NULL) {
- cur_url = cur_offset->data;
+ cur_ex = cur_offset->data;
}
c = part->content->data;
for (i = 0; i < len;) {
- if (cur_url != NULL && cur_url->pos == i) {
- i += cur_url->len + 1;
- c += cur_url->len + 1;
+ if (cur_ex != NULL && cur_ex->pos == i) {
+ i += cur_ex->len + 1;
+ c += cur_ex->len + 1;
cur_offset = g_list_next (cur_offset);
if (cur_offset != NULL) {
- cur_url = cur_offset->data;
+ cur_ex = cur_offset->data;
}
}
else {
diff --git a/src/main.h b/src/main.h
index d8f90b03f..9a3335d0a 100644
--- a/src/main.h
+++ b/src/main.h
@@ -113,6 +113,13 @@ struct save_point {
guint saved; /**< how much time we have delayed processing */
};
+/**
+ * Structure to point exception in text from processing
+ */
+struct process_exception {
+ gsize pos;
+ gsize len;
+};
/**
* Union that would be used for storing sockaddrs
diff --git a/src/mem_pool.c b/src/mem_pool.c
index 5c48b55ee..43ffc86a0 100644
--- a/src/mem_pool.c
+++ b/src/mem_pool.c
@@ -91,6 +91,7 @@ pool_chain_new (gsize size)
chain->pos = align_ptr (chain->begin, MEM_ALIGNMENT);
chain->next = NULL;
STAT_LOCK ();
+ mem_pool_stat->bytes_allocated += size;
mem_pool_stat->chunks_allocated++;
STAT_UNLOCK ();
@@ -135,6 +136,7 @@ pool_chain_new_shared (gsize size)
chain->next = NULL;
STAT_LOCK ();
mem_pool_stat->shared_chunks_allocated++;
+ mem_pool_stat->bytes_allocated += size;
STAT_UNLOCK ();
return chain;
@@ -225,16 +227,11 @@ memory_pool_alloc (memory_pool_t * pool, gsize size)
cur->next = new;
pool->cur_pool = new;
new->pos += size;
- STAT_LOCK ();
- mem_pool_stat->bytes_allocated += size;
- STAT_UNLOCK ();
+
return new->begin;
}
tmp = align_ptr (cur->pos, MEM_ALIGNMENT);
cur->pos = tmp + size;
- STAT_LOCK ();
- mem_pool_stat->bytes_allocated += size;
- STAT_UNLOCK ();
return tmp;
}
return NULL;
@@ -349,9 +346,6 @@ memory_pool_alloc_shared (memory_pool_t * pool, gsize size)
}
tmp = align_ptr (cur->pos, MEM_ALIGNMENT);
cur->pos = tmp + size;
- STAT_LOCK ();
- mem_pool_stat->bytes_allocated += size;
- STAT_UNLOCK ();
return tmp;
}
return NULL;
@@ -506,20 +500,22 @@ memory_pool_delete (memory_pool_t * pool)
while (cur) {
tmp = cur;
cur = cur->next;
- g_slice_free1 (tmp->len, tmp->begin);
- g_slice_free (struct _pool_chain, tmp);
STAT_LOCK ();
mem_pool_stat->chunks_freed++;
+ mem_pool_stat->bytes_allocated -= tmp->len;
STAT_UNLOCK ();
+ g_slice_free1 (tmp->len, tmp->begin);
+ g_slice_free (struct _pool_chain, tmp);
}
/* Unmap shared memory */
while (cur_shared) {
tmp_shared = cur_shared;
cur_shared = cur_shared->next;
- munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared));
STAT_LOCK ();
mem_pool_stat->chunks_freed++;
+ mem_pool_stat->bytes_allocated -= tmp->len;
STAT_UNLOCK ();
+ munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared));
}
if (pool->variables) {
g_hash_table_destroy (pool->variables);
diff --git a/src/message.c b/src/message.c
index 0586be8d7..4db4bef7d 100644
--- a/src/message.c
+++ b/src/message.c
@@ -686,25 +686,6 @@ free_byte_array_callback (void *pointer)
g_byte_array_free (arr, TRUE);
}
-static void
-detect_real_charset (struct worker_task *task, GByteArray * part_content, struct mime_text_part *text_part)
-{
- /* First of all try to detect UTF symbols */
- text_part->is_utf = FALSE;
- /* At first decision try to validate a single character */
- if (g_utf8_get_char_validated (part_content->data, part_content->len) != -1) {
- /* Now validate the whole part */
- if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
- text_part->is_utf = TRUE;
- text_part->real_charset = UTF8_CHARSET;
- return;
- }
- }
-
- /* Now try to detect specific symbols from some charsets */
-
-}
-
static GByteArray *
convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeContentType * type, struct mime_text_part *text_part)
{
@@ -726,6 +707,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC
if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) {
text_part->is_raw = FALSE;
+ text_part->is_utf = TRUE;
return part_content;
}
@@ -741,6 +723,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC
result_array->len = write_bytes;
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_free, res_str);
text_part->is_raw = FALSE;
+ text_part->is_utf = TRUE;
return result_array;
}
diff --git a/src/statfile.c b/src/statfile.c
index 9d21b5adf..0359c0c4d 100644
--- a/src/statfile.c
+++ b/src/statfile.c
@@ -580,7 +580,6 @@ statfile_pool_get_block (statfile_pool_t * pool, stat_file_t * file, guint32 h1,
return 0;
}
-#define RANDOM_EXPIRE G_MAXINT / CHAIN_LENGTH
static void
statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guint32 h1, guint32 h2, time_t t, double value, gboolean from_now)
{
@@ -590,7 +589,6 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
u_char *c;
double min = G_MAXDOUBLE;
-
if (from_now) {
file->access_time = t;
}
@@ -626,14 +624,10 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
return;
}
- /* Expire block if we have some random value that is lower than RANDOM_EXPIRE value */
- if (g_random_int () < RANDOM_EXPIRE) {
- to_expire = block;
- break;
- }
/* Expire block with minimum value otherwise */
if (block->value < min) {
to_expire = block;
+ min = block->value;
}
c += sizeof (struct stat_file_block);
block = (struct stat_file_block *)c;
diff --git a/src/statfile_sync.c b/src/statfile_sync.c
index 5189f1ead..44e34454b 100644
--- a/src/statfile_sync.c
+++ b/src/statfile_sync.c
@@ -31,11 +31,6 @@
#include "buffer.h"
#include "statfile_sync.h"
-/* XXX: hardcoding this value is not very smart */
-#define MAX_SYNC_TIME 60
-#define IO_TIMEOUT 20
-
-
enum rspamd_sync_state {
SYNC_STATE_GREETING,
SYNC_STATE_READ_LINE,
@@ -54,7 +49,9 @@ struct rspamd_sync_ctx {
struct timeval interval;
struct timeval io_tv;
- gint sock;
+ gint sock;
+ guint32 timeout;
+ guint32 sync_interval;
enum rspamd_sync_state state;
gboolean is_busy;
@@ -141,9 +138,9 @@ parse_revision_line (struct rspamd_sync_ctx *ctx, f_str_t *in)
}
}
}
-
+
/* Current value must be len value and its value must not be 0 */
- return ((val == &ctx->new_len) && *val != 0);
+ return ((val == &ctx->new_len));
}
static gboolean
@@ -193,8 +190,10 @@ sync_read (f_str_t * in, void *arg)
return FALSE;
}
else if (ctx->state != SYNC_STATE_QUIT) {
- ctx->state = SYNC_STATE_READ_REV;
- rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len);
+ if (ctx->new_len > 0) {
+ ctx->state = SYNC_STATE_READ_REV;
+ rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len);
+ }
}
else {
/* Quit this session */
@@ -247,11 +246,13 @@ static void
sync_timer_callback (gint fd, short what, void *ud)
{
struct rspamd_sync_ctx *ctx = ud;
+ guint32 jittered_interval;
/* Plan new event */
evtimer_del (&ctx->tm_ev);
- ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2);
- ctx->interval.tv_usec = 0;
+ /* Add some jittering for synchronization */
+ jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2);
+ msec_to_tv (jittered_interval, &ctx->interval);
evtimer_add (&ctx->tm_ev, &ctx->interval);
log_next_sync (ctx->st->symbol, ctx->interval.tv_sec);
@@ -266,8 +267,7 @@ sync_timer_callback (gint fd, short what, void *ud)
return;
}
/* Now create and activate dispatcher */
- ctx->io_tv.tv_sec = IO_TIMEOUT;
- ctx->io_tv.tv_usec = 0;
+ msec_to_tv (ctx->timeout, &ctx->io_tv);
ctx->dispatcher = rspamd_create_dispatcher (ctx->sock, BUFFER_LINE, sync_read, NULL, sync_err, &ctx->io_tv, ctx);
ctx->state = SYNC_STATE_GREETING;
@@ -278,17 +278,20 @@ sync_timer_callback (gint fd, short what, void *ud)
}
static gboolean
-add_statfile_watch (statfile_pool_t *pool, struct statfile *st)
+add_statfile_watch (statfile_pool_t *pool, struct statfile *st, struct config_file *cfg)
{
struct rspamd_sync_ctx *ctx;
+ guint32 jittered_interval;
if (st->binlog->master_addr.s_addr != INADDR_NONE &&
st->binlog->master_addr.s_addr != INADDR_ANY) {
ctx = memory_pool_alloc (pool->pool, sizeof (struct rspamd_sync_ctx));
ctx->st = st;
+ ctx->timeout = cfg->statfile_sync_timeout;
+ ctx->sync_interval = cfg->statfile_sync_interval;
/* Add some jittering for synchronization */
- ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2);
- ctx->interval.tv_usec = 0;
+ jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2);
+ msec_to_tv (jittered_interval, &ctx->interval);
/* Open statfile and attach it to pool */
if ((ctx->real_statfile = statfile_pool_is_open (pool, st->path)) == NULL) {
if ((ctx->real_statfile = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
@@ -331,7 +334,7 @@ start_statfile_sync (statfile_pool_t *pool, struct config_file *cfg)
while (l) {
st = l->data;
if (st->binlog != NULL && st->binlog->affinity == AFFINITY_SLAVE) {
- if (!add_statfile_watch (pool, st)) {
+ if (!add_statfile_watch (pool, st, cfg)) {
return FALSE;
}
}
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 5f5dfcdcd..bc57255cb 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -36,55 +36,56 @@ extern const int primes[];
int
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree,
- gboolean save_token, gboolean is_utf)
+ gboolean save_token, gboolean is_utf, GList *exceptions)
{
token_node_t *new = NULL;
- f_str_t token = { NULL, 0, 0 }, *res;
- uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- int i;
-
- /* First set all bytes of hashpipe to some common value */
- for (i = 0; i < FEATURE_WINDOW_SIZE; i++) {
- hashpipe[i] = 0xABCDEF;
- }
+ f_str_t token = { NULL, 0, 0 };
+ guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+ gint i, k = 0, l;
+ gchar *res;
if (*tree == NULL) {
*tree = g_tree_new (token_node_compare_func);
memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree);
}
- while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
+ while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) {
/* Skip small words */
if (is_utf) {
- if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) {
- continue;
- }
+ l = g_utf8_strlen (token.begin, token.len);
}
else {
- if (token.len < MIN_LEN) {
- continue;
- }
+ l = token.len;
}
+ if (l < MIN_LEN) {
+ token.begin = res;
+ continue;
+ }
+
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
hashpipe[i] = hashpipe[i - 1];
}
- hashpipe[0] = fstrhash (&token);
+ hashpipe[0] = fstrhash_lowercase (&token, is_utf);
- for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
- new = memory_pool_alloc0 (pool, sizeof (token_node_t));
- new->h1 = h1;
- new->h2 = h2;
- if (save_token) {
- new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token);
- }
+ if (k > FEATURE_WINDOW_SIZE) {
+ for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
+ h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+ h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
+ new = memory_pool_alloc0 (pool, sizeof (token_node_t));
+ new->h1 = h1;
+ new->h2 = h2;
+ if (save_token) {
+ new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token);
+ }
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
+ if (g_tree_lookup (*tree, new) == NULL) {
+ g_tree_insert (*tree, new, new);
+ }
}
}
+ k ++;
+ token.begin = res;
}
return TRUE;
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 9e41a9101..be73e506d 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -52,7 +52,7 @@ const gchar t_delimiters[255] = {
1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
- 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -103,44 +103,76 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
}
/* Get next word from specified f_str_t buf */
-f_str_t *
-get_next_word (f_str_t * buf, f_str_t * token)
+gchar *
+get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions)
{
- size_t remain;
- guchar *pos;
+ gsize remain, pos;
+ guchar *p;
+ struct process_exception *ex = NULL;
if (buf == NULL) {
return NULL;
}
+
+ if (*exceptions != NULL) {
+ ex = (*exceptions)->data;
+ }
+
if (token->begin == NULL) {
- token->begin = buf->begin;
+ if (ex != NULL) {
+ if (ex->pos == 0) {
+ token->begin = buf->begin + ex->len;
+ token->len = ex->len;
+ }
+ else {
+ token->begin = buf->begin;
+ token->len = 0;
+ }
+ }
+ else {
+ token->begin = buf->begin;
+ token->len = 0;
+ }
}
- token->begin = token->begin + token->len;
token->len = 0;
remain = buf->len - (token->begin - buf->begin);
if (remain <= 0) {
return NULL;
}
- pos = token->begin;
+ pos = token->begin - buf->begin;
+ p = token->begin;
/* Skip non delimiters symbols */
- while (remain > 0 && t_delimiters[*pos]) {
- token->begin++;
+ do {
+ if (ex != NULL && ex->pos == pos) {
+ /* Go to the next exception */
+ *exceptions = g_list_next (*exceptions);
+ return p + ex->len + 1;
+ }
pos++;
+ p++;
remain--;
- }
- while (remain > 0 && !t_delimiters[*pos]) {
+ } while (remain > 0 && t_delimiters[*p]);
+
+ token->begin = p;
+
+ while (remain > 0 && !t_delimiters[*p]) {
+ if (ex != NULL && ex->pos == pos) {
+ *exceptions = g_list_next (*exceptions);
+ return p + ex->len + 1;
+ }
token->len++;
pos++;
remain--;
+ p ++;
}
- if (token->len == 0) {
+ if (remain == 0) {
return NULL;
}
- return token;
+ return p;
}
/* Struct to access gmime headers */
@@ -239,13 +271,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree)
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
subject.begin = task->subject;
subject.len = strlen (task->subject);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
+ osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
}
if ((sub = g_mime_message_get_subject (task->message)) != NULL) {
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t));
subject.begin = (gchar *)sub;
subject.len = strlen (sub);
- osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE);
+ osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL);
}
}
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index df5481a1f..c78d90b0e 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -15,17 +15,18 @@
#define FEATURE_WINDOW_SIZE 5
typedef struct token_node_s {
- uint32_t h1;
- uint32_t h2;
+ guint32 h1;
+ guint32 h2;
float value;
uintptr_t extra;
} token_node_t;
/* Common tokenizer structure */
struct tokenizer {
- char *name;
- int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
- f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
+ gchar *name;
+ gint (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input,
+ GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
+ gchar* (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions);
};
/* Compare two token nodes */
@@ -33,9 +34,10 @@ int token_node_compare_func (gconstpointer a, gconstpointer b);
/* Get tokenizer structure by name or return NULL if this name is not found */
struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
-f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
+gchar* get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions);
/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input,
+ GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions);
/* Common tokenizer for headers */
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
/* Make tokens for a subject */
diff --git a/src/url.c b/src/url.c
index dbc04ffab..c8895e107 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1160,6 +1160,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
gint rc;
gchar *url_str = NULL, *url_start, *url_end;
struct uri *new;
+ struct process_exception *ex;
gchar *p, *end, *begin;
@@ -1183,13 +1184,14 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) {
if (url_str != NULL) {
new = memory_pool_alloc0 (pool, sizeof (struct uri));
+ ex = memory_pool_alloc0 (pool, sizeof (struct process_exception));
if (new != NULL) {
g_strstrip (url_str);
rc = parse_uri (new, url_str, pool);
if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) &&
new->hostlen > 0) {
- new->pos = url_start - begin;
- new->len = url_end - url_start;
+ ex->pos = url_start - begin;
+ ex->len = url_end - url_start;
if (new->protocol == PROTOCOL_MAILTO) {
if (!g_tree_lookup (task->emails, new)) {
g_tree_insert (task->emails, new, new);
@@ -1200,7 +1202,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
g_tree_insert (task->urls, new, new);
}
}
- part->urls_offset = g_list_prepend (part->urls_offset, new);
+ part->urls_offset = g_list_prepend (part->urls_offset, ex);
}
else if (rc != URI_ERRNO_OK) {
msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
@@ -1256,10 +1258,10 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start,
*url_str = NULL;
}
if (start != NULL) {
- *start = (gchar *)pos;
+ *start = (gchar *)m.m_begin;
}
if (fin != NULL) {
- *fin = (gchar *)pos + m.m_len;
+ *fin = (gchar *)m.m_begin + m.m_len;
}
return TRUE;
}
diff --git a/src/url.h b/src/url.h
index 9c0812e62..5a8e7e1e2 100644
--- a/src/url.h
+++ b/src/url.h
@@ -32,9 +32,6 @@ struct uri {
struct uri *phished_url;
- gsize pos;
- gsize len;
-
/* @protocollen should only be usable if @protocol is either
* PROTOCOL_USER or an uri string should be composed. */
guint protocollen;