From a3fa4d672341fd2f1888d3a2f2ed85ae57913b78 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 24 Jun 2011 20:25:54 +0400 Subject: [PATCH] * Welcome 0.4.0 Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishing --- CMakeLists.txt | 4 +- src/binlog.c | 9 +++- src/cfg_file.h | 2 + src/cfg_utils.c | 3 ++ src/cfg_xml.c | 12 +++++ src/classifiers/bayes.c | 31 ++++++------ src/controller.c | 21 ++++---- src/filter.c | 10 ++-- src/fstring.c | 95 ++++++++++++++++++++++++++++--------- src/fstring.h | 4 ++ src/fuzzy.c | 22 ++++----- src/main.h | 7 +++ src/mem_pool.c | 20 ++++---- src/message.c | 21 +------- src/statfile.c | 8 +--- src/statfile_sync.c | 39 ++++++++------- src/tokenizers/osb.c | 57 +++++++++++----------- src/tokenizers/tokenizers.c | 64 ++++++++++++++++++------- src/tokenizers/tokenizers.h | 16 ++++--- src/url.c | 12 +++-- src/url.h | 3 -- 21 files changed, 276 insertions(+), 184 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27b088160..1cffe6953 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,8 @@ PROJECT(rspamd C) SET(RSPAMD_VERSION_MAJOR 0) -SET(RSPAMD_VERSION_MINOR 3) -SET(RSPAMD_VERSION_PATCH 14) +SET(RSPAMD_VERSION_MINOR 4) +SET(RSPAMD_VERSION_PATCH 0) SET(RSPAMD_VERSION "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}") diff --git a/src/binlog.c b/src/binlog.c index 76cfa0cca..2a1eb9fcb 100644 --- a/src/binlog.c +++ b/src/binlog.c @@ -120,7 +120,7 @@ binlog_check_file (struct rspamd_binlog *log) return FALSE; } - log->cur_seq = log->cur_idx->last_index; + log->cur_seq = log->metaindex->last_index * BINLOG_IDX_LEN + log->cur_idx->last_index; log->cur_time = log->cur_idx->indexes[log->cur_idx->last_index].time; return TRUE; @@ -425,6 +425,13 @@ binlog_sync (struct rspamd_binlog *log, guint64 from_rev, guint64 *from_time, GB *rep = NULL; return FALSE; } + else if (from_rev > log->cur_seq) { + /* Slave has more actual copy, write this to log and abort sync */ + msg_warn ("slave has more recent revision of statfile %s: %uL and our is: %uL", log->filename, from_rev, log->cur_seq); + *rep = NULL; + *from_time = 0; + return FALSE; + } metaindex_num = from_rev / BINLOG_IDX_LEN; /* First of all try to find this revision */ diff --git a/src/cfg_file.h b/src/cfg_file.h index 9e10cd8fc..268906d40 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -270,6 +270,8 @@ struct config_file { gboolean log_extended; /**< log extended information */ gsize max_statfile_size; /**< maximum size for statfile */ + guint32 statfile_sync_interval; /**< synchronization interval */ + guint32 statfile_sync_timeout; /**< synchronization timeout */ struct memcached_server memcached_servers[MAX_MEMCACHED_SERVERS]; /**< memcached servers */ gsize memcached_servers_num; /**< number of memcached servers */ diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 99a4bc3d9..720d931ef 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -172,6 +172,9 @@ init_defaults (struct config_file *cfg) cfg->dns_throttling_errors = 20; cfg->dns_throttling_time = 10000; + cfg->statfile_sync_interval = 60000; + cfg->statfile_sync_timeout = 20000; + cfg->max_statfile_size = DEFAULT_STATFILE_SIZE; cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal); cfg->variables = g_hash_table_new (g_str_hash, g_str_equal); diff --git a/src/cfg_xml.c b/src/cfg_xml.c index fba925cc6..09f6574a0 100644 --- a/src/cfg_xml.c +++ b/src/cfg_xml.c @@ -281,6 +281,18 @@ static struct xml_parser_rule grammar[] = { G_STRUCT_OFFSET (struct config_file, filters_str), NULL }, + { + "sync_interval", + xml_handle_seconds, + G_STRUCT_OFFSET (struct config_file, statfile_sync_interval), + NULL + }, + { + "sync_timeout", + xml_handle_seconds, + G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout), + NULL + }, NULL_ATTR }, NULL_DEF_ATTR diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index b4f7826e5..dadd33e5e 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -43,11 +43,11 @@ bayes_error_quark (void) } struct bayes_statfile_data { - double hits; - double total_hits; + guint64 hits; + guint64 total_hits; double local_probability; double post_probability; - double value; + guint value; struct statfile *st; stat_file_t *file; }; @@ -67,25 +67,22 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) { token_node_t *node = key; struct bayes_callback_data *cd = data; - double v, c; + gint v, c; c = (cd->in_class) ? 1 : -1; /* Consider that not found blocks have value 1 */ v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now); - if (fabs (v) < ALPHA && c > 0) { + if (v == 0 && c > 0) { statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c); } else { - if (G_LIKELY (c > 0 && c < G_MAXDOUBLE)) { - v += c; + if (G_LIKELY (c > 0)) { + v ++; } else if (c < 0){ - if (v > -c) { - v -= c; - } - else { - v = 0; + if (v != 0) { + v --; } } statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v); @@ -103,14 +100,15 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) token_node_t *node = key; struct bayes_callback_data *cd = data; - double local_hits = 0, renorm = 0; - int i; + double renorm = 0; + gint i; + guint64 local_hits = 0; struct bayes_statfile_data *cur; for (i = 0; i < cd->statfiles_num; i ++) { cur = &cd->statfiles[i]; cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now); - if (cur->value > ALPHA) { + if (cur->value > 0) { cur->total_hits += cur->value; cur->hits = cur->value; local_hits += cur->value; @@ -121,7 +119,8 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) } for (i = 0; i < cd->statfiles_num; i ++) { cur = &cd->statfiles[i]; - cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (local_hits + 1.0)); + cur->local_probability = 0.5 + ((double)cur->value - ((double)local_hits - cur->value)) / + (LOCAL_PROB_DENOM * (1.0 + local_hits)); renorm += cur->post_probability * cur->local_probability; } diff --git a/src/controller.c b/src/controller.c index f899518fe..e7e4c10d8 100644 --- a/src/controller.c +++ b/src/controller.c @@ -212,7 +212,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct struct statfile *st; gchar out_buf[BUFSIZ]; gint i; - guint64 rev, ti, len, pos; + guint64 rev, ti, len, pos, blocks; gchar *out; struct rspamd_binlog_element log_elt; struct stat_file_block *stat_elt; @@ -222,7 +222,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct if (statfile == NULL) { return FALSE; } - + /* Begin to copy all blocks into array */ statfile_get_revision (statfile, &rev, (time_t *)&ti); if (ti == 0) { @@ -230,10 +230,13 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct ti = time (NULL); statfile_set_revision (statfile, rev, ti); } - len = statfile->cur_section.length * sizeof (struct rspamd_binlog_element); + msg_info ("send a whole statfile %s with version %uL to slave", symbol, rev); + + blocks = statfile_get_total_blocks (statfile); + len = blocks * sizeof (struct rspamd_binlog_element); out = memory_pool_alloc (session->session_pool, len); - for (i = 0, pos = 0; i < statfile->cur_section.length; i ++) { + for (i = 0, pos = 0; i < blocks; i ++) { stat_elt = (struct stat_file_block *)((u_char *)statfile->map + statfile->seek_pos + i * sizeof (struct stat_file_block)); if (fabs (stat_elt->value) > 0.001) { /* Write only those values which value is not 0 */ @@ -324,6 +327,7 @@ process_sync_command (struct controller_session *session, gchar **args) } while (binlog_sync (binlog, rev, &time, &data)) { + rev ++; r = rspamd_snprintf (out_buf, sizeof (out_buf), "%uL %uL %z" CRLF, rev, time, data->len); if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { if (data != NULL) { @@ -339,7 +343,6 @@ process_sync_command (struct controller_session *session, gchar **args) return FALSE; } } - rev ++; } if (time == 0) { @@ -666,12 +669,6 @@ process_command (struct controller_command *cmd, gchar **cmd_args, struct contro } return TRUE; } - else { - if (! rspamd_dispatcher_write (session->dispatcher, CRLF, sizeof (CRLF) - 1, FALSE, TRUE)) { - return FALSE; - } - return TRUE; - } break; case COMMAND_HELP: r = rspamd_snprintf (out_buf, sizeof (out_buf), @@ -851,7 +848,7 @@ controller_read_socket (f_str_t * in, void *arg) c.begin = part->content->data; c.len = part->content->len; if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, - session->session_pool, &c, &tokens, FALSE, part->is_utf)) { + session->session_pool, &c, &tokens, FALSE, part->is_utf, part->urls_offset)) { i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END); free_task (task, FALSE); if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { diff --git a/src/filter.c b/src/filter.c index 2c094fda8..797b4f6fe 100644 --- a/src/filter.c +++ b/src/filter.c @@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg) c.len = strlen (cur->data); if (c.len > 0) { c.begin = cur->data; - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) { msg_info ("cannot tokenize input"); return; } @@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg) c.begin = text_part->content->data; c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) { msg_info ("cannot tokenize input"); return; } @@ -805,7 +805,7 @@ check_metric_action (double score, double required_score, struct metric *metric) gboolean learn_task (const gchar *statfile, struct worker_task *task, GError **err) { - GList *cur; + GList *cur, *ex; struct classifier_config *cl; struct classifier_ctx *cls_ctx; gchar *s; @@ -841,6 +841,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) if (s != NULL) { c.len = strlen (cur->data); c.begin = cur->data; + ex = NULL; } else { part = cur->data; @@ -852,11 +853,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) c.begin = part->content->data; c.len = part->content->len; is_utf = part->is_utf; + ex = part->urls_offset; } /* Get tokens */ if (!cl->tokenizer->tokenize_func ( cl->tokenizer, task->task_pool, - &c, &tokens, FALSE, is_utf)) { + &c, &tokens, FALSE, is_utf, ex)) { g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message"); return FALSE; } diff --git a/src/fstring.c b/src/fstring.c index 5fcb12bd2..84c8c54bd 100644 --- a/src/fstring.c +++ b/src/fstring.c @@ -297,6 +297,34 @@ fstrgrow (memory_pool_t * pool, f_str_t * orig, size_t newlen) return res; } +static guint32 +fstrhash_c (gchar c, guint32 hval) +{ + guint32 tmp; + /* + * xor in the current byte against each byte of hval + * (which alone gaurantees that every bit of input will have + * an effect on the output) + */ + tmp = c & 0xFF; + tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); + hval ^= tmp; + + /* add some bits out of the middle as low order bits */ + hval = hval + ((hval >> 12) & 0x0000ffff); + + /* swap most and min significative bytes */ + tmp = (hval << 24) | ((hval >> 24) & 0xff); + /* zero most and min significative bytes of hval */ + hval &= 0x00ffff00; + hval |= tmp; + /* + * rotate hval 3 bits to the left (thereby making the + * 3rd msb of the above mess the hsb of the output hash) + */ + return (hval << 3) + (hval >> 29); +} + /* * Return hash value for a string */ @@ -305,7 +333,6 @@ fstrhash (f_str_t * str) { size_t i; guint32 hval; - guint32 tmp; gchar *c = str->begin; if (str == NULL) { @@ -314,32 +341,54 @@ fstrhash (f_str_t * str) hval = str->len; for (i = 0; i < str->len; i++, c++) { - /* - * xor in the current byte against each byte of hval - * (which alone gaurantees that every bit of input will have - * an effect on the output) - */ - tmp = *c & 0xFF; - tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); - hval ^= tmp; - - /* add some bits out of the middle as low order bits */ - hval = hval + ((hval >> 12) & 0x0000ffff); - - /* swap most and min significative bytes */ - tmp = (hval << 24) | ((hval >> 24) & 0xff); - /* zero most and min significative bytes of hval */ - hval &= 0x00ffff00; - hval |= tmp; - /* - * rotate hval 3 bits to the left (thereby making the - * 3rd msb of the above mess the hsb of the output hash) - */ - hval = (hval << 3) + (hval >> 29); + hval = fstrhash_c (*c, hval); } return hval; } +/* + * Return hash value for a string + */ +guint32 +fstrhash_lowercase (f_str_t * str, gboolean is_utf) +{ + gsize i; + guint32 j, hval; + const gchar *p = str->begin, *end = NULL; + gchar t; + gunichar uc; + + if (str == NULL) { + return 0; + } + hval = str->len; + + if (is_utf) { + while (end < str->begin + str->len) { + g_utf8_validate (p, str->len, &end); + while (p < end) { + uc = g_unichar_tolower (g_utf8_get_char (p)); + for (j = 0; j < sizeof (gunichar); j ++) { + t = (uc >> (j * 8)) & 0xff; + if (t != 0) { + hval = fstrhash_c (t, hval); + } + } + p = g_utf8_next_char (p); + } + p = end + 1; + } + + } + else { + for (i = 0; i < str->len; i++, p++) { + hval = fstrhash_c (g_ascii_tolower (*p), hval); + } + } + + return hval; +} + void fstrstrip (f_str_t * str) { diff --git a/src/fstring.h b/src/fstring.h index 616287fe4..eb32dc285 100644 --- a/src/fstring.h +++ b/src/fstring.h @@ -93,6 +93,10 @@ f_str_t* fstrgrow (memory_pool_t *pool, f_str_t *orig, size_t newlen); */ guint32 fstrhash (f_str_t *str); +/* + * Return fast hash value for fixed string converted to lowercase + */ +guint32 fstrhash_lowercase (f_str_t *str, gboolean is_utf); /* * Make copy of string to 0-terminated string */ diff --git a/src/fuzzy.c b/src/fuzzy.c index 1a9337071..0a24494e2 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -320,11 +320,11 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) gchar *c; gsize real_len = 0, len = part->content->len; GList *cur_offset; - struct uri *cur_url = NULL; + struct process_exception *cur_ex = NULL; cur_offset = part->urls_offset; if (cur_offset != NULL) { - cur_url = cur_offset->data; + cur_ex = cur_offset->data; } c = part->content->data; @@ -332,12 +332,12 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); bzero (&rs, sizeof (rs)); for (i = 0; i < len;) { - if (cur_url != NULL && cur_url->pos == i) { - i += cur_url->len + 1; - c += cur_url->len + 1; + if (cur_ex != NULL && cur_ex->pos == i) { + i += cur_ex->len + 1; + c += cur_ex->len + 1; cur_offset = g_list_next (cur_offset); if (cur_offset != NULL) { - cur_url = cur_offset->data; + cur_ex = cur_offset->data; } } else { @@ -354,18 +354,18 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) cur_offset = part->urls_offset; if (cur_offset != NULL) { - cur_url = cur_offset->data; + cur_ex = cur_offset->data; } c = part->content->data; for (i = 0; i < len;) { - if (cur_url != NULL && cur_url->pos == i) { - i += cur_url->len + 1; - c += cur_url->len + 1; + if (cur_ex != NULL && cur_ex->pos == i) { + i += cur_ex->len + 1; + c += cur_ex->len + 1; cur_offset = g_list_next (cur_offset); if (cur_offset != NULL) { - cur_url = cur_offset->data; + cur_ex = cur_offset->data; } } else { diff --git a/src/main.h b/src/main.h index d8f90b03f..9a3335d0a 100644 --- a/src/main.h +++ b/src/main.h @@ -113,6 +113,13 @@ struct save_point { guint saved; /**< how much time we have delayed processing */ }; +/** + * Structure to point exception in text from processing + */ +struct process_exception { + gsize pos; + gsize len; +}; /** * Union that would be used for storing sockaddrs diff --git a/src/mem_pool.c b/src/mem_pool.c index 5c48b55ee..43ffc86a0 100644 --- a/src/mem_pool.c +++ b/src/mem_pool.c @@ -91,6 +91,7 @@ pool_chain_new (gsize size) chain->pos = align_ptr (chain->begin, MEM_ALIGNMENT); chain->next = NULL; STAT_LOCK (); + mem_pool_stat->bytes_allocated += size; mem_pool_stat->chunks_allocated++; STAT_UNLOCK (); @@ -135,6 +136,7 @@ pool_chain_new_shared (gsize size) chain->next = NULL; STAT_LOCK (); mem_pool_stat->shared_chunks_allocated++; + mem_pool_stat->bytes_allocated += size; STAT_UNLOCK (); return chain; @@ -225,16 +227,11 @@ memory_pool_alloc (memory_pool_t * pool, gsize size) cur->next = new; pool->cur_pool = new; new->pos += size; - STAT_LOCK (); - mem_pool_stat->bytes_allocated += size; - STAT_UNLOCK (); + return new->begin; } tmp = align_ptr (cur->pos, MEM_ALIGNMENT); cur->pos = tmp + size; - STAT_LOCK (); - mem_pool_stat->bytes_allocated += size; - STAT_UNLOCK (); return tmp; } return NULL; @@ -349,9 +346,6 @@ memory_pool_alloc_shared (memory_pool_t * pool, gsize size) } tmp = align_ptr (cur->pos, MEM_ALIGNMENT); cur->pos = tmp + size; - STAT_LOCK (); - mem_pool_stat->bytes_allocated += size; - STAT_UNLOCK (); return tmp; } return NULL; @@ -506,20 +500,22 @@ memory_pool_delete (memory_pool_t * pool) while (cur) { tmp = cur; cur = cur->next; - g_slice_free1 (tmp->len, tmp->begin); - g_slice_free (struct _pool_chain, tmp); STAT_LOCK (); mem_pool_stat->chunks_freed++; + mem_pool_stat->bytes_allocated -= tmp->len; STAT_UNLOCK (); + g_slice_free1 (tmp->len, tmp->begin); + g_slice_free (struct _pool_chain, tmp); } /* Unmap shared memory */ while (cur_shared) { tmp_shared = cur_shared; cur_shared = cur_shared->next; - munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared)); STAT_LOCK (); mem_pool_stat->chunks_freed++; + mem_pool_stat->bytes_allocated -= tmp->len; STAT_UNLOCK (); + munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared)); } if (pool->variables) { g_hash_table_destroy (pool->variables); diff --git a/src/message.c b/src/message.c index 0586be8d7..4db4bef7d 100644 --- a/src/message.c +++ b/src/message.c @@ -686,25 +686,6 @@ free_byte_array_callback (void *pointer) g_byte_array_free (arr, TRUE); } -static void -detect_real_charset (struct worker_task *task, GByteArray * part_content, struct mime_text_part *text_part) -{ - /* First of all try to detect UTF symbols */ - text_part->is_utf = FALSE; - /* At first decision try to validate a single character */ - if (g_utf8_get_char_validated (part_content->data, part_content->len) != -1) { - /* Now validate the whole part */ - if (g_utf8_validate (part_content->data, part_content->len, NULL)) { - text_part->is_utf = TRUE; - text_part->real_charset = UTF8_CHARSET; - return; - } - } - - /* Now try to detect specific symbols from some charsets */ - -} - static GByteArray * convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeContentType * type, struct mime_text_part *text_part) { @@ -726,6 +707,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) { text_part->is_raw = FALSE; + text_part->is_utf = TRUE; return part_content; } @@ -741,6 +723,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC result_array->len = write_bytes; memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_free, res_str); text_part->is_raw = FALSE; + text_part->is_utf = TRUE; return result_array; } diff --git a/src/statfile.c b/src/statfile.c index 9d21b5adf..0359c0c4d 100644 --- a/src/statfile.c +++ b/src/statfile.c @@ -580,7 +580,6 @@ statfile_pool_get_block (statfile_pool_t * pool, stat_file_t * file, guint32 h1, return 0; } -#define RANDOM_EXPIRE G_MAXINT / CHAIN_LENGTH static void statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guint32 h1, guint32 h2, time_t t, double value, gboolean from_now) { @@ -590,7 +589,6 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin u_char *c; double min = G_MAXDOUBLE; - if (from_now) { file->access_time = t; } @@ -626,14 +624,10 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin return; } - /* Expire block if we have some random value that is lower than RANDOM_EXPIRE value */ - if (g_random_int () < RANDOM_EXPIRE) { - to_expire = block; - break; - } /* Expire block with minimum value otherwise */ if (block->value < min) { to_expire = block; + min = block->value; } c += sizeof (struct stat_file_block); block = (struct stat_file_block *)c; diff --git a/src/statfile_sync.c b/src/statfile_sync.c index 5189f1ead..44e34454b 100644 --- a/src/statfile_sync.c +++ b/src/statfile_sync.c @@ -31,11 +31,6 @@ #include "buffer.h" #include "statfile_sync.h" -/* XXX: hardcoding this value is not very smart */ -#define MAX_SYNC_TIME 60 -#define IO_TIMEOUT 20 - - enum rspamd_sync_state { SYNC_STATE_GREETING, SYNC_STATE_READ_LINE, @@ -54,7 +49,9 @@ struct rspamd_sync_ctx { struct timeval interval; struct timeval io_tv; - gint sock; + gint sock; + guint32 timeout; + guint32 sync_interval; enum rspamd_sync_state state; gboolean is_busy; @@ -141,9 +138,9 @@ parse_revision_line (struct rspamd_sync_ctx *ctx, f_str_t *in) } } } - + /* Current value must be len value and its value must not be 0 */ - return ((val == &ctx->new_len) && *val != 0); + return ((val == &ctx->new_len)); } static gboolean @@ -193,8 +190,10 @@ sync_read (f_str_t * in, void *arg) return FALSE; } else if (ctx->state != SYNC_STATE_QUIT) { - ctx->state = SYNC_STATE_READ_REV; - rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len); + if (ctx->new_len > 0) { + ctx->state = SYNC_STATE_READ_REV; + rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len); + } } else { /* Quit this session */ @@ -247,11 +246,13 @@ static void sync_timer_callback (gint fd, short what, void *ud) { struct rspamd_sync_ctx *ctx = ud; + guint32 jittered_interval; /* Plan new event */ evtimer_del (&ctx->tm_ev); - ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2); - ctx->interval.tv_usec = 0; + /* Add some jittering for synchronization */ + jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2); + msec_to_tv (jittered_interval, &ctx->interval); evtimer_add (&ctx->tm_ev, &ctx->interval); log_next_sync (ctx->st->symbol, ctx->interval.tv_sec); @@ -266,8 +267,7 @@ sync_timer_callback (gint fd, short what, void *ud) return; } /* Now create and activate dispatcher */ - ctx->io_tv.tv_sec = IO_TIMEOUT; - ctx->io_tv.tv_usec = 0; + msec_to_tv (ctx->timeout, &ctx->io_tv); ctx->dispatcher = rspamd_create_dispatcher (ctx->sock, BUFFER_LINE, sync_read, NULL, sync_err, &ctx->io_tv, ctx); ctx->state = SYNC_STATE_GREETING; @@ -278,17 +278,20 @@ sync_timer_callback (gint fd, short what, void *ud) } static gboolean -add_statfile_watch (statfile_pool_t *pool, struct statfile *st) +add_statfile_watch (statfile_pool_t *pool, struct statfile *st, struct config_file *cfg) { struct rspamd_sync_ctx *ctx; + guint32 jittered_interval; if (st->binlog->master_addr.s_addr != INADDR_NONE && st->binlog->master_addr.s_addr != INADDR_ANY) { ctx = memory_pool_alloc (pool->pool, sizeof (struct rspamd_sync_ctx)); ctx->st = st; + ctx->timeout = cfg->statfile_sync_timeout; + ctx->sync_interval = cfg->statfile_sync_interval; /* Add some jittering for synchronization */ - ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2); - ctx->interval.tv_usec = 0; + jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2); + msec_to_tv (jittered_interval, &ctx->interval); /* Open statfile and attach it to pool */ if ((ctx->real_statfile = statfile_pool_is_open (pool, st->path)) == NULL) { if ((ctx->real_statfile = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { @@ -331,7 +334,7 @@ start_statfile_sync (statfile_pool_t *pool, struct config_file *cfg) while (l) { st = l->data; if (st->binlog != NULL && st->binlog->affinity == AFFINITY_SLAVE) { - if (!add_statfile_watch (pool, st)) { + if (!add_statfile_watch (pool, st, cfg)) { return FALSE; } } diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 5f5dfcdcd..bc57255cb 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -36,55 +36,56 @@ extern const int primes[]; int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree, - gboolean save_token, gboolean is_utf) + gboolean save_token, gboolean is_utf, GList *exceptions) { token_node_t *new = NULL; - f_str_t token = { NULL, 0, 0 }, *res; - uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - int i; - - /* First set all bytes of hashpipe to some common value */ - for (i = 0; i < FEATURE_WINDOW_SIZE; i++) { - hashpipe[i] = 0xABCDEF; - } + f_str_t token = { NULL, 0, 0 }; + guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + gint i, k = 0, l; + gchar *res; if (*tree == NULL) { *tree = g_tree_new (token_node_compare_func); memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree); } - while ((res = tokenizer->get_next_word (input, &token)) != NULL) { + while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { /* Skip small words */ if (is_utf) { - if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) { - continue; - } + l = g_utf8_strlen (token.begin, token.len); } else { - if (token.len < MIN_LEN) { - continue; - } + l = token.len; } + if (l < MIN_LEN) { + token.begin = res; + continue; + } + /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { hashpipe[i] = hashpipe[i - 1]; } - hashpipe[0] = fstrhash (&token); + hashpipe[0] = fstrhash_lowercase (&token, is_utf); - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = memory_pool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); - } + if (k > FEATURE_WINDOW_SIZE) { + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = memory_pool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); + } - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } } } + k ++; + token.begin = res; } return TRUE; diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 9e41a9101..be73e506d 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -52,7 +52,7 @@ const gchar t_delimiters[255] = { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, - 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -103,44 +103,76 @@ token_node_compare_func (gconstpointer a, gconstpointer b) } /* Get next word from specified f_str_t buf */ -f_str_t * -get_next_word (f_str_t * buf, f_str_t * token) +gchar * +get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions) { - size_t remain; - guchar *pos; + gsize remain, pos; + guchar *p; + struct process_exception *ex = NULL; if (buf == NULL) { return NULL; } + + if (*exceptions != NULL) { + ex = (*exceptions)->data; + } + if (token->begin == NULL) { - token->begin = buf->begin; + if (ex != NULL) { + if (ex->pos == 0) { + token->begin = buf->begin + ex->len; + token->len = ex->len; + } + else { + token->begin = buf->begin; + token->len = 0; + } + } + else { + token->begin = buf->begin; + token->len = 0; + } } - token->begin = token->begin + token->len; token->len = 0; remain = buf->len - (token->begin - buf->begin); if (remain <= 0) { return NULL; } - pos = token->begin; + pos = token->begin - buf->begin; + p = token->begin; /* Skip non delimiters symbols */ - while (remain > 0 && t_delimiters[*pos]) { - token->begin++; + do { + if (ex != NULL && ex->pos == pos) { + /* Go to the next exception */ + *exceptions = g_list_next (*exceptions); + return p + ex->len + 1; + } pos++; + p++; remain--; - } - while (remain > 0 && !t_delimiters[*pos]) { + } while (remain > 0 && t_delimiters[*p]); + + token->begin = p; + + while (remain > 0 && !t_delimiters[*p]) { + if (ex != NULL && ex->pos == pos) { + *exceptions = g_list_next (*exceptions); + return p + ex->len + 1; + } token->len++; pos++; remain--; + p ++; } - if (token->len == 0) { + if (remain == 0) { return NULL; } - return token; + return p; } /* Struct to access gmime headers */ @@ -239,13 +271,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree) new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = task->subject; subject.len = strlen (task->subject); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); } if ((sub = g_mime_message_get_subject (task->message)) != NULL) { new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = (gchar *)sub; subject.len = strlen (sub); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); } } diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index df5481a1f..c78d90b0e 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -15,17 +15,18 @@ #define FEATURE_WINDOW_SIZE 5 typedef struct token_node_s { - uint32_t h1; - uint32_t h2; + guint32 h1; + guint32 h2; float value; uintptr_t extra; } token_node_t; /* Common tokenizer structure */ struct tokenizer { - char *name; - int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); - f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); + gchar *name; + gint (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, + GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions); + gchar* (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions); }; /* Compare two token nodes */ @@ -33,9 +34,10 @@ int token_node_compare_func (gconstpointer a, gconstpointer b); /* Get tokenizer structure by name or return NULL if this name is not found */ struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ -f_str_t *get_next_word (f_str_t *buf, f_str_t *token); +gchar* get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions); /* OSB tokenize function */ -int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); +int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, + GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions); /* Common tokenizer for headers */ int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); /* Make tokens for a subject */ diff --git a/src/url.c b/src/url.c index dbc04ffab..c8895e107 100644 --- a/src/url.c +++ b/src/url.c @@ -1160,6 +1160,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text gint rc; gchar *url_str = NULL, *url_start, *url_end; struct uri *new; + struct process_exception *ex; gchar *p, *end, *begin; @@ -1183,13 +1184,14 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) { if (url_str != NULL) { new = memory_pool_alloc0 (pool, sizeof (struct uri)); + ex = memory_pool_alloc0 (pool, sizeof (struct process_exception)); if (new != NULL) { g_strstrip (url_str); rc = parse_uri (new, url_str, pool); if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) && new->hostlen > 0) { - new->pos = url_start - begin; - new->len = url_end - url_start; + ex->pos = url_start - begin; + ex->len = url_end - url_start; if (new->protocol == PROTOCOL_MAILTO) { if (!g_tree_lookup (task->emails, new)) { g_tree_insert (task->emails, new, new); @@ -1200,7 +1202,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text g_tree_insert (task->urls, new, new); } } - part->urls_offset = g_list_prepend (part->urls_offset, new); + part->urls_offset = g_list_prepend (part->urls_offset, ex); } else if (rc != URI_ERRNO_OK) { msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); @@ -1256,10 +1258,10 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, *url_str = NULL; } if (start != NULL) { - *start = (gchar *)pos; + *start = (gchar *)m.m_begin; } if (fin != NULL) { - *fin = (gchar *)pos + m.m_len; + *fin = (gchar *)m.m_begin + m.m_len; } return TRUE; } diff --git a/src/url.h b/src/url.h index 9c0812e62..5a8e7e1e2 100644 --- a/src/url.h +++ b/src/url.h @@ -32,9 +32,6 @@ struct uri { struct uri *phished_url; - gsize pos; - gsize len; - /* @protocollen should only be usable if @protocol is either * PROTOCOL_USER or an uri string should be composed. */ guint protocollen; -- 2.39.5