Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishingtags/0.4.0
@@ -6,8 +6,8 @@ | |||
PROJECT(rspamd C) | |||
SET(RSPAMD_VERSION_MAJOR 0) | |||
SET(RSPAMD_VERSION_MINOR 3) | |||
SET(RSPAMD_VERSION_PATCH 14) | |||
SET(RSPAMD_VERSION_MINOR 4) | |||
SET(RSPAMD_VERSION_PATCH 0) | |||
SET(RSPAMD_VERSION "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}") |
@@ -120,7 +120,7 @@ binlog_check_file (struct rspamd_binlog *log) | |||
return FALSE; | |||
} | |||
log->cur_seq = log->cur_idx->last_index; | |||
log->cur_seq = log->metaindex->last_index * BINLOG_IDX_LEN + log->cur_idx->last_index; | |||
log->cur_time = log->cur_idx->indexes[log->cur_idx->last_index].time; | |||
return TRUE; | |||
@@ -425,6 +425,13 @@ binlog_sync (struct rspamd_binlog *log, guint64 from_rev, guint64 *from_time, GB | |||
*rep = NULL; | |||
return FALSE; | |||
} | |||
else if (from_rev > log->cur_seq) { | |||
/* Slave has more actual copy, write this to log and abort sync */ | |||
msg_warn ("slave has more recent revision of statfile %s: %uL and our is: %uL", log->filename, from_rev, log->cur_seq); | |||
*rep = NULL; | |||
*from_time = 0; | |||
return FALSE; | |||
} | |||
metaindex_num = from_rev / BINLOG_IDX_LEN; | |||
/* First of all try to find this revision */ |
@@ -270,6 +270,8 @@ struct config_file { | |||
gboolean log_extended; /**< log extended information */ | |||
gsize max_statfile_size; /**< maximum size for statfile */ | |||
guint32 statfile_sync_interval; /**< synchronization interval */ | |||
guint32 statfile_sync_timeout; /**< synchronization timeout */ | |||
struct memcached_server memcached_servers[MAX_MEMCACHED_SERVERS]; /**< memcached servers */ | |||
gsize memcached_servers_num; /**< number of memcached servers */ |
@@ -172,6 +172,9 @@ init_defaults (struct config_file *cfg) | |||
cfg->dns_throttling_errors = 20; | |||
cfg->dns_throttling_time = 10000; | |||
cfg->statfile_sync_interval = 60000; | |||
cfg->statfile_sync_timeout = 20000; | |||
cfg->max_statfile_size = DEFAULT_STATFILE_SIZE; | |||
cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal); | |||
cfg->variables = g_hash_table_new (g_str_hash, g_str_equal); |
@@ -281,6 +281,18 @@ static struct xml_parser_rule grammar[] = { | |||
G_STRUCT_OFFSET (struct config_file, filters_str), | |||
NULL | |||
}, | |||
{ | |||
"sync_interval", | |||
xml_handle_seconds, | |||
G_STRUCT_OFFSET (struct config_file, statfile_sync_interval), | |||
NULL | |||
}, | |||
{ | |||
"sync_timeout", | |||
xml_handle_seconds, | |||
G_STRUCT_OFFSET (struct config_file, statfile_sync_timeout), | |||
NULL | |||
}, | |||
NULL_ATTR | |||
}, | |||
NULL_DEF_ATTR |
@@ -43,11 +43,11 @@ bayes_error_quark (void) | |||
} | |||
struct bayes_statfile_data { | |||
double hits; | |||
double total_hits; | |||
guint64 hits; | |||
guint64 total_hits; | |||
double local_probability; | |||
double post_probability; | |||
double value; | |||
guint value; | |||
struct statfile *st; | |||
stat_file_t *file; | |||
}; | |||
@@ -67,25 +67,22 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) | |||
{ | |||
token_node_t *node = key; | |||
struct bayes_callback_data *cd = data; | |||
double v, c; | |||
gint v, c; | |||
c = (cd->in_class) ? 1 : -1; | |||
/* Consider that not found blocks have value 1 */ | |||
v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now); | |||
if (fabs (v) < ALPHA && c > 0) { | |||
if (v == 0 && c > 0) { | |||
statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c); | |||
} | |||
else { | |||
if (G_LIKELY (c > 0 && c < G_MAXDOUBLE)) { | |||
v += c; | |||
if (G_LIKELY (c > 0)) { | |||
v ++; | |||
} | |||
else if (c < 0){ | |||
if (v > -c) { | |||
v -= c; | |||
} | |||
else { | |||
v = 0; | |||
if (v != 0) { | |||
v --; | |||
} | |||
} | |||
statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v); | |||
@@ -103,14 +100,15 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) | |||
token_node_t *node = key; | |||
struct bayes_callback_data *cd = data; | |||
double local_hits = 0, renorm = 0; | |||
int i; | |||
double renorm = 0; | |||
gint i; | |||
guint64 local_hits = 0; | |||
struct bayes_statfile_data *cur; | |||
for (i = 0; i < cd->statfiles_num; i ++) { | |||
cur = &cd->statfiles[i]; | |||
cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now); | |||
if (cur->value > ALPHA) { | |||
if (cur->value > 0) { | |||
cur->total_hits += cur->value; | |||
cur->hits = cur->value; | |||
local_hits += cur->value; | |||
@@ -121,7 +119,8 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) | |||
} | |||
for (i = 0; i < cd->statfiles_num; i ++) { | |||
cur = &cd->statfiles[i]; | |||
cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (local_hits + 1.0)); | |||
cur->local_probability = 0.5 + ((double)cur->value - ((double)local_hits - cur->value)) / | |||
(LOCAL_PROB_DENOM * (1.0 + local_hits)); | |||
renorm += cur->post_probability * cur->local_probability; | |||
} | |||
@@ -212,7 +212,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct | |||
struct statfile *st; | |||
gchar out_buf[BUFSIZ]; | |||
gint i; | |||
guint64 rev, ti, len, pos; | |||
guint64 rev, ti, len, pos, blocks; | |||
gchar *out; | |||
struct rspamd_binlog_element log_elt; | |||
struct stat_file_block *stat_elt; | |||
@@ -222,7 +222,7 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct | |||
if (statfile == NULL) { | |||
return FALSE; | |||
} | |||
/* Begin to copy all blocks into array */ | |||
statfile_get_revision (statfile, &rev, (time_t *)&ti); | |||
if (ti == 0) { | |||
@@ -230,10 +230,13 @@ write_whole_statfile (struct controller_session *session, gchar *symbol, struct | |||
ti = time (NULL); | |||
statfile_set_revision (statfile, rev, ti); | |||
} | |||
len = statfile->cur_section.length * sizeof (struct rspamd_binlog_element); | |||
msg_info ("send a whole statfile %s with version %uL to slave", symbol, rev); | |||
blocks = statfile_get_total_blocks (statfile); | |||
len = blocks * sizeof (struct rspamd_binlog_element); | |||
out = memory_pool_alloc (session->session_pool, len); | |||
for (i = 0, pos = 0; i < statfile->cur_section.length; i ++) { | |||
for (i = 0, pos = 0; i < blocks; i ++) { | |||
stat_elt = (struct stat_file_block *)((u_char *)statfile->map + statfile->seek_pos + i * sizeof (struct stat_file_block)); | |||
if (fabs (stat_elt->value) > 0.001) { | |||
/* Write only those values which value is not 0 */ | |||
@@ -324,6 +327,7 @@ process_sync_command (struct controller_session *session, gchar **args) | |||
} | |||
while (binlog_sync (binlog, rev, &time, &data)) { | |||
rev ++; | |||
r = rspamd_snprintf (out_buf, sizeof (out_buf), "%uL %uL %z" CRLF, rev, time, data->len); | |||
if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { | |||
if (data != NULL) { | |||
@@ -339,7 +343,6 @@ process_sync_command (struct controller_session *session, gchar **args) | |||
return FALSE; | |||
} | |||
} | |||
rev ++; | |||
} | |||
if (time == 0) { | |||
@@ -666,12 +669,6 @@ process_command (struct controller_command *cmd, gchar **cmd_args, struct contro | |||
} | |||
return TRUE; | |||
} | |||
else { | |||
if (! rspamd_dispatcher_write (session->dispatcher, CRLF, sizeof (CRLF) - 1, FALSE, TRUE)) { | |||
return FALSE; | |||
} | |||
return TRUE; | |||
} | |||
break; | |||
case COMMAND_HELP: | |||
r = rspamd_snprintf (out_buf, sizeof (out_buf), | |||
@@ -851,7 +848,7 @@ controller_read_socket (f_str_t * in, void *arg) | |||
c.begin = part->content->data; | |||
c.len = part->content->len; | |||
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, | |||
session->session_pool, &c, &tokens, FALSE, part->is_utf)) { | |||
session->session_pool, &c, &tokens, FALSE, part->is_utf, part->urls_offset)) { | |||
i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END); | |||
free_task (task, FALSE); | |||
if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { |
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg) | |||
c.len = strlen (cur->data); | |||
if (c.len > 0) { | |||
c.begin = cur->data; | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) { | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) { | |||
msg_info ("cannot tokenize input"); | |||
return; | |||
} | |||
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg) | |||
c.begin = text_part->content->data; | |||
c.len = text_part->content->len; | |||
/* Tree would be freed at task pool freeing */ | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) { | |||
if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) { | |||
msg_info ("cannot tokenize input"); | |||
return; | |||
} | |||
@@ -805,7 +805,7 @@ check_metric_action (double score, double required_score, struct metric *metric) | |||
gboolean | |||
learn_task (const gchar *statfile, struct worker_task *task, GError **err) | |||
{ | |||
GList *cur; | |||
GList *cur, *ex; | |||
struct classifier_config *cl; | |||
struct classifier_ctx *cls_ctx; | |||
gchar *s; | |||
@@ -841,6 +841,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) | |||
if (s != NULL) { | |||
c.len = strlen (cur->data); | |||
c.begin = cur->data; | |||
ex = NULL; | |||
} | |||
else { | |||
part = cur->data; | |||
@@ -852,11 +853,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) | |||
c.begin = part->content->data; | |||
c.len = part->content->len; | |||
is_utf = part->is_utf; | |||
ex = part->urls_offset; | |||
} | |||
/* Get tokens */ | |||
if (!cl->tokenizer->tokenize_func ( | |||
cl->tokenizer, task->task_pool, | |||
&c, &tokens, FALSE, is_utf)) { | |||
&c, &tokens, FALSE, is_utf, ex)) { | |||
g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message"); | |||
return FALSE; | |||
} |
@@ -297,6 +297,34 @@ fstrgrow (memory_pool_t * pool, f_str_t * orig, size_t newlen) | |||
return res; | |||
} | |||
static guint32 | |||
fstrhash_c (gchar c, guint32 hval) | |||
{ | |||
guint32 tmp; | |||
/* | |||
* xor in the current byte against each byte of hval | |||
* (which alone gaurantees that every bit of input will have | |||
* an effect on the output) | |||
*/ | |||
tmp = c & 0xFF; | |||
tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); | |||
hval ^= tmp; | |||
/* add some bits out of the middle as low order bits */ | |||
hval = hval + ((hval >> 12) & 0x0000ffff); | |||
/* swap most and min significative bytes */ | |||
tmp = (hval << 24) | ((hval >> 24) & 0xff); | |||
/* zero most and min significative bytes of hval */ | |||
hval &= 0x00ffff00; | |||
hval |= tmp; | |||
/* | |||
* rotate hval 3 bits to the left (thereby making the | |||
* 3rd msb of the above mess the hsb of the output hash) | |||
*/ | |||
return (hval << 3) + (hval >> 29); | |||
} | |||
/* | |||
* Return hash value for a string | |||
*/ | |||
@@ -305,7 +333,6 @@ fstrhash (f_str_t * str) | |||
{ | |||
size_t i; | |||
guint32 hval; | |||
guint32 tmp; | |||
gchar *c = str->begin; | |||
if (str == NULL) { | |||
@@ -314,32 +341,54 @@ fstrhash (f_str_t * str) | |||
hval = str->len; | |||
for (i = 0; i < str->len; i++, c++) { | |||
/* | |||
* xor in the current byte against each byte of hval | |||
* (which alone gaurantees that every bit of input will have | |||
* an effect on the output) | |||
*/ | |||
tmp = *c & 0xFF; | |||
tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); | |||
hval ^= tmp; | |||
/* add some bits out of the middle as low order bits */ | |||
hval = hval + ((hval >> 12) & 0x0000ffff); | |||
/* swap most and min significative bytes */ | |||
tmp = (hval << 24) | ((hval >> 24) & 0xff); | |||
/* zero most and min significative bytes of hval */ | |||
hval &= 0x00ffff00; | |||
hval |= tmp; | |||
/* | |||
* rotate hval 3 bits to the left (thereby making the | |||
* 3rd msb of the above mess the hsb of the output hash) | |||
*/ | |||
hval = (hval << 3) + (hval >> 29); | |||
hval = fstrhash_c (*c, hval); | |||
} | |||
return hval; | |||
} | |||
/* | |||
* Return hash value for a string | |||
*/ | |||
guint32 | |||
fstrhash_lowercase (f_str_t * str, gboolean is_utf) | |||
{ | |||
gsize i; | |||
guint32 j, hval; | |||
const gchar *p = str->begin, *end = NULL; | |||
gchar t; | |||
gunichar uc; | |||
if (str == NULL) { | |||
return 0; | |||
} | |||
hval = str->len; | |||
if (is_utf) { | |||
while (end < str->begin + str->len) { | |||
g_utf8_validate (p, str->len, &end); | |||
while (p < end) { | |||
uc = g_unichar_tolower (g_utf8_get_char (p)); | |||
for (j = 0; j < sizeof (gunichar); j ++) { | |||
t = (uc >> (j * 8)) & 0xff; | |||
if (t != 0) { | |||
hval = fstrhash_c (t, hval); | |||
} | |||
} | |||
p = g_utf8_next_char (p); | |||
} | |||
p = end + 1; | |||
} | |||
} | |||
else { | |||
for (i = 0; i < str->len; i++, p++) { | |||
hval = fstrhash_c (g_ascii_tolower (*p), hval); | |||
} | |||
} | |||
return hval; | |||
} | |||
void | |||
fstrstrip (f_str_t * str) | |||
{ |
@@ -93,6 +93,10 @@ f_str_t* fstrgrow (memory_pool_t *pool, f_str_t *orig, size_t newlen); | |||
*/ | |||
guint32 fstrhash (f_str_t *str); | |||
/* | |||
* Return fast hash value for fixed string converted to lowercase | |||
*/ | |||
guint32 fstrhash_lowercase (f_str_t *str, gboolean is_utf); | |||
/* | |||
* Make copy of string to 0-terminated string | |||
*/ |
@@ -320,11 +320,11 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) | |||
gchar *c; | |||
gsize real_len = 0, len = part->content->len; | |||
GList *cur_offset; | |||
struct uri *cur_url = NULL; | |||
struct process_exception *cur_ex = NULL; | |||
cur_offset = part->urls_offset; | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
cur_ex = cur_offset->data; | |||
} | |||
c = part->content->data; | |||
@@ -332,12 +332,12 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) | |||
new2 = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); | |||
bzero (&rs, sizeof (rs)); | |||
for (i = 0; i < len;) { | |||
if (cur_url != NULL && cur_url->pos == i) { | |||
i += cur_url->len + 1; | |||
c += cur_url->len + 1; | |||
if (cur_ex != NULL && cur_ex->pos == i) { | |||
i += cur_ex->len + 1; | |||
c += cur_ex->len + 1; | |||
cur_offset = g_list_next (cur_offset); | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
cur_ex = cur_offset->data; | |||
} | |||
} | |||
else { | |||
@@ -354,18 +354,18 @@ fuzzy_init_part (struct mime_text_part *part, memory_pool_t *pool) | |||
cur_offset = part->urls_offset; | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
cur_ex = cur_offset->data; | |||
} | |||
c = part->content->data; | |||
for (i = 0; i < len;) { | |||
if (cur_url != NULL && cur_url->pos == i) { | |||
i += cur_url->len + 1; | |||
c += cur_url->len + 1; | |||
if (cur_ex != NULL && cur_ex->pos == i) { | |||
i += cur_ex->len + 1; | |||
c += cur_ex->len + 1; | |||
cur_offset = g_list_next (cur_offset); | |||
if (cur_offset != NULL) { | |||
cur_url = cur_offset->data; | |||
cur_ex = cur_offset->data; | |||
} | |||
} | |||
else { |
@@ -113,6 +113,13 @@ struct save_point { | |||
guint saved; /**< how much time we have delayed processing */ | |||
}; | |||
/** | |||
* Structure to point exception in text from processing | |||
*/ | |||
struct process_exception { | |||
gsize pos; | |||
gsize len; | |||
}; | |||
/** | |||
* Union that would be used for storing sockaddrs |
@@ -91,6 +91,7 @@ pool_chain_new (gsize size) | |||
chain->pos = align_ptr (chain->begin, MEM_ALIGNMENT); | |||
chain->next = NULL; | |||
STAT_LOCK (); | |||
mem_pool_stat->bytes_allocated += size; | |||
mem_pool_stat->chunks_allocated++; | |||
STAT_UNLOCK (); | |||
@@ -135,6 +136,7 @@ pool_chain_new_shared (gsize size) | |||
chain->next = NULL; | |||
STAT_LOCK (); | |||
mem_pool_stat->shared_chunks_allocated++; | |||
mem_pool_stat->bytes_allocated += size; | |||
STAT_UNLOCK (); | |||
return chain; | |||
@@ -225,16 +227,11 @@ memory_pool_alloc (memory_pool_t * pool, gsize size) | |||
cur->next = new; | |||
pool->cur_pool = new; | |||
new->pos += size; | |||
STAT_LOCK (); | |||
mem_pool_stat->bytes_allocated += size; | |||
STAT_UNLOCK (); | |||
return new->begin; | |||
} | |||
tmp = align_ptr (cur->pos, MEM_ALIGNMENT); | |||
cur->pos = tmp + size; | |||
STAT_LOCK (); | |||
mem_pool_stat->bytes_allocated += size; | |||
STAT_UNLOCK (); | |||
return tmp; | |||
} | |||
return NULL; | |||
@@ -349,9 +346,6 @@ memory_pool_alloc_shared (memory_pool_t * pool, gsize size) | |||
} | |||
tmp = align_ptr (cur->pos, MEM_ALIGNMENT); | |||
cur->pos = tmp + size; | |||
STAT_LOCK (); | |||
mem_pool_stat->bytes_allocated += size; | |||
STAT_UNLOCK (); | |||
return tmp; | |||
} | |||
return NULL; | |||
@@ -506,20 +500,22 @@ memory_pool_delete (memory_pool_t * pool) | |||
while (cur) { | |||
tmp = cur; | |||
cur = cur->next; | |||
g_slice_free1 (tmp->len, tmp->begin); | |||
g_slice_free (struct _pool_chain, tmp); | |||
STAT_LOCK (); | |||
mem_pool_stat->chunks_freed++; | |||
mem_pool_stat->bytes_allocated -= tmp->len; | |||
STAT_UNLOCK (); | |||
g_slice_free1 (tmp->len, tmp->begin); | |||
g_slice_free (struct _pool_chain, tmp); | |||
} | |||
/* Unmap shared memory */ | |||
while (cur_shared) { | |||
tmp_shared = cur_shared; | |||
cur_shared = cur_shared->next; | |||
munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared)); | |||
STAT_LOCK (); | |||
mem_pool_stat->chunks_freed++; | |||
mem_pool_stat->bytes_allocated -= tmp->len; | |||
STAT_UNLOCK (); | |||
munmap ((void *)tmp_shared, tmp_shared->len + sizeof (struct _pool_chain_shared)); | |||
} | |||
if (pool->variables) { | |||
g_hash_table_destroy (pool->variables); |
@@ -686,25 +686,6 @@ free_byte_array_callback (void *pointer) | |||
g_byte_array_free (arr, TRUE); | |||
} | |||
static void | |||
detect_real_charset (struct worker_task *task, GByteArray * part_content, struct mime_text_part *text_part) | |||
{ | |||
/* First of all try to detect UTF symbols */ | |||
text_part->is_utf = FALSE; | |||
/* At first decision try to validate a single character */ | |||
if (g_utf8_get_char_validated (part_content->data, part_content->len) != -1) { | |||
/* Now validate the whole part */ | |||
if (g_utf8_validate (part_content->data, part_content->len, NULL)) { | |||
text_part->is_utf = TRUE; | |||
text_part->real_charset = UTF8_CHARSET; | |||
return; | |||
} | |||
} | |||
/* Now try to detect specific symbols from some charsets */ | |||
} | |||
static GByteArray * | |||
convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeContentType * type, struct mime_text_part *text_part) | |||
{ | |||
@@ -726,6 +707,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC | |||
if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) { | |||
text_part->is_raw = FALSE; | |||
text_part->is_utf = TRUE; | |||
return part_content; | |||
} | |||
@@ -741,6 +723,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray * part_content, GMimeC | |||
result_array->len = write_bytes; | |||
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_free, res_str); | |||
text_part->is_raw = FALSE; | |||
text_part->is_utf = TRUE; | |||
return result_array; | |||
} |
@@ -580,7 +580,6 @@ statfile_pool_get_block (statfile_pool_t * pool, stat_file_t * file, guint32 h1, | |||
return 0; | |||
} | |||
#define RANDOM_EXPIRE G_MAXINT / CHAIN_LENGTH | |||
static void | |||
statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guint32 h1, guint32 h2, time_t t, double value, gboolean from_now) | |||
{ | |||
@@ -590,7 +589,6 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin | |||
u_char *c; | |||
double min = G_MAXDOUBLE; | |||
if (from_now) { | |||
file->access_time = t; | |||
} | |||
@@ -626,14 +624,10 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin | |||
return; | |||
} | |||
/* Expire block if we have some random value that is lower than RANDOM_EXPIRE value */ | |||
if (g_random_int () < RANDOM_EXPIRE) { | |||
to_expire = block; | |||
break; | |||
} | |||
/* Expire block with minimum value otherwise */ | |||
if (block->value < min) { | |||
to_expire = block; | |||
min = block->value; | |||
} | |||
c += sizeof (struct stat_file_block); | |||
block = (struct stat_file_block *)c; |
@@ -31,11 +31,6 @@ | |||
#include "buffer.h" | |||
#include "statfile_sync.h" | |||
/* XXX: hardcoding this value is not very smart */ | |||
#define MAX_SYNC_TIME 60 | |||
#define IO_TIMEOUT 20 | |||
enum rspamd_sync_state { | |||
SYNC_STATE_GREETING, | |||
SYNC_STATE_READ_LINE, | |||
@@ -54,7 +49,9 @@ struct rspamd_sync_ctx { | |||
struct timeval interval; | |||
struct timeval io_tv; | |||
gint sock; | |||
gint sock; | |||
guint32 timeout; | |||
guint32 sync_interval; | |||
enum rspamd_sync_state state; | |||
gboolean is_busy; | |||
@@ -141,9 +138,9 @@ parse_revision_line (struct rspamd_sync_ctx *ctx, f_str_t *in) | |||
} | |||
} | |||
} | |||
/* Current value must be len value and its value must not be 0 */ | |||
return ((val == &ctx->new_len) && *val != 0); | |||
return ((val == &ctx->new_len)); | |||
} | |||
static gboolean | |||
@@ -193,8 +190,10 @@ sync_read (f_str_t * in, void *arg) | |||
return FALSE; | |||
} | |||
else if (ctx->state != SYNC_STATE_QUIT) { | |||
ctx->state = SYNC_STATE_READ_REV; | |||
rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len); | |||
if (ctx->new_len > 0) { | |||
ctx->state = SYNC_STATE_READ_REV; | |||
rspamd_set_dispatcher_policy (ctx->dispatcher, BUFFER_CHARACTER, ctx->new_len); | |||
} | |||
} | |||
else { | |||
/* Quit this session */ | |||
@@ -247,11 +246,13 @@ static void | |||
sync_timer_callback (gint fd, short what, void *ud) | |||
{ | |||
struct rspamd_sync_ctx *ctx = ud; | |||
guint32 jittered_interval; | |||
/* Plan new event */ | |||
evtimer_del (&ctx->tm_ev); | |||
ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2); | |||
ctx->interval.tv_usec = 0; | |||
/* Add some jittering for synchronization */ | |||
jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2); | |||
msec_to_tv (jittered_interval, &ctx->interval); | |||
evtimer_add (&ctx->tm_ev, &ctx->interval); | |||
log_next_sync (ctx->st->symbol, ctx->interval.tv_sec); | |||
@@ -266,8 +267,7 @@ sync_timer_callback (gint fd, short what, void *ud) | |||
return; | |||
} | |||
/* Now create and activate dispatcher */ | |||
ctx->io_tv.tv_sec = IO_TIMEOUT; | |||
ctx->io_tv.tv_usec = 0; | |||
msec_to_tv (ctx->timeout, &ctx->io_tv); | |||
ctx->dispatcher = rspamd_create_dispatcher (ctx->sock, BUFFER_LINE, sync_read, NULL, sync_err, &ctx->io_tv, ctx); | |||
ctx->state = SYNC_STATE_GREETING; | |||
@@ -278,17 +278,20 @@ sync_timer_callback (gint fd, short what, void *ud) | |||
} | |||
static gboolean | |||
add_statfile_watch (statfile_pool_t *pool, struct statfile *st) | |||
add_statfile_watch (statfile_pool_t *pool, struct statfile *st, struct config_file *cfg) | |||
{ | |||
struct rspamd_sync_ctx *ctx; | |||
guint32 jittered_interval; | |||
if (st->binlog->master_addr.s_addr != INADDR_NONE && | |||
st->binlog->master_addr.s_addr != INADDR_ANY) { | |||
ctx = memory_pool_alloc (pool->pool, sizeof (struct rspamd_sync_ctx)); | |||
ctx->st = st; | |||
ctx->timeout = cfg->statfile_sync_timeout; | |||
ctx->sync_interval = cfg->statfile_sync_interval; | |||
/* Add some jittering for synchronization */ | |||
ctx->interval.tv_sec = g_random_int_range (MAX_SYNC_TIME, MAX_SYNC_TIME * 2); | |||
ctx->interval.tv_usec = 0; | |||
jittered_interval = g_random_int_range (ctx->sync_interval, ctx->sync_interval * 2); | |||
msec_to_tv (jittered_interval, &ctx->interval); | |||
/* Open statfile and attach it to pool */ | |||
if ((ctx->real_statfile = statfile_pool_is_open (pool, st->path)) == NULL) { | |||
if ((ctx->real_statfile = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { | |||
@@ -331,7 +334,7 @@ start_statfile_sync (statfile_pool_t *pool, struct config_file *cfg) | |||
while (l) { | |||
st = l->data; | |||
if (st->binlog != NULL && st->binlog->affinity == AFFINITY_SLAVE) { | |||
if (!add_statfile_watch (pool, st)) { | |||
if (!add_statfile_watch (pool, st, cfg)) { | |||
return FALSE; | |||
} | |||
} |
@@ -36,55 +36,56 @@ extern const int primes[]; | |||
int | |||
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree, | |||
gboolean save_token, gboolean is_utf) | |||
gboolean save_token, gboolean is_utf, GList *exceptions) | |||
{ | |||
token_node_t *new = NULL; | |||
f_str_t token = { NULL, 0, 0 }, *res; | |||
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; | |||
int i; | |||
/* First set all bytes of hashpipe to some common value */ | |||
for (i = 0; i < FEATURE_WINDOW_SIZE; i++) { | |||
hashpipe[i] = 0xABCDEF; | |||
} | |||
f_str_t token = { NULL, 0, 0 }; | |||
guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; | |||
gint i, k = 0, l; | |||
gchar *res; | |||
if (*tree == NULL) { | |||
*tree = g_tree_new (token_node_compare_func); | |||
memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree); | |||
} | |||
while ((res = tokenizer->get_next_word (input, &token)) != NULL) { | |||
while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { | |||
/* Skip small words */ | |||
if (is_utf) { | |||
if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) { | |||
continue; | |||
} | |||
l = g_utf8_strlen (token.begin, token.len); | |||
} | |||
else { | |||
if (token.len < MIN_LEN) { | |||
continue; | |||
} | |||
l = token.len; | |||
} | |||
if (l < MIN_LEN) { | |||
token.begin = res; | |||
continue; | |||
} | |||
/* Shift hashpipe */ | |||
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { | |||
hashpipe[i] = hashpipe[i - 1]; | |||
} | |||
hashpipe[0] = fstrhash (&token); | |||
hashpipe[0] = fstrhash_lowercase (&token, is_utf); | |||
for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { | |||
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; | |||
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; | |||
new = memory_pool_alloc0 (pool, sizeof (token_node_t)); | |||
new->h1 = h1; | |||
new->h2 = h2; | |||
if (save_token) { | |||
new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); | |||
} | |||
if (k > FEATURE_WINDOW_SIZE) { | |||
for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { | |||
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; | |||
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; | |||
new = memory_pool_alloc0 (pool, sizeof (token_node_t)); | |||
new->h1 = h1; | |||
new->h2 = h2; | |||
if (save_token) { | |||
new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); | |||
} | |||
if (g_tree_lookup (*tree, new) == NULL) { | |||
g_tree_insert (*tree, new, new); | |||
if (g_tree_lookup (*tree, new) == NULL) { | |||
g_tree_insert (*tree, new, new); | |||
} | |||
} | |||
} | |||
k ++; | |||
token.begin = res; | |||
} | |||
return TRUE; |
@@ -52,7 +52,7 @@ const gchar t_delimiters[255] = { | |||
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 1, 0, 1, 1, 1, 1, 1, 0, | |||
1, 1, 1, 1, 1, 0, 1, 1, 0, 0, | |||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, | |||
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
@@ -103,44 +103,76 @@ token_node_compare_func (gconstpointer a, gconstpointer b) | |||
} | |||
/* Get next word from specified f_str_t buf */ | |||
f_str_t * | |||
get_next_word (f_str_t * buf, f_str_t * token) | |||
gchar * | |||
get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions) | |||
{ | |||
size_t remain; | |||
guchar *pos; | |||
gsize remain, pos; | |||
guchar *p; | |||
struct process_exception *ex = NULL; | |||
if (buf == NULL) { | |||
return NULL; | |||
} | |||
if (*exceptions != NULL) { | |||
ex = (*exceptions)->data; | |||
} | |||
if (token->begin == NULL) { | |||
token->begin = buf->begin; | |||
if (ex != NULL) { | |||
if (ex->pos == 0) { | |||
token->begin = buf->begin + ex->len; | |||
token->len = ex->len; | |||
} | |||
else { | |||
token->begin = buf->begin; | |||
token->len = 0; | |||
} | |||
} | |||
else { | |||
token->begin = buf->begin; | |||
token->len = 0; | |||
} | |||
} | |||
token->begin = token->begin + token->len; | |||
token->len = 0; | |||
remain = buf->len - (token->begin - buf->begin); | |||
if (remain <= 0) { | |||
return NULL; | |||
} | |||
pos = token->begin; | |||
pos = token->begin - buf->begin; | |||
p = token->begin; | |||
/* Skip non delimiters symbols */ | |||
while (remain > 0 && t_delimiters[*pos]) { | |||
token->begin++; | |||
do { | |||
if (ex != NULL && ex->pos == pos) { | |||
/* Go to the next exception */ | |||
*exceptions = g_list_next (*exceptions); | |||
return p + ex->len + 1; | |||
} | |||
pos++; | |||
p++; | |||
remain--; | |||
} | |||
while (remain > 0 && !t_delimiters[*pos]) { | |||
} while (remain > 0 && t_delimiters[*p]); | |||
token->begin = p; | |||
while (remain > 0 && !t_delimiters[*p]) { | |||
if (ex != NULL && ex->pos == pos) { | |||
*exceptions = g_list_next (*exceptions); | |||
return p + ex->len + 1; | |||
} | |||
token->len++; | |||
pos++; | |||
remain--; | |||
p ++; | |||
} | |||
if (token->len == 0) { | |||
if (remain == 0) { | |||
return NULL; | |||
} | |||
return token; | |||
return p; | |||
} | |||
/* Struct to access gmime headers */ | |||
@@ -239,13 +271,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree) | |||
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); | |||
subject.begin = task->subject; | |||
subject.len = strlen (task->subject); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); | |||
} | |||
if ((sub = g_mime_message_get_subject (task->message)) != NULL) { | |||
new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); | |||
subject.begin = (gchar *)sub; | |||
subject.len = strlen (sub); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); | |||
osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE, NULL); | |||
} | |||
} | |||
@@ -15,17 +15,18 @@ | |||
#define FEATURE_WINDOW_SIZE 5 | |||
typedef struct token_node_s { | |||
uint32_t h1; | |||
uint32_t h2; | |||
guint32 h1; | |||
guint32 h2; | |||
float value; | |||
uintptr_t extra; | |||
} token_node_t; | |||
/* Common tokenizer structure */ | |||
struct tokenizer { | |||
char *name; | |||
int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); | |||
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); | |||
gchar *name; | |||
gint (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, | |||
GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions); | |||
gchar* (*get_next_word)(f_str_t *buf, f_str_t *token, GList **exceptions); | |||
}; | |||
/* Compare two token nodes */ | |||
@@ -33,9 +34,10 @@ int token_node_compare_func (gconstpointer a, gconstpointer b); | |||
/* Get tokenizer structure by name or return NULL if this name is not found */ | |||
struct tokenizer* get_tokenizer (char *name); | |||
/* Get next word from specified f_str_t buf */ | |||
f_str_t *get_next_word (f_str_t *buf, f_str_t *token); | |||
gchar* get_next_word (f_str_t *buf, f_str_t *token, GList **exceptions); | |||
/* OSB tokenize function */ | |||
int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); | |||
int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, | |||
GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions); | |||
/* Common tokenizer for headers */ | |||
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); | |||
/* Make tokens for a subject */ |
@@ -1160,6 +1160,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text | |||
gint rc; | |||
gchar *url_str = NULL, *url_start, *url_end; | |||
struct uri *new; | |||
struct process_exception *ex; | |||
gchar *p, *end, *begin; | |||
@@ -1183,13 +1184,14 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text | |||
if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str)) { | |||
if (url_str != NULL) { | |||
new = memory_pool_alloc0 (pool, sizeof (struct uri)); | |||
ex = memory_pool_alloc0 (pool, sizeof (struct process_exception)); | |||
if (new != NULL) { | |||
g_strstrip (url_str); | |||
rc = parse_uri (new, url_str, pool); | |||
if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) && | |||
new->hostlen > 0) { | |||
new->pos = url_start - begin; | |||
new->len = url_end - url_start; | |||
ex->pos = url_start - begin; | |||
ex->len = url_end - url_start; | |||
if (new->protocol == PROTOCOL_MAILTO) { | |||
if (!g_tree_lookup (task->emails, new)) { | |||
g_tree_insert (task->emails, new, new); | |||
@@ -1200,7 +1202,7 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text | |||
g_tree_insert (task->urls, new, new); | |||
} | |||
} | |||
part->urls_offset = g_list_prepend (part->urls_offset, new); | |||
part->urls_offset = g_list_prepend (part->urls_offset, ex); | |||
} | |||
else if (rc != URI_ERRNO_OK) { | |||
msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); | |||
@@ -1256,10 +1258,10 @@ url_try_text (memory_pool_t *pool, const gchar *begin, gsize len, gchar **start, | |||
*url_str = NULL; | |||
} | |||
if (start != NULL) { | |||
*start = (gchar *)pos; | |||
*start = (gchar *)m.m_begin; | |||
} | |||
if (fin != NULL) { | |||
*fin = (gchar *)pos + m.m_len; | |||
*fin = (gchar *)m.m_begin + m.m_len; | |||
} | |||
return TRUE; | |||
} |
@@ -32,9 +32,6 @@ struct uri { | |||
struct uri *phished_url; | |||
gsize pos; | |||
gsize len; | |||
/* @protocollen should only be usable if @protocol is either | |||
* PROTOCOL_USER or an uri string should be composed. */ | |||
guint protocollen; |