aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-14 17:33:31 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-14 17:33:31 +0100
commit02b6117a397bb5cba27ca63a7e2df1c5dbfd0125 (patch)
tree14bab1422a7d4eec4a6d2040b3d93f82f38c47f0 /src
parent828c31c52830e4a78da94d66c2ce8936380633e2 (diff)
downloadrspamd-02b6117a397bb5cba27ca63a7e2df1c5dbfd0125.tar.gz
rspamd-02b6117a397bb5cba27ca63a7e2df1c5dbfd0125.zip
Implement skipping of signatures in text messages.
Diffstat (limited to 'src')
-rw-r--r--src/libmime/message.c6
-rw-r--r--src/libstat/stat_process.c3
-rw-r--r--src/libstat/tokenizers/tokenizers.c45
-rw-r--r--src/libstat/tokenizers/tokenizers.h3
-rw-r--r--src/lua/lua_util.c9
5 files changed, 48 insertions, 18 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 56fa85333..2fcb4f7cd 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1214,7 +1214,8 @@ rspamd_normalize_text_part (struct rspamd_task *task,
/* Ugly workaround */
tmp = rspamd_tokenize_text (part->content->data,
part->content->len, IS_PART_UTF (part), task->cfg->min_word_len,
- part->urls_offset, FALSE);
+ part->urls_offset, FALSE,
+ !(part->flags & RSPAMD_MIME_PART_FLAG_HTML));
if (tmp) {
for (i = 0; i < tmp->len; i ++) {
@@ -1415,7 +1416,8 @@ process_text_part (struct rspamd_task *task,
detect_text_language (text_part);
text_part->words = rspamd_tokenize_text (text_part->content->data,
text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len,
- text_part->urls_offset, FALSE);
+ text_part->urls_offset, FALSE,
+ !(text_part->flags & RSPAMD_MIME_PART_FLAG_HTML));
rspamd_normalize_text_part (task, text_part);
/* Calculate number of lines */
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index c634944ff..5318ab144 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -333,7 +333,8 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
}
if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat);
+ words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat,
+ FALSE);
if (words != NULL) {
tok->tokenizer->tokenize_func (cf,
task->task_pool,
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 51ef9038d..d06afa055 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -32,7 +32,7 @@
typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos,
rspamd_fstring_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl);
+ GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
const gchar t_delimiters[255] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
@@ -79,7 +79,7 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
static gboolean
rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
gchar **cur, rspamd_fstring_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl)
+ GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
{
gsize remain, pos;
guchar *p;
@@ -171,17 +171,19 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
static gboolean
rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
gchar **cur, rspamd_fstring_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl)
+ GList **exceptions, gboolean is_utf, gsize *rl,
+ gboolean check_signature)
{
- gsize remain, pos;
- gchar *p, *next_p;
+ gsize remain, pos, siglen = 0;
+ gchar *p, *next_p, *sig = NULL;
gunichar uc;
guint processed = 0;
struct process_exception *ex = NULL;
enum {
skip_delimiters = 0,
feed_token,
- skip_exception
+ skip_exception,
+ process_signature
} state = skip_delimiters;
if (buf == NULL) {
@@ -227,10 +229,18 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
state = skip_exception;
continue;
}
- else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) {
- state = feed_token;
- token->begin = p;
- continue;
+ else if (g_unichar_isgraph (uc)) {
+ if (!g_unichar_ispunct (uc)) {
+ state = feed_token;
+ token->begin = p;
+ continue;
+ }
+ else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
+ sig = p;
+ siglen = remain;
+ state = process_signature;
+ continue;
+ }
}
break;
case feed_token:
@@ -247,6 +257,16 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
*exceptions = g_list_next (*exceptions);
goto set_token;
break;
+ case process_signature:
+ if (*p == '\r' || *p == '\n') {
+ msg_debug ("signature found: %*s", siglen, sig);
+ return FALSE;
+ }
+ else if (*p != ' ' && *p != '-' && *p != '_') {
+ state = skip_delimiters;
+ continue;
+ }
+ break;
}
remain -= next_p - p;
@@ -269,7 +289,8 @@ set_token:
GArray *
rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList *exceptions, gboolean compat)
+ gsize min_len, GList *exceptions, gboolean compat,
+ gboolean check_signature)
{
rspamd_fstring_t token, buf;
gchar *pos = NULL;
@@ -297,7 +318,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128);
- while (func (&buf, &pos, &token, &cur, is_utf, &l)) {
+ while (func (&buf, &pos, &token, &cur, is_utf, &l, check_signature)) {
if (l == 0 || (min_len > 0 && l < min_len)) {
token.begin = pos;
continue;
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index fb4b42a96..2c96b7cff 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -28,7 +28,8 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
/* Tokenize text into array of words (rspamd_fstring_t type) */
GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList *exceptions, gboolean compat);
+ gsize min_len, GList *exceptions, gboolean compat,
+ gboolean check_signature);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 9a670da04..8d5686f7c 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -340,7 +340,7 @@ lua_util_tokenize_text (lua_State *L)
struct process_exception *ex;
GArray *res;
rspamd_fstring_t *w;
- gboolean compat = FALSE;
+ gboolean compat = FALSE, check_sig = FALSE;
if (lua_type (L, 1) == LUA_TSTRING) {
in = luaL_checklstring (L, 1, &len);
@@ -389,11 +389,16 @@ lua_util_tokenize_text (lua_State *L)
compat = lua_toboolean (L, 3);
}
+ if (lua_gettop (L) > 3 && lua_type (L, 4) == LUA_TBOOLEAN) {
+ check_sig = lua_toboolean (L, 4);
+ }
+
if (exceptions) {
exceptions = g_list_reverse (exceptions);
}
- res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat);
+ res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat,
+ check_sig);
if (res == NULL) {
lua_pushnil (L);