diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-26 16:58:25 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-26 16:58:25 +0000 |
commit | 5e9cd04af5b9314defdf5bdc6b014b7cfefa896e (patch) | |
tree | 91623cd594c61de361227cdfe386329fed3b18eb /src/libstat | |
parent | 2dcb09b6eae0b21d4da68f9323be4cbfeb2b4237 (diff) | |
download | rspamd-5e9cd04af5b9314defdf5bdc6b014b7cfefa896e.tar.gz rspamd-5e9cd04af5b9314defdf5bdc6b014b7cfefa896e.zip |
[Project] Use URLs TLDs instead of !!EX!! in stat tokens
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 55 |
1 files changed, 39 insertions, 16 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index dcd556910..19a5dba98 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -245,6 +245,43 @@ rspamd_utf_word_valid (const gchar *text, const gchar *end, } \ } while(0) +static inline void +rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res) +{ + rspamd_stat_token_t token; + + memset (&token, 0, sizeof (token)); + + if (ex->type == RSPAMD_EXCEPTION_GENERIC) { + token.original.begin = "!!EX!!"; + token.original.len = sizeof ("!!EX!!") - 1; + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + + g_array_append_val (res, token); + token.flags = 0; + } + else if (ex->type == RSPAMD_EXCEPTION_URL) { + struct rspamd_url *uri; + + uri = ex->ptr; + + if (uri && uri->tldlen > 0) { + token.original.begin = uri->tld; + token.original.len = uri->tldlen; + + } + else { + token.original.begin = "!!EX!!"; + token.original.len = sizeof ("!!EX!!") - 1; + } + + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + g_array_append_val (res, token); + token.flags = 0; + } +} + + GArray * rspamd_tokenize_text (const gchar *text, gsize len, const UText *utxt, @@ -347,15 +384,7 @@ start_over: while (cur && ex->pos <= last) { /* We have an exception at the beginning, skip those */ last += ex->len; - - if (ex->type == RSPAMD_EXCEPTION_URL) { - token.original.begin = "!!EX!!"; - token.original.len = sizeof ("!!EX!!") - 1; - token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - - g_array_append_val (res, token); - token.flags = 0; - } + rspamd_tokenize_exception (ex, res); if (last > p) { /* Exception spread over the boundaries */ @@ -385,13 +414,7 @@ start_over: /* Process the current exception */ last += ex->len + (ex->pos - last); - if (ex->type == RSPAMD_EXCEPTION_URL) { - token.original.begin = "!!EX!!"; - token.original.len = sizeof ("!!EX!!") - 1; - token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - - g_array_append_val (res, token); - } + rspamd_tokenize_exception (ex, res); if (last > p) { /* Exception spread over the boundaries */ |