summaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-16 15:28:40 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-16 15:28:40 +0000
commitb5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (patch)
treea647a4306708df37a3ea1d97666fd2d325e24464 /src/tokenizers
parentffd95d7c71307bb9540f07bbaac3b04859226837 (diff)
downloadrspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.tar.gz
rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.zip
Reorganize statfiles and classifiers into libstat.
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/osb.c122
-rw-r--r--src/tokenizers/tokenizers.c260
-rw-r--r--src/tokenizers/tokenizers.h64
3 files changed, 0 insertions, 446 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
deleted file mode 100644
index 9dd12a8dd..000000000
--- a/src/tokenizers/osb.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * OSB tokenizer
- */
-
-#include <sys/types.h>
-#include "tokenizers.h"
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
-
-int
-osb_tokenize_text (struct tokenizer *tokenizer,
- rspamd_mempool_t * pool,
- GArray * input,
- GTree ** tree,
- gboolean save_token,
- gboolean is_utf,
- GList *exceptions)
-{
- token_node_t *new = NULL;
- rspamd_fstring_t *token;
- guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- gint i, processed = 0;
- guint w;
-
- if (input == NULL) {
- return FALSE;
- }
-
- if (*tree == NULL) {
- *tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t) g_tree_destroy,
- *tree);
- }
-
- memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
-
- for (w = 0; w < input->len; w ++) {
- token = &g_array_index (input, rspamd_fstring_t, w);
-
- if (processed < FEATURE_WINDOW_SIZE) {
- /* Just fill a hashpipe */
- hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
- rspamd_fstrhash_lc (token, is_utf);
- }
- else {
- /* Shift hashpipe */
- for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
- hashpipe[i] = hashpipe[i - 1];
- }
- hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
- processed++;
-
- for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] *
- primes[(i << 1) - 1];
- new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
- new->h1 = h1;
- new->h2 = h2;
- if (save_token) {
- new->extra =
- (uintptr_t)rspamd_mempool_fstrdup (pool, token);
- }
-
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
- }
- }
- }
- }
-
- if (processed <= FEATURE_WINDOW_SIZE) {
- for (i = 1; i < processed; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
- new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
- new->h1 = h1;
- new->h2 = h2;
- if (save_token) {
- new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
- }
-
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
- }
- }
- }
-
- return TRUE;
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
deleted file mode 100644
index 3e6c745ec..000000000
--- a/src/tokenizers/tokenizers.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Common tokenization functions
- */
-
-#include <sys/types.h>
-#include "main.h"
-#include "tokenizers.h"
-
-struct tokenizer tokenizers[] = {
- {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
-};
-
-const int primes[] = {
- 1, 7,
- 3, 13,
- 5, 29,
- 11, 51,
- 23, 101,
- 47, 203,
- 97, 407,
- 197, 817,
- 397, 1637,
- 797, 3277,
-};
-
-const gchar t_delimiters[255] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
- 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
- 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0
-};
-
-struct tokenizer *
-get_tokenizer (const char *name)
-{
- guint i;
-
- for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
- if (strcmp (tokenizers[i].name, name) == 0) {
- return &tokenizers[i];
- }
- }
-
- return NULL;
-}
-
-int
-token_node_compare_func (gconstpointer a, gconstpointer b)
-{
- const token_node_t *aa = a, *bb = b;
-
- if (aa->h1 == bb->h1) {
- return aa->h2 - bb->h2;
- }
-
- return aa->h1 - bb->h1;
-}
-
-/* Get next word from specified f_str_t buf */
-gchar *
-rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
-{
- gsize remain, pos;
- guchar *p;
- struct process_exception *ex = NULL;
-
- if (buf == NULL) {
- return NULL;
- }
-
- if (exceptions != NULL && *exceptions != NULL) {
- ex = (*exceptions)->data;
- }
-
- if (token->begin == NULL) {
- if (ex != NULL) {
- if (ex->pos == 0) {
- token->begin = buf->begin + ex->len;
- token->len = ex->len;
- }
- else {
- token->begin = buf->begin;
- token->len = 0;
- }
- }
- else {
- token->begin = buf->begin;
- token->len = 0;
- }
- }
-
- token->len = 0;
-
- pos = token->begin - buf->begin;
- if (pos >= buf->len) {
- return NULL;
- }
-
- remain = buf->len - pos;
- p = token->begin;
- /* Skip non delimiters symbols */
- do {
- if (ex != NULL && ex->pos == pos) {
- /* Go to the next exception */
- *exceptions = g_list_next (*exceptions);
- return p + ex->len;
- }
- pos++;
- p++;
- remain--;
- } while (remain > 0 && t_delimiters[*p]);
-
- token->begin = p;
-
- while (remain > 0 && !t_delimiters[*p]) {
- if (ex != NULL && ex->pos == pos) {
- *exceptions = g_list_next (*exceptions);
- return p + ex->len;
- }
- token->len++;
- pos++;
- remain--;
- p++;
- }
-
- if (remain == 0) {
- return NULL;
- }
-
- return p;
-}
-
-GArray *
-rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList **exceptions)
-{
- rspamd_fstring_t token, buf;
- gchar *pos;
- gsize l;
- GArray *res;
-
- if (len == 0 || text == NULL) {
- return NULL;
- }
-
- buf.begin = text;
- buf.len = len;
- buf.size = buf.len;
- token.begin = NULL;
- token.len = 0;
-
- res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
- while ((pos = rspamd_tokenizer_get_word (&buf,
- &token, exceptions)) != NULL) {
- if (is_utf) {
- l = g_utf8_strlen (token.begin, token.len);
- }
- else {
- l = token.len;
- }
- if (min_len > 0 && l < min_len) {
- token.begin = pos;
- continue;
- }
- g_array_append_val (res, token);
-
- token.begin = pos;
- }
-
- return res;
-}
-
-
-void
-tokenize_subject (struct rspamd_task *task, GTree ** tree)
-{
- gchar *sub;
- struct tokenizer *osb_tokenizer;
- GArray *words;
-
- if (*tree == NULL) {
- *tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
- }
-
- osb_tokenizer = get_tokenizer ("osb-text");
-
- /* Try to use pre-defined subject */
- if (task->subject != NULL) {
- sub = task->subject;
- }
- else {
- sub = (gchar *)g_mime_message_get_subject (task->message);
- }
-
- if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
- if (words != NULL) {
- osb_tokenizer->tokenize_func (osb_tokenizer,
- task->task_pool,
- words,
- tree,
- FALSE,
- TRUE,
- NULL);
- g_array_free (words, TRUE);
- }
- }
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
deleted file mode 100644
index ed47e0add..000000000
--- a/src/tokenizers/tokenizers.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef TOKENIZERS_H
-#define TOKENIZERS_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-#include "main.h"
-
-/* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-typedef struct token_node_s {
- guint32 h1;
- guint32 h2;
- double value;
- uintptr_t extra;
-} token_node_t;
-
-/* Common tokenizer structure */
-struct tokenizer {
- gchar *name;
- gint (*tokenize_func)(struct tokenizer *tokenizer,
- rspamd_mempool_t *pool,
- GArray *words,
- GTree **cur,
- gboolean save_token,
- gboolean is_utf,
- GList *exceptions);
- gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
-};
-
-/* Compare two token nodes */
-int token_node_compare_func (gconstpointer a, gconstpointer b);
-
-/* Get tokenizer structure by name or return NULL if this name is not found */
-struct tokenizer * get_tokenizer (const char *name);
-
-/* Get next word from specified f_str_t buf */
-gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
- rspamd_fstring_t *token, GList **exceptions);
-
-/* Tokenize text into array of words (rspamd_fstring_t type) */
-GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList **exceptions);
-
-/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer,
- rspamd_mempool_t *pool,
- GArray *input,
- GTree **cur,
- gboolean save_token,
- gboolean is_utf,
- GList *exceptions);
-
-/* Make tokens for a subject */
-void tokenize_subject (struct rspamd_task *task, GTree ** tree);
-
-/* Array of all defined tokenizers */
-extern struct tokenizer tokenizers[];
-
-#endif
-/*
- * vi:ts=4
- */