aboutsummaryrefslogtreecommitdiffstats
path: root/src/trie.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2013-06-14 17:43:00 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2013-06-14 17:43:00 +0100
commit632c28aced816c1faa4b73c1e49d5cddd9fda534 (patch)
tree2057c86581c3bb1d5c8af4ae56b090b58901bc2d /src/trie.c
parent2b0629a969e492e6cb957edaafddf5160d7b46d2 (diff)
downloadrspamd-632c28aced816c1faa4b73c1e49d5cddd9fda534.tar.gz
rspamd-632c28aced816c1faa4b73c1e49d5cddd9fda534.zip
Rework suffix trie implementation.
Diffstat (limited to 'src/trie.c')
-rw-r--r--src/trie.c121
1 files changed, 64 insertions, 57 deletions
diff --git a/src/trie.c b/src/trie.c
index c32b34039..9068abe08 100644
--- a/src/trie.c
+++ b/src/trie.c
@@ -21,16 +21,10 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-/*
- * XXX: This code was derived from CamelTrie implementation (lgpl code) and
- * is subject to be rewritten completely from scratch (or from bsd grep)
- */
-
#include "config.h"
#include "mem_pool.h"
#include "trie.h"
-
rspamd_trie_t*
rspamd_trie_create (gboolean icase)
{
@@ -51,85 +45,88 @@ rspamd_trie_create (gboolean icase)
}
/*
- * Insert a single character as level of binary trie
+ * Insert a single character as the specified level of the suffix tree
*/
static struct rspamd_trie_state *
-rspamd_trie_insert_char (rspamd_trie_t *trie, guint depth, struct rspamd_trie_state *q, gchar c)
+rspamd_trie_insert_char (rspamd_trie_t *trie, guint depth, struct rspamd_trie_state *pos, gchar c)
{
- struct rspamd_trie_match *m;
+ struct rspamd_trie_match *new_match;
+ struct rspamd_trie_state *new_pos;
- /* Insert new match into a chain */
- m = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_match));
- m->next = q->match;
- m->c = c;
+ /* New match is inserted before pos */
+ new_match = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_match));
+ new_match->next = pos->match;
+ new_match->c = c;
- q->match = m;
- m->state = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_state));
- q = m->state;
- q->match = NULL;
- q->fail = &trie->root;
- q->final = 0;
- q->id = -1;
+ /* Now set match link */
+ pos->match = new_match;
+
+ new_match->state = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_state));
+ new_pos = new_match->state;
+ new_pos->match = NULL;
+ new_pos->fail = &trie->root;
+ new_pos->final = 0;
+ new_pos->id = -1;
if (trie->fail_states->len < depth + 1) {
- /* Grow fail states array */
+ /* Grow fail states array if depth is more than its size */
guint size = trie->fail_states->len;
- size = MAX (size + 64, depth + 1);
+ size = MAX (size * 2, depth + 1);
g_ptr_array_set_size (trie->fail_states, size);
}
- q->next = trie->fail_states->pdata[depth];
- trie->fail_states->pdata[depth] = q;
+ new_pos->next = trie->fail_states->pdata[depth];
+ trie->fail_states->pdata[depth] = new_pos;
- return q;
+ return new_pos;
}
+/* Traverse the specified node to find corresponding match */
static inline struct rspamd_trie_match *
check_match (struct rspamd_trie_state *s, gchar c)
{
- struct rspamd_trie_match *m = s->match;
+ struct rspamd_trie_match *match = s->match;
- while (m && m->c != c) {
- m = m->next;
+ while (match && match->c != c) {
+ match = match->next;
}
- return m;
+ return match;
}
void
rspamd_trie_insert (rspamd_trie_t *trie, const gchar *pattern, gint pattern_id)
{
const guchar *p = pattern;
- struct rspamd_trie_state *q, *q1, *r;
+ struct rspamd_trie_state *q, *q1, *r, *cur_node;
struct rspamd_trie_match *m, *n;
guint i, depth = 0;
gchar c;
/* Insert pattern to the trie */
- q = &trie->root;
+ cur_node = &trie->root;
while (*p) {
c = trie->icase ? g_ascii_tolower (*p) : *p;
- m = check_match (q, c);
+ m = check_match (cur_node, c);
if (m == NULL) {
- /* Insert gchar at specified level depth */
- q = rspamd_trie_insert_char (trie, depth, q, c);
+ /* Insert a character at specified level depth */
+ cur_node = rspamd_trie_insert_char (trie, depth, cur_node, c);
}
else {
- /* Switch current state to matched state */
- q = m->state;
+ cur_node = m->state;
}
p ++;
depth ++;
}
- q->final = depth;
- q->id = pattern_id;
+ cur_node->final = depth;
+ cur_node->id = pattern_id;
/* Update fail states and build fail states graph */
- /* Go throught the whole depth of prefixes */
+ /* Go through the whole depth of prefixes */
for (i = 0; i < trie->fail_states->len; i++) {
q = trie->fail_states->pdata[i];
while (q) {
@@ -171,39 +168,49 @@ rspamd_trie_insert (rspamd_trie_t *trie, const gchar *pattern, gint pattern_id)
const gchar*
rspamd_trie_lookup (rspamd_trie_t *trie, const gchar *buffer, gsize buflen, gint *matched_id)
{
- const guchar *p = buffer, *prev, *pat;
- struct rspamd_trie_state *q;
+ const guchar *p = buffer, *prev, *ret;
+ struct rspamd_trie_state *cur_node;
struct rspamd_trie_match *m = NULL;
gchar c;
- q = &trie->root;
+ cur_node = &trie->root;
prev = p;
- pat = p;
+ ret = p;
while (buflen) {
c = trie->icase ? g_ascii_tolower (*p) : *p;
- while (q != NULL && (m = check_match (q, c)) == NULL) {
- q = q->fail;
+ /* Match pattern or use fail-path to restore state */
+ while (cur_node != NULL && (m = check_match (cur_node, c)) == NULL) {
+ cur_node = cur_node->fail;
}
- if (q == &trie->root) {
- pat = prev;
+ /* Shift left in the text */
+ if (cur_node == &trie->root) {
+ /* 1 character pattern found */
+ ret = prev;
}
-
- if (q == NULL) {
- q = &trie->root;
- pat = p;
+ else if (cur_node == NULL) {
+ /* We have tried the pattern but eventually it was not found */
+ cur_node = &trie->root;
+ ret = p;
+ p ++;
+ prev = p;
+ buflen --;
+ continue;
}
- else if (m != NULL) {
- q = m->state;
- if (q->final) {
- if (matched_id) {
- *matched_id = q->id;
+ if (m != NULL) {
+ /* Match found */
+ cur_node = m->state;
+
+ if (cur_node->final) {
+ /* The complete pattern found */
+ if (matched_id != NULL) {
+ *matched_id = cur_node->id;
}
- return (const gchar *) pat;
+ return (const gchar *) ret;
}
}
p ++;