]> source.dussan.org Git - rspamd.git/commitdiff
* Add custom filter for making marks for new user:
authorcebka@lenovo-laptop <cebka@lenovo-laptop>
Fri, 26 Feb 2010 18:12:24 +0000 (21:12 +0300)
committercebka@lenovo-laptop <cebka@lenovo-laptop>
Fri, 26 Feb 2010 18:12:24 +0000 (21:12 +0300)
  - each username is passed throught metaphone filter
  - then we make prefix tree based on english letters from metaphone
  - then we are searching for the longest common string and output result (how many times we got that string)

src/plugins/custom/CMakeLists.txt
src/plugins/custom/ipmark/CMakeLists.txt
src/plugins/custom/ipmark/ipmark.c
src/plugins/custom/regmark/CMakeLists.txt [new file with mode: 0644]
src/plugins/custom/regmark/metaphone.c [new file with mode: 0644]
src/plugins/custom/regmark/metaphone.h [new file with mode: 0644]
src/plugins/custom/regmark/prefix_tree.c [new file with mode: 0644]
src/plugins/custom/regmark/prefix_tree.h [new file with mode: 0644]
src/plugins/custom/regmark/regmark.c [new file with mode: 0644]

index c5e1ec99fde2872f85aab8bff69e57b500195de0..5d076c41ded8d1d67c5f6aab6d4d3a2e06e6d337 100644 (file)
@@ -1 +1,2 @@
 ADD_SUBDIRECTORY(ipmark)
+ADD_SUBDIRECTORY(regmark)
index 42785599b5326b541b90a8e378d3fdb02f4842a4..1c1b1e64c170e9a5c7abe33c4bc4e0e6b4e371ee 100644 (file)
@@ -4,7 +4,7 @@ SET(IPMARKSRC             ipmark.c
                                          ../../../mem_pool.c
 )
 
-ADD_LIBRARY(rspamd_ipmark SHARED ${IPMARKSRC})
+ADD_LIBRARY(rspamd_ipmark MODULE ${IPMARKSRC})
 TARGET_LINK_LIBRARIES(rspamd_ipmark ${GLIB2_LIBRARIES})
 
 INSTALL(TARGETS rspamd_ipmark DESTINATION lib)
index e919cd41778b7e9b230ffba59ee4163a95da1081..9bebcab22d22efcbed73fc0e4ec5ea3e903bca15 100644 (file)
@@ -54,8 +54,8 @@ void after_connect (char **output, char **log_line, void *user_data);
 void module_fin (void);        
 
 /* Internal variables */
-char *filename = NULL;
-radix_tree_t *radix = NULL;
+static char *filename = NULL;
+static radix_tree_t *radix = NULL;
 
 /* Implementation */
 
@@ -259,7 +259,7 @@ parse_line (const char *line, size_t len, char **output, void *user_data)
        const char *p;
        char *c = ip_buf, *err_str;
        struct in_addr ina;
-       int state = 0, next_state, dots;
+       int state = 0, next_state = 0, dots = 0;
        int16_t value;
        uint32_t mask;
        enum ipmark_command cmd;
diff --git a/src/plugins/custom/regmark/CMakeLists.txt b/src/plugins/custom/regmark/CMakeLists.txt
new file mode 100644 (file)
index 0000000..e6242ab
--- /dev/null
@@ -0,0 +1,10 @@
+# Regmark plugin makefile
+SET(REGMARKSRC           regmark.c
+                                         metaphone.c
+                                         prefix_tree.c
+)
+
+ADD_LIBRARY(rspamd_regmark MODULE ${REGMARKSRC})
+TARGET_LINK_LIBRARIES(rspamd_regmark ${GLIB2_LIBRARIES})
+
+INSTALL(TARGETS rspamd_regmark DESTINATION lib)
diff --git a/src/plugins/custom/regmark/metaphone.c b/src/plugins/custom/regmark/metaphone.c
new file mode 100644 (file)
index 0000000..81fa054
--- /dev/null
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is implementation of metaphone algorithm that was originally written by
+ * Michael G Schwern <schwern@pobox.com> as perl XS module
+ */
+
+/*
+ * I suppose I could have been using a character pointer instead of
+ * accesssing the array directly... 
+ */
+
+#include "../../../config.h"
+#include "metaphone.h"
+
+/*
+ * Look at the next letter in the word 
+ */
+#define Next_Letter (g_ascii_toupper (word[w_idx+1]))
+/*
+ * Look at the current letter in the word 
+ */
+#define Curr_Letter (g_ascii_toupper(word[w_idx]))
+/*
+ * Go N letters back. 
+ */
+#define Look_Back_Letter(n) (w_idx >= n ? g_ascii_toupper(word[w_idx-n]) : '\0')
+/*
+ * Previous letter.  I dunno, should this return null on failure? 
+ */
+#define Prev_Letter (Look_Back_Letter(1))
+/*
+ * Look two letters down.  It makes sure you don't walk off the string. 
+ */
+#define After_Next_Letter   (Next_Letter != '\0' ? g_ascii_toupper(word[w_idx+2]) \
+                                                 : '\0')
+#define Look_Ahead_Letter(n) (g_ascii_toupper(Lookahead(word+w_idx, n)))
+
+#define  SH    'X'
+#define  TH            '0'
+
+/*-- Character encoding array & accessing macros --*/
+/* Stolen directly out of the book... */
+char _codes[26] = {
+       1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0
+/*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
+};
+
+
+/*
+ * Allows us to safely look ahead an arbitrary # of letters 
+ */
+/*
+ * I probably could have just used strlen... 
+ */
+static char
+Lookahead (char *word, int how_far)
+{
+    char            letter_ahead = '\0';       /* null by default */
+    int             idx;
+    for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
+    /*
+     * Edge forward in the string... 
+     */
+
+    letter_ahead = word[idx];  /* idx will be either == to how_far or at
+                                * the end of the string */
+    return letter_ahead;
+}
+
+
+/*
+ * phonize one letter 
+ */
+#define Phonize(c)  {(*phoned_word)[p_idx++] = c;}
+/*
+ * Slap a null character on the end of the phoned word 
+ */
+#define End_Phoned_Word {(*phoned_word)[p_idx] = '\0';}
+/*
+ * How long is the phoned word? 
+ */
+#define Phone_Len   (p_idx)
+
+/*
+ * Note is a letter is a 'break' in the word 
+ */
+#define Isbreak(c)  (!g_ascii_isalpha(c))
+
+
+gboolean
+metaphone (char *word, int max_phonemes, char **phoned_word)
+{
+    int             w_idx = 0; /* point in the phonization we're at. */
+    int             p_idx = 0; /* end of the phoned phrase */
+
+    /*-- Parameter checks --*/
+    /*
+     * Assume largest possible if we're given no limit 
+     */
+    if (max_phonemes == 0) {
+               max_phonemes = strlen (word);
+       }
+
+    /*-- Allocate memory for our phoned_phrase --*/
+       *phoned_word = g_malloc (max_phonemes * sizeof (char));
+
+    /*-- The first phoneme has to be processed specially. --*/
+    /*
+     * Find our first letter 
+     */
+    for (; ! g_ascii_isalpha (Curr_Letter); w_idx++) {
+               /*
+               * On the off chance we were given nothing but crap... 
+               */
+               if (Curr_Letter == '\0') {
+                       End_Phoned_Word 
+                       return TRUE;    /* For testing */
+               }
+    }
+
+    switch (Curr_Letter) {
+       /*
+        * AE becomes E 
+        */
+    case 'A':
+       if (Next_Letter == 'E') {
+           Phonize ('E');
+           w_idx += 2;
+       }
+       /*
+        * Remember, preserve vowels at the beginning 
+        */
+       else {
+           Phonize ('A');
+           w_idx++;
+       }
+       break;
+       /*
+        * [GKP]N becomes N 
+        */
+    case 'G':
+    case 'K':
+    case 'P':
+       if (Next_Letter == 'N') {
+           Phonize ('N');
+           w_idx += 2;
+       }
+       break;
+       /*
+        * WH becomes H, WR becomes R W if followed by a vowel 
+        */
+    case 'W':
+       if (Next_Letter == 'H' || Next_Letter == 'R') {
+           Phonize (Next_Letter);
+           w_idx += 2;
+       } else if (isvowel (Next_Letter)) {
+           Phonize ('W');
+           w_idx += 2;
+       }
+       /*
+        * else ignore 
+        */
+       break;
+       /*
+        * X becomes S 
+        */
+    case 'X':
+       Phonize ('S');
+       w_idx++;
+       break;
+       /*
+        * Vowels are kept 
+        */
+       /*
+        * We did A already case 'A': case 'a': 
+        */
+    case 'E':
+    case 'I':
+    case 'O':
+    case 'U':
+       Phonize (Curr_Letter);
+       w_idx++;
+       break;
+    default:
+       /*
+        * do nothing 
+        */
+       break;
+    }
+
+
+
+    /*
+     * On to the metaphoning 
+     */
+    for (; Curr_Letter != '\0' &&
+        (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) {
+       /*
+        * How many letters to skip because an eariler encoding handled
+        * multiple letters 
+        */
+       unsigned short int skip_letter = 0;
+
+
+       /*
+        * THOUGHT: It would be nice if, rather than having things like...
+        * well, SCI.  For SCI you encode the S, then have to remember to
+        * skip the C.  So the phonome SCI invades both S and C.  It would
+        * be better, IMHO, to skip the C from the S part of the encoding.
+        * Hell, I'm trying it. 
+        */
+
+       /*
+        * Ignore non-alphas 
+        */
+       if (! g_ascii_isalpha (Curr_Letter))
+           continue;
+
+       /*
+        * Drop duplicates, except CC 
+        */
+       if (Curr_Letter == Prev_Letter && Curr_Letter != 'C')
+           continue;
+
+       switch (Curr_Letter) {
+           /*
+            * B -> B unless in MB 
+            */
+       case 'B':
+           if (Prev_Letter != 'M')
+               Phonize ('B');
+           break;
+           /*
+            * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
+            * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
+            * SCE-, -SCY- (handed in S) else K 
+            */
+       case 'C':
+           if (MAKESOFT (Next_Letter)) {       /* C[IEY] */
+                       if (After_Next_Letter == 'A' && Next_Letter == 'I') {   /* CIA 
+                                                                               */
+                               Phonize (SH);
+                       }
+                       /*
+                       * SC[IEY] 
+                       */
+                       else if (Prev_Letter == 'S') {
+                               /*
+                               * Dropped 
+                               */
+                       } else {
+                               Phonize ('S');
+                       }
+           } else if (Next_Letter == 'H') {
+#ifndef USE_TRADITIONAL_METAPHONE
+                       if (After_Next_Letter == 'R' || Prev_Letter == 'S') {   /* Christ, 
+                                                                               * School 
+                                                                               */
+                               Phonize ('K');
+                       } else {
+                               Phonize (SH);
+                       }
+#else
+                       Phonize (SH);
+#endif
+                       skip_letter++;
+           } else {
+                       Phonize ('K');
+           }
+           break;
+           /*
+            * J if in -DGE-, -DGI- or -DGY- else T 
+            */
+       case 'D':
+           if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) {
+                       Phonize ('J');
+                       skip_letter++;
+           } else {
+                       Phonize ('T');
+               }
+           break;
+           /*
+            * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
+            * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
+            * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
+            * else K 
+            */
+       case 'G':
+           if (Next_Letter == 'H') {
+                       if (!(NOGHTOF (Look_Back_Letter (3)) ||
+                               Look_Back_Letter (4) == 'H')) {
+                               Phonize ('F');
+                               skip_letter++;
+                       } else {
+                               /*
+                               * silent 
+                               */
+                       }
+           } else if (Next_Letter == 'N') {
+                       if (Isbreak (After_Next_Letter) ||
+                               (After_Next_Letter == 'E' &&
+                               Look_Ahead_Letter (3) == 'D')) {
+                               /*
+                               * dropped 
+                               */
+                       } else {
+                               Phonize ('K');
+                       }
+           } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') {
+                       Phonize ('J');
+           } else {
+                       Phonize ('K');
+           }
+           break;
+           /*
+            * H if before a vowel and not after C,G,P,S,T 
+            */
+       case 'H':
+           if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) {
+                       Phonize ('H');
+               }
+           break;
+           /*
+            * dropped if after C else K 
+            */
+       case 'K':
+           if (Prev_Letter != 'C') {
+                       Phonize ('K');
+               }
+           break;
+           /*
+            * F if before H else P 
+            */
+       case 'P':
+           if (Next_Letter == 'H') {
+                       Phonize ('F');
+           } else {
+                       Phonize ('P');
+           }
+           break;
+           /*
+            * K 
+            */
+       case 'Q':
+           Phonize ('K');
+           break;
+           /*
+            * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S 
+            */
+       case 'S':
+           if (Next_Letter == 'I' &&
+                    (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
+                       Phonize (SH);
+           } else if (Next_Letter == 'H') {
+                       Phonize (SH);
+                       skip_letter++;
+           }
+#ifndef USE_TRADITIONAL_METAPHONE
+           else if (Next_Letter == 'C' &&
+                    Look_Ahead_Letter (2) == 'H' &&
+                    Look_Ahead_Letter (3) == 'W') {
+                       Phonize (SH);
+                       skip_letter += 2;
+           }
+#endif
+           else {
+                       Phonize ('S');
+           }
+           break;
+           /*
+            * 'sh' in -TIA- or -TIO- else 'th' before H else T 
+            */
+       case 'T':
+           if (Next_Letter == 'I' &&
+               (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
+                       Phonize (SH);
+           } else if (Next_Letter == 'H') {
+                       Phonize (TH);
+                       skip_letter++;
+           } else {
+                       Phonize ('T');
+           }
+           break;
+           /*
+            * F 
+            */
+       case 'V':
+           Phonize ('F');
+           break;
+           /*
+            * W before a vowel, else dropped 
+            */
+       case 'W':
+           if (isvowel (Next_Letter)) {
+                       Phonize ('W');
+               }
+           break;
+           /*
+            * KS 
+            */
+       case 'X':
+           Phonize ('K');
+           Phonize ('S');
+           break;
+           /*
+            * Y if followed by a vowel 
+            */
+       case 'Y':
+           if (isvowel (Next_Letter)) {
+                       Phonize ('Y');
+               }
+           break;
+           /*
+            * S 
+            */
+       case 'Z':
+           Phonize ('S');
+           break;
+           /*
+            * No transformation 
+            */
+       case 'F':
+       case 'J':
+       case 'L':
+       case 'M':
+       case 'N':
+       case 'R':
+           Phonize (Curr_Letter);
+           break;
+       default:
+           /*
+            * nothing 
+            */
+           break;
+       }                       /* END SWITCH */
+
+       w_idx += skip_letter;
+    }                          /* END FOR */
+
+    End_Phoned_Word;
+
+    return TRUE;
+}
diff --git a/src/plugins/custom/regmark/metaphone.h b/src/plugins/custom/regmark/metaphone.h
new file mode 100644 (file)
index 0000000..4de3292
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef RSPAMD_METAPHONE_H
+#define RSPAMD_METAPHONE_H
+
+#include "../../../config.h"
+
+
+#define ENCODE(c) (g_ascii_isalpha(c) ? _codes[((g_ascii_toupper(c)) - 'A')] : 0)
+
+#define isvowel(c)     (ENCODE(c) & 1)         /* AEIOU */
+
+/* These letters are passed through unchanged */
+#define NOCHANGE(c)    (ENCODE(c) & 2)         /* FJMNR */
+
+/* These form dipthongs when preceding H */
+#define AFFECTH(c)     (ENCODE(c) & 4)         /* CGPST */
+
+/* These make C and G soft */
+#define MAKESOFT(c)    (ENCODE(c) & 8)         /* EIY */
+
+/* These prevent GH from becoming F */
+#define NOGHTOF(c)     (ENCODE(c) & 16)        /* BDH */
+
+#undef USE_TRADITIONAL_METAPHONE
+
+gboolean metaphone (char *word, int max_phonemes, char **phoned_word);
+
+#endif
diff --git a/src/plugins/custom/regmark/prefix_tree.c b/src/plugins/custom/regmark/prefix_tree.c
new file mode 100644 (file)
index 0000000..0e29206
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../../../config.h"
+#include "prefix_tree.h"
+
+
+prefix_tree_t* 
+prefix_tree_new (int levels)
+{
+       prefix_tree_t *result = NULL;
+
+       if (levels <= 0) {
+               return NULL;
+       }
+       /* Allocate tree */
+       result = g_new (prefix_tree_t, 1);
+       result->levels = levels;
+
+       /* Allocate levels */
+       result->nodes = g_new0 (prefix_tree_level_t, levels);
+       
+       return result;
+}
+
+static uintptr_t 
+add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboolean read_only, gboolean get_longest)
+{
+       int cur_level = 0, num;
+       prefix_tree_level_t *cur;
+       uintptr_t res = 0;
+
+       if (tree == NULL) {
+               return 0;
+       }
+
+       while (*input && cur_level < tree->levels) {
+               cur = &tree->nodes[cur_level];
+               if (*input >= 'A' && *input <= 'Z') {
+                       num = *input - 'A';
+                       /* Go throught each level and check specified letter */
+                       if (cur->leafs[num].data == 0) {
+                               /* Create new leaf */
+                               if (read_only) {
+                                       return res;
+                               }
+                               else {
+                                       cur->leafs[num].data = 1;
+                               }
+                       }
+                       else {
+                               /* Got some node, so check it */
+                               if (cur_level > skip_levels) {
+                                       if (! read_only) {
+                                               cur->leafs[num].data ++;
+                                       }
+                                       if (! get_longest) {
+                                               /* Get maximum after skip */
+                                               if (res < cur->leafs[num].data) {
+                                                       res = cur->leafs[num].data;
+                                               }
+                                       }
+                                       else {
+                                               res = cur->leafs[num].data;
+                                       }
+                               }
+                       }
+               }
+               input ++;
+               cur_level ++;
+       }
+
+       return res;
+}
+
+uintptr_t 
+add_string (prefix_tree_t *tree, const char *input, int skip_levels)
+{
+       return add_string_common (tree, input, skip_levels, FALSE, FALSE);
+}
+
+uintptr_t 
+check_string (prefix_tree_t *tree, const char *input, int skip_levels)
+{
+       return add_string_common (tree, input, skip_levels, TRUE, FALSE);
+}
+
+uintptr_t 
+add_string_longest (prefix_tree_t *tree, const char *input, int skip_levels)
+{
+       return add_string_common (tree, input, skip_levels, FALSE, TRUE);
+}
+
+uintptr_t 
+check_string_longest (prefix_tree_t *tree, const char *input, int skip_levels)
+{
+       return add_string_common (tree, input, skip_levels, TRUE, TRUE);
+}
+
+void 
+prefix_tree_free (prefix_tree_t *tree)
+{
+       if (tree != NULL) {
+               g_free (tree->nodes);
+               g_free (tree);
+       }
+}
+
+gboolean 
+save_prefix_tree (prefix_tree_t *tree, const char *filename)
+{
+       int fd, r;
+
+       if ((fd = open (filename, O_RDWR | O_TRUNC | O_CREAT, S_IWUSR | S_IRUSR)) == -1) {
+               return FALSE;
+       }
+       
+       if ((r = write (fd, &tree->levels, sizeof (int))) == -1) {
+               return FALSE;
+       }
+
+       if ((r = write (fd, tree->nodes, tree->levels * sizeof (prefix_tree_level_t))) == -1) {
+               return FALSE;
+       }
+
+       close (fd);
+
+       return TRUE;
+}
+
+prefix_tree_t* 
+load_prefix_tree (const char *filename)
+{
+       int fd, r, levels;
+       prefix_tree_t *tree;
+
+       if ((fd = open (filename, O_RDWR | O_TRUNC | O_CREAT, S_IWUSR | S_IRUSR)) == -1) {
+               return NULL;
+       }
+       
+
+       if ((r = read (fd, &levels, sizeof (int))) == -1) {
+               return NULL;
+       }
+
+       tree = prefix_tree_new (levels);
+
+       if ((r = read (fd, tree->nodes, tree->levels * sizeof (prefix_tree_level_t))) == -1) {
+               prefix_tree_free (tree);
+               return NULL;
+       }
+
+       close (fd);
+
+       return tree;
+}
diff --git a/src/plugins/custom/regmark/prefix_tree.h b/src/plugins/custom/regmark/prefix_tree.h
new file mode 100644 (file)
index 0000000..30580d6
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef RSPAMD_PREFIX_TREE_H
+#define RSPAMD_PREFIX_TREE_H
+
+#include "../../../config.h"
+
+#define LETTERS_NUMBER 26
+
+typedef struct prefix_tree_leaf_s {
+       uintptr_t data;
+} prefix_tree_leaf_t;
+
+typedef struct prefix_tree_level_s {
+       struct prefix_tree_leaf_s leafs[LETTERS_NUMBER];
+} prefix_tree_level_t;
+
+typedef struct prefix_tree_s {
+       prefix_tree_level_t *nodes;
+       int levels;
+} prefix_tree_t;
+
+prefix_tree_t* prefix_tree_new (int levels);
+
+uintptr_t add_string (prefix_tree_t *tree, const char *input, int skip_levels);
+
+uintptr_t check_string (prefix_tree_t *tree, const char *input, int skip_levels);
+
+uintptr_t add_string_longest (prefix_tree_t *tree, const char *input, int skip_levels);
+uintptr_t check_string_longest (prefix_tree_t *tree, const char *input, int skip_levels);
+
+void prefix_tree_free (prefix_tree_t *tree);
+
+gboolean save_prefix_tree (prefix_tree_t *tree, const char *filename);
+prefix_tree_t* load_prefix_tree (const char *filename);
+
+#endif
diff --git a/src/plugins/custom/regmark/regmark.c b/src/plugins/custom/regmark/regmark.c
new file mode 100644 (file)
index 0000000..c8cede4
--- /dev/null
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This plugin can be used as registration spam tester. Algorithm of its work:
+ * 
+ * 1) got string that identifies username, for example alexeyssad
+ * 2) do metaphone normalization
+ * 3) break this string into peaces: (al) (ex) (ey) (ss) (ad)
+ * 4) go throught the tree and increment each node value
+ * 5) find the biggest number of occurencies in some level of tree, for example:
+ *                           (root)
+ *                             |
+ *            l1:  (al:4)   (hu:5) (tt:9)
+ *            l2: (ex:4)   (is:5)  (hh:9)
+ *            l3: (ey:3)   ....
+ *            l4: (ss:2)
+ *            l5: (ad:1)
+ *  then if we have requirement of minimum l3 (6 symbols of original string) then the maximum number would be 3, so we
+ *  got name alexey 3 times before.
+ *
+ *  So input line should look like this:
+ *
+ *  <string> level
+ */
+
+#include "../../../config.h"
+#include "../../../cfg_file.h"
+#include "metaphone.h"
+#include "prefix_tree.h"
+
+#define MAX_LEVELS 32
+
+/* Exported functions */
+void module_init (struct config_file *cfg);
+void* before_connect (void);
+gboolean parse_line (const char *line, size_t len, char **output, void *user_data);
+void after_connect (char **output, char **log_line, void *user_data);
+void module_fin (void);        
+
+/* Internal variables */
+static char *filename = NULL;
+static prefix_tree_t *tree = NULL;
+
+/* Implementation */
+
+char                           *
+get_module_opt (struct config_file *cfg, char *module_name, char *opt_name)
+{
+       GList                          *cur_opt;
+       struct module_opt              *cur;
+
+       cur_opt = g_hash_table_lookup (cfg->modules_opts, module_name);
+       if (cur_opt == NULL) {
+               return NULL;
+       }
+
+       while (cur_opt) {
+               cur = cur_opt->data;
+               if (strcmp (cur->param, opt_name) == 0) {
+                       return cur->value;
+               }
+               cur_opt = g_list_next (cur_opt);
+       }
+
+       return NULL;
+}
+
+void 
+module_init (struct config_file *cfg)
+{
+       char *value;
+
+       if (cfg && (value = get_module_opt (cfg, "ipmark", "file")) != NULL) {
+               filename = g_strdup (value);
+       }
+       
+       if (filename) {
+               tree = load_prefix_tree (filename);
+               if (! tree) {
+                       tree = prefix_tree_new (MAX_LEVELS);
+               }
+       }
+       else {
+               tree = prefix_tree_new (MAX_LEVELS);
+       }
+
+}
+
+void *
+before_connect (void)
+{
+       /* In fact we do not need any session data, so just return NULL */
+       return NULL;
+}
+
+void
+module_fin (void)
+{
+       if (filename) {
+               save_prefix_tree (tree, filename);
+               g_free (filename);
+               filename = NULL;
+       }
+       if (tree) {
+               prefix_tree_free (tree);
+               tree = NULL;
+       }
+}
+
+gboolean 
+parse_line (const char *line, size_t len, char **output, void *user_data)
+{
+       const char *p = line;
+       char *name, *metaname = NULL;
+       int levels;
+       uintptr_t res = 0;
+
+       while (p - line < len) {
+               if (g_ascii_isspace (*p)) {
+                       name = g_malloc (p - line + 1);
+                       g_strlcpy (name, line, p - line);
+                       metaphone (name, 0, &metaname);
+                       /* Skip spaces */
+                       while (p - line < len && g_ascii_isspace (*p++));
+                       levels = strtol (p, NULL, 10);
+                       if (levels <= 0) {
+                               levels = strlen (name);
+                       }
+                       if (metaname) {
+                               res = add_string (tree, metaname, levels);
+                               *output = g_strdup_printf ("OK: %u", (unsigned int)res);
+                               g_free (metaname);
+                       }
+                       break;
+               }
+       }
+
+       if (res == 0) {
+               *output = g_strdup ("ERR");
+       }
+
+       return TRUE;
+}
+
+
+void after_connect (char **output, char **log_line, void *user_data)
+{
+       /* Placeholder */
+       return;
+}