From 246f7ec03f58e7efce2727c9196a4031d45ae5dd Mon Sep 17 00:00:00 2001 From: "cebka@lenovo-laptop" Date: Fri, 26 Feb 2010 21:12:24 +0300 Subject: [PATCH] * Add custom filter for making marks for new user: - each username is passed throught metaphone filter - then we make prefix tree based on english letters from metaphone - then we are searching for the longest common string and output result (how many times we got that string) --- src/plugins/custom/CMakeLists.txt | 1 + src/plugins/custom/ipmark/CMakeLists.txt | 2 +- src/plugins/custom/ipmark/ipmark.c | 6 +- src/plugins/custom/regmark/CMakeLists.txt | 10 + src/plugins/custom/regmark/metaphone.c | 466 ++++++++++++++++++++++ src/plugins/custom/regmark/metaphone.h | 27 ++ src/plugins/custom/regmark/prefix_tree.c | 177 ++++++++ src/plugins/custom/regmark/prefix_tree.h | 35 ++ src/plugins/custom/regmark/regmark.c | 172 ++++++++ 9 files changed, 892 insertions(+), 4 deletions(-) create mode 100644 src/plugins/custom/regmark/CMakeLists.txt create mode 100644 src/plugins/custom/regmark/metaphone.c create mode 100644 src/plugins/custom/regmark/metaphone.h create mode 100644 src/plugins/custom/regmark/prefix_tree.c create mode 100644 src/plugins/custom/regmark/prefix_tree.h create mode 100644 src/plugins/custom/regmark/regmark.c diff --git a/src/plugins/custom/CMakeLists.txt b/src/plugins/custom/CMakeLists.txt index c5e1ec99f..5d076c41d 100644 --- a/src/plugins/custom/CMakeLists.txt +++ b/src/plugins/custom/CMakeLists.txt @@ -1 +1,2 @@ ADD_SUBDIRECTORY(ipmark) +ADD_SUBDIRECTORY(regmark) diff --git a/src/plugins/custom/ipmark/CMakeLists.txt b/src/plugins/custom/ipmark/CMakeLists.txt index 42785599b..1c1b1e64c 100644 --- a/src/plugins/custom/ipmark/CMakeLists.txt +++ b/src/plugins/custom/ipmark/CMakeLists.txt @@ -4,7 +4,7 @@ SET(IPMARKSRC ipmark.c ../../../mem_pool.c ) -ADD_LIBRARY(rspamd_ipmark SHARED ${IPMARKSRC}) +ADD_LIBRARY(rspamd_ipmark MODULE ${IPMARKSRC}) TARGET_LINK_LIBRARIES(rspamd_ipmark ${GLIB2_LIBRARIES}) INSTALL(TARGETS rspamd_ipmark DESTINATION lib) diff --git a/src/plugins/custom/ipmark/ipmark.c b/src/plugins/custom/ipmark/ipmark.c index e919cd417..9bebcab22 100644 --- a/src/plugins/custom/ipmark/ipmark.c +++ b/src/plugins/custom/ipmark/ipmark.c @@ -54,8 +54,8 @@ void after_connect (char **output, char **log_line, void *user_data); void module_fin (void); /* Internal variables */ -char *filename = NULL; -radix_tree_t *radix = NULL; +static char *filename = NULL; +static radix_tree_t *radix = NULL; /* Implementation */ @@ -259,7 +259,7 @@ parse_line (const char *line, size_t len, char **output, void *user_data) const char *p; char *c = ip_buf, *err_str; struct in_addr ina; - int state = 0, next_state, dots; + int state = 0, next_state = 0, dots = 0; int16_t value; uint32_t mask; enum ipmark_command cmd; diff --git a/src/plugins/custom/regmark/CMakeLists.txt b/src/plugins/custom/regmark/CMakeLists.txt new file mode 100644 index 000000000..e6242ab90 --- /dev/null +++ b/src/plugins/custom/regmark/CMakeLists.txt @@ -0,0 +1,10 @@ +# Regmark plugin makefile +SET(REGMARKSRC regmark.c + metaphone.c + prefix_tree.c +) + +ADD_LIBRARY(rspamd_regmark MODULE ${REGMARKSRC}) +TARGET_LINK_LIBRARIES(rspamd_regmark ${GLIB2_LIBRARIES}) + +INSTALL(TARGETS rspamd_regmark DESTINATION lib) diff --git a/src/plugins/custom/regmark/metaphone.c b/src/plugins/custom/regmark/metaphone.c new file mode 100644 index 000000000..81fa0544e --- /dev/null +++ b/src/plugins/custom/regmark/metaphone.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is implementation of metaphone algorithm that was originally written by + * Michael G Schwern as perl XS module + */ + +/* + * I suppose I could have been using a character pointer instead of + * accesssing the array directly... + */ + +#include "../../../config.h" +#include "metaphone.h" + +/* + * Look at the next letter in the word + */ +#define Next_Letter (g_ascii_toupper (word[w_idx+1])) +/* + * Look at the current letter in the word + */ +#define Curr_Letter (g_ascii_toupper(word[w_idx])) +/* + * Go N letters back. + */ +#define Look_Back_Letter(n) (w_idx >= n ? g_ascii_toupper(word[w_idx-n]) : '\0') +/* + * Previous letter. I dunno, should this return null on failure? + */ +#define Prev_Letter (Look_Back_Letter(1)) +/* + * Look two letters down. It makes sure you don't walk off the string. + */ +#define After_Next_Letter (Next_Letter != '\0' ? g_ascii_toupper(word[w_idx+2]) \ + : '\0') +#define Look_Ahead_Letter(n) (g_ascii_toupper(Lookahead(word+w_idx, n))) + +#define SH 'X' +#define TH '0' + +/*-- Character encoding array & accessing macros --*/ +/* Stolen directly out of the book... */ +char _codes[26] = { + 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0 +/* a b c d e f g h i j k l m n o p q r s t u v w x y z */ +}; + + +/* + * Allows us to safely look ahead an arbitrary # of letters + */ +/* + * I probably could have just used strlen... + */ +static char +Lookahead (char *word, int how_far) +{ + char letter_ahead = '\0'; /* null by default */ + int idx; + for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); + /* + * Edge forward in the string... + */ + + letter_ahead = word[idx]; /* idx will be either == to how_far or at + * the end of the string */ + return letter_ahead; +} + + +/* + * phonize one letter + */ +#define Phonize(c) {(*phoned_word)[p_idx++] = c;} +/* + * Slap a null character on the end of the phoned word + */ +#define End_Phoned_Word {(*phoned_word)[p_idx] = '\0';} +/* + * How long is the phoned word? + */ +#define Phone_Len (p_idx) + +/* + * Note is a letter is a 'break' in the word + */ +#define Isbreak(c) (!g_ascii_isalpha(c)) + + +gboolean +metaphone (char *word, int max_phonemes, char **phoned_word) +{ + int w_idx = 0; /* point in the phonization we're at. */ + int p_idx = 0; /* end of the phoned phrase */ + + /*-- Parameter checks --*/ + /* + * Assume largest possible if we're given no limit + */ + if (max_phonemes == 0) { + max_phonemes = strlen (word); + } + + /*-- Allocate memory for our phoned_phrase --*/ + *phoned_word = g_malloc (max_phonemes * sizeof (char)); + + /*-- The first phoneme has to be processed specially. --*/ + /* + * Find our first letter + */ + for (; ! g_ascii_isalpha (Curr_Letter); w_idx++) { + /* + * On the off chance we were given nothing but crap... + */ + if (Curr_Letter == '\0') { + End_Phoned_Word + return TRUE; /* For testing */ + } + } + + switch (Curr_Letter) { + /* + * AE becomes E + */ + case 'A': + if (Next_Letter == 'E') { + Phonize ('E'); + w_idx += 2; + } + /* + * Remember, preserve vowels at the beginning + */ + else { + Phonize ('A'); + w_idx++; + } + break; + /* + * [GKP]N becomes N + */ + case 'G': + case 'K': + case 'P': + if (Next_Letter == 'N') { + Phonize ('N'); + w_idx += 2; + } + break; + /* + * WH becomes H, WR becomes R W if followed by a vowel + */ + case 'W': + if (Next_Letter == 'H' || Next_Letter == 'R') { + Phonize (Next_Letter); + w_idx += 2; + } else if (isvowel (Next_Letter)) { + Phonize ('W'); + w_idx += 2; + } + /* + * else ignore + */ + break; + /* + * X becomes S + */ + case 'X': + Phonize ('S'); + w_idx++; + break; + /* + * Vowels are kept + */ + /* + * We did A already case 'A': case 'a': + */ + case 'E': + case 'I': + case 'O': + case 'U': + Phonize (Curr_Letter); + w_idx++; + break; + default: + /* + * do nothing + */ + break; + } + + + + /* + * On to the metaphoning + */ + for (; Curr_Letter != '\0' && + (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) { + /* + * How many letters to skip because an eariler encoding handled + * multiple letters + */ + unsigned short int skip_letter = 0; + + + /* + * THOUGHT: It would be nice if, rather than having things like... + * well, SCI. For SCI you encode the S, then have to remember to + * skip the C. So the phonome SCI invades both S and C. It would + * be better, IMHO, to skip the C from the S part of the encoding. + * Hell, I'm trying it. + */ + + /* + * Ignore non-alphas + */ + if (! g_ascii_isalpha (Curr_Letter)) + continue; + + /* + * Drop duplicates, except CC + */ + if (Curr_Letter == Prev_Letter && Curr_Letter != 'C') + continue; + + switch (Curr_Letter) { + /* + * B -> B unless in MB + */ + case 'B': + if (Prev_Letter != 'M') + Phonize ('B'); + break; + /* + * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is + * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-, + * SCE-, -SCY- (handed in S) else K + */ + case 'C': + if (MAKESOFT (Next_Letter)) { /* C[IEY] */ + if (After_Next_Letter == 'A' && Next_Letter == 'I') { /* CIA + */ + Phonize (SH); + } + /* + * SC[IEY] + */ + else if (Prev_Letter == 'S') { + /* + * Dropped + */ + } else { + Phonize ('S'); + } + } else if (Next_Letter == 'H') { +#ifndef USE_TRADITIONAL_METAPHONE + if (After_Next_Letter == 'R' || Prev_Letter == 'S') { /* Christ, + * School + */ + Phonize ('K'); + } else { + Phonize (SH); + } +#else + Phonize (SH); +#endif + skip_letter++; + } else { + Phonize ('K'); + } + break; + /* + * J if in -DGE-, -DGI- or -DGY- else T + */ + case 'D': + if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) { + Phonize ('J'); + skip_letter++; + } else { + Phonize ('T'); + } + break; + /* + * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else + * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or + * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG + * else K + */ + case 'G': + if (Next_Letter == 'H') { + if (!(NOGHTOF (Look_Back_Letter (3)) || + Look_Back_Letter (4) == 'H')) { + Phonize ('F'); + skip_letter++; + } else { + /* + * silent + */ + } + } else if (Next_Letter == 'N') { + if (Isbreak (After_Next_Letter) || + (After_Next_Letter == 'E' && + Look_Ahead_Letter (3) == 'D')) { + /* + * dropped + */ + } else { + Phonize ('K'); + } + } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') { + Phonize ('J'); + } else { + Phonize ('K'); + } + break; + /* + * H if before a vowel and not after C,G,P,S,T + */ + case 'H': + if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) { + Phonize ('H'); + } + break; + /* + * dropped if after C else K + */ + case 'K': + if (Prev_Letter != 'C') { + Phonize ('K'); + } + break; + /* + * F if before H else P + */ + case 'P': + if (Next_Letter == 'H') { + Phonize ('F'); + } else { + Phonize ('P'); + } + break; + /* + * K + */ + case 'Q': + Phonize ('K'); + break; + /* + * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S + */ + case 'S': + if (Next_Letter == 'I' && + (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { + Phonize (SH); + } else if (Next_Letter == 'H') { + Phonize (SH); + skip_letter++; + } +#ifndef USE_TRADITIONAL_METAPHONE + else if (Next_Letter == 'C' && + Look_Ahead_Letter (2) == 'H' && + Look_Ahead_Letter (3) == 'W') { + Phonize (SH); + skip_letter += 2; + } +#endif + else { + Phonize ('S'); + } + break; + /* + * 'sh' in -TIA- or -TIO- else 'th' before H else T + */ + case 'T': + if (Next_Letter == 'I' && + (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { + Phonize (SH); + } else if (Next_Letter == 'H') { + Phonize (TH); + skip_letter++; + } else { + Phonize ('T'); + } + break; + /* + * F + */ + case 'V': + Phonize ('F'); + break; + /* + * W before a vowel, else dropped + */ + case 'W': + if (isvowel (Next_Letter)) { + Phonize ('W'); + } + break; + /* + * KS + */ + case 'X': + Phonize ('K'); + Phonize ('S'); + break; + /* + * Y if followed by a vowel + */ + case 'Y': + if (isvowel (Next_Letter)) { + Phonize ('Y'); + } + break; + /* + * S + */ + case 'Z': + Phonize ('S'); + break; + /* + * No transformation + */ + case 'F': + case 'J': + case 'L': + case 'M': + case 'N': + case 'R': + Phonize (Curr_Letter); + break; + default: + /* + * nothing + */ + break; + } /* END SWITCH */ + + w_idx += skip_letter; + } /* END FOR */ + + End_Phoned_Word; + + return TRUE; +} diff --git a/src/plugins/custom/regmark/metaphone.h b/src/plugins/custom/regmark/metaphone.h new file mode 100644 index 000000000..4de3292f9 --- /dev/null +++ b/src/plugins/custom/regmark/metaphone.h @@ -0,0 +1,27 @@ +#ifndef RSPAMD_METAPHONE_H +#define RSPAMD_METAPHONE_H + +#include "../../../config.h" + + +#define ENCODE(c) (g_ascii_isalpha(c) ? _codes[((g_ascii_toupper(c)) - 'A')] : 0) + +#define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ + +/* These letters are passed through unchanged */ +#define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */ + +/* These form dipthongs when preceding H */ +#define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */ + +/* These make C and G soft */ +#define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */ + +/* These prevent GH from becoming F */ +#define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */ + +#undef USE_TRADITIONAL_METAPHONE + +gboolean metaphone (char *word, int max_phonemes, char **phoned_word); + +#endif diff --git a/src/plugins/custom/regmark/prefix_tree.c b/src/plugins/custom/regmark/prefix_tree.c new file mode 100644 index 000000000..0e2920616 --- /dev/null +++ b/src/plugins/custom/regmark/prefix_tree.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "../../../config.h" +#include "prefix_tree.h" + + +prefix_tree_t* +prefix_tree_new (int levels) +{ + prefix_tree_t *result = NULL; + + if (levels <= 0) { + return NULL; + } + /* Allocate tree */ + result = g_new (prefix_tree_t, 1); + result->levels = levels; + + /* Allocate levels */ + result->nodes = g_new0 (prefix_tree_level_t, levels); + + return result; +} + +static uintptr_t +add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboolean read_only, gboolean get_longest) +{ + int cur_level = 0, num; + prefix_tree_level_t *cur; + uintptr_t res = 0; + + if (tree == NULL) { + return 0; + } + + while (*input && cur_level < tree->levels) { + cur = &tree->nodes[cur_level]; + if (*input >= 'A' && *input <= 'Z') { + num = *input - 'A'; + /* Go throught each level and check specified letter */ + if (cur->leafs[num].data == 0) { + /* Create new leaf */ + if (read_only) { + return res; + } + else { + cur->leafs[num].data = 1; + } + } + else { + /* Got some node, so check it */ + if (cur_level > skip_levels) { + if (! read_only) { + cur->leafs[num].data ++; + } + if (! get_longest) { + /* Get maximum after skip */ + if (res < cur->leafs[num].data) { + res = cur->leafs[num].data; + } + } + else { + res = cur->leafs[num].data; + } + } + } + } + input ++; + cur_level ++; + } + + return res; +} + +uintptr_t +add_string (prefix_tree_t *tree, const char *input, int skip_levels) +{ + return add_string_common (tree, input, skip_levels, FALSE, FALSE); +} + +uintptr_t +check_string (prefix_tree_t *tree, const char *input, int skip_levels) +{ + return add_string_common (tree, input, skip_levels, TRUE, FALSE); +} + +uintptr_t +add_string_longest (prefix_tree_t *tree, const char *input, int skip_levels) +{ + return add_string_common (tree, input, skip_levels, FALSE, TRUE); +} + +uintptr_t +check_string_longest (prefix_tree_t *tree, const char *input, int skip_levels) +{ + return add_string_common (tree, input, skip_levels, TRUE, TRUE); +} + +void +prefix_tree_free (prefix_tree_t *tree) +{ + if (tree != NULL) { + g_free (tree->nodes); + g_free (tree); + } +} + +gboolean +save_prefix_tree (prefix_tree_t *tree, const char *filename) +{ + int fd, r; + + if ((fd = open (filename, O_RDWR | O_TRUNC | O_CREAT, S_IWUSR | S_IRUSR)) == -1) { + return FALSE; + } + + if ((r = write (fd, &tree->levels, sizeof (int))) == -1) { + return FALSE; + } + + if ((r = write (fd, tree->nodes, tree->levels * sizeof (prefix_tree_level_t))) == -1) { + return FALSE; + } + + close (fd); + + return TRUE; +} + +prefix_tree_t* +load_prefix_tree (const char *filename) +{ + int fd, r, levels; + prefix_tree_t *tree; + + if ((fd = open (filename, O_RDWR | O_TRUNC | O_CREAT, S_IWUSR | S_IRUSR)) == -1) { + return NULL; + } + + + if ((r = read (fd, &levels, sizeof (int))) == -1) { + return NULL; + } + + tree = prefix_tree_new (levels); + + if ((r = read (fd, tree->nodes, tree->levels * sizeof (prefix_tree_level_t))) == -1) { + prefix_tree_free (tree); + return NULL; + } + + close (fd); + + return tree; +} diff --git a/src/plugins/custom/regmark/prefix_tree.h b/src/plugins/custom/regmark/prefix_tree.h new file mode 100644 index 000000000..30580d612 --- /dev/null +++ b/src/plugins/custom/regmark/prefix_tree.h @@ -0,0 +1,35 @@ +#ifndef RSPAMD_PREFIX_TREE_H +#define RSPAMD_PREFIX_TREE_H + +#include "../../../config.h" + +#define LETTERS_NUMBER 26 + +typedef struct prefix_tree_leaf_s { + uintptr_t data; +} prefix_tree_leaf_t; + +typedef struct prefix_tree_level_s { + struct prefix_tree_leaf_s leafs[LETTERS_NUMBER]; +} prefix_tree_level_t; + +typedef struct prefix_tree_s { + prefix_tree_level_t *nodes; + int levels; +} prefix_tree_t; + +prefix_tree_t* prefix_tree_new (int levels); + +uintptr_t add_string (prefix_tree_t *tree, const char *input, int skip_levels); + +uintptr_t check_string (prefix_tree_t *tree, const char *input, int skip_levels); + +uintptr_t add_string_longest (prefix_tree_t *tree, const char *input, int skip_levels); +uintptr_t check_string_longest (prefix_tree_t *tree, const char *input, int skip_levels); + +void prefix_tree_free (prefix_tree_t *tree); + +gboolean save_prefix_tree (prefix_tree_t *tree, const char *filename); +prefix_tree_t* load_prefix_tree (const char *filename); + +#endif diff --git a/src/plugins/custom/regmark/regmark.c b/src/plugins/custom/regmark/regmark.c new file mode 100644 index 000000000..c8cede47a --- /dev/null +++ b/src/plugins/custom/regmark/regmark.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This plugin can be used as registration spam tester. Algorithm of its work: + * + * 1) got string that identifies username, for example alexeyssad + * 2) do metaphone normalization + * 3) break this string into peaces: (al) (ex) (ey) (ss) (ad) + * 4) go throught the tree and increment each node value + * 5) find the biggest number of occurencies in some level of tree, for example: + * (root) + * | + * l1: (al:4) (hu:5) (tt:9) + * l2: (ex:4) (is:5) (hh:9) + * l3: (ey:3) .... + * l4: (ss:2) + * l5: (ad:1) + * then if we have requirement of minimum l3 (6 symbols of original string) then the maximum number would be 3, so we + * got name alexey 3 times before. + * + * So input line should look like this: + * + * level + */ + +#include "../../../config.h" +#include "../../../cfg_file.h" +#include "metaphone.h" +#include "prefix_tree.h" + +#define MAX_LEVELS 32 + +/* Exported functions */ +void module_init (struct config_file *cfg); +void* before_connect (void); +gboolean parse_line (const char *line, size_t len, char **output, void *user_data); +void after_connect (char **output, char **log_line, void *user_data); +void module_fin (void); + +/* Internal variables */ +static char *filename = NULL; +static prefix_tree_t *tree = NULL; + +/* Implementation */ + +char * +get_module_opt (struct config_file *cfg, char *module_name, char *opt_name) +{ + GList *cur_opt; + struct module_opt *cur; + + cur_opt = g_hash_table_lookup (cfg->modules_opts, module_name); + if (cur_opt == NULL) { + return NULL; + } + + while (cur_opt) { + cur = cur_opt->data; + if (strcmp (cur->param, opt_name) == 0) { + return cur->value; + } + cur_opt = g_list_next (cur_opt); + } + + return NULL; +} + +void +module_init (struct config_file *cfg) +{ + char *value; + + if (cfg && (value = get_module_opt (cfg, "ipmark", "file")) != NULL) { + filename = g_strdup (value); + } + + if (filename) { + tree = load_prefix_tree (filename); + if (! tree) { + tree = prefix_tree_new (MAX_LEVELS); + } + } + else { + tree = prefix_tree_new (MAX_LEVELS); + } + +} + +void * +before_connect (void) +{ + /* In fact we do not need any session data, so just return NULL */ + return NULL; +} + +void +module_fin (void) +{ + if (filename) { + save_prefix_tree (tree, filename); + g_free (filename); + filename = NULL; + } + if (tree) { + prefix_tree_free (tree); + tree = NULL; + } +} + +gboolean +parse_line (const char *line, size_t len, char **output, void *user_data) +{ + const char *p = line; + char *name, *metaname = NULL; + int levels; + uintptr_t res = 0; + + while (p - line < len) { + if (g_ascii_isspace (*p)) { + name = g_malloc (p - line + 1); + g_strlcpy (name, line, p - line); + metaphone (name, 0, &metaname); + /* Skip spaces */ + while (p - line < len && g_ascii_isspace (*p++)); + levels = strtol (p, NULL, 10); + if (levels <= 0) { + levels = strlen (name); + } + if (metaname) { + res = add_string (tree, metaname, levels); + *output = g_strdup_printf ("OK: %u", (unsigned int)res); + g_free (metaname); + } + break; + } + } + + if (res == 0) { + *output = g_strdup ("ERR"); + } + + return TRUE; +} + + +void after_connect (char **output, char **log_line, void *user_data) +{ + /* Placeholder */ + return; +} -- 2.39.5