]> source.dussan.org Git - rspamd.git/commitdiff
* Make regmark customplugin working
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 23 Apr 2010 15:53:15 +0000 (19:53 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 23 Apr 2010 15:53:15 +0000 (19:53 +0400)
src/plugins/custom/regmark/metaphone.c
src/plugins/custom/regmark/prefix_tree.c
src/plugins/custom/regmark/regmark.c
src/worker.c

index 81fa0544e83e4d643ea56f5867bb7be2a22f7822..3ac7b7c092a58511af8d5115d00d03d37ee5cdd9 100644 (file)
@@ -94,11 +94,11 @@ Lookahead (char *word, int how_far)
 /*
  * phonize one letter 
  */
-#define Phonize(c)  {(*phoned_word)[p_idx++] = c;}
+#define Phonize(c)  {p[p_idx++] = c;}
 /*
  * Slap a null character on the end of the phoned word 
  */
-#define End_Phoned_Word {(*phoned_word)[p_idx] = '\0';}
+#define End_Phoned_Word {p[p_idx] = '\0';}
 /*
  * How long is the phoned word? 
  */
@@ -115,17 +115,22 @@ metaphone (char *word, int max_phonemes, char **phoned_word)
 {
     int             w_idx = 0; /* point in the phonization we're at. */
     int             p_idx = 0; /* end of the phoned phrase */
+       char           *p;
 
     /*-- Parameter checks --*/
     /*
      * Assume largest possible if we're given no limit 
      */
     if (max_phonemes == 0) {
-               max_phonemes = strlen (word);
+               max_phonemes = strlen (word) * 2 + 1;
+       }
+       if (max_phonemes == 0) {
+               return FALSE;
        }
 
     /*-- Allocate memory for our phoned_phrase --*/
        *phoned_word = g_malloc (max_phonemes * sizeof (char));
+       p = *phoned_word;
 
     /*-- The first phoneme has to be processed specially. --*/
     /*
@@ -142,73 +147,68 @@ metaphone (char *word, int max_phonemes, char **phoned_word)
     }
 
     switch (Curr_Letter) {
-       /*
-        * AE becomes E 
-        */
-    case 'A':
-       if (Next_Letter == 'E') {
-           Phonize ('E');
-           w_idx += 2;
-       }
-       /*
-        * Remember, preserve vowels at the beginning 
-        */
-       else {
-           Phonize ('A');
-           w_idx++;
-       }
-       break;
-       /*
-        * [GKP]N becomes N 
-        */
-    case 'G':
-    case 'K':
-    case 'P':
-       if (Next_Letter == 'N') {
-           Phonize ('N');
-           w_idx += 2;
-       }
-       break;
-       /*
-        * WH becomes H, WR becomes R W if followed by a vowel 
-        */
-    case 'W':
-       if (Next_Letter == 'H' || Next_Letter == 'R') {
-           Phonize (Next_Letter);
-           w_idx += 2;
-       } else if (isvowel (Next_Letter)) {
-           Phonize ('W');
-           w_idx += 2;
-       }
-       /*
-        * else ignore 
-        */
-       break;
-       /*
-        * X becomes S 
-        */
-    case 'X':
-       Phonize ('S');
-       w_idx++;
-       break;
-       /*
-        * Vowels are kept 
-        */
-       /*
-        * We did A already case 'A': case 'a': 
-        */
-    case 'E':
-    case 'I':
-    case 'O':
-    case 'U':
-       Phonize (Curr_Letter);
-       w_idx++;
-       break;
-    default:
-       /*
-        * do nothing 
-        */
-       break;
+               /*
+               * AE becomes E 
+               */
+               case 'A':
+                       if (Next_Letter == 'E') {
+                               Phonize ('E');
+                               w_idx += 2;
+                       }
+                       /*
+                       * Remember, preserve vowels at the beginning 
+                       */
+                       else {
+                               Phonize ('A');
+                               w_idx++;
+                       }
+                       break;
+               /*
+               * [GKP]N becomes N 
+               */
+               case 'G':
+               case 'K':
+               case 'P':
+                       if (Next_Letter == 'N') {
+                               Phonize ('N');
+                               w_idx += 2;
+                       }
+                       break;
+               /*
+               * WH becomes H, WR becomes R W if followed by a vowel 
+               */
+               case 'W':
+                       if (Next_Letter == 'H' || Next_Letter == 'R') {
+                               Phonize (Next_Letter);
+                               w_idx += 2;
+                       } else if (isvowel (Next_Letter)) {
+                               Phonize ('W');
+                               w_idx += 2;
+                       }
+                       /*
+                       * else ignore 
+                       */
+                       break;
+               /*
+               * X becomes S 
+               */
+               case 'X':
+                       Phonize ('S');
+                       w_idx++;
+                       break;
+               /*
+               * Vowels are kept 
+               */
+               /*
+               * We did A already case 'A': case 'a': 
+               */
+               case 'E':
+               case 'I':
+               case 'O':
+               case 'U':
+                       Phonize (Curr_Letter);
+                       w_idx++;
+                       break;
     }
 
 
@@ -216,248 +216,242 @@ metaphone (char *word, int max_phonemes, char **phoned_word)
     /*
      * On to the metaphoning 
      */
-    for (; Curr_Letter != '\0' &&
-        (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) {
-       /*
-        * How many letters to skip because an eariler encoding handled
-        * multiple letters 
-        */
-       unsigned short int skip_letter = 0;
+    for (; Curr_Letter != '\0' && (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) {
+               /*
+               * How many letters to skip because an eariler encoding handled
+               * multiple letters 
+               */
+               unsigned short int skip_letter = 0;
 
 
-       /*
-        * THOUGHT: It would be nice if, rather than having things like...
-        * well, SCI.  For SCI you encode the S, then have to remember to
-        * skip the C.  So the phonome SCI invades both S and C.  It would
-        * be better, IMHO, to skip the C from the S part of the encoding.
-        * Hell, I'm trying it. 
-        */
+               /*
+               * THOUGHT: It would be nice if, rather than having things like...
+               * well, SCI.  For SCI you encode the S, then have to remember to
+               * skip the C.  So the phonome SCI invades both S and C.  It would
+               * be better, IMHO, to skip the C from the S part of the encoding.
+               * Hell, I'm trying it. 
+               */
 
-       /*
-        * Ignore non-alphas 
-        */
-       if (! g_ascii_isalpha (Curr_Letter))
-           continue;
+               /*
+               * Ignore non-alphas 
+               */
+               if (! g_ascii_isalpha (Curr_Letter))
+                       continue;
 
-       /*
-        * Drop duplicates, except CC 
-        */
-       if (Curr_Letter == Prev_Letter && Curr_Letter != 'C')
-           continue;
+               /*
+               * Drop duplicates, except CC 
+               */
+               if (Curr_Letter == Prev_Letter && Curr_Letter != 'C')
+                       continue;
 
-       switch (Curr_Letter) {
-           /*
-            * B -> B unless in MB 
-            */
-       case 'B':
-           if (Prev_Letter != 'M')
-               Phonize ('B');
-           break;
-           /*
-            * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
-            * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
-            * SCE-, -SCY- (handed in S) else K 
-            */
-       case 'C':
-           if (MAKESOFT (Next_Letter)) {       /* C[IEY] */
-                       if (After_Next_Letter == 'A' && Next_Letter == 'I') {   /* CIA 
-                                                                               */
-                               Phonize (SH);
-                       }
+               switch (Curr_Letter) {
                        /*
-                       * SC[IEY] 
+                       * B -> B unless in MB 
                        */
-                       else if (Prev_Letter == 'S') {
+                       case 'B':
+                               if (Prev_Letter != 'M')
+                               Phonize ('B');
+                               break;
                                /*
-                               * Dropped 
+                               * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
+                               * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
+                               * SCE-, -SCY- (handed in S) else K 
                                */
-                       } else {
-                               Phonize ('S');
-                       }
-           } else if (Next_Letter == 'H') {
+                       case 'C':
+                               if (MAKESOFT (Next_Letter)) {   /* C[IEY] */
+                                       if (After_Next_Letter == 'A' && Next_Letter == 'I') {   /* CIA 
+                                                                                               */
+                                               Phonize (SH);
+                                       }
+                                       /*
+                                       * SC[IEY] 
+                                       */
+                                       else if (Prev_Letter == 'S') {
+                                               /*
+                                               * Dropped 
+                                               */
+                                       } else {
+                                               Phonize ('S');
+                                       }
+                               } else if (Next_Letter == 'H') {
 #ifndef USE_TRADITIONAL_METAPHONE
-                       if (After_Next_Letter == 'R' || Prev_Letter == 'S') {   /* Christ, 
-                                                                               * School 
-                                                                               */
-                               Phonize ('K');
-                       } else {
-                               Phonize (SH);
-                       }
+                                       if (After_Next_Letter == 'R' || Prev_Letter == 'S') {   /* Christ, 
+                                                                                               * School 
+                                                                                               */
+                                               Phonize ('K');
+                                       } else {
+                                               Phonize (SH);
+                                       }
 #else
-                       Phonize (SH);
+                                       Phonize (SH);
 #endif
-                       skip_letter++;
-           } else {
-                       Phonize ('K');
-           }
-           break;
-           /*
-            * J if in -DGE-, -DGI- or -DGY- else T 
-            */
-       case 'D':
-           if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) {
-                       Phonize ('J');
-                       skip_letter++;
-           } else {
-                       Phonize ('T');
-               }
-           break;
-           /*
-            * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
-            * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
-            * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
-            * else K 
-            */
-       case 'G':
-           if (Next_Letter == 'H') {
-                       if (!(NOGHTOF (Look_Back_Letter (3)) ||
-                               Look_Back_Letter (4) == 'H')) {
-                               Phonize ('F');
-                               skip_letter++;
-                       } else {
+                                       skip_letter++;
+                               } else {
+                                       Phonize ('K');
+                               }
+                               break;
                                /*
-                               * silent 
+                               * J if in -DGE-, -DGI- or -DGY- else T 
                                */
-                       }
-           } else if (Next_Letter == 'N') {
-                       if (Isbreak (After_Next_Letter) ||
-                               (After_Next_Letter == 'E' &&
-                               Look_Ahead_Letter (3) == 'D')) {
+                       case 'D':
+                               if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) {
+                                       Phonize ('J');
+                                       skip_letter++;
+                               } else {
+                                       Phonize ('T');
+                               }
+                               break;
+                               /*
+                               * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
+                               * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
+                               * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
+                               * else K 
+                               */
+                       case 'G':
+                               if (Next_Letter == 'H') {
+                                       if (!(NOGHTOF (Look_Back_Letter (3)) ||
+                                               Look_Back_Letter (4) == 'H')) {
+                                               Phonize ('F');
+                                               skip_letter++;
+                                       } else {
+                                               /*
+                                               * silent 
+                                               */
+                                       }
+                               } else if (Next_Letter == 'N') {
+                                       if (Isbreak (After_Next_Letter) ||
+                                               (After_Next_Letter == 'E' &&
+                                               Look_Ahead_Letter (3) == 'D')) {
+                                               /*
+                                               * dropped 
+                                               */
+                                       } else {
+                                               Phonize ('K');
+                                       }
+                               } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') {
+                                       Phonize ('J');
+                               } else {
+                                       Phonize ('K');
+                               }
+                               break;
+                               /*
+                               * H if before a vowel and not after C,G,P,S,T 
+                               */
+                       case 'H':
+                               if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) {
+                                       Phonize ('H');
+                               }
+                               break;
+                               /*
+                               * dropped if after C else K 
+                               */
+                       case 'K':
+                               if (Prev_Letter != 'C') {
+                                       Phonize ('K');
+                               }
+                               break;
                                /*
-                               * dropped 
+                               * F if before H else P 
                                */
-                       } else {
+                       case 'P':
+                               if (Next_Letter == 'H') {
+                                       Phonize ('F');
+                               } else {
+                                       Phonize ('P');
+                               }
+                               break;
+                               /*
+                               * K 
+                               */
+                       case 'Q':
                                Phonize ('K');
-                       }
-           } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') {
-                       Phonize ('J');
-           } else {
-                       Phonize ('K');
-           }
-           break;
-           /*
-            * H if before a vowel and not after C,G,P,S,T 
-            */
-       case 'H':
-           if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) {
-                       Phonize ('H');
-               }
-           break;
-           /*
-            * dropped if after C else K 
-            */
-       case 'K':
-           if (Prev_Letter != 'C') {
-                       Phonize ('K');
-               }
-           break;
-           /*
-            * F if before H else P 
-            */
-       case 'P':
-           if (Next_Letter == 'H') {
-                       Phonize ('F');
-           } else {
-                       Phonize ('P');
-           }
-           break;
-           /*
-            * K 
-            */
-       case 'Q':
-           Phonize ('K');
-           break;
-           /*
-            * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S 
-            */
-       case 'S':
-           if (Next_Letter == 'I' &&
-                    (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
-                       Phonize (SH);
-           } else if (Next_Letter == 'H') {
-                       Phonize (SH);
-                       skip_letter++;
-           }
+                               break;
+                               /*
+                               * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S 
+                               */
+                       case 'S':
+                               if (Next_Letter == 'I' &&
+                                       (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
+                                       Phonize (SH);
+                               } else if (Next_Letter == 'H') {
+                                       Phonize (SH);
+                                       skip_letter++;
+                               }
 #ifndef USE_TRADITIONAL_METAPHONE
-           else if (Next_Letter == 'C' &&
-                    Look_Ahead_Letter (2) == 'H' &&
-                    Look_Ahead_Letter (3) == 'W') {
-                       Phonize (SH);
-                       skip_letter += 2;
-           }
+                               else if (Next_Letter == 'C' &&
+                                       Look_Ahead_Letter (2) == 'H' &&
+                                       Look_Ahead_Letter (3) == 'W') {
+                                       Phonize (SH);
+                                       skip_letter += 2;
+                               }
 #endif
-           else {
-                       Phonize ('S');
-           }
-           break;
-           /*
-            * 'sh' in -TIA- or -TIO- else 'th' before H else T 
-            */
-       case 'T':
-           if (Next_Letter == 'I' &&
-               (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
-                       Phonize (SH);
-           } else if (Next_Letter == 'H') {
-                       Phonize (TH);
-                       skip_letter++;
-           } else {
-                       Phonize ('T');
-           }
-           break;
-           /*
-            * F 
-            */
-       case 'V':
-           Phonize ('F');
-           break;
-           /*
-            * W before a vowel, else dropped 
-            */
-       case 'W':
-           if (isvowel (Next_Letter)) {
-                       Phonize ('W');
-               }
-           break;
-           /*
-            * KS 
-            */
-       case 'X':
-           Phonize ('K');
-           Phonize ('S');
-           break;
-           /*
-            * Y if followed by a vowel 
-            */
-       case 'Y':
-           if (isvowel (Next_Letter)) {
-                       Phonize ('Y');
-               }
-           break;
-           /*
-            * S 
-            */
-       case 'Z':
-           Phonize ('S');
-           break;
-           /*
-            * No transformation 
-            */
-       case 'F':
-       case 'J':
-       case 'L':
-       case 'M':
-       case 'N':
-       case 'R':
-           Phonize (Curr_Letter);
-           break;
-       default:
-           /*
-            * nothing 
-            */
-           break;
-       }                       /* END SWITCH */
+                               else {
+                                       Phonize ('S');
+                               }
+                               break;
+                               /*
+                               * 'sh' in -TIA- or -TIO- else 'th' before H else T 
+                               */
+                       case 'T':
+                               if (Next_Letter == 'I' &&
+                               (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
+                                       Phonize (SH);
+                               } else if (Next_Letter == 'H') {
+                                       Phonize (TH);
+                                       skip_letter++;
+                               } else {
+                                       Phonize ('T');
+                               }
+                               break;
+                               /*
+                               * F 
+                               */
+                       case 'V':
+                               Phonize ('F');
+                               break;
+                               /*
+                               * W before a vowel, else dropped 
+                               */
+                       case 'W':
+                               if (isvowel (Next_Letter)) {
+                                       Phonize ('W');
+                               }
+                               break;
+                               /*
+                               * KS 
+                               */
+                       case 'X':
+                               Phonize ('K');
+                               Phonize ('S');
+                               break;
+                               /*
+                               * Y if followed by a vowel 
+                               */
+                       case 'Y':
+                               if (isvowel (Next_Letter)) {
+                                       Phonize ('Y');
+                               }
+                               break;
+                               /*
+                               * S 
+                               */
+                       case 'Z':
+                               Phonize ('S');
+                               break;
+                               /*
+                               * No transformation 
+                               */
+                       case 'F':
+                       case 'J':
+                       case 'L':
+                       case 'M':
+                       case 'N':
+                       case 'R':
+                               Phonize (Curr_Letter);
+                               break;
+               }                       /* END SWITCH */
 
-       w_idx += skip_letter;
+               w_idx += skip_letter;
     }                          /* END FOR */
 
     End_Phoned_Word;
index 3af1ae283b6aec37c07c83d555194be1e29df37f..4d593ebbc1b7991ad306990b166f0c1fb0d0618b 100644 (file)
@@ -69,6 +69,8 @@ add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboo
                if (*input >= 'A' && *input <= 'Z') {
                        num = *input - 'A';
                        if (cur_level < skip_levels) {
+                               input ++;
+                               cur_level ++;
                                continue;
                        }
                        /* Go throught each level and check specified letter */
@@ -91,7 +93,7 @@ add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboo
                                g_strlcpy (tmp, orig, MIN (sizeof (tmp), cur_level + 1));
                                if ((res = (uintptr_t)g_tree_lookup (cur->leafs[num].data, tmp)) != 0) {
                                        if (! read_only) {
-                                               g_tree_insert (cur->leafs[num].data, tmp, GUINT_TO_POINTER (res + 1));
+                                               g_tree_insert (cur->leafs[num].data, g_strdup (tmp), GUINT_TO_POINTER (res + 1));
                                        }
                                        return res + 1;
                                }
index c8cede47a4352768a8561a8c39fe3a8183661a94..b28c071834767685d52c19dcd4e9c087431a8ce9 100644 (file)
@@ -47,6 +47,7 @@
 
 #include "../../../config.h"
 #include "../../../cfg_file.h"
+#include "../../../main.h"
 #include "metaphone.h"
 #include "prefix_tree.h"
 
@@ -134,31 +135,38 @@ parse_line (const char *line, size_t len, char **output, void *user_data)
 {
        const char *p = line;
        char *name, *metaname = NULL;
-       int levels;
+       int levels = 0;
        uintptr_t res = 0;
 
-       while (p - line < len) {
-               if (g_ascii_isspace (*p)) {
+       while (p - line <= len) {
+               if (g_ascii_isspace (*p) || p - line == len) {
                        name = g_malloc (p - line + 1);
-                       g_strlcpy (name, line, p - line);
-                       metaphone (name, 0, &metaname);
-                       /* Skip spaces */
-                       while (p - line < len && g_ascii_isspace (*p++));
-                       levels = strtol (p, NULL, 10);
-                       if (levels <= 0) {
-                               levels = strlen (name);
-                       }
-                       if (metaname) {
-                               res = add_string (tree, metaname, levels);
-                               *output = g_strdup_printf ("OK: %u", (unsigned int)res);
+                       g_strlcpy (name, line, p - line + 1);
+                       if (metaphone (name, 0, &metaname)) {
+                               /* Skip spaces */
+                               while (p - line <= len && g_ascii_isspace (*p)) {
+                                       p ++;
+                               }
+                               levels = strtol (p, NULL, 10);
+                               if (levels <= 0) {
+                                       levels = strlen (metaname) / 2;
+                               }
+                               if (metaname) {
+                                       res = add_string (tree, metaname, levels);
+                                       *output = g_strdup_printf ("OK: %u" CRLF, (unsigned int)res);
+                                       g_free (metaname);
+                                       g_free (name);
+                                       return TRUE;
+                               }
                                g_free (metaname);
                        }
                        break;
                }
+               p ++;
        }
 
        if (res == 0) {
-               *output = g_strdup ("ERR");
+               *output = g_strdup ("ERR" CRLF);
        }
 
        return TRUE;
index 77a6e64eee68b3bd219eb5cfc872bc9baa637ef8..59a620eebf770a75cb5b61d2f4cba12d30f59b8b 100644 (file)
@@ -149,7 +149,7 @@ fin_custom_filters (struct worker_task *task)
 {
        GList                          *cur, *curd;
        struct custom_filter           *filt;
-       char                           *output, *log;
+       char                           *output = NULL, *log = NULL;
 
        cur = custom_filters;
        curd = task->rcpt;
@@ -178,7 +178,7 @@ parse_line_custom (struct worker_task *task, f_str_t *in)
 {
        GList                          *cur, *curd;
        struct custom_filter           *filt;
-       char                           *output;
+       char                           *output = NULL;
        gboolean                        res = TRUE;
 
        cur = custom_filters;
@@ -350,34 +350,34 @@ write_socket (void *arg)
        switch (task->state) {
        case WRITE_REPLY:
                write_reply (task);
-               destroy_session (task->s);
                if (is_custom) {
                        fin_custom_filters (task);
                }
+               destroy_session (task->s);
                return FALSE;
                break;
        case WRITE_ERROR:
                write_reply (task);
-               destroy_session (task->s);
                if (is_custom) {
                        fin_custom_filters (task);
                }
+               destroy_session (task->s);
                return FALSE;
                break;
        case CLOSING_CONNECTION:
                debug_task ("normally closing connection");
-               destroy_session (task->s);
                if (is_custom) {
                        fin_custom_filters (task);
                }
+               destroy_session (task->s);
                return FALSE;
                break;
        default:
                msg_info ("abnormally closing connection");
-               destroy_session (task->s);
                if (is_custom) {
                        fin_custom_filters (task);
                }
+               destroy_session (task->s);
                return FALSE;
                break;
        }
@@ -393,10 +393,10 @@ err_socket (GError * err, void *arg)
        struct worker_task             *task = (struct worker_task *)arg;
        msg_info ("abnormally closing connection, error: %s", err->message);
        /* Free buffers */
-       destroy_session (task->s);
        if (is_custom) {
                fin_custom_filters (task);
        }
+       destroy_session (task->s);
 }
 
 struct worker_task             *
@@ -626,6 +626,7 @@ start_worker (struct rspamd_worker *worker)
        /* Check if this worker is not usual rspamd worker, but uses custom filters from specified path */
        is_custom_str = g_hash_table_lookup (worker->cf->params, "custom_filters");
        if (is_custom_str && g_module_supported () && load_custom_filters (worker, is_custom_str)) {
+               msg_info ("starting custom process, loaded modules from %s", is_custom_str);
                is_custom = TRUE;
        }
        else {