diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2010-04-23 19:53:15 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2010-04-23 19:53:15 +0400 |
commit | 5b9251914c25fa6559e5680f03d7efddad9af736 (patch) | |
tree | 3db9698290083a156070ba1eb87391e0ddcd1be2 /src/plugins | |
parent | e09d4f6ceaa808cef9ce965804db98f032b20a83 (diff) | |
download | rspamd-5b9251914c25fa6559e5680f03d7efddad9af736.tar.gz rspamd-5b9251914c25fa6559e5680f03d7efddad9af736.zip |
* Make regmark customplugin working
Diffstat (limited to 'src/plugins')
-rw-r--r-- | src/plugins/custom/regmark/metaphone.c | 578 | ||||
-rw-r--r-- | src/plugins/custom/regmark/prefix_tree.c | 4 | ||||
-rw-r--r-- | src/plugins/custom/regmark/regmark.c | 38 |
3 files changed, 312 insertions, 308 deletions
diff --git a/src/plugins/custom/regmark/metaphone.c b/src/plugins/custom/regmark/metaphone.c index 81fa0544e..3ac7b7c09 100644 --- a/src/plugins/custom/regmark/metaphone.c +++ b/src/plugins/custom/regmark/metaphone.c @@ -94,11 +94,11 @@ Lookahead (char *word, int how_far) /* * phonize one letter */ -#define Phonize(c) {(*phoned_word)[p_idx++] = c;} +#define Phonize(c) {p[p_idx++] = c;} /* * Slap a null character on the end of the phoned word */ -#define End_Phoned_Word {(*phoned_word)[p_idx] = '\0';} +#define End_Phoned_Word {p[p_idx] = '\0';} /* * How long is the phoned word? */ @@ -115,17 +115,22 @@ metaphone (char *word, int max_phonemes, char **phoned_word) { int w_idx = 0; /* point in the phonization we're at. */ int p_idx = 0; /* end of the phoned phrase */ + char *p; /*-- Parameter checks --*/ /* * Assume largest possible if we're given no limit */ if (max_phonemes == 0) { - max_phonemes = strlen (word); + max_phonemes = strlen (word) * 2 + 1; + } + if (max_phonemes == 0) { + return FALSE; } /*-- Allocate memory for our phoned_phrase --*/ *phoned_word = g_malloc (max_phonemes * sizeof (char)); + p = *phoned_word; /*-- The first phoneme has to be processed specially. --*/ /* @@ -142,73 +147,68 @@ metaphone (char *word, int max_phonemes, char **phoned_word) } switch (Curr_Letter) { - /* - * AE becomes E - */ - case 'A': - if (Next_Letter == 'E') { - Phonize ('E'); - w_idx += 2; - } - /* - * Remember, preserve vowels at the beginning - */ - else { - Phonize ('A'); - w_idx++; - } - break; - /* - * [GKP]N becomes N - */ - case 'G': - case 'K': - case 'P': - if (Next_Letter == 'N') { - Phonize ('N'); - w_idx += 2; - } - break; - /* - * WH becomes H, WR becomes R W if followed by a vowel - */ - case 'W': - if (Next_Letter == 'H' || Next_Letter == 'R') { - Phonize (Next_Letter); - w_idx += 2; - } else if (isvowel (Next_Letter)) { - Phonize ('W'); - w_idx += 2; - } - /* - * else ignore - */ - break; - /* - * X becomes S - */ - case 'X': - Phonize ('S'); - w_idx++; - break; - /* - * Vowels are kept - */ - /* - * We did A already case 'A': case 'a': - */ - case 'E': - case 'I': - case 'O': - case 'U': - Phonize (Curr_Letter); - w_idx++; - break; - default: - /* - * do nothing - */ - break; + /* + * AE becomes E + */ + case 'A': + if (Next_Letter == 'E') { + Phonize ('E'); + w_idx += 2; + } + /* + * Remember, preserve vowels at the beginning + */ + else { + Phonize ('A'); + w_idx++; + } + break; + /* + * [GKP]N becomes N + */ + case 'G': + case 'K': + case 'P': + if (Next_Letter == 'N') { + Phonize ('N'); + w_idx += 2; + } + break; + /* + * WH becomes H, WR becomes R W if followed by a vowel + */ + case 'W': + if (Next_Letter == 'H' || Next_Letter == 'R') { + Phonize (Next_Letter); + w_idx += 2; + } else if (isvowel (Next_Letter)) { + Phonize ('W'); + w_idx += 2; + } + /* + * else ignore + */ + break; + /* + * X becomes S + */ + case 'X': + Phonize ('S'); + w_idx++; + break; + /* + * Vowels are kept + */ + /* + * We did A already case 'A': case 'a': + */ + case 'E': + case 'I': + case 'O': + case 'U': + Phonize (Curr_Letter); + w_idx++; + break; } @@ -216,248 +216,242 @@ metaphone (char *word, int max_phonemes, char **phoned_word) /* * On to the metaphoning */ - for (; Curr_Letter != '\0' && - (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) { - /* - * How many letters to skip because an eariler encoding handled - * multiple letters - */ - unsigned short int skip_letter = 0; + for (; Curr_Letter != '\0' && (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) { + /* + * How many letters to skip because an eariler encoding handled + * multiple letters + */ + unsigned short int skip_letter = 0; - /* - * THOUGHT: It would be nice if, rather than having things like... - * well, SCI. For SCI you encode the S, then have to remember to - * skip the C. So the phonome SCI invades both S and C. It would - * be better, IMHO, to skip the C from the S part of the encoding. - * Hell, I'm trying it. - */ + /* + * THOUGHT: It would be nice if, rather than having things like... + * well, SCI. For SCI you encode the S, then have to remember to + * skip the C. So the phonome SCI invades both S and C. It would + * be better, IMHO, to skip the C from the S part of the encoding. + * Hell, I'm trying it. + */ - /* - * Ignore non-alphas - */ - if (! g_ascii_isalpha (Curr_Letter)) - continue; + /* + * Ignore non-alphas + */ + if (! g_ascii_isalpha (Curr_Letter)) + continue; - /* - * Drop duplicates, except CC - */ - if (Curr_Letter == Prev_Letter && Curr_Letter != 'C') - continue; + /* + * Drop duplicates, except CC + */ + if (Curr_Letter == Prev_Letter && Curr_Letter != 'C') + continue; - switch (Curr_Letter) { - /* - * B -> B unless in MB - */ - case 'B': - if (Prev_Letter != 'M') - Phonize ('B'); - break; - /* - * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is - * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-, - * SCE-, -SCY- (handed in S) else K - */ - case 'C': - if (MAKESOFT (Next_Letter)) { /* C[IEY] */ - if (After_Next_Letter == 'A' && Next_Letter == 'I') { /* CIA - */ - Phonize (SH); - } + switch (Curr_Letter) { /* - * SC[IEY] + * B -> B unless in MB */ - else if (Prev_Letter == 'S') { + case 'B': + if (Prev_Letter != 'M') + Phonize ('B'); + break; /* - * Dropped + * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is + * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-, + * SCE-, -SCY- (handed in S) else K */ - } else { - Phonize ('S'); - } - } else if (Next_Letter == 'H') { + case 'C': + if (MAKESOFT (Next_Letter)) { /* C[IEY] */ + if (After_Next_Letter == 'A' && Next_Letter == 'I') { /* CIA + */ + Phonize (SH); + } + /* + * SC[IEY] + */ + else if (Prev_Letter == 'S') { + /* + * Dropped + */ + } else { + Phonize ('S'); + } + } else if (Next_Letter == 'H') { #ifndef USE_TRADITIONAL_METAPHONE - if (After_Next_Letter == 'R' || Prev_Letter == 'S') { /* Christ, - * School - */ - Phonize ('K'); - } else { - Phonize (SH); - } + if (After_Next_Letter == 'R' || Prev_Letter == 'S') { /* Christ, + * School + */ + Phonize ('K'); + } else { + Phonize (SH); + } #else - Phonize (SH); + Phonize (SH); #endif - skip_letter++; - } else { - Phonize ('K'); - } - break; - /* - * J if in -DGE-, -DGI- or -DGY- else T - */ - case 'D': - if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) { - Phonize ('J'); - skip_letter++; - } else { - Phonize ('T'); - } - break; - /* - * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else - * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or - * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG - * else K - */ - case 'G': - if (Next_Letter == 'H') { - if (!(NOGHTOF (Look_Back_Letter (3)) || - Look_Back_Letter (4) == 'H')) { - Phonize ('F'); - skip_letter++; - } else { + skip_letter++; + } else { + Phonize ('K'); + } + break; /* - * silent + * J if in -DGE-, -DGI- or -DGY- else T */ - } - } else if (Next_Letter == 'N') { - if (Isbreak (After_Next_Letter) || - (After_Next_Letter == 'E' && - Look_Ahead_Letter (3) == 'D')) { + case 'D': + if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) { + Phonize ('J'); + skip_letter++; + } else { + Phonize ('T'); + } + break; + /* + * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else + * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or + * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG + * else K + */ + case 'G': + if (Next_Letter == 'H') { + if (!(NOGHTOF (Look_Back_Letter (3)) || + Look_Back_Letter (4) == 'H')) { + Phonize ('F'); + skip_letter++; + } else { + /* + * silent + */ + } + } else if (Next_Letter == 'N') { + if (Isbreak (After_Next_Letter) || + (After_Next_Letter == 'E' && + Look_Ahead_Letter (3) == 'D')) { + /* + * dropped + */ + } else { + Phonize ('K'); + } + } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') { + Phonize ('J'); + } else { + Phonize ('K'); + } + break; + /* + * H if before a vowel and not after C,G,P,S,T + */ + case 'H': + if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) { + Phonize ('H'); + } + break; + /* + * dropped if after C else K + */ + case 'K': + if (Prev_Letter != 'C') { + Phonize ('K'); + } + break; /* - * dropped + * F if before H else P */ - } else { + case 'P': + if (Next_Letter == 'H') { + Phonize ('F'); + } else { + Phonize ('P'); + } + break; + /* + * K + */ + case 'Q': Phonize ('K'); - } - } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') { - Phonize ('J'); - } else { - Phonize ('K'); - } - break; - /* - * H if before a vowel and not after C,G,P,S,T - */ - case 'H': - if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) { - Phonize ('H'); - } - break; - /* - * dropped if after C else K - */ - case 'K': - if (Prev_Letter != 'C') { - Phonize ('K'); - } - break; - /* - * F if before H else P - */ - case 'P': - if (Next_Letter == 'H') { - Phonize ('F'); - } else { - Phonize ('P'); - } - break; - /* - * K - */ - case 'Q': - Phonize ('K'); - break; - /* - * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S - */ - case 'S': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { - Phonize (SH); - } else if (Next_Letter == 'H') { - Phonize (SH); - skip_letter++; - } + break; + /* + * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S + */ + case 'S': + if (Next_Letter == 'I' && + (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { + Phonize (SH); + } else if (Next_Letter == 'H') { + Phonize (SH); + skip_letter++; + } #ifndef USE_TRADITIONAL_METAPHONE - else if (Next_Letter == 'C' && - Look_Ahead_Letter (2) == 'H' && - Look_Ahead_Letter (3) == 'W') { - Phonize (SH); - skip_letter += 2; - } + else if (Next_Letter == 'C' && + Look_Ahead_Letter (2) == 'H' && + Look_Ahead_Letter (3) == 'W') { + Phonize (SH); + skip_letter += 2; + } #endif - else { - Phonize ('S'); - } - break; - /* - * 'sh' in -TIA- or -TIO- else 'th' before H else T - */ - case 'T': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { - Phonize (SH); - } else if (Next_Letter == 'H') { - Phonize (TH); - skip_letter++; - } else { - Phonize ('T'); - } - break; - /* - * F - */ - case 'V': - Phonize ('F'); - break; - /* - * W before a vowel, else dropped - */ - case 'W': - if (isvowel (Next_Letter)) { - Phonize ('W'); - } - break; - /* - * KS - */ - case 'X': - Phonize ('K'); - Phonize ('S'); - break; - /* - * Y if followed by a vowel - */ - case 'Y': - if (isvowel (Next_Letter)) { - Phonize ('Y'); - } - break; - /* - * S - */ - case 'Z': - Phonize ('S'); - break; - /* - * No transformation - */ - case 'F': - case 'J': - case 'L': - case 'M': - case 'N': - case 'R': - Phonize (Curr_Letter); - break; - default: - /* - * nothing - */ - break; - } /* END SWITCH */ + else { + Phonize ('S'); + } + break; + /* + * 'sh' in -TIA- or -TIO- else 'th' before H else T + */ + case 'T': + if (Next_Letter == 'I' && + (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { + Phonize (SH); + } else if (Next_Letter == 'H') { + Phonize (TH); + skip_letter++; + } else { + Phonize ('T'); + } + break; + /* + * F + */ + case 'V': + Phonize ('F'); + break; + /* + * W before a vowel, else dropped + */ + case 'W': + if (isvowel (Next_Letter)) { + Phonize ('W'); + } + break; + /* + * KS + */ + case 'X': + Phonize ('K'); + Phonize ('S'); + break; + /* + * Y if followed by a vowel + */ + case 'Y': + if (isvowel (Next_Letter)) { + Phonize ('Y'); + } + break; + /* + * S + */ + case 'Z': + Phonize ('S'); + break; + /* + * No transformation + */ + case 'F': + case 'J': + case 'L': + case 'M': + case 'N': + case 'R': + Phonize (Curr_Letter); + break; + } /* END SWITCH */ - w_idx += skip_letter; + w_idx += skip_letter; } /* END FOR */ End_Phoned_Word; diff --git a/src/plugins/custom/regmark/prefix_tree.c b/src/plugins/custom/regmark/prefix_tree.c index 3af1ae283..4d593ebbc 100644 --- a/src/plugins/custom/regmark/prefix_tree.c +++ b/src/plugins/custom/regmark/prefix_tree.c @@ -69,6 +69,8 @@ add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboo if (*input >= 'A' && *input <= 'Z') { num = *input - 'A'; if (cur_level < skip_levels) { + input ++; + cur_level ++; continue; } /* Go throught each level and check specified letter */ @@ -91,7 +93,7 @@ add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboo g_strlcpy (tmp, orig, MIN (sizeof (tmp), cur_level + 1)); if ((res = (uintptr_t)g_tree_lookup (cur->leafs[num].data, tmp)) != 0) { if (! read_only) { - g_tree_insert (cur->leafs[num].data, tmp, GUINT_TO_POINTER (res + 1)); + g_tree_insert (cur->leafs[num].data, g_strdup (tmp), GUINT_TO_POINTER (res + 1)); } return res + 1; } diff --git a/src/plugins/custom/regmark/regmark.c b/src/plugins/custom/regmark/regmark.c index c8cede47a..b28c07183 100644 --- a/src/plugins/custom/regmark/regmark.c +++ b/src/plugins/custom/regmark/regmark.c @@ -47,6 +47,7 @@ #include "../../../config.h" #include "../../../cfg_file.h" +#include "../../../main.h" #include "metaphone.h" #include "prefix_tree.h" @@ -134,31 +135,38 @@ parse_line (const char *line, size_t len, char **output, void *user_data) { const char *p = line; char *name, *metaname = NULL; - int levels; + int levels = 0; uintptr_t res = 0; - while (p - line < len) { - if (g_ascii_isspace (*p)) { + while (p - line <= len) { + if (g_ascii_isspace (*p) || p - line == len) { name = g_malloc (p - line + 1); - g_strlcpy (name, line, p - line); - metaphone (name, 0, &metaname); - /* Skip spaces */ - while (p - line < len && g_ascii_isspace (*p++)); - levels = strtol (p, NULL, 10); - if (levels <= 0) { - levels = strlen (name); - } - if (metaname) { - res = add_string (tree, metaname, levels); - *output = g_strdup_printf ("OK: %u", (unsigned int)res); + g_strlcpy (name, line, p - line + 1); + if (metaphone (name, 0, &metaname)) { + /* Skip spaces */ + while (p - line <= len && g_ascii_isspace (*p)) { + p ++; + } + levels = strtol (p, NULL, 10); + if (levels <= 0) { + levels = strlen (metaname) / 2; + } + if (metaname) { + res = add_string (tree, metaname, levels); + *output = g_strdup_printf ("OK: %u" CRLF, (unsigned int)res); + g_free (metaname); + g_free (name); + return TRUE; + } g_free (metaname); } break; } + p ++; } if (res == 0) { - *output = g_strdup ("ERR"); + *output = g_strdup ("ERR" CRLF); } return TRUE; |