summaryrefslogtreecommitdiffstats
path: root/src/plugins
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2010-04-23 19:53:15 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2010-04-23 19:53:15 +0400
commit5b9251914c25fa6559e5680f03d7efddad9af736 (patch)
tree3db9698290083a156070ba1eb87391e0ddcd1be2 /src/plugins
parente09d4f6ceaa808cef9ce965804db98f032b20a83 (diff)
downloadrspamd-5b9251914c25fa6559e5680f03d7efddad9af736.tar.gz
rspamd-5b9251914c25fa6559e5680f03d7efddad9af736.zip
* Make regmark customplugin working
Diffstat (limited to 'src/plugins')
-rw-r--r--src/plugins/custom/regmark/metaphone.c578
-rw-r--r--src/plugins/custom/regmark/prefix_tree.c4
-rw-r--r--src/plugins/custom/regmark/regmark.c38
3 files changed, 312 insertions, 308 deletions
diff --git a/src/plugins/custom/regmark/metaphone.c b/src/plugins/custom/regmark/metaphone.c
index 81fa0544e..3ac7b7c09 100644
--- a/src/plugins/custom/regmark/metaphone.c
+++ b/src/plugins/custom/regmark/metaphone.c
@@ -94,11 +94,11 @@ Lookahead (char *word, int how_far)
/*
* phonize one letter
*/
-#define Phonize(c) {(*phoned_word)[p_idx++] = c;}
+#define Phonize(c) {p[p_idx++] = c;}
/*
* Slap a null character on the end of the phoned word
*/
-#define End_Phoned_Word {(*phoned_word)[p_idx] = '\0';}
+#define End_Phoned_Word {p[p_idx] = '\0';}
/*
* How long is the phoned word?
*/
@@ -115,17 +115,22 @@ metaphone (char *word, int max_phonemes, char **phoned_word)
{
int w_idx = 0; /* point in the phonization we're at. */
int p_idx = 0; /* end of the phoned phrase */
+ char *p;
/*-- Parameter checks --*/
/*
* Assume largest possible if we're given no limit
*/
if (max_phonemes == 0) {
- max_phonemes = strlen (word);
+ max_phonemes = strlen (word) * 2 + 1;
+ }
+ if (max_phonemes == 0) {
+ return FALSE;
}
/*-- Allocate memory for our phoned_phrase --*/
*phoned_word = g_malloc (max_phonemes * sizeof (char));
+ p = *phoned_word;
/*-- The first phoneme has to be processed specially. --*/
/*
@@ -142,73 +147,68 @@ metaphone (char *word, int max_phonemes, char **phoned_word)
}
switch (Curr_Letter) {
- /*
- * AE becomes E
- */
- case 'A':
- if (Next_Letter == 'E') {
- Phonize ('E');
- w_idx += 2;
- }
- /*
- * Remember, preserve vowels at the beginning
- */
- else {
- Phonize ('A');
- w_idx++;
- }
- break;
- /*
- * [GKP]N becomes N
- */
- case 'G':
- case 'K':
- case 'P':
- if (Next_Letter == 'N') {
- Phonize ('N');
- w_idx += 2;
- }
- break;
- /*
- * WH becomes H, WR becomes R W if followed by a vowel
- */
- case 'W':
- if (Next_Letter == 'H' || Next_Letter == 'R') {
- Phonize (Next_Letter);
- w_idx += 2;
- } else if (isvowel (Next_Letter)) {
- Phonize ('W');
- w_idx += 2;
- }
- /*
- * else ignore
- */
- break;
- /*
- * X becomes S
- */
- case 'X':
- Phonize ('S');
- w_idx++;
- break;
- /*
- * Vowels are kept
- */
- /*
- * We did A already case 'A': case 'a':
- */
- case 'E':
- case 'I':
- case 'O':
- case 'U':
- Phonize (Curr_Letter);
- w_idx++;
- break;
- default:
- /*
- * do nothing
- */
- break;
+ /*
+ * AE becomes E
+ */
+ case 'A':
+ if (Next_Letter == 'E') {
+ Phonize ('E');
+ w_idx += 2;
+ }
+ /*
+ * Remember, preserve vowels at the beginning
+ */
+ else {
+ Phonize ('A');
+ w_idx++;
+ }
+ break;
+ /*
+ * [GKP]N becomes N
+ */
+ case 'G':
+ case 'K':
+ case 'P':
+ if (Next_Letter == 'N') {
+ Phonize ('N');
+ w_idx += 2;
+ }
+ break;
+ /*
+ * WH becomes H, WR becomes R W if followed by a vowel
+ */
+ case 'W':
+ if (Next_Letter == 'H' || Next_Letter == 'R') {
+ Phonize (Next_Letter);
+ w_idx += 2;
+ } else if (isvowel (Next_Letter)) {
+ Phonize ('W');
+ w_idx += 2;
+ }
+ /*
+ * else ignore
+ */
+ break;
+ /*
+ * X becomes S
+ */
+ case 'X':
+ Phonize ('S');
+ w_idx++;
+ break;
+ /*
+ * Vowels are kept
+ */
+ /*
+ * We did A already case 'A': case 'a':
+ */
+ case 'E':
+ case 'I':
+ case 'O':
+ case 'U':
+ Phonize (Curr_Letter);
+ w_idx++;
+ break;
}
@@ -216,248 +216,242 @@ metaphone (char *word, int max_phonemes, char **phoned_word)
/*
* On to the metaphoning
*/
- for (; Curr_Letter != '\0' &&
- (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) {
- /*
- * How many letters to skip because an eariler encoding handled
- * multiple letters
- */
- unsigned short int skip_letter = 0;
+ for (; Curr_Letter != '\0' && (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) {
+ /*
+ * How many letters to skip because an eariler encoding handled
+ * multiple letters
+ */
+ unsigned short int skip_letter = 0;
- /*
- * THOUGHT: It would be nice if, rather than having things like...
- * well, SCI. For SCI you encode the S, then have to remember to
- * skip the C. So the phonome SCI invades both S and C. It would
- * be better, IMHO, to skip the C from the S part of the encoding.
- * Hell, I'm trying it.
- */
+ /*
+ * THOUGHT: It would be nice if, rather than having things like...
+ * well, SCI. For SCI you encode the S, then have to remember to
+ * skip the C. So the phonome SCI invades both S and C. It would
+ * be better, IMHO, to skip the C from the S part of the encoding.
+ * Hell, I'm trying it.
+ */
- /*
- * Ignore non-alphas
- */
- if (! g_ascii_isalpha (Curr_Letter))
- continue;
+ /*
+ * Ignore non-alphas
+ */
+ if (! g_ascii_isalpha (Curr_Letter))
+ continue;
- /*
- * Drop duplicates, except CC
- */
- if (Curr_Letter == Prev_Letter && Curr_Letter != 'C')
- continue;
+ /*
+ * Drop duplicates, except CC
+ */
+ if (Curr_Letter == Prev_Letter && Curr_Letter != 'C')
+ continue;
- switch (Curr_Letter) {
- /*
- * B -> B unless in MB
- */
- case 'B':
- if (Prev_Letter != 'M')
- Phonize ('B');
- break;
- /*
- * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
- * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
- * SCE-, -SCY- (handed in S) else K
- */
- case 'C':
- if (MAKESOFT (Next_Letter)) { /* C[IEY] */
- if (After_Next_Letter == 'A' && Next_Letter == 'I') { /* CIA
- */
- Phonize (SH);
- }
+ switch (Curr_Letter) {
/*
- * SC[IEY]
+ * B -> B unless in MB
*/
- else if (Prev_Letter == 'S') {
+ case 'B':
+ if (Prev_Letter != 'M')
+ Phonize ('B');
+ break;
/*
- * Dropped
+ * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
+ * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
+ * SCE-, -SCY- (handed in S) else K
*/
- } else {
- Phonize ('S');
- }
- } else if (Next_Letter == 'H') {
+ case 'C':
+ if (MAKESOFT (Next_Letter)) { /* C[IEY] */
+ if (After_Next_Letter == 'A' && Next_Letter == 'I') { /* CIA
+ */
+ Phonize (SH);
+ }
+ /*
+ * SC[IEY]
+ */
+ else if (Prev_Letter == 'S') {
+ /*
+ * Dropped
+ */
+ } else {
+ Phonize ('S');
+ }
+ } else if (Next_Letter == 'H') {
#ifndef USE_TRADITIONAL_METAPHONE
- if (After_Next_Letter == 'R' || Prev_Letter == 'S') { /* Christ,
- * School
- */
- Phonize ('K');
- } else {
- Phonize (SH);
- }
+ if (After_Next_Letter == 'R' || Prev_Letter == 'S') { /* Christ,
+ * School
+ */
+ Phonize ('K');
+ } else {
+ Phonize (SH);
+ }
#else
- Phonize (SH);
+ Phonize (SH);
#endif
- skip_letter++;
- } else {
- Phonize ('K');
- }
- break;
- /*
- * J if in -DGE-, -DGI- or -DGY- else T
- */
- case 'D':
- if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) {
- Phonize ('J');
- skip_letter++;
- } else {
- Phonize ('T');
- }
- break;
- /*
- * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
- * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
- * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
- * else K
- */
- case 'G':
- if (Next_Letter == 'H') {
- if (!(NOGHTOF (Look_Back_Letter (3)) ||
- Look_Back_Letter (4) == 'H')) {
- Phonize ('F');
- skip_letter++;
- } else {
+ skip_letter++;
+ } else {
+ Phonize ('K');
+ }
+ break;
/*
- * silent
+ * J if in -DGE-, -DGI- or -DGY- else T
*/
- }
- } else if (Next_Letter == 'N') {
- if (Isbreak (After_Next_Letter) ||
- (After_Next_Letter == 'E' &&
- Look_Ahead_Letter (3) == 'D')) {
+ case 'D':
+ if (Next_Letter == 'G' && MAKESOFT (After_Next_Letter)) {
+ Phonize ('J');
+ skip_letter++;
+ } else {
+ Phonize ('T');
+ }
+ break;
+ /*
+ * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
+ * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
+ * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
+ * else K
+ */
+ case 'G':
+ if (Next_Letter == 'H') {
+ if (!(NOGHTOF (Look_Back_Letter (3)) ||
+ Look_Back_Letter (4) == 'H')) {
+ Phonize ('F');
+ skip_letter++;
+ } else {
+ /*
+ * silent
+ */
+ }
+ } else if (Next_Letter == 'N') {
+ if (Isbreak (After_Next_Letter) ||
+ (After_Next_Letter == 'E' &&
+ Look_Ahead_Letter (3) == 'D')) {
+ /*
+ * dropped
+ */
+ } else {
+ Phonize ('K');
+ }
+ } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') {
+ Phonize ('J');
+ } else {
+ Phonize ('K');
+ }
+ break;
+ /*
+ * H if before a vowel and not after C,G,P,S,T
+ */
+ case 'H':
+ if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) {
+ Phonize ('H');
+ }
+ break;
+ /*
+ * dropped if after C else K
+ */
+ case 'K':
+ if (Prev_Letter != 'C') {
+ Phonize ('K');
+ }
+ break;
/*
- * dropped
+ * F if before H else P
*/
- } else {
+ case 'P':
+ if (Next_Letter == 'H') {
+ Phonize ('F');
+ } else {
+ Phonize ('P');
+ }
+ break;
+ /*
+ * K
+ */
+ case 'Q':
Phonize ('K');
- }
- } else if (MAKESOFT (Next_Letter) && Prev_Letter != 'G') {
- Phonize ('J');
- } else {
- Phonize ('K');
- }
- break;
- /*
- * H if before a vowel and not after C,G,P,S,T
- */
- case 'H':
- if (isvowel (Next_Letter) && !AFFECTH (Prev_Letter)) {
- Phonize ('H');
- }
- break;
- /*
- * dropped if after C else K
- */
- case 'K':
- if (Prev_Letter != 'C') {
- Phonize ('K');
- }
- break;
- /*
- * F if before H else P
- */
- case 'P':
- if (Next_Letter == 'H') {
- Phonize ('F');
- } else {
- Phonize ('P');
- }
- break;
- /*
- * K
- */
- case 'Q':
- Phonize ('K');
- break;
- /*
- * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
- */
- case 'S':
- if (Next_Letter == 'I' &&
- (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
- Phonize (SH);
- } else if (Next_Letter == 'H') {
- Phonize (SH);
- skip_letter++;
- }
+ break;
+ /*
+ * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
+ */
+ case 'S':
+ if (Next_Letter == 'I' &&
+ (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
+ Phonize (SH);
+ } else if (Next_Letter == 'H') {
+ Phonize (SH);
+ skip_letter++;
+ }
#ifndef USE_TRADITIONAL_METAPHONE
- else if (Next_Letter == 'C' &&
- Look_Ahead_Letter (2) == 'H' &&
- Look_Ahead_Letter (3) == 'W') {
- Phonize (SH);
- skip_letter += 2;
- }
+ else if (Next_Letter == 'C' &&
+ Look_Ahead_Letter (2) == 'H' &&
+ Look_Ahead_Letter (3) == 'W') {
+ Phonize (SH);
+ skip_letter += 2;
+ }
#endif
- else {
- Phonize ('S');
- }
- break;
- /*
- * 'sh' in -TIA- or -TIO- else 'th' before H else T
- */
- case 'T':
- if (Next_Letter == 'I' &&
- (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
- Phonize (SH);
- } else if (Next_Letter == 'H') {
- Phonize (TH);
- skip_letter++;
- } else {
- Phonize ('T');
- }
- break;
- /*
- * F
- */
- case 'V':
- Phonize ('F');
- break;
- /*
- * W before a vowel, else dropped
- */
- case 'W':
- if (isvowel (Next_Letter)) {
- Phonize ('W');
- }
- break;
- /*
- * KS
- */
- case 'X':
- Phonize ('K');
- Phonize ('S');
- break;
- /*
- * Y if followed by a vowel
- */
- case 'Y':
- if (isvowel (Next_Letter)) {
- Phonize ('Y');
- }
- break;
- /*
- * S
- */
- case 'Z':
- Phonize ('S');
- break;
- /*
- * No transformation
- */
- case 'F':
- case 'J':
- case 'L':
- case 'M':
- case 'N':
- case 'R':
- Phonize (Curr_Letter);
- break;
- default:
- /*
- * nothing
- */
- break;
- } /* END SWITCH */
+ else {
+ Phonize ('S');
+ }
+ break;
+ /*
+ * 'sh' in -TIA- or -TIO- else 'th' before H else T
+ */
+ case 'T':
+ if (Next_Letter == 'I' &&
+ (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
+ Phonize (SH);
+ } else if (Next_Letter == 'H') {
+ Phonize (TH);
+ skip_letter++;
+ } else {
+ Phonize ('T');
+ }
+ break;
+ /*
+ * F
+ */
+ case 'V':
+ Phonize ('F');
+ break;
+ /*
+ * W before a vowel, else dropped
+ */
+ case 'W':
+ if (isvowel (Next_Letter)) {
+ Phonize ('W');
+ }
+ break;
+ /*
+ * KS
+ */
+ case 'X':
+ Phonize ('K');
+ Phonize ('S');
+ break;
+ /*
+ * Y if followed by a vowel
+ */
+ case 'Y':
+ if (isvowel (Next_Letter)) {
+ Phonize ('Y');
+ }
+ break;
+ /*
+ * S
+ */
+ case 'Z':
+ Phonize ('S');
+ break;
+ /*
+ * No transformation
+ */
+ case 'F':
+ case 'J':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'R':
+ Phonize (Curr_Letter);
+ break;
+ } /* END SWITCH */
- w_idx += skip_letter;
+ w_idx += skip_letter;
} /* END FOR */
End_Phoned_Word;
diff --git a/src/plugins/custom/regmark/prefix_tree.c b/src/plugins/custom/regmark/prefix_tree.c
index 3af1ae283..4d593ebbc 100644
--- a/src/plugins/custom/regmark/prefix_tree.c
+++ b/src/plugins/custom/regmark/prefix_tree.c
@@ -69,6 +69,8 @@ add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboo
if (*input >= 'A' && *input <= 'Z') {
num = *input - 'A';
if (cur_level < skip_levels) {
+ input ++;
+ cur_level ++;
continue;
}
/* Go throught each level and check specified letter */
@@ -91,7 +93,7 @@ add_string_common (prefix_tree_t *tree, const char *input, int skip_levels, gboo
g_strlcpy (tmp, orig, MIN (sizeof (tmp), cur_level + 1));
if ((res = (uintptr_t)g_tree_lookup (cur->leafs[num].data, tmp)) != 0) {
if (! read_only) {
- g_tree_insert (cur->leafs[num].data, tmp, GUINT_TO_POINTER (res + 1));
+ g_tree_insert (cur->leafs[num].data, g_strdup (tmp), GUINT_TO_POINTER (res + 1));
}
return res + 1;
}
diff --git a/src/plugins/custom/regmark/regmark.c b/src/plugins/custom/regmark/regmark.c
index c8cede47a..b28c07183 100644
--- a/src/plugins/custom/regmark/regmark.c
+++ b/src/plugins/custom/regmark/regmark.c
@@ -47,6 +47,7 @@
#include "../../../config.h"
#include "../../../cfg_file.h"
+#include "../../../main.h"
#include "metaphone.h"
#include "prefix_tree.h"
@@ -134,31 +135,38 @@ parse_line (const char *line, size_t len, char **output, void *user_data)
{
const char *p = line;
char *name, *metaname = NULL;
- int levels;
+ int levels = 0;
uintptr_t res = 0;
- while (p - line < len) {
- if (g_ascii_isspace (*p)) {
+ while (p - line <= len) {
+ if (g_ascii_isspace (*p) || p - line == len) {
name = g_malloc (p - line + 1);
- g_strlcpy (name, line, p - line);
- metaphone (name, 0, &metaname);
- /* Skip spaces */
- while (p - line < len && g_ascii_isspace (*p++));
- levels = strtol (p, NULL, 10);
- if (levels <= 0) {
- levels = strlen (name);
- }
- if (metaname) {
- res = add_string (tree, metaname, levels);
- *output = g_strdup_printf ("OK: %u", (unsigned int)res);
+ g_strlcpy (name, line, p - line + 1);
+ if (metaphone (name, 0, &metaname)) {
+ /* Skip spaces */
+ while (p - line <= len && g_ascii_isspace (*p)) {
+ p ++;
+ }
+ levels = strtol (p, NULL, 10);
+ if (levels <= 0) {
+ levels = strlen (metaname) / 2;
+ }
+ if (metaname) {
+ res = add_string (tree, metaname, levels);
+ *output = g_strdup_printf ("OK: %u" CRLF, (unsigned int)res);
+ g_free (metaname);
+ g_free (name);
+ return TRUE;
+ }
g_free (metaname);
}
break;
}
+ p ++;
}
if (res == 0) {
- *output = g_strdup ("ERR");
+ *output = g_strdup ("ERR" CRLF);
}
return TRUE;