Przeglądaj źródła

Lemmatize words for fuzzy check.

tags/0.8.0
Vsevolod Stakhov 9 lat temu
rodzic
commit
a75c10a8b1
3 zmienionych plików z 57 dodań i 5 usunięć
  1. 1
    1
      contrib/snowball
  2. 1
    0
      src/CMakeLists.txt
  3. 55
    4
      src/plugins/fuzzy_check.c

+ 1
- 1
contrib/snowball

@@ -1 +1 @@
Subproject commit 558c36912b3ca63e2917d7df504a92dbc78e1b75
Subproject commit 4bc9f365c4b674d0dd0cfd14c314a866f00e0883

+ 1
- 0
src/CMakeLists.txt Wyświetl plik

@@ -101,6 +101,7 @@ TARGET_LINK_LIBRARIES(rspamd rspamd-mime)
TARGET_LINK_LIBRARIES(rspamd rspamd-server)
TARGET_LINK_LIBRARIES(rspamd rspamd-util)
TARGET_LINK_LIBRARIES(rspamd rspamd-lua)
TARGET_LINK_LIBRARIES(rspamd stemmer)

TARGET_LINK_LIBRARIES(rspamd event)
IF(HAVE_LIBEVENT2)

+ 55
- 4
src/plugins/fuzzy_check.c Wyświetl plik

@@ -49,6 +49,7 @@
#include "main.h"
#include "blake2.h"
#include "ottery.h"
#include "libstemmer.h"

#define DEFAULT_SYMBOL "R_FUZZY_HASH"
#define DEFAULT_UPSTREAM_ERROR_TIME 10
@@ -533,6 +534,52 @@ fuzzy_io_fin (void *ud)
close (session->fd);
}

static void
fuzzy_g_array_destructor (gpointer a)
{
GArray *ar = (GArray *)a;

g_array_free (ar, TRUE);
}

static GArray *
fuzzy_preprocess_words (struct mime_text_part *part, rspamd_mempool_t *pool)
{
GArray *res;
struct sb_stemmer *stem;
rspamd_fstring_t *w, stw;
const guchar *r;
guint i;

if (!part->is_utf || !part->language || part->language[0] == '\0') {
res = part->words;
}
else {
/* Lemmatize words */
stem = sb_stemmer_new (part->language, "UTF_8");
if (stem == NULL) {
msg_debug ("cannot lemmatize %s language", part->language);
res = part->words;
}
else {
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t),
part->words->len);
for (i = 0; i < part->words->len; i ++) {
w = &g_array_index (part->words, rspamd_fstring_t, i);
r = sb_stemmer_stem (stem, w->begin, w->len);
if (r != NULL) {
stw.begin = rspamd_mempool_strdup (pool, r);
stw.len = strlen (r);
rspamd_str_lc (stw.begin, stw.len);
g_array_append_val (res, stw);
}
}
rspamd_mempool_add_destructor (pool, fuzzy_g_array_destructor, res);
sb_stemmer_delete (stem);
}
}
return res;
}

/*
* Create fuzzy command from a text part
@@ -553,6 +600,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
guint i;
blake2b_state st;
rspamd_fstring_t *word;
GArray *words;

if (legacy || part->words == NULL || part->words->len == 0) {
cmd = rspamd_mempool_alloc0 (pool, sizeof (*cmd));
@@ -572,15 +620,18 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
*/
g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str,
rule->hash_key->len) != -1);
for (i = 0; i < part->words->len; i ++) {
word = &g_array_index (part->words, rspamd_fstring_t, i);
words = fuzzy_preprocess_words (part, pool);

for (i = 0; i < words->len; i ++) {
word = &g_array_index (words, rspamd_fstring_t, i);
blake2b_update (&st, word->begin, word->len);
}
blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest));

msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str);
sh = rspamd_shingles_generate (part->words, rule->shingles_key->str,
pool, rspamd_shingles_default_filter, NULL);
sh = rspamd_shingles_generate (words,
rule->shingles_key->str, pool,
rspamd_shingles_default_filter, NULL);
if (sh != NULL) {
memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;

Ładowanie…
Anuluj
Zapisz