diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-05-13 13:23:32 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-05-13 13:23:32 +0100 |
commit | eb0ec2feb63afdf7d16babbcc82f59950233e0ee (patch) | |
tree | fd24ced0c68f52d3f51cccc73a098379420310b2 /src | |
parent | 2cebc2f8b4e75135e0842af5af44bc1f9f891a27 (diff) | |
download | rspamd-eb0ec2feb63afdf7d16babbcc82f59950233e0ee.tar.gz rspamd-eb0ec2feb63afdf7d16babbcc82f59950233e0ee.zip |
[Feature] Reuse stemmers in the cache
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/message.c | 29 |
1 files changed, 21 insertions, 8 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index 9cf611e79..0cf18e0a4 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -204,11 +204,29 @@ rspamd_extract_words (struct rspamd_task *task, guint i, nlen, total_len = 0, short_len = 0; #ifdef WITH_SNOWBALL + static GHashTable *stemmers = NULL; + if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) { - stem = sb_stemmer_new (part->language, "UTF_8"); + + if (!stemmers) { + stemmers = g_hash_table_new (rspamd_strcase_hash, + rspamd_strcase_equal); + } + + stem = g_hash_table_lookup (stemmers, part->language); + if (stem == NULL) { - msg_debug_task ("<%s> cannot create lemmatizer for %s language", - task->message_id, part->language); + + stem = sb_stemmer_new (part->language, "UTF_8"); + + if (stem == NULL) { + msg_debug_task ("<%s> cannot create lemmatizer for %s language", + task->message_id, part->language); + } + else { + g_hash_table_insert (stemmers, g_strdup (part->language), + stem); + } } } #endif @@ -284,11 +302,6 @@ rspamd_extract_words (struct rspamd_task *task, } } } -#ifdef WITH_SNOWBALL - if (stem != NULL) { - sb_stemmer_delete (stem); - } -#endif if (part->normalized_words && part->normalized_words->len) { gdouble *avg_len_p, *short_len_p; |