From: Vsevolod Stakhov Date: Sat, 13 May 2017 12:23:32 +0000 (+0100) Subject: [Feature] Reuse stemmers in the cache X-Git-Tag: 1.6.0~219 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=eb0ec2feb63afdf7d16babbcc82f59950233e0ee;p=rspamd.git [Feature] Reuse stemmers in the cache --- diff --git a/src/libmime/message.c b/src/libmime/message.c index 9cf611e79..0cf18e0a4 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -204,11 +204,29 @@ rspamd_extract_words (struct rspamd_task *task, guint i, nlen, total_len = 0, short_len = 0; #ifdef WITH_SNOWBALL + static GHashTable *stemmers = NULL; + if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) { - stem = sb_stemmer_new (part->language, "UTF_8"); + + if (!stemmers) { + stemmers = g_hash_table_new (rspamd_strcase_hash, + rspamd_strcase_equal); + } + + stem = g_hash_table_lookup (stemmers, part->language); + if (stem == NULL) { - msg_debug_task ("<%s> cannot create lemmatizer for %s language", - task->message_id, part->language); + + stem = sb_stemmer_new (part->language, "UTF_8"); + + if (stem == NULL) { + msg_debug_task ("<%s> cannot create lemmatizer for %s language", + task->message_id, part->language); + } + else { + g_hash_table_insert (stemmers, g_strdup (part->language), + stem); + } } } #endif @@ -284,11 +302,6 @@ rspamd_extract_words (struct rspamd_task *task, } } } -#ifdef WITH_SNOWBALL - if (stem != NULL) { - sb_stemmer_delete (stem); - } -#endif if (part->normalized_words && part->normalized_words->len) { gdouble *avg_len_p, *short_len_p;