aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 17:16:32 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 17:16:32 +0000
commit99b1cf76771eed3824693ed84751ba8054645e18 (patch)
tree6c217800a67f0c23005c68ac08f46a0a5d0f8a2b /src/libstat/tokenizers
parentabd5300a45ff290656926b61603a65e9621e090f (diff)
downloadrspamd-99b1cf76771eed3824693ed84751ba8054645e18.tar.gz
rspamd-99b1cf76771eed3824693ed84751ba8054645e18.zip
[Project] Rework stemming
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/osb.c5
-rw-r--r--src/libstat/tokenizers/tokenizers.c100
-rw-r--r--src/libstat/tokenizers/tokenizers.h9
3 files changed, 105 insertions, 9 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index d68e3bc60..a19217a89 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -306,9 +306,8 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
token_flags = token->flags;
if (task->lang_det) {
- if (rspamd_language_detector_is_stop_word (task->lang_det,
- token->begin, token->len)) {
- /* Skip it */
+ if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
+ /* Skip stop word */
continue;
}
}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 247c24dbd..9bbe899fb 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -21,6 +21,8 @@
#include "tokenizers.h"
#include "stat_internal.h"
#include "contrib/mumhash/mum.h"
+#include "libmime/lang_detection.h"
+#include "libstemmer.h"
#include <unicode/utf8.h>
#include <unicode/uchar.h>
@@ -664,5 +666,99 @@ rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
}
}
-void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
- const gchar *language); \ No newline at end of file
+void
+rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+ const gchar *language,
+ struct rspamd_lang_detector *d)
+{
+ static GHashTable *stemmers = NULL;
+ struct sb_stemmer *stem = NULL;
+ guint i;
+ rspamd_stat_token_t *tok;
+ gchar *dest;
+ gsize dlen;
+
+ if (!stemmers) {
+ stemmers = g_hash_table_new (rspamd_strcase_hash,
+ rspamd_strcase_equal);
+ }
+
+ if (language && language[0] != '\0') {
+ stem = g_hash_table_lookup (stemmers, language);
+
+ if (stem == NULL) {
+
+ stem = sb_stemmer_new (language, "UTF_8");
+
+ if (stem == NULL) {
+ msg_debug_pool (
+ "<%s> cannot create lemmatizer for %s language",
+ language);
+ g_hash_table_insert (stemmers, g_strdup (language),
+ GINT_TO_POINTER (-1));
+ }
+ else {
+ g_hash_table_insert (stemmers, g_strdup (language),
+ stem);
+ }
+ }
+ else if (stem == GINT_TO_POINTER (-1)) {
+ /* Negative cache */
+ stem = NULL;
+ }
+ }
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+ if (stem) {
+ const gchar *stemmed;
+
+ stemmed = sb_stemmer_stem (stem,
+ tok->normalized.begin, tok->normalized.len);
+
+ dlen = strlen (stemmed);
+
+ if (dlen > 0) {
+ dest = rspamd_mempool_alloc (pool, dlen);
+ memcpy (dest, stemmed, dlen);
+ rspamd_str_lc_utf8 (dest, dlen);
+ tok->stemmed.len = dlen;
+ tok->stemmed.begin = dest;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
+ }
+ else {
+ /* Fallback */
+ dest = rspamd_mempool_alloc (pool, tok->normalized.len);
+ memcpy (dest, tok->normalized.begin, tok->normalized.len);
+ rspamd_str_lc_utf8 (dest, tok->normalized.len);
+ tok->stemmed.len = tok->normalized.len;
+ tok->stemmed.begin = dest;
+ }
+ }
+ else {
+ /* No stemmer, utf8 lowercase */
+ dest = rspamd_mempool_alloc (pool, tok->normalized.len);
+ memcpy (dest, tok->normalized.begin, tok->normalized.len);
+ rspamd_str_lc_utf8 (dest, tok->normalized.len);
+ tok->stemmed.len = tok->normalized.len;
+ tok->stemmed.begin = dest;
+ }
+
+ if (tok->stemmed.len > 0 && rspamd_language_detector_is_stop_word (d,
+ tok->stemmed.begin, tok->stemmed.len)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
+ }
+ }
+ else {
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ /* Raw text, lowercase */
+ dest = rspamd_mempool_alloc (pool, tok->original.len);
+ memcpy (dest, tok->original.begin, tok->original.len);
+ rspamd_str_lc (dest, tok->original.len);
+ tok->stemmed.len = tok->original.len;
+ tok->stemmed.begin = dest;
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 9a5561671..eb4a285de 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -54,13 +54,14 @@ gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
GPtrArray *result);
gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_config *cf,
- gsize *len);
+ struct rspamd_tokenizer_config *cf,
+ gsize *len);
+struct rspamd_lang_detector;
void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool);
-
void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
- const gchar *language);
+ const gchar *language,
+ struct rspamd_lang_detector *d);
GArray * rspamd_tokenize_subject (struct rspamd_task *task);
#endif