aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libstat/tokenizers/tokenizers.c21
-rw-r--r--src/libstat/tokenizers/tokenizers.h18
2 files changed, 27 insertions, 12 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 6e55a33a6..ee7234df7 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2023 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -871,7 +871,7 @@ void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
const gchar *language,
- struct rspamd_lang_detector *d)
+ struct rspamd_lang_detector *lang_detector)
{
static GHashTable *stemmers = NULL;
struct sb_stemmer *stem = NULL;
@@ -894,7 +894,7 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
if (stem == NULL) {
msg_debug_pool(
- "<%s> cannot create lemmatizer for %s language",
+ "cannot create lemmatizer for %s language",
language);
g_hash_table_insert(stemmers, g_strdup(language),
GINT_TO_POINTER(-1));
@@ -919,12 +919,11 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
stemmed = sb_stemmer_stem(stem,
tok->normalized.begin, tok->normalized.len);
- dlen = stemmed ? strlen(stemmed) : 0;
+ dlen = sb_stemmer_length(stem);
- if (dlen > 0) {
- dest = rspamd_mempool_alloc(pool, dlen + 1);
+ if (stemmed != NULL && dlen > 0) {
+ dest = rspamd_mempool_alloc(pool, dlen);
memcpy(dest, stemmed, dlen);
- dest[dlen] = '\0';
tok->stemmed.len = dlen;
tok->stemmed.begin = dest;
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
@@ -940,8 +939,8 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
tok->stemmed.begin = tok->normalized.begin;
}
- if (tok->stemmed.len > 0 && d != NULL &&
- rspamd_language_detector_is_stop_word(d, tok->stemmed.begin, tok->stemmed.len)) {
+ if (tok->stemmed.len > 0 && lang_detector != NULL &&
+ rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) {
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
}
}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index e908c359d..d696364e2 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
#ifndef TOKENIZERS_H
#define TOKENIZERS_H
@@ -73,7 +89,7 @@ void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
const gchar *language,
- struct rspamd_lang_detector *d);
+ struct rspamd_lang_detector *lang_detector);
void rspamd_tokenize_meta_words(struct rspamd_task *task);