diff options
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 21 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 18 |
2 files changed, 27 insertions, 12 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 6e55a33a6..ee7234df7 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -1,11 +1,11 @@ -/*- - * Copyright 2016 Vsevolod Stakhov +/* + * Copyright 2023 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -871,7 +871,7 @@ void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool) void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, const gchar *language, - struct rspamd_lang_detector *d) + struct rspamd_lang_detector *lang_detector) { static GHashTable *stemmers = NULL; struct sb_stemmer *stem = NULL; @@ -894,7 +894,7 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, if (stem == NULL) { msg_debug_pool( - "<%s> cannot create lemmatizer for %s language", + "cannot create lemmatizer for %s language", language); g_hash_table_insert(stemmers, g_strdup(language), GINT_TO_POINTER(-1)); @@ -919,12 +919,11 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, stemmed = sb_stemmer_stem(stem, tok->normalized.begin, tok->normalized.len); - dlen = stemmed ? strlen(stemmed) : 0; + dlen = sb_stemmer_length(stem); - if (dlen > 0) { - dest = rspamd_mempool_alloc(pool, dlen + 1); + if (stemmed != NULL && dlen > 0) { + dest = rspamd_mempool_alloc(pool, dlen); memcpy(dest, stemmed, dlen); - dest[dlen] = '\0'; tok->stemmed.len = dlen; tok->stemmed.begin = dest; tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED; @@ -940,8 +939,8 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, tok->stemmed.begin = tok->normalized.begin; } - if (tok->stemmed.len > 0 && d != NULL && - rspamd_language_detector_is_stop_word(d, tok->stemmed.begin, tok->stemmed.len)) { + if (tok->stemmed.len > 0 && lang_detector != NULL && + rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) { tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD; } } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index e908c359d..d696364e2 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -1,3 +1,19 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #ifndef TOKENIZERS_H #define TOKENIZERS_H @@ -73,7 +89,7 @@ void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool); void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, const gchar *language, - struct rspamd_lang_detector *d); + struct rspamd_lang_detector *lang_detector); void rspamd_tokenize_meta_words(struct rspamd_task *task); |