UErrorCode uc_err = U_ZERO_ERROR;
ucs_token->flags = utf_token->flags;
- out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->len + 1));
- nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1),
- utf_token->begin, utf_token->len, &uc_err);
+ out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->normalized.len + 1));
+ nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->normalized.len + 1),
+ utf_token->normalized.begin, utf_token->normalized.len, &uc_err);
if (nsym >= 0 && uc_err == U_ZERO_ERROR) {
rspamd_language_detector_ucs_lowercase (out, nsym);
- ucs_token->begin = (const gchar *) out;
- ucs_token->len = nsym;
+ ucs_token->normalized.begin = (const gchar *) out;
+ ucs_token->normalized.len = nsym;
}
else {
- ucs_token->len = 0;
+ ucs_token->normalized.len = 0;
}
}
for (;;) {
tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
/* Filter bad tokens */
- if (tok->len >= 2 && u_isalpha (*(UChar *)tok->begin)
- && u_isalpha (*(((UChar *)tok->begin) + (tok->len - 1)))) {
+ if (tok->normalized.len >= 2 &&
+ u_isalpha (*(UChar *)tok->normalized.begin) &&
+ u_isalpha (*(((UChar *)tok->normalized.begin) + (tok->normalized.len - 1)))) {
offsets_out[out_idx] = sel;
break;
}
window[0] = (UChar)' ';
for (i = 0; i < wlen - 1; i ++) {
- window[i + 1] = *(((UChar *)tok->begin) + i);
+ window[i + 1] = *(((UChar *)tok->normalized.begin) + i);
}
}
- else if (cur_off + wlen == tok->len + 1) {
+ else if (cur_off + wlen == tok->normalized.len + 1) {
/* Add trailing space */
for (i = 0; i < wlen - 1; i ++) {
- window[i] = *(((UChar *)tok->begin) + cur_off + i);
+ window[i] = *(((UChar *)tok->normalized.begin) + cur_off + i);
}
window[wlen - 1] = (UChar)' ';
}
- else if (cur_off + wlen > tok->len + 1) {
+ else if (cur_off + wlen > tok->normalized.len + 1) {
/* No more fun */
return -1;
}
else {
/* Normal case */
for (i = 0; i < wlen; i++) {
- window[i] = *(((UChar *) tok->begin) + cur_off + i);
+ window[i] = *(((UChar *) tok->normalized.begin) + cur_off + i);
}
}
}
else {
- if (tok->len <= cur_off) {
+ if (tok->normalized.len <= cur_off) {
return -1;
}
- window[0] = *(((UChar *)tok->begin) + cur_off);
+ window[0] = *(((UChar *)tok->normalized.begin) + cur_off);
}
return cur_off + 1;
gboolean
rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
- const gchar *word, gsize wlen)
+ const gchar *word, gsize wlen)
{
khiter_t k;
rspamd_ftok_t search;
if (part->utf_words) {
#ifdef WITH_SNOWBALL
- static GHashTable *stemmers = NULL;
+
if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
+#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9)
typedef struct rspamd_stat_token_s {
rspamd_ftok_t original;
token_flags = token->flags;
if (task->lang_det) {
- if (rspamd_language_detector_is_stop_word (task->lang_det,
- token->begin, token->len)) {
- /* Skip it */
+ if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
+ /* Skip stop word */
continue;
}
}
#include "tokenizers.h"
#include "stat_internal.h"
#include "contrib/mumhash/mum.h"
+#include "libmime/lang_detection.h"
+#include "libstemmer.h"
#include <unicode/utf8.h>
#include <unicode/uchar.h>
}
}
-void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
- const gchar *language);
\ No newline at end of file
+void
+rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+ const gchar *language,
+ struct rspamd_lang_detector *d)
+{
+ static GHashTable *stemmers = NULL;
+ struct sb_stemmer *stem = NULL;
+ guint i;
+ rspamd_stat_token_t *tok;
+ gchar *dest;
+ gsize dlen;
+
+ if (!stemmers) {
+ stemmers = g_hash_table_new (rspamd_strcase_hash,
+ rspamd_strcase_equal);
+ }
+
+ if (language && language[0] != '\0') {
+ stem = g_hash_table_lookup (stemmers, language);
+
+ if (stem == NULL) {
+
+ stem = sb_stemmer_new (language, "UTF_8");
+
+ if (stem == NULL) {
+ msg_debug_pool (
+ "<%s> cannot create lemmatizer for %s language",
+ language);
+ g_hash_table_insert (stemmers, g_strdup (language),
+ GINT_TO_POINTER (-1));
+ }
+ else {
+ g_hash_table_insert (stemmers, g_strdup (language),
+ stem);
+ }
+ }
+ else if (stem == GINT_TO_POINTER (-1)) {
+ /* Negative cache */
+ stem = NULL;
+ }
+ }
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+ if (stem) {
+ const gchar *stemmed;
+
+ stemmed = sb_stemmer_stem (stem,
+ tok->normalized.begin, tok->normalized.len);
+
+ dlen = strlen (stemmed);
+
+ if (dlen > 0) {
+ dest = rspamd_mempool_alloc (pool, dlen);
+ memcpy (dest, stemmed, dlen);
+ rspamd_str_lc_utf8 (dest, dlen);
+ tok->stemmed.len = dlen;
+ tok->stemmed.begin = dest;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
+ }
+ else {
+ /* Fallback */
+ dest = rspamd_mempool_alloc (pool, tok->normalized.len);
+ memcpy (dest, tok->normalized.begin, tok->normalized.len);
+ rspamd_str_lc_utf8 (dest, tok->normalized.len);
+ tok->stemmed.len = tok->normalized.len;
+ tok->stemmed.begin = dest;
+ }
+ }
+ else {
+ /* No stemmer, utf8 lowercase */
+ dest = rspamd_mempool_alloc (pool, tok->normalized.len);
+ memcpy (dest, tok->normalized.begin, tok->normalized.len);
+ rspamd_str_lc_utf8 (dest, tok->normalized.len);
+ tok->stemmed.len = tok->normalized.len;
+ tok->stemmed.begin = dest;
+ }
+
+ if (tok->stemmed.len > 0 && rspamd_language_detector_is_stop_word (d,
+ tok->stemmed.begin, tok->stemmed.len)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
+ }
+ }
+ else {
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ /* Raw text, lowercase */
+ dest = rspamd_mempool_alloc (pool, tok->original.len);
+ memcpy (dest, tok->original.begin, tok->original.len);
+ rspamd_str_lc (dest, tok->original.len);
+ tok->stemmed.len = tok->original.len;
+ tok->stemmed.begin = dest;
+ }
+ }
+ }
+}
\ No newline at end of file
GPtrArray *result);
gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_config *cf,
- gsize *len);
+ struct rspamd_tokenizer_config *cf,
+ gsize *len);
+struct rspamd_lang_detector;
void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool);
-
void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
- const gchar *language);
+ const gchar *language,
+ struct rspamd_lang_detector *d);
GArray * rspamd_tokenize_subject (struct rspamd_task *task);
#endif