Browse Source

[Feature] Ignore bogus whitespaces in the words

Issue: #2649
tags/1.8.3
Vsevolod Stakhov 5 years ago
parent
commit
05f17c6cd6
2 changed files with 9 additions and 1 deletions
  1. 1
    0
      src/libstat/stat_api.h
  2. 8
    1
      src/libstat/tokenizers/tokenizers.c

+ 1
- 0
src/libstat/stat_api.h View File

@@ -38,6 +38,7 @@
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10)
#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 11)

typedef struct rspamd_stat_token_s {
rspamd_ftok_t original; /* utf8 raw */

+ 8
- 1
src/libstat/tokenizers/tokenizers.c View File

@@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,

while (i < srclen) {
U16_NEXT_UNSAFE (src, i, t);
*d++ = u_tolower (t);

if (u_isgraph (t)) {
*d++ = u_tolower (t);
}
else {
/* Invisible spaces ! */
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
}
}

tok->unicode.begin = dest;

Loading…
Cancel
Save