diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 16:33:33 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 16:33:33 +0000 |
commit | 8adf20f620fa9737666044de7c712eac2174b1c4 (patch) | |
tree | dae0d228d312e67f11468118c39b21ded046f530 /src/libstat/stat_api.h | |
parent | 0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c (diff) | |
download | rspamd-8adf20f620fa9737666044de7c712eac2174b1c4.tar.gz rspamd-8adf20f620fa9737666044de7c712eac2174b1c4.zip |
[Project] Another try to normalize unicode properly
Diffstat (limited to 'src/libstat/stat_api.h')
-rw-r--r-- | src/libstat/stat_api.h | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 8ab3332b9..b912f8d20 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -37,12 +37,13 @@ #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9) +#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10) typedef struct rspamd_stat_token_s { - rspamd_ftok_t original; - rspamd_ftok_unicode_t unicode; - rspamd_ftok_t normalized; - rspamd_ftok_t stemmed; + rspamd_ftok_t original; /* utf8 raw */ + rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */ + rspamd_ftok_t normalized; /* normalized and lowercased utf8 */ + rspamd_ftok_t stemmed; /* stemmed utf8 */ guint flags; } rspamd_stat_token_t; |