aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/stat_api.h
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-25 16:33:33 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-25 16:33:33 +0000
commit8adf20f620fa9737666044de7c712eac2174b1c4 (patch)
treedae0d228d312e67f11468118c39b21ded046f530 /src/libstat/stat_api.h
parent0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c (diff)
downloadrspamd-8adf20f620fa9737666044de7c712eac2174b1c4.tar.gz
rspamd-8adf20f620fa9737666044de7c712eac2174b1c4.zip
[Project] Another try to normalize unicode properly
Diffstat (limited to 'src/libstat/stat_api.h')
-rw-r--r--src/libstat/stat_api.h9
1 files changed, 5 insertions, 4 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index 8ab3332b9..b912f8d20 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -37,12 +37,13 @@
#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9)
+#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10)
typedef struct rspamd_stat_token_s {
- rspamd_ftok_t original;
- rspamd_ftok_unicode_t unicode;
- rspamd_ftok_t normalized;
- rspamd_ftok_t stemmed;
+ rspamd_ftok_t original; /* utf8 raw */
+ rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
+ rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
+ rspamd_ftok_t stemmed; /* stemmed utf8 */
guint flags;
} rspamd_stat_token_t;