aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 16:26:01 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 16:26:01 +0000
commitabd5300a45ff290656926b61603a65e9621e090f (patch)
treee3d350cca3ecbac3a41fcf96ad2a9dc5f9e48d75 /src/libstat
parentb522caaf83b4a3f16246bdc38d0f7ce866cdc660 (diff)
downloadrspamd-abd5300a45ff290656926b61603a65e9621e090f.tar.gz
rspamd-abd5300a45ff290656926b61603a65e9621e090f.zip
[Project] Add function to normalize unicode on per words basis
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/stat_api.h3
-rw-r--r--src/libstat/tokenizers/tokenizers.c134
-rw-r--r--src/libstat/tokenizers/tokenizers.h4
3 files changed, 139 insertions, 2 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index 645e1f1aa..c046dd227 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -35,11 +35,12 @@
#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
+#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
typedef struct rspamd_stat_token_s {
rspamd_ftok_t original;
rspamd_ftok_unicode_t unicode;
- rspamd_ftok_t normalised;
+ rspamd_ftok_t normalized;
rspamd_ftok_t stemmed;
guint flags;
} rspamd_stat_token_t;
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 8664b9e19..247c24dbd 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -20,11 +20,17 @@
#include "rspamd.h"
#include "tokenizers.h"
#include "stat_internal.h"
-#include "../../../contrib/mumhash/mum.h"
+#include "contrib/mumhash/mum.h"
+
#include <unicode/utf8.h>
#include <unicode/uchar.h>
#include <unicode/uiter.h>
#include <unicode/ubrk.h>
+#include <unicode/ucnv.h>
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+#include <unicode/unorm2.h>
+#endif
+
#include <math.h>
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
@@ -534,3 +540,129 @@ rspamd_tokenize_subject (struct rspamd_task *task)
return words;
}
+void
+rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
+{
+ rspamd_stat_token_t *tok;
+ guint i;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ guint clen, dlen;
+ gint r;
+ UConverter *utf8_converter;
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
+ gint32 end;
+ UChar *src = NULL, *dest = NULL;
+#endif
+
+ utf8_converter = rspamd_get_utf8_converter ();
+
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+ UChar *unicode;
+ gchar *utf8;
+ gsize ulen;
+
+ uc_err = U_ZERO_ERROR;
+ ulen = tok->original.len;
+ unicode = rspamd_mempool_alloc (pool, sizeof (UChar) * (ulen + 1));
+ ulen = ucnv_toUChars (utf8_converter,
+ unicode,
+ tok->original.len + 1,
+ tok->original.begin,
+ tok->original.len,
+ &uc_err);
+
+
+ if (!U_SUCCESS (uc_err)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ tok->unicode.begin = NULL;
+ tok->unicode.len = 0;
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ }
+ else {
+ /* Perform normalization if available and needed */
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ /* We can now check if we need to decompose */
+ end = unorm2_spanQuickCheckYes (norm, src, ulen, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ }
+ else {
+ if (end == ulen) {
+ /* Already normalised */
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = tok->original.begin;
+ tok->normalized.len = tok->original.len;
+ }
+ else {
+ /* Perform normalization */
+
+ dest = rspamd_mempool_alloc (pool, ulen * sizeof (UChar));
+ /* First part */
+ memcpy (dest, src, end * sizeof (*dest));
+ /* Second part */
+ ulen = unorm2_normalizeSecondAndAppend (norm, dest, end,
+ ulen,
+ src + end, ulen - end, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+ msg_warn_pool_check ("cannot normalise text '%*s': %s",
+ (gint)tok->original.len, tok->original.begin,
+ u_errorName (uc_err));
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ }
+ }
+ else {
+ /* Copy normalised back */
+ tok->unicode.begin = dest;
+ tok->unicode.len = ulen;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
+
+ /* Convert utf8 to produce normalized part */
+ clen = ucnv_getMaxCharSize (utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (ulen, clen);
+
+ utf8 = rspamd_mempool_alloc (pool,
+ sizeof (*utf8) * dlen + 1);
+ r = ucnv_fromUChars (utf8_converter,
+ utf8,
+ dlen,
+ dest,
+ ulen,
+ &uc_err);
+ utf8[r] = '\0';
+
+ tok->normalized.begin = utf8;
+ tok->normalized.len = r;
+ }
+ }
+ }
+#else
+ /* Legacy libicu path */
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = tok->original.begin;
+ tok->normalized.len = tok->original.len;
+#endif
+ }
+ }
+ }
+}
+
+void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+ const gchar *language); \ No newline at end of file
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 668f08cdc..9a5561671 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -57,6 +57,10 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,
gsize *len);
+void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool);
+
+void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+ const gchar *language);
GArray * rspamd_tokenize_subject (struct rspamd_task *task);
#endif