123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
- * Common tokenization functions
- */
-
- #include "rspamd.h"
- #include "tokenizers.h"
- #include "stat_internal.h"
- #include "xxhash.h"
-
- typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
- rspamd_ftok_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
-
- const gchar t_delimiters[255] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
- 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
- 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0
- };
-
- gint
- token_node_compare_func (gconstpointer a, gconstpointer b)
- {
- const rspamd_token_t *aa = a, *bb = b;
-
- if (aa->datalen != bb->datalen) {
- return aa->datalen - bb->datalen;
- }
-
- return memcmp (aa->data, bb->data, aa->datalen);
- }
-
- /* Get next word from specified f_str_t buf */
- static gboolean
- rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
- gchar const **cur, rspamd_ftok_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
- {
- gsize remain, pos;
- const gchar *p;
- struct process_exception *ex = NULL;
-
- if (buf == NULL) {
- return FALSE;
- }
-
- g_assert (cur != NULL);
-
- if (exceptions != NULL && *exceptions != NULL) {
- ex = (*exceptions)->data;
- }
-
- if (token->begin == NULL || *cur == NULL) {
- if (ex != NULL) {
- if (ex->pos == 0) {
- token->begin = buf->begin + ex->len;
- token->len = ex->len;
- }
- else {
- token->begin = buf->begin;
- token->len = 0;
- }
- }
- else {
- token->begin = buf->begin;
- token->len = 0;
- }
- *cur = token->begin;
- }
-
- token->len = 0;
-
- pos = *cur - buf->begin;
- if (pos >= buf->len) {
- return FALSE;
- }
-
- remain = buf->len - pos;
- p = *cur;
-
- /* Skip non delimiters symbols */
- do {
- if (ex != NULL && ex->pos == pos) {
- /* Go to the next exception */
- *exceptions = g_list_next (*exceptions);
- *cur = p + ex->len;
- return TRUE;
- }
- pos++;
- p++;
- remain--;
- } while (remain > 0 && t_delimiters[(guchar)*p]);
-
- token->begin = p;
-
- while (remain > 0 && !t_delimiters[(guchar)*p]) {
- if (ex != NULL && ex->pos == pos) {
- *exceptions = g_list_next (*exceptions);
- *cur = p + ex->len;
- return TRUE;
- }
- token->len++;
- pos++;
- remain--;
- p++;
- }
-
- if (remain == 0) {
- return FALSE;
- }
-
- if (rl) {
- if (is_utf) {
- *rl = g_utf8_strlen (token->begin, token->len);
- }
- else {
- *rl = token->len;
- }
- }
-
- *cur = p;
-
- return TRUE;
- }
-
- static gboolean
- rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
- gchar const **cur, rspamd_ftok_t * token,
- GList **exceptions, gboolean is_utf, gsize *rl,
- gboolean check_signature)
- {
- gsize remain, pos, siglen = 0;
- const gchar *p, *next_p, *sig = NULL;
- gunichar uc;
- guint processed = 0;
- struct process_exception *ex = NULL;
- enum {
- skip_delimiters = 0,
- feed_token,
- skip_exception,
- process_signature
- } state = skip_delimiters;
-
- if (buf == NULL) {
- return FALSE;
- }
-
- if (exceptions != NULL && *exceptions != NULL) {
- ex = (*exceptions)->data;
- }
-
- g_assert (is_utf);
- g_assert (cur != NULL);
-
- if (*cur == NULL) {
- *cur = buf->begin;
- }
-
- token->len = 0;
-
- pos = *cur - buf->begin;
- if (pos >= buf->len) {
- return FALSE;
- }
-
- remain = buf->len - pos;
- p = *cur;
- token->begin = p;
-
- while (remain > 0) {
- uc = g_utf8_get_char (p);
- next_p = g_utf8_next_char (p);
-
- if (next_p - p > (gint)remain) {
- return FALSE;
- }
-
- switch (state) {
- case skip_delimiters:
- if (ex != NULL && p - buf->begin == (gint)ex->pos) {
- token->begin = "!!EX!!";
- token->len = sizeof ("!!EX!!") - 1;
- processed = token->len;
- state = skip_exception;
- continue;
- }
- else if (g_unichar_isgraph (uc)) {
- if (!g_unichar_ispunct (uc)) {
- state = feed_token;
- token->begin = p;
- continue;
- }
- else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
- sig = p;
- siglen = remain;
- state = process_signature;
- continue;
- }
- }
- break;
- case feed_token:
- if (ex != NULL && p - buf->begin == (gint)ex->pos) {
- goto set_token;
- }
- else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
- goto set_token;
- }
- processed ++;
- break;
- case skip_exception:
- *cur = p + ex->len;
- *exceptions = g_list_next (*exceptions);
- goto set_token;
- break;
- case process_signature:
- if (*p == '\r' || *p == '\n') {
- msg_debug ("signature found: %*s", (gint)siglen, sig);
- return FALSE;
- }
- else if (*p != ' ' && *p != '-' && *p != '_') {
- state = skip_delimiters;
- continue;
- }
- break;
- }
-
- remain -= next_p - p;
- p = next_p;
- }
-
- set_token:
- if (rl) {
- *rl = processed;
- }
-
- if (token->len == 0) {
- token->len = p - token->begin;
- g_assert (token->len > 0);
- *cur = p;
- }
-
- return TRUE;
- }
-
- GArray *
- rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- struct rspamd_config *cfg, GList *exceptions, gboolean compat,
- guint64 *hash)
- {
- rspamd_ftok_t token, buf;
- const gchar *pos = NULL;
- gsize l;
- GArray *res;
- GList *cur = exceptions;
- token_get_function func;
- guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
- guint64 hv = 0;
- XXH64_state_t *st;
- gboolean decay = FALSE;
- guint64 prob;
-
- if (text == NULL) {
- return NULL;
- }
-
- buf.begin = text;
- buf.len = len;
- token.begin = NULL;
- token.len = 0;
-
- if (compat || !is_utf) {
- func = rspamd_tokenizer_get_word_compat;
- }
- else {
- func = rspamd_tokenizer_get_word;
- }
-
- if (cfg != NULL) {
- min_len = cfg->min_word_len;
- max_len = cfg->max_word_len;
- word_decay = cfg->words_decay;
- initial_size = word_decay * 2;
- }
-
- res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
- st = XXH64_createState ();
- XXH64_reset (st, 0);
-
- while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
- if (l == 0 || (min_len > 0 && l < min_len) ||
- (max_len > 0 && l > max_len)) {
- token.begin = pos;
- continue;
- }
-
- if (!decay) {
- XXH64_update (st, token.begin, token.len);
-
- /* Check for decay */
- if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
- /* Start decay */
- gdouble decay_prob;
-
- decay = TRUE;
- hv = XXH64_digest (st);
-
- /* We assume that word is 6 symbols length in average */
- decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
-
- if (decay_prob >= 1.0) {
- prob = G_MAXUINT64;
- }
- else {
- prob = decay_prob * G_MAXUINT64;
- }
- }
- }
- else {
- /* Decaying probability */
- /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
- hv = 2862933555777941757ULL * hv + 3037000493ULL;
-
- if (hv > prob) {
- token.begin = pos;
- continue;
- }
- }
-
- g_array_append_val (res, token);
- token.begin = pos;
- }
-
- if (!decay) {
- hv = XXH64_digest (st);
- }
-
- if (hash) {
- *hash = hv;
- }
-
- XXH64_freeState (st);
-
- return res;
- }
-
- /*
- * vi:ts=4
- */
|