You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizers.c 7.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * Common tokenization functions
  18. */
  19. #include "rspamd.h"
  20. #include "tokenizers.h"
  21. #include "stat_internal.h"
  22. #include "xxhash.h"
  23. typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
  24. rspamd_ftok_t * token,
  25. GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature);
  26. const gchar t_delimiters[255] = {
  27. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
  28. 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
  29. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  30. 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
  31. 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
  32. 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
  33. 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
  34. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  35. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  36. 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
  37. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  38. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  39. 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
  40. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  41. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  42. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  43. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  44. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  45. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  46. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  47. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  48. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  51. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  52. 0, 0, 0, 0, 0
  53. };
  54. gint
  55. token_node_compare_func (gconstpointer a, gconstpointer b)
  56. {
  57. const rspamd_token_t *aa = a, *bb = b;
  58. if (aa->datalen != bb->datalen) {
  59. return aa->datalen - bb->datalen;
  60. }
  61. return memcmp (aa->data, bb->data, aa->datalen);
  62. }
  63. /* Get next word from specified f_str_t buf */
  64. static gboolean
  65. rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
  66. gchar const **cur, rspamd_ftok_t * token,
  67. GList **exceptions, gboolean is_utf, gsize *rl, gboolean unused)
  68. {
  69. gsize remain, pos;
  70. const gchar *p;
  71. struct process_exception *ex = NULL;
  72. if (buf == NULL) {
  73. return FALSE;
  74. }
  75. g_assert (cur != NULL);
  76. if (exceptions != NULL && *exceptions != NULL) {
  77. ex = (*exceptions)->data;
  78. }
  79. if (token->begin == NULL || *cur == NULL) {
  80. if (ex != NULL) {
  81. if (ex->pos == 0) {
  82. token->begin = buf->begin + ex->len;
  83. token->len = ex->len;
  84. }
  85. else {
  86. token->begin = buf->begin;
  87. token->len = 0;
  88. }
  89. }
  90. else {
  91. token->begin = buf->begin;
  92. token->len = 0;
  93. }
  94. *cur = token->begin;
  95. }
  96. token->len = 0;
  97. pos = *cur - buf->begin;
  98. if (pos >= buf->len) {
  99. return FALSE;
  100. }
  101. remain = buf->len - pos;
  102. p = *cur;
  103. /* Skip non delimiters symbols */
  104. do {
  105. if (ex != NULL && ex->pos == pos) {
  106. /* Go to the next exception */
  107. *exceptions = g_list_next (*exceptions);
  108. *cur = p + ex->len;
  109. return TRUE;
  110. }
  111. pos++;
  112. p++;
  113. remain--;
  114. } while (remain > 0 && t_delimiters[(guchar)*p]);
  115. token->begin = p;
  116. while (remain > 0 && !t_delimiters[(guchar)*p]) {
  117. if (ex != NULL && ex->pos == pos) {
  118. *exceptions = g_list_next (*exceptions);
  119. *cur = p + ex->len;
  120. return TRUE;
  121. }
  122. token->len++;
  123. pos++;
  124. remain--;
  125. p++;
  126. }
  127. if (remain == 0) {
  128. return FALSE;
  129. }
  130. if (rl) {
  131. if (is_utf) {
  132. *rl = g_utf8_strlen (token->begin, token->len);
  133. }
  134. else {
  135. *rl = token->len;
  136. }
  137. }
  138. *cur = p;
  139. return TRUE;
  140. }
  141. static gboolean
  142. rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
  143. gchar const **cur, rspamd_ftok_t * token,
  144. GList **exceptions, gboolean is_utf, gsize *rl,
  145. gboolean check_signature)
  146. {
  147. gsize remain, pos, siglen = 0;
  148. const gchar *p, *next_p, *sig = NULL;
  149. gunichar uc;
  150. guint processed = 0;
  151. struct process_exception *ex = NULL;
  152. enum {
  153. skip_delimiters = 0,
  154. feed_token,
  155. skip_exception,
  156. process_signature
  157. } state = skip_delimiters;
  158. if (buf == NULL) {
  159. return FALSE;
  160. }
  161. if (exceptions != NULL && *exceptions != NULL) {
  162. ex = (*exceptions)->data;
  163. }
  164. g_assert (is_utf);
  165. g_assert (cur != NULL);
  166. if (*cur == NULL) {
  167. *cur = buf->begin;
  168. }
  169. token->len = 0;
  170. pos = *cur - buf->begin;
  171. if (pos >= buf->len) {
  172. return FALSE;
  173. }
  174. remain = buf->len - pos;
  175. p = *cur;
  176. token->begin = p;
  177. while (remain > 0) {
  178. uc = g_utf8_get_char (p);
  179. next_p = g_utf8_next_char (p);
  180. if (next_p - p > (gint)remain) {
  181. return FALSE;
  182. }
  183. switch (state) {
  184. case skip_delimiters:
  185. if (ex != NULL && p - buf->begin == (gint)ex->pos) {
  186. token->begin = "!!EX!!";
  187. token->len = sizeof ("!!EX!!") - 1;
  188. processed = token->len;
  189. state = skip_exception;
  190. continue;
  191. }
  192. else if (g_unichar_isgraph (uc)) {
  193. if (!g_unichar_ispunct (uc)) {
  194. state = feed_token;
  195. token->begin = p;
  196. continue;
  197. }
  198. else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
  199. sig = p;
  200. siglen = remain;
  201. state = process_signature;
  202. continue;
  203. }
  204. }
  205. break;
  206. case feed_token:
  207. if (ex != NULL && p - buf->begin == (gint)ex->pos) {
  208. goto set_token;
  209. }
  210. else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
  211. goto set_token;
  212. }
  213. processed ++;
  214. break;
  215. case skip_exception:
  216. *cur = p + ex->len;
  217. *exceptions = g_list_next (*exceptions);
  218. goto set_token;
  219. break;
  220. case process_signature:
  221. if (*p == '\r' || *p == '\n') {
  222. msg_debug ("signature found: %*s", (gint)siglen, sig);
  223. return FALSE;
  224. }
  225. else if (*p != ' ' && *p != '-' && *p != '_') {
  226. state = skip_delimiters;
  227. continue;
  228. }
  229. break;
  230. }
  231. remain -= next_p - p;
  232. p = next_p;
  233. }
  234. set_token:
  235. if (rl) {
  236. *rl = processed;
  237. }
  238. if (token->len == 0) {
  239. token->len = p - token->begin;
  240. g_assert (token->len > 0);
  241. *cur = p;
  242. }
  243. return TRUE;
  244. }
  245. GArray *
  246. rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
  247. struct rspamd_config *cfg, GList *exceptions, gboolean compat,
  248. guint64 *hash)
  249. {
  250. rspamd_ftok_t token, buf;
  251. const gchar *pos = NULL;
  252. gsize l;
  253. GArray *res;
  254. GList *cur = exceptions;
  255. token_get_function func;
  256. guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
  257. guint64 hv = 0;
  258. XXH64_state_t *st;
  259. gboolean decay = FALSE;
  260. guint64 prob;
  261. if (text == NULL) {
  262. return NULL;
  263. }
  264. buf.begin = text;
  265. buf.len = len;
  266. token.begin = NULL;
  267. token.len = 0;
  268. if (compat || !is_utf) {
  269. func = rspamd_tokenizer_get_word_compat;
  270. }
  271. else {
  272. func = rspamd_tokenizer_get_word;
  273. }
  274. if (cfg != NULL) {
  275. min_len = cfg->min_word_len;
  276. max_len = cfg->max_word_len;
  277. word_decay = cfg->words_decay;
  278. initial_size = word_decay * 2;
  279. }
  280. res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
  281. st = XXH64_createState ();
  282. XXH64_reset (st, 0);
  283. while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
  284. if (l == 0 || (min_len > 0 && l < min_len) ||
  285. (max_len > 0 && l > max_len)) {
  286. token.begin = pos;
  287. continue;
  288. }
  289. if (!decay) {
  290. XXH64_update (st, token.begin, token.len);
  291. /* Check for decay */
  292. if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
  293. /* Start decay */
  294. gdouble decay_prob;
  295. decay = TRUE;
  296. hv = XXH64_digest (st);
  297. /* We assume that word is 6 symbols length in average */
  298. decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
  299. if (decay_prob >= 1.0) {
  300. prob = G_MAXUINT64;
  301. }
  302. else {
  303. prob = decay_prob * G_MAXUINT64;
  304. }
  305. }
  306. }
  307. else {
  308. /* Decaying probability */
  309. /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
  310. hv = 2862933555777941757ULL * hv + 3037000493ULL;
  311. if (hv > prob) {
  312. token.begin = pos;
  313. continue;
  314. }
  315. }
  316. g_array_append_val (res, token);
  317. token.begin = pos;
  318. }
  319. if (!decay) {
  320. hv = XXH64_digest (st);
  321. }
  322. if (hash) {
  323. *hash = hv;
  324. }
  325. XXH64_freeState (st);
  326. return res;
  327. }
  328. /*
  329. * vi:ts=4
  330. */