You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

osb.c 8.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * OSB tokenizer
  18. */
  19. #include "tokenizers.h"
  20. #include "stat_internal.h"
  21. #include "xxhash.h"
  22. #include "cryptobox.h"
  23. /* Size for features pipe */
  24. #define DEFAULT_FEATURE_WINDOW_SIZE 5
  25. #define DEFAULT_OSB_VERSION 2
  26. static const int primes[] = {
  27. 1, 7,
  28. 3, 13,
  29. 5, 29,
  30. 11, 51,
  31. 23, 101,
  32. 47, 203,
  33. 97, 407,
  34. 197, 817,
  35. 397, 1637,
  36. 797, 3277,
  37. };
  38. static const guchar osb_tokenizer_magic[] = {'o', 's', 'b', 't', 'o', 'k', 'v', '2'};
  39. enum rspamd_osb_hash_type {
  40. RSPAMD_OSB_HASH_COMPAT = 0,
  41. RSPAMD_OSB_HASH_XXHASH,
  42. RSPAMD_OSB_HASH_SIPHASH
  43. };
  44. struct rspamd_osb_tokenizer_config {
  45. guchar magic[8];
  46. gshort version;
  47. gshort window_size;
  48. enum rspamd_osb_hash_type ht;
  49. guint64 seed;
  50. rspamd_sipkey_t sk;
  51. };
  52. /*
  53. * Return default config
  54. */
  55. static struct rspamd_osb_tokenizer_config *
  56. rspamd_tokenizer_osb_default_config (void)
  57. {
  58. static struct rspamd_osb_tokenizer_config def;
  59. if (memcmp (def.magic, osb_tokenizer_magic, sizeof (osb_tokenizer_magic)) != 0) {
  60. memset (&def, 0, sizeof (def));
  61. memcpy (def.magic, osb_tokenizer_magic, sizeof (osb_tokenizer_magic));
  62. def.version = DEFAULT_OSB_VERSION;
  63. def.window_size = DEFAULT_FEATURE_WINDOW_SIZE;
  64. def.ht = RSPAMD_OSB_HASH_XXHASH;
  65. def.seed = 0xdeadbabe;
  66. }
  67. return &def;
  68. }
  69. static struct rspamd_osb_tokenizer_config *
  70. rspamd_tokenizer_osb_config_from_ucl (rspamd_mempool_t * pool,
  71. const ucl_object_t *obj)
  72. {
  73. const ucl_object_t *elt;
  74. struct rspamd_osb_tokenizer_config *cf, *def;
  75. guchar *key = NULL;
  76. gsize keylen;
  77. if (pool != NULL) {
  78. cf = rspamd_mempool_alloc (pool, sizeof (*cf));
  79. }
  80. else {
  81. cf = g_slice_alloc (sizeof (*cf));
  82. }
  83. /* Use default config */
  84. def = rspamd_tokenizer_osb_default_config ();
  85. memcpy (cf, def, sizeof (*cf));
  86. elt = ucl_object_find_key (obj, "hash");
  87. if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
  88. if (g_ascii_strncasecmp (ucl_object_tostring (elt), "xxh", 3)
  89. == 0) {
  90. cf->ht = RSPAMD_OSB_HASH_XXHASH;
  91. elt = ucl_object_find_key (obj, "seed");
  92. if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
  93. cf->seed = ucl_object_toint (elt);
  94. }
  95. }
  96. else if (g_ascii_strncasecmp (ucl_object_tostring (elt), "sip", 3)
  97. == 0) {
  98. cf->ht = RSPAMD_OSB_HASH_SIPHASH;
  99. elt = ucl_object_find_key (obj, "key");
  100. if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
  101. key = rspamd_decode_base32 (ucl_object_tostring (elt),
  102. 0, &keylen);
  103. if (keylen < sizeof (rspamd_sipkey_t)) {
  104. msg_warn ("siphash key is too short: %z", keylen);
  105. g_free (key);
  106. }
  107. else {
  108. memcpy (cf->sk, key, sizeof (cf->sk));
  109. g_free (key);
  110. }
  111. }
  112. else {
  113. msg_warn_pool ("siphash cannot be used without key");
  114. }
  115. }
  116. }
  117. else {
  118. elt = ucl_object_find_key (obj, "compat");
  119. if (elt != NULL && ucl_object_toboolean (elt)) {
  120. cf->ht = RSPAMD_OSB_HASH_COMPAT;
  121. }
  122. }
  123. elt = ucl_object_find_key (obj, "window");
  124. if (elt != NULL && ucl_object_type (elt) == UCL_INT) {
  125. cf->window_size = ucl_object_toint (elt);
  126. if (cf->window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) {
  127. msg_err_pool ("too large window size: %d", cf->window_size);
  128. cf->window_size = DEFAULT_FEATURE_WINDOW_SIZE;
  129. }
  130. }
  131. return cf;
  132. }
  133. gpointer
  134. rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
  135. struct rspamd_tokenizer_config *cf,
  136. gsize *len)
  137. {
  138. struct rspamd_osb_tokenizer_config *osb_cf, *def;
  139. if (cf != NULL && cf->opts != NULL) {
  140. osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, cf->opts);
  141. }
  142. else {
  143. def = rspamd_tokenizer_osb_default_config ();
  144. osb_cf = rspamd_mempool_alloc (pool, sizeof (*osb_cf));
  145. memcpy (osb_cf, def, sizeof (*osb_cf));
  146. /* Do not write sipkey to statfile */
  147. }
  148. if (osb_cf->ht == RSPAMD_OSB_HASH_SIPHASH) {
  149. msg_info_pool ("siphash key is not stored into statfiles, so you'd "
  150. "need to keep it inside the configuration");
  151. }
  152. memset (osb_cf->sk, 0, sizeof (osb_cf->sk));
  153. if (len != NULL) {
  154. *len = sizeof (*osb_cf);
  155. }
  156. return osb_cf;
  157. }
  158. #if 0
  159. gboolean
  160. rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
  161. gpointer ptr, gsize len)
  162. {
  163. struct rspamd_osb_tokenizer_config *osb_cf, *test_cf;
  164. gboolean ret = FALSE;
  165. test_cf = rt->config;
  166. g_assert (test_cf != NULL);
  167. if (len == sizeof (*osb_cf)) {
  168. osb_cf = ptr;
  169. if (memcmp (osb_cf, osb_tokenizer_magic, sizeof (osb_tokenizer_magic)) != 0) {
  170. ret = test_cf->ht == RSPAMD_OSB_HASH_COMPAT;
  171. }
  172. else {
  173. if (osb_cf->version == DEFAULT_OSB_VERSION) {
  174. /* We can compare them directly now */
  175. ret = (memcmp (osb_cf, test_cf, sizeof (*osb_cf)
  176. - sizeof (osb_cf->sk))) == 0;
  177. }
  178. }
  179. }
  180. else {
  181. /* We are compatible now merely with fallback config */
  182. if (test_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
  183. ret = TRUE;
  184. }
  185. }
  186. return ret;
  187. }
  188. gboolean
  189. rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
  190. struct rspamd_tokenizer_runtime *rt,
  191. gpointer ptr, gsize len)
  192. {
  193. struct rspamd_osb_tokenizer_config *osb_cf;
  194. if (ptr == NULL || len == 0) {
  195. osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
  196. if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
  197. /* Trying to load incompatible configuration */
  198. msg_err_pool ("cannot load tokenizer configuration from a legacy "
  199. "statfile; maybe you have forgotten to set 'compat' option"
  200. " in the tokenizer configuration");
  201. return FALSE;
  202. }
  203. }
  204. else {
  205. g_assert (len == sizeof (*osb_cf));
  206. osb_cf = ptr;
  207. }
  208. rt->config = osb_cf;
  209. rt->conf_len = sizeof (*osb_cf);
  210. return TRUE;
  211. }
  212. gboolean
  213. rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
  214. {
  215. struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
  216. return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
  217. }
  218. #endif
  219. gint
  220. rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
  221. rspamd_mempool_t *pool,
  222. GArray *words,
  223. gboolean is_utf,
  224. const gchar *prefix,
  225. GPtrArray *result)
  226. {
  227. rspamd_token_t *new_tok = NULL;
  228. rspamd_ftok_t *token;
  229. struct rspamd_osb_tokenizer_config *osb_cf;
  230. guint64 *hashpipe, cur, seed;
  231. guint32 h1, h2;
  232. gsize token_size;
  233. guint processed = 0, i, w, window_size;
  234. if (words == NULL) {
  235. return FALSE;
  236. }
  237. osb_cf = ctx->tkcf;
  238. window_size = osb_cf->window_size;
  239. if (prefix) {
  240. seed = XXH64 (prefix, strlen (prefix), osb_cf->seed);
  241. }
  242. else {
  243. seed = osb_cf->seed;
  244. }
  245. hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
  246. memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0]));
  247. token_size = sizeof (rspamd_token_t) + sizeof (gdouble) * ctx->statfiles->len;
  248. g_assert (token_size > 0);
  249. for (w = 0; w < words->len; w ++) {
  250. token = &g_array_index (words, rspamd_ftok_t, w);
  251. if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
  252. cur = rspamd_fstrhash_lc (token, is_utf);
  253. }
  254. else {
  255. /* We know that the words are normalized */
  256. if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) {
  257. cur = XXH64 (token->begin, token->len, osb_cf->seed);
  258. }
  259. else {
  260. rspamd_cryptobox_siphash ((guchar *)&cur, token->begin,
  261. token->len, osb_cf->sk);
  262. if (prefix) {
  263. cur ^= seed;
  264. }
  265. }
  266. }
  267. #define ADD_TOKEN do {\
  268. new_tok = rspamd_mempool_alloc0 (pool, token_size); \
  269. new_tok->datalen = sizeof (gint64); \
  270. if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \
  271. h1 = ((guint32)hashpipe[0]) * primes[0] + \
  272. ((guint32)hashpipe[i]) * primes[i << 1]; \
  273. h2 = ((guint32)hashpipe[0]) * primes[1] + \
  274. ((guint32)hashpipe[i]) * primes[(i << 1) - 1]; \
  275. memcpy(new_tok->data, &h1, sizeof (h1)); \
  276. memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \
  277. } \
  278. else { \
  279. cur = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; \
  280. memcpy (new_tok->data, &cur, sizeof (cur)); \
  281. } \
  282. new_tok->window_idx = i + 1; \
  283. g_ptr_array_add (result, new_tok); \
  284. } while(0)
  285. if (processed < window_size) {
  286. /* Just fill a hashpipe */
  287. hashpipe[window_size - ++processed] = cur;
  288. }
  289. else {
  290. /* Shift hashpipe */
  291. for (i = window_size - 1; i > 0; i--) {
  292. hashpipe[i] = hashpipe[i - 1];
  293. }
  294. hashpipe[0] = cur;
  295. processed++;
  296. for (i = 1; i < window_size; i++) {
  297. ADD_TOKEN;
  298. }
  299. }
  300. }
  301. if (processed <= window_size) {
  302. memmove (hashpipe, hashpipe + (window_size - processed + 1), processed);
  303. for (i = 1; i < processed; i++) {
  304. ADD_TOKEN;
  305. }
  306. }
  307. #undef ADD_TOKEN
  308. return TRUE;
  309. }