You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

shingles.c 9.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "shingles.h"
  17. #include "fstring.h"
  18. #include "cryptobox.h"
  19. #include "images.h"
  20. #include "libstat/stat_api.h"
  21. #define SHINGLES_WINDOW 3
  22. #define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES
  23. static guint
  24. rspamd_shingles_keys_hash (gconstpointer k)
  25. {
  26. return rspamd_cryptobox_fast_hash (k, SHINGLES_KEY_SIZE,
  27. rspamd_hash_seed ());
  28. }
  29. static gboolean
  30. rspamd_shingles_keys_equal (gconstpointer k1, gconstpointer k2)
  31. {
  32. return (memcmp (k1, k2, SHINGLES_KEY_SIZE) == 0);
  33. }
  34. static void
  35. rspamd_shingles_keys_free (gpointer p)
  36. {
  37. guchar **k = p;
  38. guint i;
  39. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  40. g_free (k[i]);
  41. }
  42. g_free (k);
  43. }
  44. static guchar **
  45. rspamd_shingles_keys_new (void)
  46. {
  47. guchar **k;
  48. guint i;
  49. k = g_malloc0 (sizeof (guchar *) * RSPAMD_SHINGLE_SIZE);
  50. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  51. k[i] = g_malloc0 (sizeof (guchar) * SHINGLES_KEY_SIZE);
  52. }
  53. return k;
  54. }
  55. static guchar **
  56. rspamd_shingles_get_keys_cached (const guchar key[SHINGLES_KEY_SIZE])
  57. {
  58. static GHashTable *ht = NULL;
  59. guchar **keys = NULL, *key_cpy;
  60. rspamd_cryptobox_hash_state_t bs;
  61. const guchar *cur_key;
  62. guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
  63. guint i;
  64. if (ht == NULL) {
  65. ht = g_hash_table_new_full (rspamd_shingles_keys_hash,
  66. rspamd_shingles_keys_equal, g_free, rspamd_shingles_keys_free);
  67. }
  68. else {
  69. keys = g_hash_table_lookup (ht, key);
  70. }
  71. if (keys == NULL) {
  72. keys = rspamd_shingles_keys_new ();
  73. key_cpy = g_malloc (SHINGLES_KEY_SIZE);
  74. memcpy (key_cpy, key, SHINGLES_KEY_SIZE);
  75. /* Generate keys */
  76. rspamd_cryptobox_hash_init (&bs, NULL, 0);
  77. cur_key = key;
  78. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  79. /*
  80. * To generate a set of hashes we just apply sha256 to the
  81. * initial key as many times as many hashes are required and
  82. * xor left and right parts of sha256 to get a single 16 bytes SIP key.
  83. */
  84. out_key = keys[i];
  85. rspamd_cryptobox_hash_update (&bs, cur_key, 16);
  86. rspamd_cryptobox_hash_final (&bs, shabuf);
  87. memcpy (out_key, shabuf, 16);
  88. rspamd_cryptobox_hash_init (&bs, NULL, 0);
  89. cur_key = out_key;
  90. }
  91. g_hash_table_insert (ht, key_cpy, keys);
  92. }
  93. return keys;
  94. }
  95. struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
  96. rspamd_shingles_from_text (GArray *input,
  97. const guchar key[16],
  98. rspamd_mempool_t *pool,
  99. rspamd_shingles_filter filter,
  100. gpointer filterd,
  101. enum rspamd_shingle_alg alg)
  102. {
  103. struct rspamd_shingle *res;
  104. guint64 **hashes;
  105. guchar **keys;
  106. rspamd_fstring_t *row;
  107. rspamd_stat_token_t *word;
  108. guint64 val;
  109. gint i, j, k;
  110. gsize hlen, ilen = 0, beg = 0, widx = 0;
  111. enum rspamd_cryptobox_fast_hash_type ht;
  112. if (pool != NULL) {
  113. res = rspamd_mempool_alloc (pool, sizeof (*res));
  114. }
  115. else {
  116. res = g_malloc (sizeof (*res));
  117. }
  118. row = rspamd_fstring_sized_new (256);
  119. for (i = 0; i < input->len; i ++) {
  120. word = &g_array_index (input, rspamd_stat_token_t, i);
  121. if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)
  122. || word->stemmed.len == 0)) {
  123. ilen ++;
  124. }
  125. }
  126. /* Init hashes pipes and keys */
  127. hashes = g_malloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
  128. hlen = ilen > SHINGLES_WINDOW ?
  129. (ilen - SHINGLES_WINDOW + 1) : 1;
  130. keys = rspamd_shingles_get_keys_cached (key);
  131. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  132. hashes[i] = g_malloc (hlen * sizeof (guint64));
  133. }
  134. /* Now parse input words into a vector of hashes using rolling window */
  135. if (alg == RSPAMD_SHINGLES_OLD) {
  136. for (i = 0; i <= (gint)ilen; i ++) {
  137. if (i - beg >= SHINGLES_WINDOW || i == (gint)ilen) {
  138. for (j = beg; j < i; j ++) {
  139. word = NULL;
  140. while (widx < input->len) {
  141. word = &g_array_index (input, rspamd_stat_token_t, widx);
  142. if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)
  143. || word->stemmed.len == 0) {
  144. widx++;
  145. }
  146. else {
  147. break;
  148. }
  149. }
  150. if (word == NULL) {
  151. /* Nothing but exceptions */
  152. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  153. g_free (hashes[i]);
  154. }
  155. g_free (hashes);
  156. if (pool == NULL) {
  157. g_free (res);
  158. }
  159. rspamd_fstring_free (row);
  160. return NULL;
  161. }
  162. row = rspamd_fstring_append (row, word->stemmed.begin,
  163. word->stemmed.len);
  164. }
  165. /* Now we need to create a new row here */
  166. for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
  167. rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
  168. keys[j]);
  169. g_assert (hlen > beg);
  170. hashes[j][beg] = val;
  171. }
  172. beg++;
  173. widx ++;
  174. row = rspamd_fstring_assign (row, "", 0);
  175. }
  176. }
  177. }
  178. else {
  179. guint64 window[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
  180. switch (alg) {
  181. case RSPAMD_SHINGLES_XXHASH:
  182. ht = RSPAMD_CRYPTOBOX_XXHASH64;
  183. break;
  184. case RSPAMD_SHINGLES_MUMHASH:
  185. ht = RSPAMD_CRYPTOBOX_MUMHASH;
  186. break;
  187. default:
  188. ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
  189. break;
  190. }
  191. memset (window, 0, sizeof (window));
  192. for (i = 0; i <= ilen; i ++) {
  193. if (i - beg >= SHINGLES_WINDOW || i == ilen) {
  194. for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
  195. /* Shift hashes window to right */
  196. for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
  197. window[j * SHINGLES_WINDOW + k] =
  198. window[j * SHINGLES_WINDOW + k + 1];
  199. }
  200. word = NULL;
  201. while (widx < input->len) {
  202. word = &g_array_index (input, rspamd_stat_token_t, widx);
  203. if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)
  204. || word->stemmed.len == 0) {
  205. widx++;
  206. }
  207. else {
  208. break;
  209. }
  210. }
  211. if (word == NULL) {
  212. /* Nothing but exceptions */
  213. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  214. g_free (hashes[i]);
  215. }
  216. if (pool == NULL) {
  217. g_free (res);
  218. }
  219. g_free (hashes);
  220. rspamd_fstring_free (row);
  221. return NULL;
  222. }
  223. /* Insert the last element to the pipe */
  224. memcpy (&seed, keys[j], sizeof (seed));
  225. window[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
  226. rspamd_cryptobox_fast_hash_specific (ht,
  227. word->stemmed.begin, word->stemmed.len,
  228. seed);
  229. val = 0;
  230. for (k = 0; k < SHINGLES_WINDOW; k ++) {
  231. val ^= window[j * SHINGLES_WINDOW + k] >>
  232. (8 * (SHINGLES_WINDOW - k - 1));
  233. }
  234. g_assert (hlen > beg);
  235. hashes[j][beg] = val;
  236. }
  237. beg ++;
  238. widx ++;
  239. }
  240. }
  241. }
  242. /* Now we need to filter all hashes and make a shingles result */
  243. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  244. res->hashes[i] = filter (hashes[i], hlen,
  245. i, key, filterd);
  246. g_free (hashes[i]);
  247. }
  248. g_free (hashes);
  249. rspamd_fstring_free (row);
  250. return res;
  251. }
  252. struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
  253. rspamd_shingles_from_image (guchar *dct,
  254. const guchar key[16],
  255. rspamd_mempool_t *pool,
  256. rspamd_shingles_filter filter,
  257. gpointer filterd,
  258. enum rspamd_shingle_alg alg)
  259. {
  260. struct rspamd_shingle *shingle;
  261. guint64 **hashes;
  262. guchar **keys;
  263. guint64 d;
  264. guint64 val;
  265. gint i, j;
  266. gsize hlen, beg = 0;
  267. enum rspamd_cryptobox_fast_hash_type ht;
  268. guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
  269. if (pool != NULL) {
  270. shingle = rspamd_mempool_alloc (pool, sizeof (*shingle));
  271. }
  272. else {
  273. shingle = g_malloc (sizeof (*shingle));
  274. }
  275. /* Init hashes pipes and keys */
  276. hashes = g_malloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
  277. hlen = RSPAMD_DCT_LEN / NBBY + 1;
  278. keys = rspamd_shingles_get_keys_cached (key);
  279. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  280. hashes[i] = g_malloc (hlen * sizeof (guint64));
  281. }
  282. switch (alg) {
  283. case RSPAMD_SHINGLES_OLD:
  284. ht = RSPAMD_CRYPTOBOX_MUMHASH;
  285. break;
  286. case RSPAMD_SHINGLES_XXHASH:
  287. ht = RSPAMD_CRYPTOBOX_XXHASH64;
  288. break;
  289. case RSPAMD_SHINGLES_MUMHASH:
  290. ht = RSPAMD_CRYPTOBOX_MUMHASH;
  291. break;
  292. default:
  293. ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
  294. break;
  295. }
  296. memset (res, 0, sizeof (res));
  297. #define INNER_CYCLE_SHINGLES(s, e) for (j = (s); j < (e); j ++) { \
  298. d = dct[beg]; \
  299. memcpy (&seed, keys[j], sizeof (seed)); \
  300. val = rspamd_cryptobox_fast_hash_specific (ht, \
  301. &d, sizeof (d), \
  302. seed); \
  303. hashes[j][beg] = val; \
  304. }
  305. for (i = 0; i < RSPAMD_DCT_LEN / NBBY; i ++) {
  306. INNER_CYCLE_SHINGLES (0, RSPAMD_SHINGLE_SIZE / 4);
  307. INNER_CYCLE_SHINGLES (RSPAMD_SHINGLE_SIZE / 4, RSPAMD_SHINGLE_SIZE / 2);
  308. INNER_CYCLE_SHINGLES (RSPAMD_SHINGLE_SIZE / 2, 3 * RSPAMD_SHINGLE_SIZE / 4);
  309. INNER_CYCLE_SHINGLES (3 * RSPAMD_SHINGLE_SIZE / 4, RSPAMD_SHINGLE_SIZE);
  310. beg++;
  311. }
  312. #undef INNER_CYCLE_SHINGLES
  313. /* Now we need to filter all hashes and make a shingles result */
  314. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  315. shingle->hashes[i] = filter (hashes[i], hlen,
  316. i, key, filterd);
  317. g_free (hashes[i]);
  318. }
  319. g_free (hashes);
  320. return shingle;
  321. }
  322. guint64
  323. rspamd_shingles_default_filter (guint64 *input, gsize count,
  324. gint shno, const guchar *key, gpointer ud)
  325. {
  326. guint64 minimal = G_MAXUINT64;
  327. gsize i;
  328. for (i = 0; i < count; i ++) {
  329. if (minimal > input[i]) {
  330. minimal = input[i];
  331. }
  332. }
  333. return minimal;
  334. }
  335. gdouble rspamd_shingles_compare (const struct rspamd_shingle *a,
  336. const struct rspamd_shingle *b)
  337. {
  338. gint i, common = 0;
  339. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  340. if (a->hashes[i] == b->hashes[i]) {
  341. common ++;
  342. }
  343. }
  344. return (gdouble)common / (gdouble)RSPAMD_SHINGLE_SIZE;
  345. }