Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

rspamd_shingles_test.c 7.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "rspamd.h"
  18. #include "shingles.h"
  19. #include "ottery.h"
  20. #include <math.h>
  21. static const gchar *
  22. algorithm_to_string(enum rspamd_shingle_alg alg)
  23. {
  24. const gchar *ret = "unknown";
  25. switch (alg) {
  26. case RSPAMD_SHINGLES_OLD:
  27. ret = "siphash";
  28. break;
  29. case RSPAMD_SHINGLES_XXHASH:
  30. ret = "xxhash";
  31. break;
  32. case RSPAMD_SHINGLES_MUMHASH:
  33. ret = "mumhash";
  34. break;
  35. case RSPAMD_SHINGLES_FAST:
  36. ret = "fasthash";
  37. break;
  38. }
  39. return ret;
  40. }
  41. static void
  42. generate_random_string(char *begin, size_t len)
  43. {
  44. gsize i;
  45. for (i = 0; i < len; i++) {
  46. begin[i] = ottery_rand_range('z' - 'a') + 'a';
  47. }
  48. }
  49. static GArray *
  50. generate_fuzzy_words(gsize cnt, gsize max_len)
  51. {
  52. GArray *res;
  53. gsize i, wlen;
  54. rspamd_ftok_t w;
  55. char *t;
  56. res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), cnt);
  57. for (i = 0; i < cnt; i++) {
  58. wlen = ottery_rand_range(max_len) + 1;
  59. /* wlen = max_len; */
  60. w.len = wlen;
  61. t = g_malloc(wlen);
  62. generate_random_string(t, wlen);
  63. w.begin = t;
  64. g_array_append_val(res, w);
  65. }
  66. return res;
  67. }
  68. static void
  69. permute_vector(GArray *in, gdouble prob)
  70. {
  71. gsize i, total = 0;
  72. rspamd_ftok_t *w;
  73. for (i = 0; i < in->len; i++) {
  74. if (ottery_rand_unsigned() <= G_MAXUINT * prob) {
  75. w = &g_array_index(in, rspamd_ftok_t, i);
  76. generate_random_string((gchar *) w->begin, w->len);
  77. total++;
  78. }
  79. }
  80. msg_debug("generated %z permutations of %ud words", total, in->len);
  81. }
  82. static void
  83. free_fuzzy_words(GArray *ar)
  84. {
  85. gsize i;
  86. rspamd_ftok_t *w;
  87. for (i = 0; i < ar->len; i++) {
  88. w = &g_array_index(ar, rspamd_ftok_t, i);
  89. g_free((gpointer) w->begin);
  90. }
  91. }
  92. static void
  93. test_case(gsize cnt, gsize max_len, gdouble perm_factor,
  94. enum rspamd_shingle_alg alg)
  95. {
  96. GArray *input;
  97. struct rspamd_shingle *sgl, *sgl_permuted;
  98. gdouble res;
  99. guchar key[16];
  100. gdouble ts1, ts2;
  101. ottery_rand_bytes(key, sizeof(key));
  102. input = generate_fuzzy_words(cnt, max_len);
  103. ts1 = rspamd_get_virtual_ticks();
  104. sgl = rspamd_shingles_from_text(input, key, NULL,
  105. rspamd_shingles_default_filter, NULL, alg);
  106. ts2 = rspamd_get_virtual_ticks();
  107. permute_vector(input, perm_factor);
  108. sgl_permuted = rspamd_shingles_from_text(input, key, NULL,
  109. rspamd_shingles_default_filter, NULL, alg);
  110. res = rspamd_shingles_compare(sgl, sgl_permuted);
  111. msg_info("%s (%z words of %z max len, %.2f perm factor):"
  112. " percentage of common shingles: %.3f, generate time: %.4f sec",
  113. algorithm_to_string(alg), cnt, max_len, perm_factor, res, ts2 - ts1);
  114. //g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.25);
  115. free_fuzzy_words(input);
  116. g_free(sgl);
  117. g_free(sgl_permuted);
  118. }
  119. static const guint64 expected_old[RSPAMD_SHINGLE_SIZE] = {
  120. 0x2a97e024235cedc5,
  121. 0x46238acbcc55e9e0,
  122. 0x2378ff151af075b3,
  123. 0xde1f29a95cad109,
  124. 0x5d3bbbdb5db5d19f,
  125. 0x4d75a0ec52af10a6,
  126. 0x215ecd6372e755b5,
  127. 0x7b52295758295350,
  128. 0x17387d1beddc7f62,
  129. 0x26264ca879ffcada,
  130. 0x49d4a65ec0ab9914,
  131. 0xa2763e6995350cf,
  132. 0x3f4570231449c13f,
  133. 0x3309f857a0e54ee5,
  134. 0x24e4c5b561b0fce3,
  135. 0x1f153e3b275bfd1b,
  136. 0x4d067dbc97c3fd78,
  137. 0x9ffa2d076fa4f8bc,
  138. 0x3d8907f84b9ffc6c,
  139. 0x1cfd664c5262d256,
  140. 0xcdd7e744b699c15,
  141. 0x5544a2bbe05124f7,
  142. 0x5a4029b5d6a06f7,
  143. 0xd5adfbdc756c0e4,
  144. 0xa504b23d9689a67e,
  145. 0x15d945f7007de115,
  146. 0xbf676c0522a2c51d,
  147. 0x1c8d8163ad4b0f93,
  148. 0xa2c4ba20799344d7,
  149. 0x27c6f13c02134388,
  150. 0xa1d443d31fd5a3,
  151. 0x99fbca9f8563080,
  152. };
  153. static const guint64 expected_xxhash[RSPAMD_SHINGLE_SIZE] = {
  154. 0x33b134be11a705a,
  155. 0x36e2ea657aa36903,
  156. 0x6547b57f7470ce9d,
  157. 0x8253eb6d2f8f158e,
  158. 0x1cc99e3cf22388f,
  159. 0x2396da27ea36ffe8,
  160. 0x1b457d208ad3d96c,
  161. 0x2d6ac733d7a2c107,
  162. 0x17849cbed75cc4d1,
  163. 0x4dd94e772330e804,
  164. 0x39f592fa32014ed4,
  165. 0xa2f6229ad356461,
  166. 0x6dc825879a057b37,
  167. 0x886b12cef4338b05,
  168. 0x8b23af68c186518a,
  169. 0x16932b40339aaf02,
  170. 0x412090c6bb0b719c,
  171. 0x4d4a88cbdf1935f3,
  172. 0x233bcbddb5f67a7,
  173. 0x474719442a33dcca,
  174. 0x2da7ec30563e622,
  175. 0x7ab90086960e1ad2,
  176. 0x3ea2b45582539f75,
  177. 0x108cd9287d95a6c5,
  178. 0x69ba7c67c115597,
  179. 0x10880860eb75e982,
  180. 0x16f3d90e6ab995a6,
  181. 0x5f24ea09379b9f5c,
  182. 0x3c2dc04088e8fe54,
  183. 0x340b8cf1c6f1227,
  184. 0x193bc348ed2e9ce7,
  185. 0x68454ef43da9c748,
  186. };
  187. static const guint64 expected_mumhash[RSPAMD_SHINGLE_SIZE] = {
  188. 0x38d35473b80a7fc3,
  189. 0x1300531adc2d16a1,
  190. 0x26883bc89f78f4bd,
  191. 0x57de365ef6d1a62,
  192. 0x773603185fcbb20a,
  193. 0x39c6cbd7ebbeaa88,
  194. 0x676c7445ad167e70,
  195. 0x432315d1ecc4c0b1,
  196. 0x1380b95756dbb078,
  197. 0x9ee12832fa53b90e,
  198. 0x72970be210f0dd0b,
  199. 0x62909bd520f5956,
  200. 0x66196965a45eb32a,
  201. 0x2466a9ca5436620e,
  202. 0x157b828b10e10f6e,
  203. 0x429bb673a523a7e5,
  204. 0x51a6ace94f320f88,
  205. 0x23f53a30bd7d7147,
  206. 0xbee557664d3bc34c,
  207. 0x65730c88cd212a9,
  208. 0x87e72c0cd05fd0e,
  209. 0x417a744669baeb3d,
  210. 0x78e26f7917829324,
  211. 0x439777dcfc25fdf4,
  212. 0x582eac6ff013f00b,
  213. 0x1e40aa90e367f4af,
  214. 0x301d14a28d6c23a2,
  215. 0x34140ecb21b6c69,
  216. 0x390a091c8b4c31b9,
  217. 0x2e35fecf9fff0ae7,
  218. 0x94322e1a5cf31f1b,
  219. 0x33cb9190905e049a,
  220. };
  221. static const guint64 expected_fasthash[RSPAMD_SHINGLE_SIZE] = {
  222. 0x3843a716f94828a6,
  223. 0x13fd5386dda3b28d,
  224. 0x71cb09de527c40a,
  225. 0x5d6f59ffd839c62,
  226. 0x7ce3633acd568476,
  227. 0x9014298cbd00167,
  228. 0x6708ec29eedb5350,
  229. 0x2882931ff2c5c410,
  230. 0x1839d8b947b12571,
  231. 0x58f7bc3829173302,
  232. 0x4dac8103da51abc4,
  233. 0x6c5cbcc6fb1de28,
  234. 0x31fefcef9bafb755,
  235. 0x6f2d1a0b1feca401,
  236. 0x3e71f3718e520b06,
  237. 0x42f6ba11164ab231,
  238. 0x21164d010bd76f4a,
  239. 0x4c597ccc7b60f620,
  240. 0x2cf1ca3383b77574,
  241. 0x54ff9c01660b8add,
  242. 0x2ca344758f40380d,
  243. 0x1b962321bd37d0f2,
  244. 0x9323bb99c32bc418,
  245. 0x375659d0eef2b8f2,
  246. 0x1dbd23a1030084b7,
  247. 0x83cb978dee06aa0a,
  248. 0x42c97be5b27a7763,
  249. 0x3b6d6b7270ed765,
  250. 0x125c12fdba584aed,
  251. 0x1c826397afe58763,
  252. 0x8bdbe2d43f3eda96,
  253. 0x954cda70edf6591f,
  254. };
  255. void rspamd_shingles_test_func(void)
  256. {
  257. enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;
  258. struct rspamd_shingle *sgl;
  259. guchar key[16];
  260. GArray *input;
  261. rspamd_ftok_t tok;
  262. int i;
  263. memset(key, 0, sizeof(key));
  264. input = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), 5);
  265. for (i = 0; i < 5; i++) {
  266. gchar *b = g_alloca(8);
  267. memset(b, 0, 8);
  268. memcpy(b + 1, "test", 4);
  269. b[0] = 'a' + i;
  270. tok.begin = b;
  271. tok.len = 5 + ((i + 1) % 4);
  272. g_array_append_val(input, tok);
  273. }
  274. sgl = rspamd_shingles_from_text(input, key, NULL,
  275. rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_OLD);
  276. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  277. g_assert(sgl->hashes[i] == expected_old[i]);
  278. }
  279. g_free(sgl);
  280. sgl = rspamd_shingles_from_text(input, key, NULL,
  281. rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_XXHASH);
  282. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  283. g_assert(sgl->hashes[i] == expected_xxhash[i]);
  284. }
  285. g_free(sgl);
  286. sgl = rspamd_shingles_from_text(input, key, NULL,
  287. rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_MUMHASH);
  288. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  289. g_assert(sgl->hashes[i] == expected_mumhash[i]);
  290. }
  291. g_free(sgl);
  292. sgl = rspamd_shingles_from_text(input, key, NULL,
  293. rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_FAST);
  294. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  295. g_assert(sgl->hashes[i] == expected_fasthash[i]);
  296. }
  297. g_free(sgl);
  298. for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_FAST; alg++) {
  299. test_case(200, 10, 0.1, alg);
  300. test_case(500, 20, 0.01, alg);
  301. test_case(5000, 20, 0.01, alg);
  302. test_case(5000, 15, 0, alg);
  303. test_case(5000, 30, 1.0, alg);
  304. test_case(50000, 30, 0.02, alg);
  305. test_case(50000, 5, 0.02, alg);
  306. test_case(50000, 16, 0.02, alg);
  307. }
  308. }