You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

rspamd_shingles_test.c 2.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "rspamd.h"
  18. #include "shingles.h"
  19. #include "ottery.h"
  20. #include <math.h>
  21. static void
  22. generate_random_string (char *begin, size_t len)
  23. {
  24. gsize i;
  25. for (i = 0; i < len; i ++) {
  26. begin[i] = ottery_rand_range ('z' - 'a') + 'a';
  27. }
  28. }
  29. static GArray *
  30. generate_fuzzy_words (gsize cnt, gsize max_len)
  31. {
  32. GArray *res;
  33. gsize i, wlen;
  34. rspamd_ftok_t w;
  35. char *t;
  36. res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), cnt);
  37. for (i = 0; i < cnt; i ++) {
  38. wlen = ottery_rand_range (max_len) + 1;
  39. w.len = wlen;
  40. t = g_malloc (wlen);
  41. generate_random_string (t, wlen);
  42. w.begin = t;
  43. g_array_append_val (res, w);
  44. }
  45. return res;
  46. }
  47. static void
  48. permute_vector (GArray *in, gdouble prob)
  49. {
  50. gsize i, total = 0;
  51. rspamd_ftok_t *w;
  52. for (i = 0; i < in->len; i ++) {
  53. if (ottery_rand_unsigned () <= G_MAXUINT * prob) {
  54. w = &g_array_index (in, rspamd_ftok_t, i);
  55. generate_random_string ((gchar *)w->begin, w->len);
  56. total ++;
  57. }
  58. }
  59. msg_debug ("generated %z permutations of %ud words", total, in->len);
  60. }
  61. static void
  62. free_fuzzy_words (GArray *ar)
  63. {
  64. gsize i;
  65. rspamd_ftok_t *w;
  66. for (i = 0; i < ar->len; i ++) {
  67. w = &g_array_index (ar, rspamd_ftok_t, i);
  68. g_free ((gpointer)w->begin);
  69. }
  70. }
  71. static void
  72. test_case (gsize cnt, gsize max_len, gdouble perm_factor)
  73. {
  74. GArray *input;
  75. struct rspamd_shingle *sgl, *sgl_permuted;
  76. gdouble res;
  77. guchar key[16];
  78. gdouble ts1, ts2;
  79. ottery_rand_bytes (key, sizeof (key));
  80. input = generate_fuzzy_words (cnt, max_len);
  81. ts1 = rspamd_get_ticks ();
  82. sgl = rspamd_shingles_generate (input, key, NULL,
  83. rspamd_shingles_default_filter, NULL);
  84. ts2 = rspamd_get_ticks ();
  85. permute_vector (input, perm_factor);
  86. sgl_permuted = rspamd_shingles_generate (input, key, NULL,
  87. rspamd_shingles_default_filter, NULL);
  88. res = rspamd_shingles_compare (sgl, sgl_permuted);
  89. msg_debug ("percentage of common shingles: %.3f, generate time: %hd usec",
  90. res, (gint)(ts1 - ts2) * 1000);
  91. g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.20);
  92. free_fuzzy_words (input);
  93. g_free (sgl);
  94. g_free (sgl_permuted);
  95. }
  96. void
  97. rspamd_shingles_test_func (void)
  98. {
  99. //test_case (5, 100, 0.5);
  100. test_case (200, 10, 0.1);
  101. test_case (500, 20, 0.01);
  102. test_case (5000, 20, 0.01);
  103. test_case (5000, 15, 0);
  104. test_case (5000, 30, 1.0);
  105. }