You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

shingles.h 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef SHINGLES_H_
  17. #define SHINGLES_H_
  18. #include "config.h"
  19. #include "mem_pool.h"
  20. #define RSPAMD_SHINGLE_SIZE 32
  21. #ifdef __cplusplus
  22. extern "C" {
  23. #endif
  24. struct rspamd_shingle {
  25. uint64_t hashes[RSPAMD_SHINGLE_SIZE];
  26. };
  27. enum rspamd_shingle_alg {
  28. RSPAMD_SHINGLES_OLD = 0,
  29. RSPAMD_SHINGLES_XXHASH,
  30. RSPAMD_SHINGLES_MUMHASH,
  31. RSPAMD_SHINGLES_FAST
  32. };
  33. /**
  34. * Shingles filtering function
  35. * @param input input array of hashes
  36. * @param count number of hashes in the vector
  37. * @return shingle value
  38. */
  39. typedef uint64_t (*rspamd_shingles_filter)(uint64_t *input, gsize count,
  40. gint shno, const guchar *key, gpointer ud);
  41. /**
  42. * Generate shingles from the input of fixed size strings using lemmatizer
  43. * if needed
  44. * @param input array of `rspamd_fstring_t`
  45. * @param key secret key used to generate shingles
  46. * @param pool pool to allocate shingles array
  47. * @param filter hashes filtering function
  48. * @param filterd opaque data for filtering function
  49. * @return shingles array
  50. */
  51. struct rspamd_shingle *rspamd_shingles_from_text(GArray *input,
  52. const guchar key[16],
  53. rspamd_mempool_t *pool,
  54. rspamd_shingles_filter filter,
  55. gpointer filterd,
  56. enum rspamd_shingle_alg alg);
  57. /**
  58. * Generate shingles from the DCT matrix of an image
  59. * @param dct discrete cosine transfor matrix (must be 64x64)
  60. * @param key secret key used to generate shingles
  61. * @param pool pool to allocate shingles array
  62. * @param filter hashes filtering function
  63. * @param filterd opaque data for filtering function
  64. * @return shingles array
  65. */
  66. struct rspamd_shingle *rspamd_shingles_from_image(guchar *dct,
  67. const guchar key[16],
  68. rspamd_mempool_t *pool,
  69. rspamd_shingles_filter filter,
  70. gpointer filterd,
  71. enum rspamd_shingle_alg alg);
  72. /**
  73. * Compares two shingles and return result as a floating point value - 1.0
  74. * for completely similar shingles and 0.0 for completely different ones
  75. * @param a
  76. * @param b
  77. * @return
  78. */
  79. gdouble rspamd_shingles_compare(const struct rspamd_shingle *a,
  80. const struct rspamd_shingle *b);
  81. /**
  82. * Default filtering function
  83. */
  84. uint64_t rspamd_shingles_default_filter(uint64_t *input, gsize count,
  85. gint shno, const guchar *key, gpointer ud);
  86. #ifdef __cplusplus
  87. }
  88. #endif
  89. #endif /* SHINGLES_H_ */