You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

osb.c 3.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. /*
  2. * Copyright (c) 2009-2012, Vsevolod Stakhov
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. *
  13. * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
  14. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  15. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  16. * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
  17. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  18. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  19. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  20. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  21. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  22. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. */
  24. /*
  25. * OSB tokenizer
  26. */
  27. #include <sys/types.h>
  28. #include "tokenizers.h"
  29. /* Minimum length of token */
  30. #define MIN_LEN 4
  31. extern const int primes[];
  32. int
  33. osb_tokenize_text (struct tokenizer *tokenizer,
  34. rspamd_mempool_t * pool,
  35. GArray * input,
  36. GTree ** tree,
  37. gboolean save_token,
  38. gboolean is_utf,
  39. GList *exceptions)
  40. {
  41. token_node_t *new = NULL;
  42. rspamd_fstring_t *token;
  43. guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
  44. gint i, processed = 0;
  45. guint w;
  46. if (input == NULL) {
  47. return FALSE;
  48. }
  49. if (*tree == NULL) {
  50. *tree = g_tree_new (token_node_compare_func);
  51. rspamd_mempool_add_destructor (pool,
  52. (rspamd_mempool_destruct_t) g_tree_destroy,
  53. *tree);
  54. }
  55. memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
  56. for (w = 0; w < input->len; w ++) {
  57. token = &g_array_index (input, rspamd_fstring_t, w);
  58. if (processed < FEATURE_WINDOW_SIZE) {
  59. /* Just fill a hashpipe */
  60. hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
  61. rspamd_fstrhash_lc (token, is_utf);
  62. }
  63. else {
  64. /* Shift hashpipe */
  65. for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
  66. hashpipe[i] = hashpipe[i - 1];
  67. }
  68. hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
  69. processed++;
  70. for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
  71. h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
  72. h2 = hashpipe[0] * primes[1] + hashpipe[i] *
  73. primes[(i << 1) - 1];
  74. new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
  75. new->h1 = h1;
  76. new->h2 = h2;
  77. if (save_token) {
  78. new->extra =
  79. (uintptr_t)rspamd_mempool_fstrdup (pool, token);
  80. }
  81. if (g_tree_lookup (*tree, new) == NULL) {
  82. g_tree_insert (*tree, new, new);
  83. }
  84. }
  85. }
  86. }
  87. if (processed <= FEATURE_WINDOW_SIZE) {
  88. for (i = 1; i < processed; i++) {
  89. h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
  90. h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
  91. new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
  92. new->h1 = h1;
  93. new->h2 = h2;
  94. if (save_token) {
  95. new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
  96. }
  97. if (g_tree_lookup (*tree, new) == NULL) {
  98. g_tree_insert (*tree, new, new);
  99. }
  100. }
  101. }
  102. return TRUE;
  103. }
  104. /*
  105. * vi:ts=4
  106. */