You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizers.h 2.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. /*
  2. * Copyright 2023 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef TOKENIZERS_H
  17. #define TOKENIZERS_H
  18. #include "config.h"
  19. #include "mem_pool.h"
  20. #include "fstring.h"
  21. #include "rspamd.h"
  22. #include "stat_api.h"
  23. #include <unicode/utext.h>
  24. #define RSPAMD_DEFAULT_TOKENIZER "osb"
  25. #ifdef __cplusplus
  26. extern "C" {
  27. #endif
  28. struct rspamd_tokenizer_runtime;
  29. struct rspamd_stat_ctx;
  30. /* Common tokenizer structure */
  31. struct rspamd_stat_tokenizer {
  32. gchar *name;
  33. gpointer (*get_config)(rspamd_mempool_t *pool,
  34. struct rspamd_tokenizer_config *cf, gsize *len);
  35. gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
  36. struct rspamd_task *task,
  37. GArray *words,
  38. gboolean is_utf,
  39. const gchar *prefix,
  40. GPtrArray *result);
  41. };
  42. enum rspamd_tokenize_type {
  43. RSPAMD_TOKENIZE_UTF = 0,
  44. RSPAMD_TOKENIZE_RAW,
  45. RSPAMD_TOKENIZE_UNICODE
  46. };
  47. /* Compare two token nodes */
  48. gint token_node_compare_func(gconstpointer a, gconstpointer b);
  49. /* Tokenize text into array of words (rspamd_stat_token_t type) */
  50. GArray *rspamd_tokenize_text(const gchar *text, gsize len,
  51. const UText *utxt,
  52. enum rspamd_tokenize_type how,
  53. struct rspamd_config *cfg,
  54. GList *exceptions,
  55. uint64_t *hash,
  56. GArray *cur_words,
  57. rspamd_mempool_t *pool);
  58. /* OSB tokenize function */
  59. gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
  60. struct rspamd_task *task,
  61. GArray *words,
  62. gboolean is_utf,
  63. const gchar *prefix,
  64. GPtrArray *result);
  65. gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
  66. struct rspamd_tokenizer_config *cf,
  67. gsize *len);
  68. struct rspamd_lang_detector;
  69. void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool);
  70. void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
  71. void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
  72. const gchar *language,
  73. struct rspamd_lang_detector *lang_detector);
  74. void rspamd_tokenize_meta_words(struct rspamd_task *task);
  75. #ifdef __cplusplus
  76. }
  77. #endif
  78. #endif