You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizers.h 1.6KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #ifndef TOKENIZERS_H
  2. #define TOKENIZERS_H
  3. #include "config.h"
  4. #include "mem_pool.h"
  5. #include "fstring.h"
  6. #include "main.h"
  7. /* Size for features pipe */
  8. #define FEATURE_WINDOW_SIZE 5
  9. typedef struct token_node_s {
  10. guint32 h1;
  11. guint32 h2;
  12. double value;
  13. uintptr_t extra;
  14. } token_node_t;
  15. /* Common tokenizer structure */
  16. struct tokenizer {
  17. gchar *name;
  18. gint (*tokenize_func)(struct tokenizer *tokenizer,
  19. rspamd_mempool_t *pool,
  20. GArray *words,
  21. GTree **cur,
  22. gboolean save_token,
  23. gboolean is_utf,
  24. GList *exceptions);
  25. gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
  26. };
  27. /* Compare two token nodes */
  28. int token_node_compare_func (gconstpointer a, gconstpointer b);
  29. /* Get tokenizer structure by name or return NULL if this name is not found */
  30. struct tokenizer * get_tokenizer (const char *name);
  31. /* Get next word from specified f_str_t buf */
  32. gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
  33. rspamd_fstring_t *token, GList **exceptions);
  34. /* Tokenize text into array of words (rspamd_fstring_t type) */
  35. GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
  36. gsize min_len, GList **exceptions);
  37. /* OSB tokenize function */
  38. int osb_tokenize_text (struct tokenizer *tokenizer,
  39. rspamd_mempool_t *pool,
  40. GArray *input,
  41. GTree **cur,
  42. gboolean save_token,
  43. gboolean is_utf,
  44. GList *exceptions);
  45. /* Make tokens for a subject */
  46. void tokenize_subject (struct rspamd_task *task, GTree ** tree);
  47. /* Array of all defined tokenizers */
  48. extern struct tokenizer tokenizers[];
  49. #endif
  50. /*
  51. * vi:ts=4
  52. */