You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizers.h 1.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. #ifndef TOKENIZERS_H
  2. #define TOKENIZERS_H
  3. #include <sys/types.h>
  4. #include "../config.h"
  5. #ifdef HAVE_STDINT_H
  6. #include <stdint.h>
  7. #endif
  8. #include "../mem_pool.h"
  9. #include "../fstring.h"
  10. #include "../main.h"
  11. /* Size for features pipe */
  12. #define FEATURE_WINDOW_SIZE 5
  13. typedef struct token_node_s {
  14. uint32_t h1;
  15. uint32_t h2;
  16. } token_node_t;
  17. /* Common tokenizer structure */
  18. struct tokenizer {
  19. char *name;
  20. GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
  21. f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
  22. };
  23. /* Compare two token nodes */
  24. int token_node_compare_func (gconstpointer a, gconstpointer b);
  25. /* Get tokenizer structure by name or return NULL if this name is not found */
  26. struct tokenizer* get_tokenizer (char *name);
  27. /* Get next word from specified f_str_t buf */
  28. f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
  29. /* OSB tokenize function */
  30. GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
  31. /* Array of all defined tokenizers */
  32. extern struct tokenizer tokenizers[];
  33. #endif
  34. /*
  35. * vi:ts=4
  36. */