You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizers.c 1.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. /*
  2. * Common tokenization functions
  3. */
  4. #include <sys/types.h>
  5. #include "tokenizers.h"
  6. struct tokenizer tokenizers[] = {
  7. {"osb-text", osb_tokenize_text, get_next_word },
  8. };
  9. struct tokenizer*
  10. get_tokenizer (char *name)
  11. {
  12. int i;
  13. for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i ++) {
  14. if (strcmp (tokenizers[i].name, name) == 0) {
  15. return &tokenizers[i];
  16. }
  17. }
  18. return NULL;
  19. }
  20. int
  21. token_node_compare_func (gconstpointer a, gconstpointer b)
  22. {
  23. const token_node_t *aa = a, *bb = b;
  24. if (aa->h1 == bb->h1) {
  25. return aa->h2 - bb->h2;
  26. }
  27. return aa->h1 - bb->h1;
  28. }
  29. /* Get next word from specified f_str_t buf */
  30. f_str_t *
  31. get_next_word (f_str_t *buf, f_str_t *token)
  32. {
  33. size_t remain;
  34. char *pos;
  35. if (buf == NULL) {
  36. return NULL;
  37. }
  38. if (token->begin == NULL) {
  39. token->begin = buf->begin;
  40. }
  41. remain = buf->len - (token->begin - buf->begin);
  42. if (remain <= 0) {
  43. return NULL;
  44. }
  45. token->begin = token->begin + token->len;
  46. token->len = 0;
  47. pos = token->begin;
  48. /* Skip non graph symbols */
  49. while (remain-- && !g_ascii_isgraph (*pos ++)) {
  50. token->begin ++;
  51. }
  52. while (remain-- && g_ascii_isgraph (*pos ++)) {
  53. token->len ++;
  54. }
  55. return token;
  56. }
  57. /*
  58. * vi:ts=4
  59. */