You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lang_detection.h 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. /*-
  2. * Copyright 2017 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef RSPAMD_LANG_DETECTION_H
  17. #define RSPAMD_LANG_DETECTION_H
  18. #include "config.h"
  19. #include "libserver/cfg_file.h"
  20. #include "libstat/stat_api.h"
  21. #include "libmime/message.h"
  22. #ifdef __cplusplus
  23. extern "C" {
  24. #endif
  25. struct rspamd_lang_detector;
  26. struct rspamd_language_elt;
  27. struct rspamd_task;
  28. enum rspamd_unicode_scripts {
  29. RSPAMD_UNICODE_LATIN = (1 << 0),
  30. RSPAMD_UNICODE_GREEK = (1 << 1),
  31. RSPAMD_UNICODE_CYRILLIC = (1 << 2),
  32. RSPAMD_UNICODE_HEBREW = (1 << 3),
  33. RSPAMD_UNICODE_CJK = (1 << 4),
  34. RSPAMD_UNICODE_JP = (1 << 5),
  35. RSPAMD_UNICODE_ARABIC = (1 << 6),
  36. RSPAMD_UNICODE_DEVANAGARI = (1 << 7),
  37. RSPAMD_UNICODE_THAI = (1 << 8),
  38. RSPAMD_UNICODE_ARMENIAN = (1 << 9),
  39. RSPAMD_UNICODE_GEORGIAN = (1 << 10),
  40. RSPAMD_UNICODE_GUJARATI = (1 << 11),
  41. RSPAMD_UNICODE_TAMIL = (1 << 12),
  42. RSPAMD_UNICODE_TELUGU = (1 << 13),
  43. RSPAMD_UNICODE_MALAYALAM = (1 << 14),
  44. RSPAMD_UNICODE_SINHALA = (1 << 15),
  45. RSPAMD_UNICODE_HANGUL = (1 << 16),
  46. };
  47. enum rspamd_language_elt_flags {
  48. RS_LANGUAGE_DEFAULT = 0,
  49. RS_LANGUAGE_LATIN = (1 << 0),
  50. RS_LANGUAGE_TIER1 = (1 << 3),
  51. RS_LANGUAGE_TIER0 = (1 << 4),
  52. RS_LANGUAGE_DIACRITICS = (1 << 5),
  53. RS_LANGUAGE_ASCII = (1 << 6),
  54. };
  55. struct rspamd_lang_detector_res {
  56. double prob;
  57. const char *lang;
  58. struct rspamd_language_elt *elt;
  59. };
  60. /**
  61. * Create new language detector object using configuration object
  62. * @param cfg
  63. * @return
  64. */
  65. struct rspamd_lang_detector *rspamd_language_detector_init(struct rspamd_config *cfg);
  66. struct rspamd_lang_detector *rspamd_language_detector_ref(struct rspamd_lang_detector *d);
  67. void rspamd_language_detector_unref(struct rspamd_lang_detector *d);
  68. /**
  69. * Try to detect language of words
  70. * @param d
  71. * @param ucs_tokens
  72. * @param words_len
  73. * @return array of struct rspamd_lang_detector_res sorted by freq descending
  74. */
  75. gboolean rspamd_language_detector_detect(struct rspamd_task *task,
  76. struct rspamd_lang_detector *d,
  77. struct rspamd_mime_text_part *part);
  78. /**
  79. * Returns TRUE if the specified word is known to be a stop word
  80. * @param d
  81. * @param word
  82. * @param wlen
  83. * @return
  84. */
  85. gboolean rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d,
  86. const char *word, gsize wlen);
  87. /**
  88. * Return language flags for a specific language elt
  89. * @param elt
  90. * @return
  91. */
  92. int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt);
  93. #ifdef __cplusplus
  94. }
  95. #endif
  96. #endif