Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

lang_detection.h 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. /*-
  2. * Copyright 2017 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef RSPAMD_LANG_DETECTION_H
  17. #define RSPAMD_LANG_DETECTION_H
  18. #include "config.h"
  19. #include "libserver/cfg_file.h"
  20. #include "libstat/stat_api.h"
  21. #include "libmime/message.h"
  22. #ifdef __cplusplus
  23. extern "C" {
  24. #endif
  25. struct rspamd_lang_detector;
  26. struct rspamd_language_elt;
  27. struct rspamd_task;
  28. enum rspamd_unicode_scripts {
  29. RSPAMD_UNICODE_LATIN = (1 << 0),
  30. RSPAMD_UNICODE_GREEK = (1 << 1),
  31. RSPAMD_UNICODE_CYRILLIC = (1 << 2),
  32. RSPAMD_UNICODE_HEBREW = (1 << 3),
  33. RSPAMD_UNICODE_CJK = (1 << 4),
  34. RSPAMD_UNICODE_JP = (1 << 5),
  35. RSPAMD_UNICODE_ARABIC = (1 << 6),
  36. RSPAMD_UNICODE_DEVANAGARI = (1 << 7),
  37. RSPAMD_UNICODE_THAI = (1 << 8),
  38. RSPAMD_UNICODE_ARMENIAN = (1 << 9),
  39. RSPAMD_UNICODE_GEORGIAN = (1 << 10),
  40. RSPAMD_UNICODE_GUJARATI = (1 << 11),
  41. RSPAMD_UNICODE_TAMIL = (1 << 12),
  42. RSPAMD_UNICODE_TELUGU = (1 << 13),
  43. RSPAMD_UNICODE_MALAYALAM = (1 << 14),
  44. RSPAMD_UNICODE_SINHALA = (1 << 15),
  45. RSPAMD_UNICODE_HANGUL = (1 << 16),
  46. };
  47. enum rspamd_language_elt_flags {
  48. RS_LANGUAGE_DEFAULT = 0,
  49. RS_LANGUAGE_LATIN = (1 << 0),
  50. RS_LANGUAGE_TIER1 = (1 << 3),
  51. RS_LANGUAGE_TIER0 = (1 << 4),
  52. RS_LANGUAGE_DIACRITICS = (1 << 5),
  53. };
  54. struct rspamd_lang_detector_res {
  55. gdouble prob;
  56. const gchar *lang;
  57. struct rspamd_language_elt *elt;
  58. };
  59. /**
  60. * Create new language detector object using configuration object
  61. * @param cfg
  62. * @return
  63. */
  64. struct rspamd_lang_detector *rspamd_language_detector_init (struct rspamd_config *cfg);
  65. struct rspamd_lang_detector *rspamd_language_detector_ref (struct rspamd_lang_detector *d);
  66. void rspamd_language_detector_unref (struct rspamd_lang_detector *d);
  67. /**
  68. * Try to detect language of words
  69. * @param d
  70. * @param ucs_tokens
  71. * @param words_len
  72. * @return array of struct rspamd_lang_detector_res sorted by freq descending
  73. */
  74. gboolean rspamd_language_detector_detect (struct rspamd_task *task,
  75. struct rspamd_lang_detector *d,
  76. struct rspamd_mime_text_part *part);
  77. /**
  78. * Returns TRUE if the specified word is known to be a stop word
  79. * @param d
  80. * @param word
  81. * @param wlen
  82. * @return
  83. */
  84. gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
  85. const gchar *word, gsize wlen);
  86. /**
  87. * Return language flags for a specific language elt
  88. * @param elt
  89. * @return
  90. */
  91. gint rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt);
  92. #ifdef __cplusplus
  93. }
  94. #endif
  95. #endif