You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lang_detection.h 2.5KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. /*-
  2. * Copyright 2017 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef RSPAMD_LANG_DETECTION_H
  17. #define RSPAMD_LANG_DETECTION_H
  18. #include "config.h"
  19. #include "libserver/cfg_file.h"
  20. #include "libstat/stat_api.h"
  21. #include "libmime/message.h"
  22. struct rspamd_lang_detector;
  23. struct rspamd_language_elt;
  24. struct rspamd_task;
  25. enum rspamd_unicode_scripts {
  26. RSPAMD_UNICODE_LATIN = (1 << 0),
  27. RSPAMD_UNICODE_GREEK = (1 << 1),
  28. RSPAMD_UNICODE_CYRILLIC = (1 << 2),
  29. RSPAMD_UNICODE_HEBREW = (1 << 3),
  30. RSPAMD_UNICODE_CJK = (1 << 4),
  31. RSPAMD_UNICODE_JP = (1 << 5),
  32. RSPAMD_UNICODE_ARABIC = (1 << 6),
  33. RSPAMD_UNICODE_DEVANAGARI = (1 << 7),
  34. RSPAMD_UNICODE_THAI = (1 << 8),
  35. RSPAMD_UNICODE_ARMENIAN = (1 << 9),
  36. RSPAMD_UNICODE_GEORGIAN = (1 << 10),
  37. RSPAMD_UNICODE_GUJARATI = (1 << 11),
  38. RSPAMD_UNICODE_TAMIL = (1 << 12),
  39. RSPAMD_UNICODE_TELUGU = (1 << 13),
  40. RSPAMD_UNICODE_MALAYALAM = (1 << 14),
  41. RSPAMD_UNICODE_SINHALA = (1 << 15),
  42. RSPAMD_UNICODE_HANGUL = (1 << 16),
  43. };
  44. struct rspamd_lang_detector_res {
  45. gdouble prob;
  46. const gchar *lang;
  47. struct rspamd_language_elt *elt;
  48. };
  49. /**
  50. * Create new language detector object using configuration object
  51. * @param cfg
  52. * @return
  53. */
  54. struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg);
  55. struct rspamd_lang_detector* rspamd_language_detector_ref (struct rspamd_lang_detector* d);
  56. void rspamd_language_detector_unref (struct rspamd_lang_detector* d);
  57. /**
  58. * Try to detect language of words
  59. * @param d
  60. * @param ucs_tokens
  61. * @param words_len
  62. * @return array of struct rspamd_lang_detector_res sorted by freq descending
  63. */
  64. gboolean rspamd_language_detector_detect (struct rspamd_task *task,
  65. struct rspamd_lang_detector *d,
  66. struct rspamd_mime_text_part *part);
  67. /**
  68. * Returns TRUE if the specified word is known to be a stop word
  69. * @param d
  70. * @param word
  71. * @param wlen
  72. * @return
  73. */
  74. gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
  75. const gchar *word, gsize wlen);
  76. #endif