You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

compact_enc_det.h 3.5KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. // Copyright 2016 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_
  17. #define COMPACT_ENC_DET_COMPACT_ENC_DET_H_
  18. #include "util/encodings/encodings.h" // for Encoding
  19. #include "util/languages/languages.h" // for Language
  20. #include <string.h>
  21. namespace CompactEncDet {
  22. // We may want different statistics, depending on whether the text being
  23. // identfied is from the web, from email, etc. This is currently ignored,
  24. // except WEB_CORPUS enables ignoring chars inside tags.
  25. enum TextCorpusType {
  26. WEB_CORPUS,
  27. XML_CORPUS,
  28. QUERY_CORPUS, // Use this for vanilla plaintext
  29. EMAIL_CORPUS,
  30. NUM_CORPA, // always last
  31. };
  32. // Scan raw bytes and detect most likely encoding
  33. // Design goals:
  34. // Skip over big initial stretches of seven-bit ASCII bytes very quickly
  35. // Thread safe
  36. // Works equally well on
  37. // 50-byte queries,
  38. // 5000-byte email and
  39. // 50000-byte web pages
  40. // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
  41. //
  42. // Inputs: text and text_length
  43. // web page's url (preferred) or just
  44. // top-level domain name (e.g. "com") or NULL as a hint
  45. // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
  46. // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
  47. // an Encoding or UNKNOWN_ENCODING as a hint
  48. // a Language or UNKNOWN_LANGUAGE as a hint
  49. // corpus type from the list above. Currently ignored; may select
  50. // different probability tables in the future
  51. // ignore_7bit if true says to NOT return the pure seven-bit encodings
  52. // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
  53. // This may save a little scoring time on pure printable ASCII input text
  54. // Outputs: bytes_consumed says how much of text_length was actually examined
  55. // is_reliable set true if the returned encoding is at least 2**10 time more
  56. // probable then the second-best encoding
  57. // Return value: the most likely encoding for the input text
  58. //
  59. // Setting ignore_7bit_mail_encodings effectively turns off detection of
  60. // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
  61. // when corpus_type is QUERY_CORPUS.
  62. Encoding DetectEncoding(
  63. const char* text, int text_length, const char* url_hint,
  64. const char* http_charset_hint, const char* meta_charset_hint,
  65. const int encoding_hint,
  66. const Language language_hint, // User interface lang
  67. const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
  68. int* bytes_consumed, bool* is_reliable);
  69. // Support functions for unit test program
  70. int BackmapEncodingToRankedEncoding(Encoding enc);
  71. Encoding TopEncodingOfLangHint(const char* name);
  72. Encoding TopEncodingOfTLDHint(const char* name);
  73. Encoding TopEncodingOfCharsetHint(const char* name);
  74. const char* Version(void);
  75. } // End namespace CompactEncDet
  76. #endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_