1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- // Copyright 2016 Google Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- ////////////////////////////////////////////////////////////////////////////////
-
- #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_
- #define COMPACT_ENC_DET_COMPACT_ENC_DET_H_
-
- #include "util/encodings/encodings.h" // for Encoding
- #include "util/languages/languages.h" // for Language
-
- #include <string.h>
-
- namespace CompactEncDet {
- // We may want different statistics, depending on whether the text being
- // identfied is from the web, from email, etc. This is currently ignored,
- // except WEB_CORPUS enables ignoring chars inside tags.
- enum TextCorpusType {
- WEB_CORPUS,
- XML_CORPUS,
- QUERY_CORPUS, // Use this for vanilla plaintext
- EMAIL_CORPUS,
- NUM_CORPA, // always last
- };
-
- // Scan raw bytes and detect most likely encoding
- // Design goals:
- // Skip over big initial stretches of seven-bit ASCII bytes very quickly
- // Thread safe
- // Works equally well on
- // 50-byte queries,
- // 5000-byte email and
- // 50000-byte web pages
- // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
- //
- // Inputs: text and text_length
- // web page's url (preferred) or just
- // top-level domain name (e.g. "com") or NULL as a hint
- // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
- // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
- // an Encoding or UNKNOWN_ENCODING as a hint
- // a Language or UNKNOWN_LANGUAGE as a hint
- // corpus type from the list above. Currently ignored; may select
- // different probability tables in the future
- // ignore_7bit if true says to NOT return the pure seven-bit encodings
- // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
- // This may save a little scoring time on pure printable ASCII input text
- // Outputs: bytes_consumed says how much of text_length was actually examined
- // is_reliable set true if the returned encoding is at least 2**10 time more
- // probable then the second-best encoding
- // Return value: the most likely encoding for the input text
- //
- // Setting ignore_7bit_mail_encodings effectively turns off detection of
- // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
- // when corpus_type is QUERY_CORPUS.
- Encoding DetectEncoding(
- const char* text, int text_length, const char* url_hint,
- const char* http_charset_hint, const char* meta_charset_hint,
- const int encoding_hint,
- const Language language_hint, // User interface lang
- const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
- int* bytes_consumed, bool* is_reliable);
-
- // Support functions for unit test program
- int BackmapEncodingToRankedEncoding(Encoding enc);
- Encoding TopEncodingOfLangHint(const char* name);
- Encoding TopEncodingOfTLDHint(const char* name);
- Encoding TopEncodingOfCharsetHint(const char* name);
- const char* Version(void);
- } // End namespace CompactEncDet
-
- #endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_
|