123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299 |
- // Copyright 2016 Google Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- ////////////////////////////////////////////////////////////////////////////////
-
- #ifndef UTIL_ENCODINGS_ENCODINGS_H_
- #define UTIL_ENCODINGS_ENCODINGS_H_
-
- // This interface defines the Encoding enum and various functions that
- // depend only on Encoding values.
-
- // A hash-function for Encoding, hash<Encoding>, is defined in
- // i18n/encodings/public/encodings-hash.h
-
- // On some Windows projects, UNICODE may be defined, which would prevent the
- // Encoding enum below from compiling. Note that this is a quick fix that does
- // not break any existing projects. The UNICODE enum may someday be changed
- // to something more specific and non-colliding, but this involves careful
- // testing of changes in many other projects.
- #undef UNICODE
-
- // NOTE: The Encoding enum must always start at 0. This assumption has
- // been made and used.
-
- #ifndef SWIG
-
- #include "util/encodings/encodings.pb.h"
-
- #else
-
- // TODO: Include a SWIG workaround header file.
-
- #endif
-
- const int kNumEncodings = NUM_ENCODINGS;
-
- // some of the popular encoding aliases
- // TODO: Make these static const Encoding values instead of macros.
- #define LATIN1 ISO_8859_1
- #define LATIN2 ISO_8859_2
- #define LATIN3 ISO_8859_3
- #define LATIN4 ISO_8859_4
- #define CYRILLIC ISO_8859_5
- #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
- #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
- #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
- #define LATIN5 ISO_8859_9
- #define LATIN6 ISO_8859_10
- #define KOREAN_HANGUL KOREAN_EUC_KR
-
- // The default Encoding (LATIN1).
- Encoding default_encoding();
-
-
-
- // *************************************************************
- // Encoding predicates
- // IsValidEncoding()
- // IsEncEncCompatible
- // IsSupersetOfAscii7Bit
- // Is8BitEncoding
- // IsCJKEncoding
- // IsHebrewEncoding
- // IsRightToLeftEncoding
- // IsLogicalRightToLeftEncoding
- // IsVisualRightToLeftEncoding
- // IsIso2022Encoding
- // IsIso2022JpOrVariant
- // IsShiftJisOrVariant
- // IsJapaneseCellPhoneCarrierSpecificEncoding
- // *************************************************************
-
- // IsValidEncoding
- // ===================================
- //
- // Function to check if the input language enum is within range.
- //
-
- bool IsValidEncoding(Encoding enc);
-
- //
- // IsEncEncCompatible
- // ------------------
- //
- // This function is to determine whether or not converting from the
- // first encoding to the second requires any changes to the underlying
- // text (e.g. ASCII_7BIT is a subset of UTF8).
- //
- // TODO: the current implementation is likely incomplete. It would be
- // good to consider the full matrix of all pairs of encodings and to fish out
- // all compatible pairs.
- //
- bool IsEncEncCompatible(const Encoding from, const Encoding to);
-
- // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
- // encoding represent the same characters as they do in ISO_8859_1.
-
- // WARNING: This function does not currently return true for all encodings that
- // are supersets of Ascii 7-bit.
- bool IsSupersetOfAscii7Bit(Encoding e);
-
- // To be an 8-bit encoding means that there are fewer than 256 symbols.
- // Each byte determines a new character; there are no multi-byte sequences.
-
- // WARNING: This function does not currently return true for all encodings that
- // are 8-bit encodings.
- bool Is8BitEncoding(Encoding e);
-
- // IsCJKEncoding
- // -------------
- //
- // This function returns true if the encoding is either Chinese
- // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
- // considered a CJK encoding.
- bool IsCJKEncoding(Encoding e);
-
- // IsHebrewEncoding
- // -------------
- //
- // This function returns true if the encoding is a Hebrew specific
- // encoding (not UTF8, etc).
- bool IsHebrewEncoding(Encoding e);
-
- // IsRightToLeftEncoding
- // ---------------------
- //
- // Returns true if the encoding is a right-to-left encoding.
- //
- // Note that the name of this function is somewhat misleading. There is nothing
- // "right to left" about these encodings. They merely contain code points for
- // characters in RTL languages such as Hebrew and Arabic. But this is also
- // true for UTF-8.
- //
- // TODO: Get rid of this function. The only special-case we
- // should need to worry about are visual encodings. Anything we
- // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
- bool IsRightToLeftEncoding(Encoding enc);
-
- // IsLogicalRightToLeftEncoding
- // ----------------------------
- //
- // Returns true if the encoding is a logical right-to-left encoding.
- // Logical right-to-left encodings are those that the browser renders
- // right-to-left and applies the BiDi algorithm to. Therefore the characters
- // appear in reading order in the file, and indexing, snippet generation etc.
- // should all just work with no special processing.
- //
- // TODO: Get rid of this function. The only special-case we
- // should need to worry about are visual encodings.
- bool IsLogicalRightToLeftEncoding(Encoding enc);
-
- // IsVisualRightToLeftEncoding
- // ---------------------------
- //
- // Returns true if the encoding is a visual right-to-left encoding.
- // Visual right-to-left encodings are those that the browser renders
- // left-to-right and does not apply the BiDi algorithm to. Therefore each
- // line appears in reverse order in the file, lines are manually wrapped
- // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
- // the prehistoric days when browsers couldn't render right-to-left, but
- // unfortunately some visual pages persist to this day. These documents require
- // special processing so that we don't index or snippet them with each line
- // reversed.
- bool IsVisualRightToLeftEncoding(Encoding enc);
-
- // IsIso2022Encoding
- // -----------------
- //
- // Returns true if the encoding is a kind of ISO 2022 such as
- // ISO-2022-JP.
- bool IsIso2022Encoding(Encoding enc);
-
- // IsIso2022JpOrVariant
- // --------------------
- //
- // Returns true if the encoding is ISO-2022-JP or a variant such as
- // KDDI's ISO-2022-JP.
- bool IsIso2022JpOrVariant(Encoding enc);
-
- // IsShiftJisOrVariant
- // --------------------
- //
- // Returns true if the encoding is Shift_JIS or a variant such as
- // KDDI's Shift_JIS.
- bool IsShiftJisOrVariant(Encoding enc);
-
- // IsJapanesCellPhoneCarrierSpecificEncoding
- // -----------------------------------------
- //
- // Returns true if it's Japanese cell phone carrier specific encoding
- // such as KDDI_SHIFT_JIS.
- bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
-
-
-
- // *************************************************************
- // ENCODING NAMES
- //
- // This interface defines a standard name for each valid encoding, and
- // a standard name for invalid encodings. (Some names use all upper
- // case, but others use mixed case.)
- //
- // EncodingName() [Encoding to name]
- // MimeEncodingName() [Encoding to name]
- // EncodingFromName() [name to Encoding]
- // EncodingNameAliasToEncoding() [name to Encoding]
- // default_encoding_name()
- // invalid_encoding_name()
- // *************************************************************
-
- // EncodingName
- // ------------
- //
- // Given the encoding, returns its standard name.
- // Return invalid_encoding_name() if the encoding is invalid.
- //
- const char* EncodingName(Encoding enc);
-
- //
- // MimeEncodingName
- // ----------------
- //
- // Return the "preferred MIME name" of an encoding.
- //
- // This name is suitable for using in HTTP headers, HTML tags,
- // and as the "charset" parameter of a MIME Content-Type.
- const char* MimeEncodingName(Encoding enc);
-
-
- // The maximum length of an encoding name
- const int kMaxEncodingNameSize = 50;
-
- // The standard name of the default encoding.
- const char* default_encoding_name();
-
- // The name used for an invalid encoding.
- const char* invalid_encoding_name();
-
- // EncodingFromName
- // ----------------
- //
- // If enc_name matches the standard name of an Encoding, using a
- // case-insensitive comparison, set *encoding to that Encoding and
- // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
- // return false.
- //
- // REQUIRES: encoding must not be NULL.
- //
- bool EncodingFromName(const char* enc_name, Encoding *encoding);
-
- //
- // EncodingNameAliasToEncoding
- // ---------------------------
- //
- // If enc_name matches the standard name or an alias of an Encoding,
- // using a case-insensitive comparison, return that
- // Encoding. Otherwise, return UNKNOWN_ENCODING.
- //
- // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
- // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
- // common variations with hyphens and underscores (e.g., "koi8-u" and
- // "koi8u" for RUSSIAN_KOI8_R).
-
- Encoding EncodingNameAliasToEncoding(const char *enc_name);
-
- // *************************************************************
- // Miscellany
- // *************************************************************
-
- // PreferredWebOutputEncoding
- // --------------------------
- //
- // Some multi-byte encodings use byte values that coincide with the
- // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
- // can misinterpret these, as indicated in an external XSS report from
- // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
- // also use UTF8 instead of encodings that we don't support in our
- // output, and we generally try to be conservative in what we send out.
- // Where the client asks for single- or double-byte encodings that are
- // not as common, we substitute a more common single- or double-byte
- // encoding, if there is one, thereby preserving the client's intent
- // to use less space than UTF-8. This also means that characters
- // outside the destination set will be converted to HTML NCRs (&#NNN;)
- // if requested.
- Encoding PreferredWebOutputEncoding(Encoding enc);
-
-
- #endif // UTIL_ENCODINGS_ENCODINGS_H_
|