diff options
Diffstat (limited to 'contrib/google-ced/util/encodings/encodings.h')
-rw-r--r-- | contrib/google-ced/util/encodings/encodings.h | 299 |
1 files changed, 299 insertions, 0 deletions
diff --git a/contrib/google-ced/util/encodings/encodings.h b/contrib/google-ced/util/encodings/encodings.h new file mode 100644 index 000000000..647797432 --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings.h @@ -0,0 +1,299 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_ENCODINGS_ENCODINGS_H_ +#define UTIL_ENCODINGS_ENCODINGS_H_ + +// This interface defines the Encoding enum and various functions that +// depend only on Encoding values. + +// A hash-function for Encoding, hash<Encoding>, is defined in +// i18n/encodings/public/encodings-hash.h + +// On some Windows projects, UNICODE may be defined, which would prevent the +// Encoding enum below from compiling. Note that this is a quick fix that does +// not break any existing projects. The UNICODE enum may someday be changed +// to something more specific and non-colliding, but this involves careful +// testing of changes in many other projects. +#undef UNICODE + +// NOTE: The Encoding enum must always start at 0. This assumption has +// been made and used. + +#ifndef SWIG + +#include "util/encodings/encodings.pb.h" + +#else + +// TODO: Include a SWIG workaround header file. + +#endif + +const int kNumEncodings = NUM_ENCODINGS; + +// some of the popular encoding aliases +// TODO: Make these static const Encoding values instead of macros. +#define LATIN1 ISO_8859_1 +#define LATIN2 ISO_8859_2 +#define LATIN3 ISO_8859_3 +#define LATIN4 ISO_8859_4 +#define CYRILLIC ISO_8859_5 +#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language +#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language +#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language +#define LATIN5 ISO_8859_9 +#define LATIN6 ISO_8859_10 +#define KOREAN_HANGUL KOREAN_EUC_KR + +// The default Encoding (LATIN1). +Encoding default_encoding(); + + + +// ************************************************************* +// Encoding predicates +// IsValidEncoding() +// IsEncEncCompatible +// IsSupersetOfAscii7Bit +// Is8BitEncoding +// IsCJKEncoding +// IsHebrewEncoding +// IsRightToLeftEncoding +// IsLogicalRightToLeftEncoding +// IsVisualRightToLeftEncoding +// IsIso2022Encoding +// IsIso2022JpOrVariant +// IsShiftJisOrVariant +// IsJapaneseCellPhoneCarrierSpecificEncoding +// ************************************************************* + +// IsValidEncoding +// =================================== +// +// Function to check if the input language enum is within range. +// + +bool IsValidEncoding(Encoding enc); + +// +// IsEncEncCompatible +// ------------------ +// +// This function is to determine whether or not converting from the +// first encoding to the second requires any changes to the underlying +// text (e.g. ASCII_7BIT is a subset of UTF8). +// +// TODO: the current implementation is likely incomplete. It would be +// good to consider the full matrix of all pairs of encodings and to fish out +// all compatible pairs. +// +bool IsEncEncCompatible(const Encoding from, const Encoding to); + +// To be a superset of 7-bit Ascii means that bytes 0...127 in the given +// encoding represent the same characters as they do in ISO_8859_1. + +// WARNING: This function does not currently return true for all encodings that +// are supersets of Ascii 7-bit. +bool IsSupersetOfAscii7Bit(Encoding e); + +// To be an 8-bit encoding means that there are fewer than 256 symbols. +// Each byte determines a new character; there are no multi-byte sequences. + +// WARNING: This function does not currently return true for all encodings that +// are 8-bit encodings. +bool Is8BitEncoding(Encoding e); + +// IsCJKEncoding +// ------------- +// +// This function returns true if the encoding is either Chinese +// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not +// considered a CJK encoding. +bool IsCJKEncoding(Encoding e); + +// IsHebrewEncoding +// ------------- +// +// This function returns true if the encoding is a Hebrew specific +// encoding (not UTF8, etc). +bool IsHebrewEncoding(Encoding e); + +// IsRightToLeftEncoding +// --------------------- +// +// Returns true if the encoding is a right-to-left encoding. +// +// Note that the name of this function is somewhat misleading. There is nothing +// "right to left" about these encodings. They merely contain code points for +// characters in RTL languages such as Hebrew and Arabic. But this is also +// true for UTF-8. +// +// TODO: Get rid of this function. The only special-case we +// should need to worry about are visual encodings. Anything we +// need to do for all 'RTL' encodings we need to do for UTF-8 as well. +bool IsRightToLeftEncoding(Encoding enc); + +// IsLogicalRightToLeftEncoding +// ---------------------------- +// +// Returns true if the encoding is a logical right-to-left encoding. +// Logical right-to-left encodings are those that the browser renders +// right-to-left and applies the BiDi algorithm to. Therefore the characters +// appear in reading order in the file, and indexing, snippet generation etc. +// should all just work with no special processing. +// +// TODO: Get rid of this function. The only special-case we +// should need to worry about are visual encodings. +bool IsLogicalRightToLeftEncoding(Encoding enc); + +// IsVisualRightToLeftEncoding +// --------------------------- +// +// Returns true if the encoding is a visual right-to-left encoding. +// Visual right-to-left encodings are those that the browser renders +// left-to-right and does not apply the BiDi algorithm to. Therefore each +// line appears in reverse order in the file, lines are manually wrapped +// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of +// the prehistoric days when browsers couldn't render right-to-left, but +// unfortunately some visual pages persist to this day. These documents require +// special processing so that we don't index or snippet them with each line +// reversed. +bool IsVisualRightToLeftEncoding(Encoding enc); + +// IsIso2022Encoding +// ----------------- +// +// Returns true if the encoding is a kind of ISO 2022 such as +// ISO-2022-JP. +bool IsIso2022Encoding(Encoding enc); + +// IsIso2022JpOrVariant +// -------------------- +// +// Returns true if the encoding is ISO-2022-JP or a variant such as +// KDDI's ISO-2022-JP. +bool IsIso2022JpOrVariant(Encoding enc); + +// IsShiftJisOrVariant +// -------------------- +// +// Returns true if the encoding is Shift_JIS or a variant such as +// KDDI's Shift_JIS. +bool IsShiftJisOrVariant(Encoding enc); + +// IsJapanesCellPhoneCarrierSpecificEncoding +// ----------------------------------------- +// +// Returns true if it's Japanese cell phone carrier specific encoding +// such as KDDI_SHIFT_JIS. +bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc); + + + +// ************************************************************* +// ENCODING NAMES +// +// This interface defines a standard name for each valid encoding, and +// a standard name for invalid encodings. (Some names use all upper +// case, but others use mixed case.) +// +// EncodingName() [Encoding to name] +// MimeEncodingName() [Encoding to name] +// EncodingFromName() [name to Encoding] +// EncodingNameAliasToEncoding() [name to Encoding] +// default_encoding_name() +// invalid_encoding_name() +// ************************************************************* + +// EncodingName +// ------------ +// +// Given the encoding, returns its standard name. +// Return invalid_encoding_name() if the encoding is invalid. +// +const char* EncodingName(Encoding enc); + +// +// MimeEncodingName +// ---------------- +// +// Return the "preferred MIME name" of an encoding. +// +// This name is suitable for using in HTTP headers, HTML tags, +// and as the "charset" parameter of a MIME Content-Type. +const char* MimeEncodingName(Encoding enc); + + +// The maximum length of an encoding name +const int kMaxEncodingNameSize = 50; + +// The standard name of the default encoding. +const char* default_encoding_name(); + +// The name used for an invalid encoding. +const char* invalid_encoding_name(); + +// EncodingFromName +// ---------------- +// +// If enc_name matches the standard name of an Encoding, using a +// case-insensitive comparison, set *encoding to that Encoding and +// return true. Otherwise set *encoding to UNKNOWN_ENCODING and +// return false. +// +// REQUIRES: encoding must not be NULL. +// +bool EncodingFromName(const char* enc_name, Encoding *encoding); + +// +// EncodingNameAliasToEncoding +// --------------------------- +// +// If enc_name matches the standard name or an alias of an Encoding, +// using a case-insensitive comparison, return that +// Encoding. Otherwise, return UNKNOWN_ENCODING. +// +// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for +// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and +// common variations with hyphens and underscores (e.g., "koi8-u" and +// "koi8u" for RUSSIAN_KOI8_R). + +Encoding EncodingNameAliasToEncoding(const char *enc_name); + +// ************************************************************* +// Miscellany +// ************************************************************* + +// PreferredWebOutputEncoding +// -------------------------- +// +// Some multi-byte encodings use byte values that coincide with the +// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE +// can misinterpret these, as indicated in an external XSS report from +// 2007-02-15. Here, we map these dangerous encodings to safer ones. We +// also use UTF8 instead of encodings that we don't support in our +// output, and we generally try to be conservative in what we send out. +// Where the client asks for single- or double-byte encodings that are +// not as common, we substitute a more common single- or double-byte +// encoding, if there is one, thereby preserving the client's intent +// to use less space than UTF-8. This also means that characters +// outside the destination set will be converted to HTML NCRs (&#NNN;) +// if requested. +Encoding PreferredWebOutputEncoding(Encoding enc); + + +#endif // UTIL_ENCODINGS_ENCODINGS_H_ |