diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-05-26 11:31:47 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-05-26 11:31:47 +0100 |
commit | 19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3 (patch) | |
tree | 6d0f43f3cd9ede27eb578562480633e27f042934 /contrib/google-ced/util | |
parent | c11838dcbacbfd0a75e98f95a63a026217c88c51 (diff) | |
download | rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.tar.gz rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.zip |
[Rework] Use google-ced instead of libicu chardet as the former sucks
Diffstat (limited to 'contrib/google-ced/util')
-rw-r--r-- | contrib/google-ced/util/basictypes.h | 331 | ||||
-rw-r--r-- | contrib/google-ced/util/case_insensitive_hash.h | 88 | ||||
-rw-r--r-- | contrib/google-ced/util/commandlineflags.h | 39 | ||||
-rw-r--r-- | contrib/google-ced/util/encodings/encodings.cc | 891 | ||||
-rw-r--r-- | contrib/google-ced/util/encodings/encodings.h | 299 | ||||
-rw-r--r-- | contrib/google-ced/util/encodings/encodings.pb.h | 181 | ||||
-rw-r--r-- | contrib/google-ced/util/encodings/encodings_unittest.cc | 34 | ||||
-rw-r--r-- | contrib/google-ced/util/languages/languages.cc | 349 | ||||
-rw-r--r-- | contrib/google-ced/util/languages/languages.h | 381 | ||||
-rw-r--r-- | contrib/google-ced/util/languages/languages.pb.h | 191 | ||||
-rw-r--r-- | contrib/google-ced/util/logging.h | 25 | ||||
-rw-r--r-- | contrib/google-ced/util/port.h | 53 | ||||
-rw-r--r-- | contrib/google-ced/util/string_util.h | 61 | ||||
-rw-r--r-- | contrib/google-ced/util/varsetter.h | 66 |
14 files changed, 2989 insertions, 0 deletions
diff --git a/contrib/google-ced/util/basictypes.h b/contrib/google-ced/util/basictypes.h new file mode 100644 index 000000000..af391c742 --- /dev/null +++ b/contrib/google-ced/util/basictypes.h @@ -0,0 +1,331 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_BASICTYPES_H_ +#define UTIL_BASICTYPES_H_ + +#include <limits.h> // So we can set the bounds of our types +#include <stddef.h> // For size_t +#include <string.h> // for memcpy + +#include "util/port.h" // Types that only need exist on certain systems + +#ifndef COMPILER_MSVC +// stdint.h is part of C99 but MSVC doesn't have it. +#include <stdint.h> // For intptr_t. +#endif + +typedef signed char schar; +typedef signed char int8; +typedef short int16; +// TODO(mbelshe) Remove these type guards. These are +// temporary to avoid conflicts with npapi.h. +#ifndef _INT32 +#define _INT32 +typedef int int32; +#endif + +// The NSPR system headers define 64-bit as |long| when possible. In order to +// not have typedef mismatches, we do the same on LP64. +#if __LP64__ +typedef long int64; +#else +typedef long long int64; +#endif + +// NOTE: unsigned types are DANGEROUS in loops and other arithmetical +// places. Use the signed types unless your variable represents a bit +// pattern (eg a hash value) or you really need the extra bit. Do NOT +// use 'unsigned' to express "this value should always be positive"; +// use assertions for this. + +typedef unsigned char uint8; +typedef unsigned short uint16; +// TODO(mbelshe) Remove these type guards. These are +// temporary to avoid conflicts with npapi.h. +#ifndef _UINT32 +#define _UINT32 +typedef unsigned int uint32; +#endif + +// See the comment above about NSPR and 64-bit. +#if __LP64__ +typedef unsigned long uint64; +#else +typedef unsigned long long uint64; +#endif + +// A type to represent a Unicode code-point value. As of Unicode 4.0, +// such values require up to 21 bits. +// (For type-checking on pointers, make this explicitly signed, +// and it should always be the signed version of whatever int32 is.) +typedef signed int char32; + +const uint8 kuint8max = (( uint8) 0xFF); +const uint16 kuint16max = ((uint16) 0xFFFF); +const uint32 kuint32max = ((uint32) 0xFFFFFFFF); +const uint64 kuint64max = ((uint64) GG_LONGLONG(0xFFFFFFFFFFFFFFFF)); +const int8 kint8min = (( int8) 0x80); +const int8 kint8max = (( int8) 0x7F); +const int16 kint16min = (( int16) 0x8000); +const int16 kint16max = (( int16) 0x7FFF); +const int32 kint32min = (( int32) 0x80000000); +const int32 kint32max = (( int32) 0x7FFFFFFF); +const int64 kint64min = (( int64) GG_LONGLONG(0x8000000000000000)); +const int64 kint64max = (( int64) GG_LONGLONG(0x7FFFFFFFFFFFFFFF)); + +// A macro to disallow the copy constructor and operator= functions +// This should be used in the private: declarations for a class +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +// An older, deprecated, politically incorrect name for the above. +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName) + +// A macro to disallow all the implicit constructors, namely the +// default constructor, copy constructor and operator= functions. +// +// This should be used in the private: declarations for a class +// that wants to prevent anyone from instantiating it. This is +// especially useful for classes containing only static methods. +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName(); \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template <typename T, size_t N> +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef _MSC_VER +template <typename T, size_t N> +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + + +// Use implicit_cast as a safe version of static_cast or const_cast +// for upcasting in the type hierarchy (i.e. casting a pointer to Foo +// to a pointer to SuperclassOfFoo or casting a pointer to Foo to +// a const pointer to Foo). +// When you use implicit_cast, the compiler checks that the cast is safe. +// Such explicit implicit_casts are necessary in surprisingly many +// situations where C++ demands an exact type match instead of an +// argument type convertable to a target type. +// +// The From type can be inferred, so the preferred syntax for using +// implicit_cast is the same as for static_cast etc.: +// +// implicit_cast<ToType>(expr) +// +// implicit_cast would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +template<typename To, typename From> +inline To implicit_cast(From const &f) { + return f; +} + +// The COMPILE_ASSERT macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// COMPILE_ASSERT(arraysize(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +template <bool> +struct CompileAssert { +}; + +#undef COMPILE_ASSERT +#define COMPILE_ASSERT(expr, msg) \ + typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] + +// Implementation details of COMPILE_ASSERT: +// +// - COMPILE_ASSERT works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type CompileAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outter parentheses in CompileAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// CompileAssert<bool(expr)> +// +// instead, these compilers will refuse to compile +// +// COMPILE_ASSERT(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + + +// MetatagId refers to metatag-id that we assign to +// each metatag <name, value> pair.. +typedef uint32 MetatagId; + +// Argument type used in interfaces that can optionally take ownership +// of a passed in argument. If TAKE_OWNERSHIP is passed, the called +// object takes ownership of the argument. Otherwise it does not. +enum Ownership { + DO_NOT_TAKE_OWNERSHIP, + TAKE_OWNERSHIP +}; + +// bit_cast<Dest,Source> is a template function that implements the +// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in +// very low-level functions like the protobuf library and fast math +// support. +// +// float f = 3.14159265358979; +// int i = bit_cast<int32>(f); +// // i = 0x40490fdb +// +// The classical address-casting method is: +// +// // WRONG +// float f = 3.14159265358979; // WRONG +// int i = * reinterpret_cast<int*>(&f); // WRONG +// +// The address-casting method actually produces undefined behavior +// according to ISO C++ specification section 3.10 -15 -. Roughly, this +// section says: if an object in memory has one type, and a program +// accesses it with a different type, then the result is undefined +// behavior for most values of "different type". +// +// This is true for any cast syntax, either *(int*)&f or +// *reinterpret_cast<int*>(&f). And it is particularly true for +// conversions betweeen integral lvalues and floating-point lvalues. +// +// The purpose of 3.10 -15- is to allow optimizing compilers to assume +// that expressions with different types refer to different memory. gcc +// 4.0.1 has an optimizer that takes advantage of this. So a +// non-conforming program quietly produces wildly incorrect output. +// +// The problem is not the use of reinterpret_cast. The problem is type +// punning: holding an object in memory of one type and reading its bits +// back using a different type. +// +// The C++ standard is more subtle and complex than this, but that +// is the basic idea. +// +// Anyways ... +// +// bit_cast<> calls memcpy() which is blessed by the standard, +// especially by the example in section 3.9 . Also, of course, +// bit_cast<> wraps up the nasty logic in one place. +// +// Fortunately memcpy() is very fast. In optimized mode, with a +// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline +// code with the minimal amount of data movement. On a 32-bit system, +// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8) +// compiles to two loads and two stores. +// +// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1. +// +// WARNING: if Dest or Source is a non-POD type, the result of the memcpy +// is likely to surprise you. + +template <class Dest, class Source> +inline Dest bit_cast(const Source& source) { + // Compile time assertion: sizeof(Dest) == sizeof(Source) + // A compile error here means your Dest and Source have different sizes. + // typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1]; + + Dest dest; + memcpy(&dest, &source, sizeof(dest)); + return dest; +} + +// The following enum should be used only as a constructor argument to indicate +// that the variable has static storage class, and that the constructor should +// do nothing to its state. It indicates to the reader that it is legal to +// declare a static instance of the class, provided the constructor is given +// the base::LINKER_INITIALIZED argument. Normally, it is unsafe to declare a +// static variable that has a constructor or a destructor because invocation +// order is undefined. However, IF the type can be initialized by filling with +// zeroes (which the loader does for static variables), AND the destructor also +// does nothing to the storage, AND there are no virtual methods, then a +// constructor declared as +// explicit MyClass(base::LinkerInitialized x) {} +// and invoked as +// static MyClass my_variable_name(base::LINKER_INITIALIZED); +namespace base { +enum LinkerInitialized { LINKER_INITIALIZED }; +} // base + +// UnaligndLoad32 is put here instead of util/port.h to +// avoid the circular dependency between port.h and basictypes.h +// ARM does not support unaligned memory access. +#if defined(ARCH_CPU_X86_FAMILY) +// x86 and x86-64 can perform unaligned loads/stores directly; +inline uint32 UnalignedLoad32(const void* p) { + return *reinterpret_cast<const uint32*>(p); +} +#else +#define NEED_ALIGNED_LOADS +// If target architecture does not support unaligned loads and stores, +// use memcpy version of UNALIGNED_LOAD32. +inline uint32 UnalignedLoad32(const void* p) { + uint32 t; + memcpy(&t, reinterpret_cast<const uint8*>(p), sizeof(t)); + return t; +} + +#endif +#endif // UTIL_BASICTYPES_H_ diff --git a/contrib/google-ced/util/case_insensitive_hash.h b/contrib/google-ced/util/case_insensitive_hash.h new file mode 100644 index 000000000..7b0c9db76 --- /dev/null +++ b/contrib/google-ced/util/case_insensitive_hash.h @@ -0,0 +1,88 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_CASE_INSENSITIVE_HASH_H_ +#define UTIL_CASE_INSENSITIVE_HASH_H_ + +#include <ctype.h> +#include <stddef.h> +#ifndef _MSC_VER +#include <strings.h> +#endif + +#include <string> + +#include "util/basictypes.h" +#include "util/string_util.h" + +// Functors for hashing c-strings with case-insensitive semantics. +struct CStringCaseHash { + size_t operator()(const char *str) const { + unsigned long hash_val = 0; + while (*str) { + hash_val = 5*hash_val + tolower(*str); + str++; + } + return (size_t)hash_val; + } +}; + +struct CStringCaseEqual { + bool operator()(const char *str1, const char *str2) const { + return !base::strcasecmp(str1, str2); + } +}; + +// These functors, in addition to being case-insensitive, ignore all +// non-alphanumeric characters. This is useful when we want all variants of +// a string -- where variants can differ in puncutation and whitespace -- to +// map to the same value. +struct CStringAlnumCaseHash { + size_t operator()(const char *str) const { + unsigned long hash_val = 0; + while (*str) { + if (isalnum(*str)) { + hash_val = 5*hash_val + tolower(*str); + } + str++; + } + return (size_t)hash_val; + } +}; + +struct CStringAlnumCaseEqual { + bool operator()(const char *str1, const char *str2) const { + while (true) { + // Skip until each pointer is pointing to an alphanumeric char or '\0' + while (!isalnum(*str1) && (*str1 != '\0')) { + str1++; + } + while (!isalnum(*str2) && (*str2 != '\0')) { + str2++; + } + if (tolower(*str1) != tolower(*str2)) { + return false; // mismatch on alphanumeric char or '\0' + } + if (*str1 == '\0') { // in which case *str2 must be '\0' as well + return true; // reached '\0' in both strings without mismatch + } + str1++; + str2++; + } + } +}; + +#endif // UTIL_CASE_INSENSITIVE_HASH_H_ diff --git a/contrib/google-ced/util/commandlineflags.h b/contrib/google-ced/util/commandlineflags.h new file mode 100644 index 000000000..341a659ba --- /dev/null +++ b/contrib/google-ced/util/commandlineflags.h @@ -0,0 +1,39 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_COMMANDLINEFLAGS_H_ +#define UTIL_COMMANDLINEFLAGS_H_ + + +#undef DEFINE_bool +#define DEFINE_bool(name, default_value, comment) \ + bool FLAGS_##name = default_value +#undef DEFINE_int32 +#define DEFINE_int32(name, default_value, comment) \ + int32 FLAGS_##name = default_value +#undef DEFINE_string +#define DEFINE_string(name, default_value, comment) \ + string FLAGS_##name = default_value + +#undef DECLARE_bool +#define DECLARE_bool(name) extern bool FLAGS_##name +#undef DECLARE_int32 +#define DECLARE_int32(name) extern int32 FLAGS_##name +#undef DECLARE_string +#define DECLARE_string(name) extern string FLAGS_##name + + +#endif // UTIL_COMMANDLINEFLAGS_H_ diff --git a/contrib/google-ced/util/encodings/encodings.cc b/contrib/google-ced/util/encodings/encodings.cc new file mode 100644 index 000000000..b5f8dc5fa --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings.cc @@ -0,0 +1,891 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "util/encodings/encodings.h" + +#include <string.h> // for strcasecmp +#include <unordered_map> +#include <utility> // for pair + +#include "util/basictypes.h" +#include "util/string_util.h" +#include "util/case_insensitive_hash.h" + +struct EncodingInfo { + // The standard name for this encoding. + // + const char* encoding_name_; + + // The "preferred MIME name" of an encoding as specified by the IANA at: + // http://www.iana.org/assignments/character-sets + // + // Note that the preferred MIME name may differ slightly from the + // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987 + // + const char* mime_encoding_name_; + + // It is an internal policy that if an encoding has an IANA name, + // then encoding_name_ and mime_encoding_name_ must be the same string. + // + // However, there can be exceptions if there are compelling reasons. + // For example, Japanese mobile handsets require the name + // "Shift_JIS" in charset=... parameter in Content-Type headers to + // process emoji (emoticons) in their private encodings. In that + // case, mime_encoding_name_ should be "Shift_JIS", despite + // encoding_name_ actually is "X-KDDI-Shift_JIS". + + // Some multi-byte encodings use byte values that coincide with the + // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE + // can misinterpret these, as indicated in an external XSS report from + // 2007-02-15. Here, we map these dangerous encodings to safer ones. We + // also use UTF8 instead of encodings that we don't support in our + // output, and we generally try to be conservative in what we send out. + // Where the client asks for single- or double-byte encodings that are + // not as common, we substitute a more common single- or double-byte + // encoding, if there is one, thereby preserving the client's intent + // to use less space than UTF-8. This also means that characters + // outside the destination set will be converted to HTML NCRs (&#NNN;) + // if requested. + + Encoding preferred_web_output_encoding_; +}; + +static const EncodingInfo kEncodingInfoTable[] = { + { "ASCII", "ISO-8859-1", ISO_8859_1}, + { "Latin2", "ISO-8859-2", ISO_8859_2}, + { "Latin3", "ISO-8859-3", UTF8}, + // MSIE 6 does not support ISO-8859-3 (XSS issue) + { "Latin4", "ISO-8859-4", ISO_8859_4}, + { "ISO-8859-5", "ISO-8859-5", ISO_8859_5}, + { "Arabic", "ISO-8859-6", ISO_8859_6}, + { "Greek", "ISO-8859-7", ISO_8859_7}, + { "Hebrew", "ISO-8859-8", MSFT_CP1255}, + // we do not endorse the visual order + { "Latin5", "ISO-8859-9", ISO_8859_9}, + { "Latin6", "ISO-8859-10", UTF8}, + // MSIE does not support ISO-8859-10 (XSS issue) + { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP}, + { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, + // due to potential confusion with HTML syntax chars + { "BIG5", "Big5", CHINESE_BIG5}, + { "GB", "GB2312", CHINESE_GB}, + { "EUC-CN", + "EUC-CN", + // Misnamed. Should be EUC-TW. + CHINESE_BIG5}, + // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW, + // and EUC-TW is rare, so we prefer Big5 for output. + { "KSC", "EUC-KR", KOREAN_EUC_KR}, + { "Unicode", + "UTF-16LE", + // Internet Explorer doesn't recognize "ISO-10646-UCS-2" + UTF8 + // due to potential confusion with HTML syntax chars + }, + { "EUC", + "EUC", // Misnamed. Should be EUC-TW. + CHINESE_BIG5 + // MSIE does not recognize "EUC" (XSS issue), + // and EUC-TW is rare, so we prefer Big5 for output. + }, + { "CNS", + "CNS", // Misnamed. Should be EUC-TW. + CHINESE_BIG5}, + // MSIE does not recognize "CNS" (XSS issue), + // and EUC-TW is rare, so we prefer Big5 for output. + { "BIG5-CP950", + "BIG5-CP950", // Not an IANA name + CHINESE_BIG5 + // MSIE does not recognize "BIG5-CP950" (XSS issue) + }, + { "CP932", "CP932", // Not an IANA name + JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue) + { "UTF8", "UTF-8", UTF8}, + { "Unknown", + "x-unknown", // Not an IANA name + UTF8}, // UTF-8 is our default output encoding + { "ASCII-7-bit", "US-ASCII", ASCII_7BIT}, + { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R}, + { "CP1251", "windows-1251", RUSSIAN_CP1251}, + { "CP1252", "windows-1252", MSFT_CP1252}, + { "KOI8U", + "KOI8-U", + ISO_8859_5}, // because koi8-u is not as common + { "CP1250", "windows-1250", MSFT_CP1250}, + { "ISO-8859-15", "ISO-8859-15", ISO_8859_15}, + { "CP1254", "windows-1254", MSFT_CP1254}, + { "CP1257", "windows-1257", MSFT_CP1257}, + { "ISO-8859-11", "ISO-8859-11", ISO_8859_11}, + { "CP874", "windows-874", MSFT_CP874}, + { "CP1256", "windows-1256", MSFT_CP1256}, + { "CP1255", "windows-1255", MSFT_CP1255}, + { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255}, + // Java does not support iso-8859-8-i + { "VISUAL", "ISO-8859-8", MSFT_CP1255}, + // we do not endorse the visual order + { "CP852", "cp852", MSFT_CP1250}, + // because cp852 is not as common + { "CSN_369103", "csn_369103", MSFT_CP1250}, + // MSIE does not recognize "csn_369103" (XSS issue) + { "CP1253", "windows-1253", MSFT_CP1253}, + { "CP866", "IBM866", RUSSIAN_CP1251}, + // because cp866 is not as common + { "ISO-8859-13", "ISO-8859-13", UTF8}, + // because iso-8859-13 is not widely supported + { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR}, + // due to potential confusion with HTML syntax chars + { "GBK", "GBK", GBK}, + { "GB18030", "GB18030", GBK}, + // because gb18030 is not widely supported + { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5}, + // because Big5-HKSCS is not widely supported + { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB}, + // due to potential confusion with HTML syntax chars + { "TSCII", "tscii", UTF8}, + // we do not have an output converter for this font encoding + { "TAM", "tam", UTF8}, + // we do not have an output converter for this font encoding + { "TAB", "tab", UTF8}, + // we do not have an output converter for this font encoding + { "JAGRAN", "jagran", UTF8}, + // we do not have an output converter for this font encoding + { "MACINTOSH", "MACINTOSH", ISO_8859_1}, + // because macintosh is relatively uncommon + { "UTF7", "UTF-7", + UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated + { "BHASKAR", "bhaskar", + UTF8}, // we do not have an output converter for this font encoding + { "HTCHANAKYA", "htchanakya", // not an IANA charset name. + UTF8}, // we do not have an output converter for this font encoding + { "UTF-16BE", "UTF-16BE", + UTF8}, // due to potential confusion with HTML syntax chars + { "UTF-16LE", "UTF-16LE", + UTF8}, // due to potential confusion with HTML syntax chars + { "UTF-32BE", "UTF-32BE", + UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web + { "UTF-32LE", "UTF-32LE", + UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web + { "X-BINARYENC", "x-binaryenc", // Not an IANA name + UTF8}, // because this one is not intended for output (just input) + { "HZ-GB-2312", "HZ-GB-2312", + CHINESE_GB}, // due to potential confusion with HTML syntax chars + { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name + UTF8}, // because this one is not intended for output (just input) + { "X-TAM-ELANGO", "x-tam-elango", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-LTTMBARANI", "x-tam-lttmbarani", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-SHREE", "x-tam-shree", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-TBOOMIS", "x-tam-tboomis", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-TMNEWS", "x-tam-tmnews", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-WEBTAMIL", "x-tam-webtamil", + UTF8}, // we do not have an output converter for this font encoding + + { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + // KDDI version of Shift_JIS with Google Emoji PUA mappings. + // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses + // "Shift_JIS" in HTTP headers and email messages. + + { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + // DoCoMo version of Shift_JIS with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + + { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + // SoftBank version of Shift_JIS with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + + { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, + // KDDI version of ISO-2022-JP with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + // The preferred Web encoding is due to potential confusion with + // HTML syntax chars. + + { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, + // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + // The preferred Web encoding is due to potential confusion with + // HTML syntax chars. + + // Please refer to NOTE: section in the comments in the definition + // of "struct I18NInfoByEncoding", before adding new encodings. + +}; + + + +COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS, + kEncodingInfoTable_has_incorrect_size); + +Encoding default_encoding() {return LATIN1;} + +// ************************************************************* +// Encoding predicates +// IsValidEncoding() +// IsEncEncCompatible +// IsEncodingWithSupportedLanguage +// IsSupersetOfAscii7Bit +// Is8BitEncoding +// IsCJKEncoding +// IsHebrewEncoding +// IsRightToLeftEncoding +// IsLogicalRightToLeftEncoding +// IsVisualRightToLeftEncoding +// IsIso2022Encoding +// IsIso2022JpOrVariant +// IsShiftJisOrVariant +// IsJapaneseCellPhoneCarrierSpecificEncoding +// ************************************************************* + +bool IsValidEncoding(Encoding enc) { + return ((enc >= 0) && (enc < kNumEncodings)); +} + +bool IsEncEncCompatible(const Encoding from, const Encoding to) { + // Tests compatibility between the "from" and "to" encodings; in + // the typical case -- when both are valid known encodings -- this + // returns true iff converting from first to second is a no-op. + if (!IsValidEncoding(from) || !IsValidEncoding(to)) { + return false; // we only work with valid encodings... + } else if (to == from) { + return true; // the trivial common case + } + + if (to == UNKNOWN_ENCODING) { + return true; // all valid encodings are compatible with the unknown + } + + if (from == UNKNOWN_ENCODING) { + return false; // no unknown encoding is compatible with one that is + } + + if (from == ASCII_7BIT) { + return IsSupersetOfAscii7Bit(to); + } + + return (from == ISO_8859_1 && to == MSFT_CP1252) || + (from == ISO_8859_8 && to == HEBREW_VISUAL) || + (from == HEBREW_VISUAL && to == ISO_8859_8) || + (from == ISO_8859_9 && to == MSFT_CP1254) || + (from == ISO_8859_11 && to == MSFT_CP874) || + (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) || + (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) || + (from == CHINESE_GB && to == GBK) || + (from == CHINESE_GB && to == GB18030) || + (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) || + (from == CHINESE_EUC_CN && to == CHINESE_CNS) || + (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) || + (from == CHINESE_EUC_DEC && to == CHINESE_CNS) || + (from == CHINESE_CNS && to == CHINESE_EUC_CN) || + (from == CHINESE_CNS && to == CHINESE_EUC_DEC); +} + +// To be a superset of 7-bit Ascii means that bytes 0...127 in the given +// encoding represent the same characters as they do in ISO_8859_1. + +// TODO: This list could be expanded. Many other encodings are supersets +// of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two +// encodings that I know for a fact should *not* be in this list. +bool IsSupersetOfAscii7Bit(Encoding e) { + switch (e) { + case ISO_8859_1: + case ISO_8859_2: + case ISO_8859_3: + case ISO_8859_4: + case ISO_8859_5: + case ISO_8859_6: + case ISO_8859_7: + case ISO_8859_8: + case ISO_8859_9: + case ISO_8859_10: + case JAPANESE_EUC_JP: + case JAPANESE_SHIFT_JIS: + case CHINESE_BIG5: + case CHINESE_GB: + case CHINESE_EUC_CN: + case KOREAN_EUC_KR: + case CHINESE_EUC_DEC: + case CHINESE_CNS: + case CHINESE_BIG5_CP950: + case JAPANESE_CP932: + case UTF8: + case UNKNOWN_ENCODING: + case ASCII_7BIT: + case RUSSIAN_KOI8_R: + case RUSSIAN_CP1251: + case MSFT_CP1252: + case RUSSIAN_KOI8_RU: + case MSFT_CP1250: + case ISO_8859_15: + case MSFT_CP1254: + case MSFT_CP1257: + case ISO_8859_11: + case MSFT_CP874: + case MSFT_CP1256: + case MSFT_CP1255: + case ISO_8859_8_I: + case HEBREW_VISUAL: + case CZECH_CP852: + case MSFT_CP1253: + case RUSSIAN_CP866: + case ISO_8859_13: + case GBK: + case GB18030: + case BIG5_HKSCS: + case MACINTOSH_ROMAN: + return true; + default: + return false; + } +} + +// To be an 8-bit encoding means that there are fewer than 256 symbols. +// Each byte determines a new character; there are no multi-byte sequences. + +// TODO: This list could maybe be expanded. Other encodings may be 8-bit. +bool Is8BitEncoding(Encoding e) { + switch (e) { + case ASCII_7BIT: + case ISO_8859_1: + case ISO_8859_2: + case ISO_8859_3: + case ISO_8859_4: + case ISO_8859_5: + case ISO_8859_6: + case ISO_8859_7: + case ISO_8859_8: + case ISO_8859_8_I: + case ISO_8859_9: + case ISO_8859_10: + case ISO_8859_11: + case ISO_8859_13: + case ISO_8859_15: + case MSFT_CP1252: + case MSFT_CP1253: + case MSFT_CP1254: + case MSFT_CP1255: + case MSFT_CP1256: + case MSFT_CP1257: + case RUSSIAN_KOI8_R: + case RUSSIAN_KOI8_RU: + case RUSSIAN_CP866: + return true; + default: + return false; + } +} + +bool IsCJKEncoding(Encoding e) { + switch (e) { + case JAPANESE_EUC_JP: + case JAPANESE_SHIFT_JIS: + case JAPANESE_JIS: + case CHINESE_BIG5: + case CHINESE_GB: + case CHINESE_EUC_CN: + case KOREAN_EUC_KR: + case CHINESE_EUC_DEC: + case CHINESE_CNS: + case CHINESE_BIG5_CP950: + case JAPANESE_CP932: + case ISO_2022_KR: + case GBK: + case GB18030: + case BIG5_HKSCS: + case ISO_2022_CN: + case HZ_GB_2312: + return true; + default: + return false; + } +} + +bool IsHebrewEncoding(Encoding e) { + return (e == ISO_8859_8 || + e == ISO_8859_8_I || + e == MSFT_CP1255 || + e == HEBREW_VISUAL); +} + + + +bool IsRightToLeftEncoding(Encoding enc) { + switch (enc) { + case MSFT_CP1255: + case MSFT_CP1256: + case ARABIC_ENCODING: + case HEBREW_ENCODING: + case ISO_8859_8_I: + case HEBREW_VISUAL: + return true; + default: + return false; + } +} + +bool IsLogicalRightToLeftEncoding(Encoding enc) { + return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc); +} + +// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6) +// is NOT visual. +bool IsVisualRightToLeftEncoding(Encoding enc) { + switch (enc) { + case HEBREW_ENCODING: + case HEBREW_VISUAL: + return true; + default: + return false; + } +} + + + + + +bool IsIso2022Encoding(Encoding enc) { + return (IsIso2022JpOrVariant(enc) || + enc == ISO_2022_KR || + enc == ISO_2022_CN); +} + +bool IsIso2022JpOrVariant(Encoding enc) { + return (enc == JAPANESE_JIS || + enc == KDDI_ISO_2022_JP || + enc == SOFTBANK_ISO_2022_JP); +} + +bool IsShiftJisOrVariant(Encoding enc) { + return (enc == JAPANESE_SHIFT_JIS || + enc == JAPANESE_CP932 || + enc == KDDI_SHIFT_JIS || + enc == DOCOMO_SHIFT_JIS || + enc == SOFTBANK_SHIFT_JIS); +} + +bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) { + return (enc == KDDI_ISO_2022_JP || + enc == KDDI_SHIFT_JIS || + enc == DOCOMO_SHIFT_JIS || + enc == SOFTBANK_SHIFT_JIS || + enc == SOFTBANK_ISO_2022_JP); +} + + +// ************************************************************* +// ENCODING NAMES +// EncodingName() [Encoding to name] +// MimeEncodingName() [Encoding to name] +// EncodingFromName() [name to Encoding] +// EncodingNameAliasToEncoding() [name to Encoding] +// default_encoding_name() +// invalid_encoding_name() +// ************************************************************* + +const char * EncodingName(const Encoding enc) { + if ( (enc < 0) || (enc >= kNumEncodings) ) + return invalid_encoding_name(); + return kEncodingInfoTable[enc].encoding_name_; +} + +// TODO: Unify MimeEncodingName and EncodingName, or determine why +// such a unification is not possible. + +const char * MimeEncodingName(Encoding enc) { + if ( (enc < 0) || (enc >= kNumEncodings) ) + return ""; // TODO: Should this be invalid_encoding_name()? + return kEncodingInfoTable[enc].mime_encoding_name_; +} + +bool EncodingFromName(const char* enc_name, Encoding *encoding) { + *encoding = UNKNOWN_ENCODING; + if ( enc_name == NULL ) return false; + + for ( int i = 0; i < kNumEncodings; i++ ) { + if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) { + *encoding = static_cast<Encoding>(i); + return true; + } + } + return false; +} + +// The encoding_map maps standard and non-standard encoding-names +// (strings) to Encoding enums. It is used only by +// EncodingNameAliasToEncoding. Note that the map uses +// case-insensitive hash and comparison functions. + +typedef std::unordered_map<const char *, Encoding, + CStringAlnumCaseHash, + CStringAlnumCaseEqual> EncodingMap; + +static const EncodingMap& GetEncodingMap() { + static EncodingMap encoding_map; + if (!encoding_map.empty()) { + // Already initialized + return encoding_map; + } + + // Initialize the map with all the "standard" encoding names, + // i.e., the ones returned by EncodingName and MimeEncodingName. + // + // First, add internal encoding names returned by EncodingName(). + for (int i = 0; i < NUM_ENCODINGS; ++i) { + Encoding e = static_cast<Encoding>(i); + // Internal encoding names must be unique. + // The internal names are guaranteed to be unique by the CHECK_EQ. + const char *encoding_name = EncodingName(e); + // CHECK_EQ(0, encoding_map.count(encoding_name)) + // << "Duplicate found for " << encoding_name; + encoding_map[encoding_name] = e; + } + // Then, add mime encoding names returned by MimeEncodingName(). + // We don't override existing entries, to give precedence to entries + // added earlier. + for (int i = 0; i < NUM_ENCODINGS; ++i) { + Encoding e = static_cast<Encoding>(i); + // Note that MimeEncodingName() can return the same mime encoding + // name for different encoding enums like JAPANESE_SHIFT_JIS and + // KDDI_SHIFT_JIS. In that case, the encoding enum first seen + // will be the value for the encoding name in the map. + const char *mime_encoding_name = MimeEncodingName(e); + if (encoding_map.count(mime_encoding_name) == 0) { + encoding_map[mime_encoding_name] = e; + } + } + + // Add some non-standard names: alternate spellings, common typos, + // etc. (It does no harm to add names already in the map.) Note + // that although the map is case-insensitive, by convention the + // keys are written here in lower case. For ease of maintenance, + // they are listed in alphabetical order. + encoding_map["5601"] = KOREAN_EUC_KR; + encoding_map["646"] = ASCII_7BIT; + encoding_map["852"] = CZECH_CP852; + encoding_map["866"] = RUSSIAN_CP866; + encoding_map["8859-1"] = ISO_8859_1; + encoding_map["ansi-1251"] = RUSSIAN_CP1251; + encoding_map["ansi_x3.4-1968"] = ASCII_7BIT; + encoding_map["arabic"] = ISO_8859_6; + encoding_map["ascii"] = ISO_8859_1; + encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard + encoding_map["asmo-708"] = ISO_8859_6; + encoding_map["bhaskar"] = BHASKAR; + encoding_map["big5"] = CHINESE_BIG5; + encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard + encoding_map["big5-hkscs"] = BIG5_HKSCS; + encoding_map["chinese"] = CHINESE_GB; + encoding_map["cns"] = CHINESE_CNS; // not iana standard + encoding_map["cns11643"] = CHINESE_CNS; + encoding_map["cp1250"] = MSFT_CP1250; // not iana standard + encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard + encoding_map["cp1252"] = MSFT_CP1252; // not iana standard + encoding_map["cp1253"] = MSFT_CP1253; // not iana standard + encoding_map["cp1254"] = MSFT_CP1254; // not iana standard + encoding_map["cp1255"] = MSFT_CP1255; + encoding_map["cp1256"] = MSFT_CP1256; + encoding_map["cp1257"] = MSFT_CP1257; // not iana standard + encoding_map["cp819"] = ISO_8859_1; + encoding_map["cp852"] = CZECH_CP852; + encoding_map["cp866"] = RUSSIAN_CP866; + encoding_map["cp-866"] = RUSSIAN_CP866; + encoding_map["cp874"] = MSFT_CP874; + encoding_map["cp932"] = JAPANESE_CP932; // not iana standard + encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard + encoding_map["csbig5"] = CHINESE_BIG5; + encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP; + encoding_map["cseuckr"] = KOREAN_EUC_KR; + encoding_map["csgb2312"] = CHINESE_GB; + encoding_map["csibm852"] = CZECH_CP852; + encoding_map["csibm866"] = RUSSIAN_CP866; + encoding_map["csiso2022jp"] = JAPANESE_JIS; + encoding_map["csiso2022kr"] = ISO_2022_KR; + encoding_map["csiso58gb231280"] = CHINESE_GB; + encoding_map["csiso88598i"] = ISO_8859_8_I; + encoding_map["csisolatin1"] = ISO_8859_1; + encoding_map["csisolatin2"] = ISO_8859_2; + encoding_map["csisolatin3"] = ISO_8859_3; + encoding_map["csisolatin4"] = ISO_8859_4; + encoding_map["csisolatin5"] = ISO_8859_9; + encoding_map["csisolatin6"] = ISO_8859_10; + encoding_map["csisolatinarabic"] = ISO_8859_6; + encoding_map["csisolatincyrillic"] = ISO_8859_5; + encoding_map["csisolatingreek"] = ISO_8859_7; + encoding_map["csisolatinhebrew"] = ISO_8859_8; + encoding_map["csksc56011987"] = KOREAN_EUC_KR; + encoding_map["csmacintosh"] = MACINTOSH_ROMAN; + encoding_map["csn-369103"] = CZECH_CSN_369103; + encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS; + encoding_map["csunicode"] = UTF16BE; + encoding_map["csunicode11"] = UTF16BE; + encoding_map["csunicode11utf7"] = UTF7; + encoding_map["csunicodeascii"] = UTF16BE; + encoding_map["csunicodelatin1"] = UTF16BE; + encoding_map["cyrillic"] = ISO_8859_5; + encoding_map["ecma-114"] = ISO_8859_6; + encoding_map["ecma-118"] = ISO_8859_7; + encoding_map["elot_928"] = ISO_8859_7; + encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard + encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard + encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard + encoding_map["euc-jp"] = JAPANESE_EUC_JP; + encoding_map["euc-kr"] = KOREAN_EUC_KR; + encoding_map["eucgb2312_cn"] = CHINESE_GB; + encoding_map["gb"] = CHINESE_GB; // not iana standard + encoding_map["gb18030"] = GB18030; + encoding_map["gb2132"] = CHINESE_GB; // common typo + encoding_map["gb2312"] = CHINESE_GB; + encoding_map["gb_2312-80"] = CHINESE_GB; + encoding_map["gbk"] = GBK; + encoding_map["greek"] = ISO_8859_7; + encoding_map["greek8"] = ISO_8859_7; + encoding_map["hebrew"] = ISO_8859_8; + encoding_map["htchanakya"] = HTCHANAKYA; + encoding_map["hz-gb-2312"] = HZ_GB_2312; + encoding_map["ibm819"] = ISO_8859_1; + encoding_map["ibm852"] = CZECH_CP852; + encoding_map["ibm874"] = MSFT_CP874; + encoding_map["iso-10646"] = UTF16BE; + encoding_map["iso-10646-j-1"] = UTF16BE; + encoding_map["iso-10646-ucs-2"] = UNICODE; + encoding_map["iso-10646-ucs-4"] = UTF32BE; + encoding_map["iso-10646-ucs-basic"] = UTF16BE; + encoding_map["iso-10646-unicode-latin1"] = UTF16BE; + encoding_map["iso-2022-cn"] = ISO_2022_CN; + encoding_map["iso-2022-jp"] = JAPANESE_JIS; + encoding_map["iso-2022-kr"] = ISO_2022_KR; + encoding_map["iso-8559-1"] = ISO_8859_1; // common typo + encoding_map["iso-874"] = MSFT_CP874; + encoding_map["iso-8858-1"] = ISO_8859_1; // common typo + // iso-8859-0 was a temporary name, eventually renamed iso-8859-15 + encoding_map["iso-8859-0"] = ISO_8859_15; + encoding_map["iso-8859-1"] = ISO_8859_1; + encoding_map["iso-8859-10"] = ISO_8859_10; + encoding_map["iso-8859-11"] = ISO_8859_11; + encoding_map["iso-8859-13"] = ISO_8859_13; + encoding_map["iso-8859-15"] = ISO_8859_15; + encoding_map["iso-8859-2"] = ISO_8859_2; + encoding_map["iso-8859-3"] = ISO_8859_3; + encoding_map["iso-8859-4"] = ISO_8859_4; + encoding_map["iso-8859-5"] = ISO_8859_5; + encoding_map["iso-8859-6"] = ISO_8859_6; + encoding_map["iso-8859-7"] = ISO_8859_7; + encoding_map["iso-8859-8"] = ISO_8859_8; + encoding_map["iso-8859-8-i"] = ISO_8859_8_I; + encoding_map["iso-8859-9"] = ISO_8859_9; + encoding_map["iso-9959-1"] = ISO_8859_1; // common typo + encoding_map["iso-ir-100"] = ISO_8859_1; + encoding_map["iso-ir-101"] = ISO_8859_2; + encoding_map["iso-ir-109"] = ISO_8859_3; + encoding_map["iso-ir-110"] = ISO_8859_4; + encoding_map["iso-ir-126"] = ISO_8859_7; + encoding_map["iso-ir-127"] = ISO_8859_6; + encoding_map["iso-ir-138"] = ISO_8859_8; + encoding_map["iso-ir-144"] = ISO_8859_5; + encoding_map["iso-ir-148"] = ISO_8859_9; + encoding_map["iso-ir-149"] = KOREAN_EUC_KR; + encoding_map["iso-ir-157"] = ISO_8859_10; + encoding_map["iso-ir-58"] = CHINESE_GB; + encoding_map["iso-latin-1"] = ISO_8859_1; + encoding_map["iso_2022-cn"] = ISO_2022_CN; + encoding_map["iso_2022-kr"] = ISO_2022_KR; + encoding_map["iso_8859-1"] = ISO_8859_1; + encoding_map["iso_8859-10:1992"] = ISO_8859_10; + encoding_map["iso_8859-11"] = ISO_8859_11; + encoding_map["iso_8859-13"] = ISO_8859_13; + encoding_map["iso_8859-15"] = ISO_8859_15; + encoding_map["iso_8859-1:1987"] = ISO_8859_1; + encoding_map["iso_8859-2"] = ISO_8859_2; + encoding_map["iso_8859-2:1987"] = ISO_8859_2; + encoding_map["iso_8859-3"] = ISO_8859_3; + encoding_map["iso_8859-3:1988"] = ISO_8859_3; + encoding_map["iso_8859-4"] = ISO_8859_4; + encoding_map["iso_8859-4:1988"] = ISO_8859_4; + encoding_map["iso_8859-5"] = ISO_8859_5; + encoding_map["iso_8859-5:1988"] = ISO_8859_5; + encoding_map["iso_8859-6"] = ISO_8859_6; + encoding_map["iso_8859-6:1987"] = ISO_8859_6; + encoding_map["iso_8859-7"] = ISO_8859_7; + encoding_map["iso_8859-7:1987"] = ISO_8859_7; + encoding_map["iso_8859-8"] = ISO_8859_8; + encoding_map["iso_8859-8:1988:"] = ISO_8859_8; + encoding_map["iso_8859-9"] = ISO_8859_9; + encoding_map["iso_8859-9:1989"] = ISO_8859_9; + encoding_map["jagran"] = JAGRAN; + encoding_map["jis"] = JAPANESE_JIS; // not iana standard + encoding_map["koi8-cs"] = CZECH_CSN_369103; + encoding_map["koi8-r"] = RUSSIAN_KOI8_R; + encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard + encoding_map["koi8-u"] = RUSSIAN_KOI8_RU; + encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard + encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard + encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant + encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard + encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard + encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR; + encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard + encoding_map["l1"] = ISO_8859_1; + encoding_map["l2"] = ISO_8859_2; + encoding_map["l3"] = ISO_8859_3; + encoding_map["l4"] = ISO_8859_4; + encoding_map["l5"] = ISO_8859_9; + encoding_map["l6"] = ISO_8859_10; + encoding_map["latin-1"] = ISO_8859_1; // not iana standard + encoding_map["latin1"] = ISO_8859_1; + encoding_map["latin2"] = ISO_8859_2; + encoding_map["latin3"] = ISO_8859_3; + encoding_map["latin4"] = ISO_8859_4; + encoding_map["latin5"] = ISO_8859_9; + encoding_map["latin6"] = ISO_8859_10; + encoding_map["mac"] = MACINTOSH_ROMAN; + encoding_map["macintosh"] = MACINTOSH_ROMAN; + encoding_map["macintosh-roman"] = MACINTOSH_ROMAN; + encoding_map["ms932"] = JAPANESE_CP932; // not iana standard + encoding_map["ms_kanji"] = JAPANESE_CP932; + encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS; + encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS; + encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard + encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard + encoding_map["sun_eu_greek"] = ISO_8859_7; + encoding_map["tab"] = TAMIL_BI; + encoding_map["tam"] = TAMIL_MONO; + encoding_map["tis-620"] = ISO_8859_11; + encoding_map["tscii"] = TSCII; + encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard + encoding_map["unicode"] = UNICODE; // not iana standard + encoding_map["unicode-1-1-utf-7"] = UTF7; + encoding_map["unicode-1-1-utf-8"] = UTF8; + encoding_map["unicode-2-0-utf-7"] = UTF7; + encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard + encoding_map["us"] = ISO_8859_1; + encoding_map["us-ascii"] = ISO_8859_1; + encoding_map["utf-16be"] = UTF16BE; + encoding_map["utf-16le"] = UTF16LE; + encoding_map["utf-32be"] = UTF32BE; + encoding_map["utf-32le"] = UTF32LE; + encoding_map["utf-7"] = UTF7; + encoding_map["utf-8"] = UTF8; + encoding_map["utf7"] = UTF7; + encoding_map["utf8"] = UTF8; // not iana standard + encoding_map["visual"] = HEBREW_VISUAL; + encoding_map["win-1250"] = MSFT_CP1250; // not iana standard + encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard + encoding_map["window-874"] = MSFT_CP874; + encoding_map["windows-1250"] = MSFT_CP1250; + encoding_map["windows-1251"] = RUSSIAN_CP1251; + encoding_map["windows-1252"] = MSFT_CP1252; + encoding_map["windows-1253"] = MSFT_CP1253; + encoding_map["windows-1254"] = MSFT_CP1254; + encoding_map["windows-1255"] = MSFT_CP1255; + encoding_map["windows-1256"] = MSFT_CP1256; + encoding_map["windows-1257"] = MSFT_CP1257; + encoding_map["windows-31j"] = JAPANESE_CP932; + encoding_map["windows-874"] = MSFT_CP874; + encoding_map["windows-936"] = GBK; + encoding_map["x-big5"] = CHINESE_BIG5; + encoding_map["x-binaryenc"] = BINARYENC; // not iana standard + encoding_map["x-cp1250"] = MSFT_CP1250; + encoding_map["x-cp1251"] = RUSSIAN_CP1251; + encoding_map["x-cp1252"] = MSFT_CP1252; + encoding_map["x-cp1253"] = MSFT_CP1253; + encoding_map["x-cp1254"] = MSFT_CP1254; + encoding_map["x-cp1255"] = MSFT_CP1255; + encoding_map["x-cp1256"] = MSFT_CP1256; + encoding_map["x-cp1257"] = MSFT_CP1257; + encoding_map["x-euc-jp"] = JAPANESE_EUC_JP; + encoding_map["x-euc-tw"] = CHINESE_CNS; + encoding_map["x-gbk"] = GBK; + encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE; + encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE; + encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE; + encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE; + encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard + encoding_map["x-mac-roman"] = MACINTOSH_ROMAN; + encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard + encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS; + encoding_map["x-unicode-2-0-utf-7"] = UTF7; + encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard + encoding_map["x-x-big5"] = CHINESE_BIG5; + encoding_map["zh_cn.euc"] = CHINESE_GB; + encoding_map["zh_tw-big5"] = CHINESE_BIG5; + encoding_map["zh_tw-euc"] = CHINESE_CNS; + + // Remove they entry for the empty string, if any. + encoding_map.erase(""); + + return encoding_map; +} + +// ---------------------------------------------------------------------- +// EncodingNameAliasToEncoding() +// +// This function takes an encoding name/alias and returns the Encoding +// enum. The input is case insensitive. It is the union of the common +// IANA standard names, the charset names used in Netscape Navigator, +// and some common names we have been using. +// See: http://www.iana.org/assignments/character-sets +// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html +// +// UNKNOWN_ENCODING is returned if none matches. +// +// TODO: Check if it is possible to remove the non-standard, +// non-netscape-use names. It is because this routine is used for +// encoding detections from html meta info. Non-standard names may +// introduce noise on encoding detection. +// +// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName, +// or determine why such a unification is not possible. +// ---------------------------------------------------------------------- +Encoding EncodingNameAliasToEncoding(const char *encoding_name) { + if (!encoding_name) { + return UNKNOWN_ENCODING; + } + + const EncodingMap& encoding_map = GetEncodingMap(); + + EncodingMap::const_iterator emi = encoding_map.find(encoding_name); + if (emi != encoding_map.end()) { + return emi->second; + } else { + return UNKNOWN_ENCODING; + } +} + +const char* default_encoding_name() { + return kEncodingInfoTable[LATIN1].encoding_name_; +} + +static const char* const kInvalidEncodingName = "invalid_encoding"; + +const char *invalid_encoding_name() { + return kInvalidEncodingName; +} + + + +// ************************************************************* +// Miscellany +// ************************************************************* + + +Encoding PreferredWebOutputEncoding(Encoding enc) { + return IsValidEncoding(enc) + ? kEncodingInfoTable[enc].preferred_web_output_encoding_ + : UTF8; +} diff --git a/contrib/google-ced/util/encodings/encodings.h b/contrib/google-ced/util/encodings/encodings.h new file mode 100644 index 000000000..647797432 --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings.h @@ -0,0 +1,299 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_ENCODINGS_ENCODINGS_H_ +#define UTIL_ENCODINGS_ENCODINGS_H_ + +// This interface defines the Encoding enum and various functions that +// depend only on Encoding values. + +// A hash-function for Encoding, hash<Encoding>, is defined in +// i18n/encodings/public/encodings-hash.h + +// On some Windows projects, UNICODE may be defined, which would prevent the +// Encoding enum below from compiling. Note that this is a quick fix that does +// not break any existing projects. The UNICODE enum may someday be changed +// to something more specific and non-colliding, but this involves careful +// testing of changes in many other projects. +#undef UNICODE + +// NOTE: The Encoding enum must always start at 0. This assumption has +// been made and used. + +#ifndef SWIG + +#include "util/encodings/encodings.pb.h" + +#else + +// TODO: Include a SWIG workaround header file. + +#endif + +const int kNumEncodings = NUM_ENCODINGS; + +// some of the popular encoding aliases +// TODO: Make these static const Encoding values instead of macros. +#define LATIN1 ISO_8859_1 +#define LATIN2 ISO_8859_2 +#define LATIN3 ISO_8859_3 +#define LATIN4 ISO_8859_4 +#define CYRILLIC ISO_8859_5 +#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language +#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language +#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language +#define LATIN5 ISO_8859_9 +#define LATIN6 ISO_8859_10 +#define KOREAN_HANGUL KOREAN_EUC_KR + +// The default Encoding (LATIN1). +Encoding default_encoding(); + + + +// ************************************************************* +// Encoding predicates +// IsValidEncoding() +// IsEncEncCompatible +// IsSupersetOfAscii7Bit +// Is8BitEncoding +// IsCJKEncoding +// IsHebrewEncoding +// IsRightToLeftEncoding +// IsLogicalRightToLeftEncoding +// IsVisualRightToLeftEncoding +// IsIso2022Encoding +// IsIso2022JpOrVariant +// IsShiftJisOrVariant +// IsJapaneseCellPhoneCarrierSpecificEncoding +// ************************************************************* + +// IsValidEncoding +// =================================== +// +// Function to check if the input language enum is within range. +// + +bool IsValidEncoding(Encoding enc); + +// +// IsEncEncCompatible +// ------------------ +// +// This function is to determine whether or not converting from the +// first encoding to the second requires any changes to the underlying +// text (e.g. ASCII_7BIT is a subset of UTF8). +// +// TODO: the current implementation is likely incomplete. It would be +// good to consider the full matrix of all pairs of encodings and to fish out +// all compatible pairs. +// +bool IsEncEncCompatible(const Encoding from, const Encoding to); + +// To be a superset of 7-bit Ascii means that bytes 0...127 in the given +// encoding represent the same characters as they do in ISO_8859_1. + +// WARNING: This function does not currently return true for all encodings that +// are supersets of Ascii 7-bit. +bool IsSupersetOfAscii7Bit(Encoding e); + +// To be an 8-bit encoding means that there are fewer than 256 symbols. +// Each byte determines a new character; there are no multi-byte sequences. + +// WARNING: This function does not currently return true for all encodings that +// are 8-bit encodings. +bool Is8BitEncoding(Encoding e); + +// IsCJKEncoding +// ------------- +// +// This function returns true if the encoding is either Chinese +// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not +// considered a CJK encoding. +bool IsCJKEncoding(Encoding e); + +// IsHebrewEncoding +// ------------- +// +// This function returns true if the encoding is a Hebrew specific +// encoding (not UTF8, etc). +bool IsHebrewEncoding(Encoding e); + +// IsRightToLeftEncoding +// --------------------- +// +// Returns true if the encoding is a right-to-left encoding. +// +// Note that the name of this function is somewhat misleading. There is nothing +// "right to left" about these encodings. They merely contain code points for +// characters in RTL languages such as Hebrew and Arabic. But this is also +// true for UTF-8. +// +// TODO: Get rid of this function. The only special-case we +// should need to worry about are visual encodings. Anything we +// need to do for all 'RTL' encodings we need to do for UTF-8 as well. +bool IsRightToLeftEncoding(Encoding enc); + +// IsLogicalRightToLeftEncoding +// ---------------------------- +// +// Returns true if the encoding is a logical right-to-left encoding. +// Logical right-to-left encodings are those that the browser renders +// right-to-left and applies the BiDi algorithm to. Therefore the characters +// appear in reading order in the file, and indexing, snippet generation etc. +// should all just work with no special processing. +// +// TODO: Get rid of this function. The only special-case we +// should need to worry about are visual encodings. +bool IsLogicalRightToLeftEncoding(Encoding enc); + +// IsVisualRightToLeftEncoding +// --------------------------- +// +// Returns true if the encoding is a visual right-to-left encoding. +// Visual right-to-left encodings are those that the browser renders +// left-to-right and does not apply the BiDi algorithm to. Therefore each +// line appears in reverse order in the file, lines are manually wrapped +// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of +// the prehistoric days when browsers couldn't render right-to-left, but +// unfortunately some visual pages persist to this day. These documents require +// special processing so that we don't index or snippet them with each line +// reversed. +bool IsVisualRightToLeftEncoding(Encoding enc); + +// IsIso2022Encoding +// ----------------- +// +// Returns true if the encoding is a kind of ISO 2022 such as +// ISO-2022-JP. +bool IsIso2022Encoding(Encoding enc); + +// IsIso2022JpOrVariant +// -------------------- +// +// Returns true if the encoding is ISO-2022-JP or a variant such as +// KDDI's ISO-2022-JP. +bool IsIso2022JpOrVariant(Encoding enc); + +// IsShiftJisOrVariant +// -------------------- +// +// Returns true if the encoding is Shift_JIS or a variant such as +// KDDI's Shift_JIS. +bool IsShiftJisOrVariant(Encoding enc); + +// IsJapanesCellPhoneCarrierSpecificEncoding +// ----------------------------------------- +// +// Returns true if it's Japanese cell phone carrier specific encoding +// such as KDDI_SHIFT_JIS. +bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc); + + + +// ************************************************************* +// ENCODING NAMES +// +// This interface defines a standard name for each valid encoding, and +// a standard name for invalid encodings. (Some names use all upper +// case, but others use mixed case.) +// +// EncodingName() [Encoding to name] +// MimeEncodingName() [Encoding to name] +// EncodingFromName() [name to Encoding] +// EncodingNameAliasToEncoding() [name to Encoding] +// default_encoding_name() +// invalid_encoding_name() +// ************************************************************* + +// EncodingName +// ------------ +// +// Given the encoding, returns its standard name. +// Return invalid_encoding_name() if the encoding is invalid. +// +const char* EncodingName(Encoding enc); + +// +// MimeEncodingName +// ---------------- +// +// Return the "preferred MIME name" of an encoding. +// +// This name is suitable for using in HTTP headers, HTML tags, +// and as the "charset" parameter of a MIME Content-Type. +const char* MimeEncodingName(Encoding enc); + + +// The maximum length of an encoding name +const int kMaxEncodingNameSize = 50; + +// The standard name of the default encoding. +const char* default_encoding_name(); + +// The name used for an invalid encoding. +const char* invalid_encoding_name(); + +// EncodingFromName +// ---------------- +// +// If enc_name matches the standard name of an Encoding, using a +// case-insensitive comparison, set *encoding to that Encoding and +// return true. Otherwise set *encoding to UNKNOWN_ENCODING and +// return false. +// +// REQUIRES: encoding must not be NULL. +// +bool EncodingFromName(const char* enc_name, Encoding *encoding); + +// +// EncodingNameAliasToEncoding +// --------------------------- +// +// If enc_name matches the standard name or an alias of an Encoding, +// using a case-insensitive comparison, return that +// Encoding. Otherwise, return UNKNOWN_ENCODING. +// +// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for +// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and +// common variations with hyphens and underscores (e.g., "koi8-u" and +// "koi8u" for RUSSIAN_KOI8_R). + +Encoding EncodingNameAliasToEncoding(const char *enc_name); + +// ************************************************************* +// Miscellany +// ************************************************************* + +// PreferredWebOutputEncoding +// -------------------------- +// +// Some multi-byte encodings use byte values that coincide with the +// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE +// can misinterpret these, as indicated in an external XSS report from +// 2007-02-15. Here, we map these dangerous encodings to safer ones. We +// also use UTF8 instead of encodings that we don't support in our +// output, and we generally try to be conservative in what we send out. +// Where the client asks for single- or double-byte encodings that are +// not as common, we substitute a more common single- or double-byte +// encoding, if there is one, thereby preserving the client's intent +// to use less space than UTF-8. This also means that characters +// outside the destination set will be converted to HTML NCRs (&#NNN;) +// if requested. +Encoding PreferredWebOutputEncoding(Encoding enc); + + +#endif // UTIL_ENCODINGS_ENCODINGS_H_ diff --git a/contrib/google-ced/util/encodings/encodings.pb.h b/contrib/google-ced/util/encodings/encodings.pb.h new file mode 100644 index 000000000..ffbd716ec --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings.pb.h @@ -0,0 +1,181 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_ +#define UTIL_ENCODINGS_ENCODINGS_PB_H_ + +enum Encoding { + ISO_8859_1 = 0, // Teragram ASCII + ISO_8859_2 = 1, // Teragram Latin2 + ISO_8859_3 = 2, // in BasisTech but not in Teragram + ISO_8859_4 = 3, // Teragram Latin4 + ISO_8859_5 = 4, // Teragram ISO-8859-5 + ISO_8859_6 = 5, // Teragram Arabic + ISO_8859_7 = 6, // Teragram Greek + ISO_8859_8 = 7, // Teragram Hebrew + ISO_8859_9 = 8, // in BasisTech but not in Teragram + ISO_8859_10 = 9, // in BasisTech but not in Teragram + JAPANESE_EUC_JP = 10, // Teragram EUC_JP + JAPANESE_SHIFT_JIS = 11, // Teragram SJS + JAPANESE_JIS = 12, // Teragram JIS + CHINESE_BIG5 = 13, // Teragram BIG5 + CHINESE_GB = 14, // Teragram GB + CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech + // CNS11643EUC, before that Teragram EUC-CN(!) + // See //i18n/basistech/basistech_encodings.h + KOREAN_EUC_KR = 16, // Teragram KSC + UNICODE = 17, // Teragram Unicode + CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech + // CNS11643EUC, before that Teragram EUC. + CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech + // CNS11643EUC, before that Teragram CNS. + CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950 + JAPANESE_CP932 = 21, // Teragram CP932 + UTF8 = 22, + UNKNOWN_ENCODING = 23, + ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127. + // Should be present only in the crawler + // and in the repository, + // *never* as a result of Document::encoding(). + RUSSIAN_KOI8_R = 25, // Teragram KOI8R + RUSSIAN_CP1251 = 26, // Teragram CP1251 + + //---------------------------------------------------------- + // These are _not_ output from teragram. Instead, they are as + // detected in the headers of usenet articles. + MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii + RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian. + // Misnamed, this is _not_ KOI8-RU but KOI8-U. + // KOI8-U is used much more often than KOI8-RU. + MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european + ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized + //---------------------------------------------------------- + + //---------------------------------------------------------- + // These are in BasisTech but not in Teragram. They are + // needed for new interface languages. Now detected by + // research langid + MSFT_CP1254 = 31, // used for Turkish + MSFT_CP1257 = 32, // used in Baltic countries + //---------------------------------------------------------- + + //---------------------------------------------------------- + //---------------------------------------------------------- + // New encodings detected by Teragram + ISO_8859_11 = 33, // aka TIS-620, used for Thai + MSFT_CP874 = 34, // used for Thai + MSFT_CP1256 = 35, // used for Arabic + + //---------------------------------------------------------- + // Detected as ISO_8859_8 by Teragram, but can be found in META tags + MSFT_CP1255 = 36, // Logical Hebrew Microsoft + ISO_8859_8_I = 37, // Iso Hebrew Logical + HEBREW_VISUAL = 38, // Iso Hebrew Visual + //---------------------------------------------------------- + + //---------------------------------------------------------- + // Detected by research langid + CZECH_CP852 = 39, + CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS + MSFT_CP1253 = 41, // used for Greek + RUSSIAN_CP866 = 42, + //---------------------------------------------------------- + + //---------------------------------------------------------- + // Handled by iconv in glibc + ISO_8859_13 = 43, + ISO_2022_KR = 44, + GBK = 45, + GB18030 = 46, + BIG5_HKSCS = 47, + ISO_2022_CN = 48, + + //----------------------------------------------------------- + // Detected by xin liu's detector + // Handled by transcoder + // (Indic encodings) + + TSCII = 49, + TAMIL_MONO = 50, + TAMIL_BI = 51, + JAGRAN = 52, + + + MACINTOSH_ROMAN = 53, + UTF7 = 54, + BHASKAR = 55, // Indic encoding - Devanagari + HTCHANAKYA = 56, // 56 Indic encoding - Devanagari + + //----------------------------------------------------------- + // These allow a single place (inputconverter and outputconverter) + // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8 + // bulk conversions, with interchange-valid checking on input and + // fallback if needed on ouput. + UTF16BE = 57, // big-endian UTF-16 + UTF16LE = 58, // little-endian UTF-16 + UTF32BE = 59, // big-endian UTF-32 + UTF32LE = 60, // little-endian UTF-32 + //----------------------------------------------------------- + + //----------------------------------------------------------- + // An encoding that means "This is not text, but it may have some + // simple ASCII text embedded". Intended input conversion (not yet + // implemented) is to keep strings of >=4 seven-bit ASCII characters + // (follow each kept string with an ASCII space), delete the rest of + // the bytes. This will pick up and allow indexing of e.g. captions + // in JPEGs. No output conversion needed. + BINARYENC = 61, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Some Web pages allow a mixture of HZ-GB and GB-2312 by using + // ~{ ... ~} for 2-byte pairs, and the browsers support this. + HZ_GB_2312 = 62, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Some external vendors make the common input error of + // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed. + UTF8UTF8 = 63, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Handled by transcoder for tamil language specific font + // encodings without the support for detection at present. + TAM_ELANGO = 64, // Elango - Tamil + TAM_LTTMBARANI = 65, // Barani - Tamil + TAM_SHREE = 66, // Shree - Tamil + TAM_TBOOMIS = 67, // TBoomis - Tamil + TAM_TMNEWS = 68, // TMNews - Tamil + TAM_WEBTAMIL = 69, // Webtamil - Tamil + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Shift_JIS variants used by Japanese cell phone carriers. + KDDI_SHIFT_JIS = 70, + DOCOMO_SHIFT_JIS = 71, + SOFTBANK_SHIFT_JIS = 72, + // ISO-2022-JP variants used by KDDI and SoftBank. + KDDI_ISO_2022_JP = 73, + SOFTBANK_ISO_2022_JP = 74, + //----------------------------------------------------------- + + NUM_ENCODINGS = 75, // Always keep this at the end. It is not a + // valid Encoding enum, it is only used to + // indicate the total number of Encodings. +}; + +#endif // UTIL_ENCODINGS_ENCODINGS_PB_H_ diff --git a/contrib/google-ced/util/encodings/encodings_unittest.cc b/contrib/google-ced/util/encodings/encodings_unittest.cc new file mode 100644 index 000000000..223e3e45b --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings_unittest.cc @@ -0,0 +1,34 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "util/encodings/encodings.h" + +#include "gtest/gtest.h" + +TEST(EncodingsTest, EncodingNameAliasToEncoding) { + // Test that cases, non-alpha-numeric chars are ignored. + EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso_8859_1")); + EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso-8859-1")); + + // Test that spaces are ignored. + EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF8")); + EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF 8")); + EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF-8")); + + // Test alphanumeric differences are counted. + EXPECT_NE(UTF8, EncodingNameAliasToEncoding("UTF-7")); + EXPECT_NE(KOREAN_EUC_KR, EncodingNameAliasToEncoding("euc-jp")); +} diff --git a/contrib/google-ced/util/languages/languages.cc b/contrib/google-ced/util/languages/languages.cc new file mode 100644 index 000000000..852351fc6 --- /dev/null +++ b/contrib/google-ced/util/languages/languages.cc @@ -0,0 +1,349 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "util/languages/languages.h" + +#include "util/basictypes.h" +#include "util/string_util.h" + + +Language default_language() {return ENGLISH;} + + +// Language names and codes + +struct LanguageInfo { + const char * language_name_; + const char * language_code_639_1_; // the ISO-639-1 code for the language + const char * language_code_639_2_; // the ISO-639-2 code for the language + const char * language_code_other_; // some nonstandard code for the language +}; + +static const LanguageInfo kLanguageInfoTable[] = { + { "ENGLISH", "en", "eng", NULL}, + { "DANISH", "da", "dan", NULL}, + { "DUTCH", "nl", "dut", NULL}, + { "FINNISH", "fi", "fin", NULL}, + { "FRENCH", "fr", "fre", NULL}, + { "GERMAN", "de", "ger", NULL}, + { "HEBREW", "he", "heb", NULL}, + { "ITALIAN", "it", "ita", NULL}, + { "Japanese", "ja", "jpn", NULL}, + { "Korean", "ko", "kor", NULL}, + { "NORWEGIAN", "nb", "nor", NULL}, + { "POLISH", "pl", "pol", NULL}, + { "PORTUGUESE", "pt", "por", NULL}, + { "RUSSIAN", "ru", "rus", NULL}, + { "SPANISH", "es", "spa", NULL}, + { "SWEDISH", "sv", "swe", NULL}, + { "Chinese", "zh", "chi", "zh-CN"}, + { "CZECH", "cs", "cze", NULL}, + { "GREEK", "el", "gre", NULL}, + { "ICELANDIC", "is", "ice", NULL}, + { "LATVIAN", "lv", "lav", NULL}, + { "LITHUANIAN", "lt", "lit", NULL}, + { "ROMANIAN", "ro", "rum", NULL}, + { "HUNGARIAN", "hu", "hun", NULL}, + { "ESTONIAN", "et", "est", NULL}, + // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE" + // and "Unknown", they are essentially the same. Need to unify them. + // "un" and "ut" are invented by us, not from ISO-639. + // + { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"}, + { "Unknown", NULL, NULL, "un"}, + { "BULGARIAN", "bg", "bul", NULL}, + { "CROATIAN", "hr", "scr", NULL}, + { "SERBIAN", "sr", "scc", NULL}, + { "IRISH", "ga", "gle", NULL}, + { "GALICIAN", "gl", "glg", NULL}, + // Impossible to tell Tagalog from Filipino at the moment. + // Use ISO 639-2 code for Filipino here. + { "TAGALOG", NULL, "fil", NULL}, + { "TURKISH", "tr", "tur", NULL}, + { "UKRAINIAN", "uk", "ukr", NULL}, + { "HINDI", "hi", "hin", NULL}, + { "MACEDONIAN", "mk", "mac", NULL}, + { "BENGALI", "bn", "ben", NULL}, + { "INDONESIAN", "id", "ind", NULL}, + { "LATIN", "la", "lat", NULL}, + { "MALAY", "ms", "may", NULL}, + { "MALAYALAM", "ml", "mal", NULL}, + { "WELSH", "cy", "wel", NULL}, + { "NEPALI", "ne", "nep", NULL}, + { "TELUGU", "te", "tel", NULL}, + { "ALBANIAN", "sq", "alb", NULL}, + { "TAMIL", "ta", "tam", NULL}, + { "BELARUSIAN", "be", "bel", NULL}, + { "JAVANESE", "jw", "jav", NULL}, + { "OCCITAN", "oc", "oci", NULL}, + { "URDU", "ur", "urd", NULL}, + { "BIHARI", "bh", "bih", NULL}, + { "GUJARATI", "gu", "guj", NULL}, + { "THAI", "th", "tha", NULL}, + { "ARABIC", "ar", "ara", NULL}, + { "CATALAN", "ca", "cat", NULL}, + { "ESPERANTO", "eo", "epo", NULL}, + { "BASQUE", "eu", "baq", NULL}, + { "INTERLINGUA", "ia", "ina", NULL}, + { "KANNADA", "kn", "kan", NULL}, + { "PUNJABI", "pa", "pan", NULL}, + { "SCOTS_GAELIC", "gd", "gla", NULL}, + { "SWAHILI", "sw", "swa", NULL}, + { "SLOVENIAN", "sl", "slv", NULL}, + { "MARATHI", "mr", "mar", NULL}, + { "MALTESE", "mt", "mlt", NULL}, + { "VIETNAMESE", "vi", "vie", NULL}, + { "FRISIAN", "fy", "fry", NULL}, + { "SLOVAK", "sk", "slo", NULL}, + { "ChineseT", + NULL, NULL, // We intentionally set these 2 fields to NULL to avoid + // confusion between CHINESE_T and CHINESE. + "zh-TW"}, + { "FAROESE", "fo", "fao", NULL}, + { "SUNDANESE", "su", "sun", NULL}, + { "UZBEK", "uz", "uzb", NULL}, + { "AMHARIC", "am", "amh", NULL}, + { "AZERBAIJANI", "az", "aze", NULL}, + { "GEORGIAN", "ka", "geo", NULL}, + { "TIGRINYA", "ti", "tir", NULL}, + { "PERSIAN", "fa", "per", NULL}, + { "BOSNIAN", "bs", "bos", NULL}, + { "SINHALESE", "si", "sin", NULL}, + { "NORWEGIAN_N", "nn", "nno", NULL}, + { "PORTUGUESE_P", NULL, NULL, "pt-PT"}, + { "PORTUGUESE_B", NULL, NULL, "pt-BR"}, + { "XHOSA", "xh", "xho", NULL}, + { "ZULU", "zu", "zul", NULL}, + { "GUARANI", "gn", "grn", NULL}, + { "SESOTHO", "st", "sot", NULL}, + { "TURKMEN", "tk", "tuk", NULL}, + { "KYRGYZ", "ky", "kir", NULL}, + { "BRETON", "br", "bre", NULL}, + { "TWI", "tw", "twi", NULL}, + { "YIDDISH", "yi", "yid", NULL}, + { "SERBO_CROATIAN", "sh", NULL, NULL}, + { "SOMALI", "so", "som", NULL}, + { "UIGHUR", "ug", "uig", NULL}, + { "KURDISH", "ku", "kur", NULL}, + { "MONGOLIAN", "mn", "mon", NULL}, + { "ARMENIAN", "hy", "arm", NULL}, + { "LAOTHIAN", "lo", "lao", NULL}, + { "SINDHI", "sd", "snd", NULL}, + { "RHAETO_ROMANCE", "rm", "roh", NULL}, + { "AFRIKAANS", "af", "afr", NULL}, + { "LUXEMBOURGISH", "lb", "ltz", NULL}, + { "BURMESE", "my", "bur", NULL}, + // KHMER is known as Cambodian for Google user interfaces. + { "KHMER", "km", "khm", NULL}, + { "TIBETAN", "bo", "tib", NULL}, + { "DHIVEHI", "dv", "div", NULL}, + { "CHEROKEE", NULL, "chr", NULL}, + { "SYRIAC", NULL, "syr", NULL}, + { "LIMBU", NULL, NULL, "sit-NP"}, + { "ORIYA", "or", "ori", NULL}, + { "ASSAMESE", "as", "asm", NULL}, + { "CORSICAN", "co", "cos", NULL}, + { "INTERLINGUE", "ie", "ine", NULL}, + { "KAZAKH", "kk", "kaz", NULL}, + { "LINGALA", "ln", "lin", NULL}, + { "MOLDAVIAN", "mo", "mol", NULL}, + { "PASHTO", "ps", "pus", NULL}, + { "QUECHUA", "qu", "que", NULL}, + { "SHONA", "sn", "sna", NULL}, + { "TAJIK", "tg", "tgk", NULL}, + { "TATAR", "tt", "tat", NULL}, + { "TONGA", "to", "tog", NULL}, + { "YORUBA", "yo", "yor", NULL}, + { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL}, + { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL}, + { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL}, + { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL}, + { "MAORI", "mi", "mao", NULL}, + { "WOLOF", "wo", "wol", NULL}, + { "ABKHAZIAN", "ab", "abk", NULL}, + { "AFAR", "aa", "aar", NULL}, + { "AYMARA", "ay", "aym", NULL}, + { "BASHKIR", "ba", "bak", NULL}, + { "BISLAMA", "bi", "bis", NULL}, + { "DZONGKHA", "dz", "dzo", NULL}, + { "FIJIAN", "fj", "fij", NULL}, + { "GREENLANDIC", "kl", "kal", NULL}, + { "HAUSA", "ha", "hau", NULL}, + { "HAITIAN_CREOLE", "ht", NULL, NULL}, + { "INUPIAK", "ik", "ipk", NULL}, + { "INUKTITUT", "iu", "iku", NULL}, + { "KASHMIRI", "ks", "kas", NULL}, + { "KINYARWANDA", "rw", "kin", NULL}, + { "MALAGASY", "mg", "mlg", NULL}, + { "NAURU", "na", "nau", NULL}, + { "OROMO", "om", "orm", NULL}, + { "RUNDI", "rn", "run", NULL}, + { "SAMOAN", "sm", "smo", NULL}, + { "SANGO", "sg", "sag", NULL}, + { "SANSKRIT", "sa", "san", NULL}, + { "SISWANT", "ss", "ssw", NULL}, + { "TSONGA", "ts", "tso", NULL}, + { "TSWANA", "tn", "tsn", NULL}, + { "VOLAPUK", "vo", "vol", NULL}, + { "ZHUANG", "za", "zha", NULL}, + { "KHASI", NULL, "kha", NULL}, + { "SCOTS", NULL, "sco", NULL}, + { "GANDA", "lg", "lug", NULL}, + { "MANX", "gv", "glv", NULL}, + { "MONTENEGRIN", NULL, NULL, "sr-ME"}, + { "XX", NULL, NULL, "XX"}, +}; + +COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1, + kLanguageInfoTable_has_incorrect_length); + + +// LANGUAGE NAMES + +const char* default_language_name() { + return kLanguageInfoTable[ENGLISH].language_name_; +} + +static const char* const kInvalidLanguageName = "invalid_language"; + +const char *invalid_language_name() { + return kInvalidLanguageName; +} + +const char* LanguageName(Language lang) { + return IsValidLanguage(lang) + ? kLanguageInfoTable[lang].language_name_ + : kInvalidLanguageName; +} + + + +// LANGUAGE CODES + + +// The space before invalid_language_code is intentional. It is used +// to prevent it matching any two letter language code. +// +static const char* const kInvalidLanguageCode = " invalid_language_code"; + +const char *invalid_language_code() { + return kInvalidLanguageCode; +} + +const char * LanguageCode(Language lang) { + if (! IsValidLanguage(lang)) + return kInvalidLanguageCode; + const LanguageInfo& info = kLanguageInfoTable[lang]; + if (info.language_code_639_1_) { + return info.language_code_639_1_; + } else if (info.language_code_639_2_) { + return info.language_code_639_2_; + } else if (info.language_code_other_) { + return info.language_code_other_; + } else { + return kInvalidLanguageCode; + } +} + +const char* default_language_code() { + return kLanguageInfoTable[ENGLISH].language_code_639_1_; +} + +const char* LanguageCodeISO639_1(Language lang) { + if (! IsValidLanguage(lang)) + return kInvalidLanguageCode; + if (const char* code = kLanguageInfoTable[lang].language_code_639_1_) + return code; + return kInvalidLanguageCode; +} + +const char* LanguageCodeISO639_2(Language lang) { + if (! IsValidLanguage(lang)) + return kInvalidLanguageCode; + if (const char* code = kLanguageInfoTable[lang].language_code_639_2_) + return code; + return kInvalidLanguageCode; +} + +const char* LanguageCodeWithDialects(Language lang) { + if (lang == CHINESE) + return "zh-CN"; + return LanguageCode(lang); +} + + + +bool LanguageFromCode(const char* lang_code, Language *language) { + *language = UNKNOWN_LANGUAGE; + if ( lang_code == NULL ) return false; + + for ( int i = 0 ; i < kNumLanguages ; i++ ) { + const LanguageInfo& info = kLanguageInfoTable[i]; + if ((info.language_code_639_1_ && + !base::strcasecmp(lang_code, info.language_code_639_1_)) || + (info.language_code_639_2_ && + !base::strcasecmp(lang_code, info.language_code_639_2_)) || + (info.language_code_other_ && + !base::strcasecmp(lang_code, info.language_code_other_))) { + *language = static_cast<Language>(i); + return true; + } + } + + // For convenience, this function can also parse the non-standard + // five-letter language codes "zh-cn" and "zh-tw" which are used by + // front-ends such as GWS to distinguish Simplified from Traditional + // Chinese. + if (!base::strcasecmp(lang_code, "zh-cn") || + !base::strcasecmp(lang_code, "zh_cn")) { + *language = CHINESE; + return true; + } + if (!base::strcasecmp(lang_code, "zh-tw") || + !base::strcasecmp(lang_code, "zh_tw")) { + *language = CHINESE_T; + return true; + } + if (!base::strcasecmp(lang_code, "sr-me") || + !base::strcasecmp(lang_code, "sr_me")) { + *language = MONTENEGRIN; + return true; + } + + // Process language-code synonyms. + if (!base::strcasecmp(lang_code, "he")) { + *language = HEBREW; // Use "iw". + return true; + } + if (!base::strcasecmp(lang_code, "in")) { + *language = INDONESIAN; // Use "id". + return true; + } + if (!base::strcasecmp(lang_code, "ji")) { + *language = YIDDISH; // Use "yi". + return true; + } + + // Process language-detection synonyms. + // These distinct languages cannot be differentiated by our current + // language-detection algorithms. + if (!base::strcasecmp(lang_code, "fil")) { + *language = TAGALOG; + return true; + } + + return false; +} diff --git a/contrib/google-ced/util/languages/languages.h b/contrib/google-ced/util/languages/languages.h new file mode 100644 index 000000000..4237961e3 --- /dev/null +++ b/contrib/google-ced/util/languages/languages.h @@ -0,0 +1,381 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_LANGUAGES_LANGUAGES_H_ +#define UTIL_LANGUAGES_LANGUAGES_H_ + +// This interface defines the Language enum and functions that depend +// only on Language values. + +// A hash-function for Language, hash<Language>, is defined in +// i18n/languages/public/languages-hash.h + +#ifndef SWIG +// Language enum defined in languages.proto +// Also description on how to add languages. +#include "util/languages/languages.pb.h" + +#else + +// TODO: Include a header containing swig-compatible enum. + +#endif + +const int kNumLanguages = NUM_LANGUAGES; + +// Return the default language (ENGLISH). +Language default_language(); + + +// ******************************************* +// Language predicates +// IsValidLanguage() +// IS_LANGUAGE_UNKNOWN() +// IsCJKLanguage() +// IsChineseLanguage() +// IsNorwegianLanguage() +// IsPortugueseLanguage() +// IsRightToLeftLanguage() +// IsMaybeRightToLeftLanguage() +// IsSameLanguage() +// IsScriptRequiringLongerSnippets() +// ******************************************* + +// IsValidLanguage +// =============== +// +// Function to check if the input is within range of the Language enum. If +// IsValidLanguage(lang) returns true, it is safe to call +// static_cast<Language>(lang). +// +inline bool IsValidLanguage(int lang) { + return ((lang >= 0) && (lang < kNumLanguages)); +} + +// Return true if the language is "unknown". (This function was +// previously a macro, hence the spelling in all caps.) +// +inline bool IS_LANGUAGE_UNKNOWN(Language lang) { + return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE; +} + +// IsCJKLanguage +// ------------- +// +// This function returns true if the language is either Chinese +// (simplified or traditional), Japanese, or Korean. +bool IsCJKLanguage(Language lang); + +// IsChineseLanguage +// ----------------- +// +// This function returns true if the language is either Chinese +// (simplified or traditional) +bool IsChineseLanguage(Language lang); + +// IsNorwegianLanguage +// -------------------- +// +// This function returns true if the language is any of the Norwegian +// (regular or Nynorsk). +bool IsNorwegianLanguage(Language lang); + +// IsPortugueseLanguage +// -------------------- +// +// This function returns true if the language is any of the Portuguese +// languages (regular, Portugal or Brazil) +bool IsPortugueseLanguage(Language lang); + +// IsSameLanguage +// -------------- +// +// WARNING: This function provides only a simple test on the values of +// the two Language arguments. It returns false if either language is +// invalid. It returns true if the language arguments are equal, or +// if they are both Chinese languages, both Norwegian languages, or +// both Portuguese languages, as defined by IsChineseLanguage, +// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns +// false. +bool IsSameLanguage(Language lang1, Language lang2); + + +// IsRightToLeftLanguage +// --------------------- +// +// This function returns true if the language is only written right-to-left +// (E.g., Hebrew, Arabic, Persian etc.) +// +// IMPORTANT NOTE: Technically we're talking about scripts, not languages. +// There are languages that can be written in more than one script. +// Examples: +// - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in +// Latin or Cyrillic script, and right-to-left in Arabic script. +// - Sindhi and Punjabi are written in different scripts, depending on +// region and dialect. +// - Turkmen used an Arabic script historically, but not any more. +// - Pashto and Uyghur can use Arabic script, but use a Roman script +// on the Internet. +// - Kashmiri and Urdu are written either with Arabic or Devanagari script. +// +// This function only returns true for languages that are always, unequivocally +// written in right-to-left script. +// +// TODO: If we want to do anything special with multi-script languages +// we should create new 'languages' for each language+script, as we do for +// traditional vs. simplified Chinese. However most such languages are rare in +// use and even rarer on the web, so this is unlikely to be something we'll +// be concerned with for a while. +bool IsRightToLeftLanguage(Language lang); + +// IsMaybeRightToLeftLanguage +// -------------------------- +// +// This function returns true if the language may appear on the web in a +// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.) +// +// NOTE: See important notes under IsRightToLeftLanguage(...). +// +// This function returns true for languages that *may* appear on the web in a +// right-to-left script, even if they may also appear in a left-to-right +// script. +// +// This function should typically be used in cases where doing some work on +// left-to-right text would be OK (usually a no-op), and this function is used +// just to cut down on unnecessary work on regular, LTR text. +bool IsMaybeRightToLeftLanguage(Language lang); + +// IsScriptRequiringLongerSnippets +// -------------------- +// +// This function returns true if the script chracteristics require longer +// snippet length (Devanagari, Bengali, Gurmukhi, +// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam). +// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE +// bool IsScriptRequiringLongerSnippets(UnicodeScript script); + + +// ******************************************* +// LANGUAGE NAMES +// +// This interface defines a standard name for each valid Language, +// and a standard name for invalid languages. Some language names use all +// uppercase letters, but others use mixed case. +// LanguageName() [Language to name] +// LanguageEnumName() [language to enum name] +// LanguageFromName() [name to Language] +// default_language_name() +// invalid_language_name() +// ******************************************* + +// Given a Language, returns its standard name. +// Return invalid_language_name() if the language is invalid. +const char* LanguageName(Language lang); + +// Given a Language, return the name of the enum constant for that +// language. In all but a few cases, this is the same as its standard +// name. For example, LanguageName(CHINESE) returns "Chinese", but +// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for +// code that is generating C++ code, where the enum constant is more +// useful than its integer value. Return "NUM_LANGUAGES" if +// the language is invalid. +const char* LanguageEnumName(Language lang); + +// The maximum length of a standard language name. +const int kMaxLanguageNameSize = 50; + +// The standard name for the default language. +const char* default_language_name(); + +// The standard name for all invalid languages. +const char* invalid_language_name(); + +// If lang_name matches the standard name of a Language, using a +// case-insensitive comparison, set *language to that Language and +// return true. +// Otherwise, set *language to UNKNOWN_LANGUAGE and return false. +// +// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name +// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA. +// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed +// as a name for UNKNOWN_LANGUAGE (the return value is true in this case, +// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for +// CHINESE_T (i.e., a synonym for "ChineseT"). +// +// REQUIRES: language must not be NULL. +// +bool LanguageFromName(const char* lang_name, Language *language); + + + +// ******************************************* +// LANGUAGE CODES +// +// This interface defines a standard code for each valid language, and +// a standard code for invalid languages. These are derived from ISO codes, +// with some Google additions. +// LanguageCode() +// default_language_code() +// invalid_language_code() +// LanguageCodeWithDialects() +// LanguageCodeISO639_1() +// LanguageCodeISO639_2() +// ******************************************* + +// Given a Language, return its standard code. There are Google-specific codes: +// For CHINESE_T, return "zh-TW". +// For TG_UNKNOWN_LANGUAGE, return "ut". +// For UNKNOWN_LANGUAGE, return "un". +// For PORTUGUESE_P, return "pt-PT". +// For PORTUGUESE_B, return "pt-BR". +// For LIMBU, return "sit-NP". +// For CHEROKEE, return "chr". +// For SYRIAC, return "syr". +// Otherwise return the ISO 639-1 two-letter language code for lang. +// If lang is invalid, return invalid_language_code(). +// +// NOTE: See the note below about the codes for Chinese languages. +// +const char* LanguageCode(Language lang); + +// The maximum length of a language code. +const int kMaxLanguageCodeSize = 50; + +// The standard code for the default language. +const char* default_language_code(); + +// The standard code for all invalid languages. +const char* invalid_language_code(); + + +// -------------------------------------------- +// NOTE: CHINESE LANGUAGE CODES +// +// There are three functions that return codes for Chinese languages. +// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here. +// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h. +// The following list shows the different results. +// +// LanguageCode(CHINESE) returns "zh" +// LanguageCode(CHINESE_T) returns "zh-TW". +// +// LanguageCodeWithDialects(CHINESE) returns "zh-CN". +// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW". +// +// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW". +// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW". +// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN". +// +// -------------------------------------------- + +// LanguageCodeWithDialects +// ------------------------ +// +// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang). +const char* LanguageCodeWithDialects(Language lang); + +// LanguageCodeISO639_1 +// -------------------- +// +// Return the ISO 639-1 two-letter language code for lang. +// Return invalid_language_code() if lang is invalid or does not have +// an ISO 639-1 two-letter language code. +const char* LanguageCodeISO639_1(Language lang); + +// LanguageCodeISO639_2 +// -------------------- +// +// Return the ISO 639-2 three-letter language for lang. +// Return invalid_language_code() if lang is invalid or does not have +// an ISO 639-2 three-letter language code. +const char* LanguageCodeISO639_2(Language lang); + +// LanguageFromCode +// ---------------- +// +// If lang_code matches the code for a Language, using a case-insensitive +// comparison, set *lang to that Language and return true. +// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false. +// +// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2 +// (three-letter) code, or a Google-specific code (see LanguageCode). +// +// Certain language-code aliases are also allowed: +// For "zh-cn" and "zh_cn", set *lang to CHINESE. +// For "zh-tw" and "zh_tw", set *lang to CHINESE_T. +// For "he", set *lang to HEBREW. +// For "in", set *lang to INDONESIAN. +// For "ji", set *lang to YIDDISH. +// For "fil", set *lang to TAGALOG. +// +// REQUIRES: 'lang' must not be NULL. +bool LanguageFromCode(const char* lang_code, Language *language); + + +// LanguageFromCodeOrName +// ---------------------- +// +// If lang_code_or_name is a language code or a language name. +// set *language to the corresponding Language and return true. +// Otherwise set *language to UNKNOWN_LANGUAGE and return false. +// +bool LanguageFromCodeOrName(const char* lang_code_or_name, + Language* language); + +// LanguageNameFromCode +// -------------------- +// +// If language_code is the code for a Language (see LanguageFromCode), +// return the standard name of that language (see LanguageName). +// Otherwise return invalid_language_name(). +// +const char* LanguageNameFromCode(const char* language_code); + + +// Miscellany + +// LanguageCodeToUnderscoreForm +// ---------------------------- +// +// Given a language code, convert the dash "-" to underscore "_". +// +// Specifically, if result_length <= strlen(lang_code), set result[0] +// to '\0' and return false. Otherwise, copy lang_code to result, +// converting every dash to an underscore, converting every character +// before the first dash or underscore to lower case, and converting +// every character after the first dash or underscore to upper +// case. If there is no dash or underscore, convert the entire string +// to lower case. +// +// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL. + +bool LanguageCodeToUnderscoreForm(const char* lang_code, + char* result, + int result_length); + +// +// AlwaysPutInExpectedRestrict +// --------------------------- +// +// For Web pages in certain top-level domains, Web Search always +// applies a "country restrict". If 'tld' matches one of those, using +// a case-SENSITIVE comparison, set *expected_language to the Language +// most commonly found in that top-level domain and return true. +// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false. +bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language); + + +#endif // UTIL_LANGUAGES_LANGUAGES_H_ diff --git a/contrib/google-ced/util/languages/languages.pb.h b/contrib/google-ced/util/languages/languages.pb.h new file mode 100644 index 000000000..84f1d6a79 --- /dev/null +++ b/contrib/google-ced/util/languages/languages.pb.h @@ -0,0 +1,191 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_LANGUAGES_LANGUAGES_PB_H_ +#define UTIL_LANGUAGES_LANGUAGES_PB_H_ + +enum Language { + ENGLISH = 0, + DANISH = 1, + DUTCH = 2, + FINNISH = 3, + FRENCH = 4, + GERMAN = 5, + HEBREW = 6, + ITALIAN = 7, + JAPANESE = 8, + KOREAN = 9, + NORWEGIAN = 10, + POLISH = 11, + PORTUGUESE = 12, + RUSSIAN = 13, + SPANISH = 14, + SWEDISH = 15, + CHINESE = 16, + CZECH = 17, + GREEK = 18, + ICELANDIC = 19, + LATVIAN = 20, + LITHUANIAN = 21, + ROMANIAN = 22, + HUNGARIAN = 23, + ESTONIAN = 24, + TG_UNKNOWN_LANGUAGE = 25, + UNKNOWN_LANGUAGE = 26, + BULGARIAN = 27, + CROATIAN = 28, + SERBIAN = 29, + IRISH = 30, // UI only. + GALICIAN = 31, + TAGALOG = 32, // Tagalog (tl) + Filipino (fil), + TURKISH = 33, + UKRAINIAN = 34, + HINDI = 35, + MACEDONIAN = 36, + BENGALI = 37, + INDONESIAN = 38, + LATIN = 39, // UI only. + MALAY = 40, + MALAYALAM = 41, + WELSH = 42, // UI only. + NEPALI = 43, + TELUGU = 44, + ALBANIAN = 45, + TAMIL = 46, + BELARUSIAN = 47, + JAVANESE = 48, // UI only. + OCCITAN = 49, // UI only. + URDU = 50, + BIHARI = 51, + GUJARATI = 52, + THAI = 53, + ARABIC = 54, + CATALAN = 55, + ESPERANTO = 56, + BASQUE = 57, + INTERLINGUA = 58, // UI only. + KANNADA = 59, + PUNJABI = 60, + SCOTS_GAELIC = 61, // UI only. + SWAHILI = 62, + SLOVENIAN = 63, + MARATHI = 64, + MALTESE = 65, + VIETNAMESE = 66, + FRISIAN = 67, // UI only. + SLOVAK = 68, + CHINESE_T = 69, // This is added to solve the problem of + // distinguishing Traditional and Simplified + // Chinese when the encoding is UTF8. + FAROESE = 70, // UI only. + SUNDANESE = 71, // UI only. + UZBEK = 72, + AMHARIC = 73, + AZERBAIJANI = 74, + GEORGIAN = 75, + TIGRINYA = 76, // UI only. + PERSIAN = 77, + BOSNIAN = 78, // UI only. LangId language: CROATIAN (28) + SINHALESE = 79, + NORWEGIAN_N = 80, // UI only. LangId language: NORWEGIAN (10) + PORTUGUESE_P = 81, // UI only. LangId language: PORTUGUESE (12) + PORTUGUESE_B = 82, // UI only. LangId language: PORTUGUESE (12) + XHOSA = 83, // UI only. + ZULU = 84, // UI only. + GUARANI = 85, + SESOTHO = 86, // UI only. + TURKMEN = 87, // UI only. + KYRGYZ = 88, + BRETON = 89, // UI only. + TWI = 90, // UI only. + YIDDISH = 91, // UI only. + SERBO_CROATIAN= 92, // UI only. LangId language: SERBIAN (29) + SOMALI = 93, // UI only. + UIGHUR = 94, + KURDISH = 95, + MONGOLIAN = 96, + ARMENIAN = 97, + LAOTHIAN = 98, + SINDHI = 99, + RHAETO_ROMANCE= 100, // UI only. + AFRIKAANS = 101, + LUXEMBOURGISH = 102, // UI only. + BURMESE = 103, + KHMER = 104, + TIBETAN = 105, + DHIVEHI = 106, // sometimes spelled Divehi, lang of Maldives + CHEROKEE = 107, + SYRIAC = 108, // UI only. + LIMBU = 109, // UI only. + ORIYA = 110, + ASSAMESE = 111, // UI only. + CORSICAN = 112, // UI only. + INTERLINGUE = 113, // UI only. + KAZAKH = 114, + LINGALA = 115, // UI only. + MOLDAVIAN = 116, // UI only. LangId language: ROMANIAN (22) + PASHTO = 117, + QUECHUA = 118, // UI only. + SHONA = 119, // UI only. + TAJIK = 120, + TATAR = 121, // UI only. + TONGA = 122, // UI only. + YORUBA = 123, // UI only. + CREOLES_AND_PIDGINS_ENGLISH_BASED = 124, // UI only. + CREOLES_AND_PIDGINS_FRENCH_BASED = 125, // UI only. + CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126, // UI only. + CREOLES_AND_PIDGINS_OTHER = 127, // UI only. + MAORI = 128, // UI only. + WOLOF = 129, // UI only. + ABKHAZIAN = 130, // UI only. + AFAR = 131, // UI only. + AYMARA = 132, // UI only. + BASHKIR = 133, // UI only. + BISLAMA = 134, // UI only. + DZONGKHA = 135, // UI only. + FIJIAN = 136, // UI only. + GREENLANDIC = 137, // UI only. + HAUSA = 138, // UI only. + HAITIAN_CREOLE= 139, // UI only. + INUPIAK = 140, // UI only. + INUKTITUT = 141, + KASHMIRI = 142, // UI only. + KINYARWANDA = 143, // UI only. + MALAGASY = 144, // UI only. + NAURU = 145, // UI only. + OROMO = 146, // UI only. + RUNDI = 147, // UI only. + SAMOAN = 148, // UI only. + SANGO = 149, // UI only. + SANSKRIT = 150, + SISWANT = 151, // UI only. + TSONGA = 152, // UI only. + TSWANA = 153, // UI only. + VOLAPUK = 154, // UI only. + ZHUANG = 155, // UI only. + KHASI = 156, // UI only. + SCOTS = 157, // UI only. + GANDA = 158, // UI only. + MANX = 159, // UI only. + MONTENEGRIN = 160, // UI only. LangId language: SERBIAN (29) + NUM_LANGUAGES = 161, // Always keep this at the end. It is not a + // valid Language enum. It is only used to + // indicate the total number of Languages. + // NOTE: If you add a language, you will break a unittest. See the note + // at the top of this enum. +}; + +#endif // UTIL_LANGUAGES_LANGUAGES_PB_H_ diff --git a/contrib/google-ced/util/logging.h b/contrib/google-ced/util/logging.h new file mode 100644 index 000000000..16e50f209 --- /dev/null +++ b/contrib/google-ced/util/logging.h @@ -0,0 +1,25 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_LOGGING_H_ +#define UTIL_LOGGING_H_ + +#undef CHECK +#define CHECK(expr) +#undef DCHECK +#define DCHECK(expr) + +#endif // UTIL_LOGGING_H_ diff --git a/contrib/google-ced/util/port.h b/contrib/google-ced/util/port.h new file mode 100644 index 000000000..3799b1696 --- /dev/null +++ b/contrib/google-ced/util/port.h @@ -0,0 +1,53 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_PORT_H_ +#define UTIL_PORT_H_ + +#include <stdarg.h> + +#if defined(_MSC_VER) +#define GG_LONGLONG(x) x##I64 +#define GG_ULONGLONG(x) x##UI64 +#else +#define GG_LONGLONG(x) x##LL +#define GG_ULONGLONG(x) x##ULL +#endif + +// Per C99 7.8.14, define __STDC_CONSTANT_MACROS before including <stdint.h> +// to get the INTn_C and UINTn_C macros for integer constants. It's difficult +// to guarantee any specific ordering of header includes, so it's difficult to +// guarantee that the INTn_C macros can be defined by including <stdint.h> at +// any specific point. Provide GG_INTn_C macros instead. + +#define GG_INT8_C(x) (x) +#define GG_INT16_C(x) (x) +#define GG_INT32_C(x) (x) +#define GG_INT64_C(x) GG_LONGLONG(x) + +#define GG_UINT8_C(x) (x ## U) +#define GG_UINT16_C(x) (x ## U) +#define GG_UINT32_C(x) (x ## U) +#define GG_UINT64_C(x) GG_ULONGLONG(x) + +// Define an OS-neutral wrapper for shared library entry points +#if defined(_WIN32) +#define API_CALL __stdcall +#else +#define API_CALL +#endif + +#endif // UTIL_PORT_H_ diff --git a/contrib/google-ced/util/string_util.h b/contrib/google-ced/util/string_util.h new file mode 100644 index 000000000..5977f4fd8 --- /dev/null +++ b/contrib/google-ced/util/string_util.h @@ -0,0 +1,61 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_STRING_UTIL_H_ +#define UTIL_STRING_UTIL_H_ + +#include <string.h> + +namespace base { + +#if defined(_WIN32) +// Compare the two strings s1 and s2 without regard to case using +// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if +// s2 > s1 according to a lexicographic comparison. +inline int strcasecmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} +inline int strncasecmp(const char* s1, const char* s2, size_t n) { + return _strnicmp(s1, s2, n); +} +#else +inline int strcasecmp(const char* s1, const char* s2) { + return ::strcasecmp(s1, s2); +} +inline int strncasecmp(const char* s1, const char* s2, size_t n) { + return ::strncasecmp(s1, s2, n); +} +#endif +} + +#ifndef HAVE_MEMRCHR +#if defined(__GLIBC__) && ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 2))) +#define HAVE_MEMRCHR +#endif +#endif + +#ifndef HAVE_MEMRCHR +inline void* memrchr(const void* s, int c, size_t n) { + const unsigned char* p = (const unsigned char*) s; + for (p += n; n > 0; n--) { + if (*--p == c) + return (void*) p; + } + return NULL; +} +#endif + +#endif // UTIL_STRING_UTIL_H_ diff --git a/contrib/google-ced/util/varsetter.h b/contrib/google-ced/util/varsetter.h new file mode 100644 index 000000000..8e8cbf2c0 --- /dev/null +++ b/contrib/google-ced/util/varsetter.h @@ -0,0 +1,66 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_VARSETTER_H_ +#define UTIL_VARSETTER_H_ + +// +// Use a VarSetter object to temporarily set an object of some sort to +// a particular value. When the VarSetter object is destructed, the +// underlying object will revert to its former value. +// +// Sample code: +// +#if 0 +{ + bool b = true; + { + VarSetter<bool> bool_setter(&b, false); + // Now b == false. + } + // Now b == true again. +} +#endif + +template <class C> +class VarSetter { +public: + + // Constructor that just sets the object to a fixed value + VarSetter(C* object, const C& value) : object_(object), old_value_(*object) { + *object = value; + } + + ~VarSetter() { *object_ = old_value_; } + +private: + + C*const object_; + C old_value_; + + // Disallow + VarSetter(const VarSetter&); + VarSetter& operator=(const VarSetter&); + + // VarSetters always live on the stack + static void* operator new (size_t); + static void* operator new[](size_t); // Redundant, no default ctor + + static void operator delete (void*); + static void operator delete[](void*); +}; + +#endif // UTIL_VARSETTER_H_ |