[Rework] Use google-ced instead of libicu chardet as the former sucks

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2020-05-26 11:31:47 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2020-05-26 11:31:47 +0100
commit: 19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3 (patch)
tree: 6d0f43f3cd9ede27eb578562480633e27f042934 /contrib/google-ced/util
parent: c11838dcbacbfd0a75e98f95a63a026217c88c51 (diff)
download: rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.tar.gz
rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.zip
14 files changed, 2989 insertions, 0 deletions
diff --git a/contrib/google-ced/util/basictypes.h b/contrib/google-ced/util/basictypes.h
new file mode 100644
index 000000000..af391c742
--- /dev/null
+++ b/contrib/google-ced/util/basictypes.h
@@ -0,0 +1,331 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_BASICTYPES_H_
+#define UTIL_BASICTYPES_H_
+
+#include <limits.h>         // So we can set the bounds of our types
+#include <stddef.h>         // For size_t
+#include <string.h>         // for memcpy
+
+#include "util/port.h"    // Types that only need exist on certain systems
+
+#ifndef COMPILER_MSVC
+// stdint.h is part of C99 but MSVC doesn't have it.
+#include <stdint.h>         // For intptr_t.
+#endif
+
+typedef signed char         schar;
+typedef signed char         int8;
+typedef short               int16;
+// TODO(mbelshe) Remove these type guards.  These are
+//               temporary to avoid conflicts with npapi.h.
+#ifndef _INT32
+#define _INT32
+typedef int                 int32;
+#endif
+
+// The NSPR system headers define 64-bit as |long| when possible.  In order to
+// not have typedef mismatches, we do the same on LP64.
+#if __LP64__
+typedef long                int64;
+#else
+typedef long long           int64;
+#endif
+
+// NOTE: unsigned types are DANGEROUS in loops and other arithmetical
+// places.  Use the signed types unless your variable represents a bit
+// pattern (eg a hash value) or you really need the extra bit.  Do NOT
+// use 'unsigned' to express "this value should always be positive";
+// use assertions for this.
+
+typedef unsigned char      uint8;
+typedef unsigned short     uint16;
+// TODO(mbelshe) Remove these type guards.  These are
+//               temporary to avoid conflicts with npapi.h.
+#ifndef _UINT32
+#define _UINT32
+typedef unsigned int       uint32;
+#endif
+
+// See the comment above about NSPR and 64-bit.
+#if __LP64__
+typedef unsigned long uint64;
+#else
+typedef unsigned long long uint64;
+#endif
+
+// A type to represent a Unicode code-point value. As of Unicode 4.0,
+// such values require up to 21 bits.
+// (For type-checking on pointers, make this explicitly signed,
+// and it should always be the signed version of whatever int32 is.)
+typedef signed int         char32;
+
+const uint8  kuint8max  = (( uint8) 0xFF);
+const uint16 kuint16max = ((uint16) 0xFFFF);
+const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
+const uint64 kuint64max = ((uint64) GG_LONGLONG(0xFFFFFFFFFFFFFFFF));
+const  int8  kint8min   = ((  int8) 0x80);
+const  int8  kint8max   = ((  int8) 0x7F);
+const  int16 kint16min  = (( int16) 0x8000);
+const  int16 kint16max  = (( int16) 0x7FFF);
+const  int32 kint32min  = (( int32) 0x80000000);
+const  int32 kint32max  = (( int32) 0x7FFFFFFF);
+const  int64 kint64min  = (( int64) GG_LONGLONG(0x8000000000000000));
+const  int64 kint64max  = (( int64) GG_LONGLONG(0x7FFFFFFFFFFFFFFF));
+
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);               \
+  void operator=(const TypeName&)
+
+// An older, deprecated, politically incorrect name for the above.
+#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName)
+
+// A macro to disallow all the implicit constructors, namely the
+// default constructor, copy constructor and operator= functions.
+//
+// This should be used in the private: declarations for a class
+// that wants to prevent anyone from instantiating it. This is
+// especially useful for classes containing only static methods.
+#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
+  TypeName();                                    \
+  DISALLOW_COPY_AND_ASSIGN(TypeName)
+
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef _MSC_VER
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+
+// Use implicit_cast as a safe version of static_cast or const_cast
+// for upcasting in the type hierarchy (i.e. casting a pointer to Foo
+// to a pointer to SuperclassOfFoo or casting a pointer to Foo to
+// a const pointer to Foo).
+// When you use implicit_cast, the compiler checks that the cast is safe.
+// Such explicit implicit_casts are necessary in surprisingly many
+// situations where C++ demands an exact type match instead of an
+// argument type convertable to a target type.
+//
+// The From type can be inferred, so the preferred syntax for using
+// implicit_cast is the same as for static_cast etc.:
+//
+//   implicit_cast<ToType>(expr)
+//
+// implicit_cast would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+template<typename To, typename From>
+inline To implicit_cast(From const &f) {
+  return f;
+}
+
+// The COMPILE_ASSERT macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   COMPILE_ASSERT(arraysize(content_type_names) == CONTENT_NUM_TYPES,
+//                  content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct CompileAssert {
+};
+
+#undef COMPILE_ASSERT
+#define COMPILE_ASSERT(expr, msg) \
+  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// Implementation details of COMPILE_ASSERT:
+//
+// - COMPILE_ASSERT works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//     #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
+//                               // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     COMPILE_ASSERT(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+
+// MetatagId refers to metatag-id that we assign to
+// each metatag <name, value> pair..
+typedef uint32 MetatagId;
+
+// Argument type used in interfaces that can optionally take ownership
+// of a passed in argument.  If TAKE_OWNERSHIP is passed, the called
+// object takes ownership of the argument.  Otherwise it does not.
+enum Ownership {
+  DO_NOT_TAKE_OWNERSHIP,
+  TAKE_OWNERSHIP
+};
+
+// bit_cast<Dest,Source> is a template function that implements the
+// equivalent of "*reinterpret_cast<Dest*>(&source)".  We need this in
+// very low-level functions like the protobuf library and fast math
+// support.
+//
+//   float f = 3.14159265358979;
+//   int i = bit_cast<int32>(f);
+//   // i = 0x40490fdb
+//
+// The classical address-casting method is:
+//
+//   // WRONG
+//   float f = 3.14159265358979;            // WRONG
+//   int i = * reinterpret_cast<int*>(&f);  // WRONG
+//
+// The address-casting method actually produces undefined behavior
+// according to ISO C++ specification section 3.10 -15 -.  Roughly, this
+// section says: if an object in memory has one type, and a program
+// accesses it with a different type, then the result is undefined
+// behavior for most values of "different type".
+//
+// This is true for any cast syntax, either *(int*)&f or
+// *reinterpret_cast<int*>(&f).  And it is particularly true for
+// conversions betweeen integral lvalues and floating-point lvalues.
+//
+// The purpose of 3.10 -15- is to allow optimizing compilers to assume
+// that expressions with different types refer to different memory.  gcc
+// 4.0.1 has an optimizer that takes advantage of this.  So a
+// non-conforming program quietly produces wildly incorrect output.
+//
+// The problem is not the use of reinterpret_cast.  The problem is type
+// punning: holding an object in memory of one type and reading its bits
+// back using a different type.
+//
+// The C++ standard is more subtle and complex than this, but that
+// is the basic idea.
+//
+// Anyways ...
+//
+// bit_cast<> calls memcpy() which is blessed by the standard,
+// especially by the example in section 3.9 .  Also, of course,
+// bit_cast<> wraps up the nasty logic in one place.
+//
+// Fortunately memcpy() is very fast.  In optimized mode, with a
+// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
+// code with the minimal amount of data movement.  On a 32-bit system,
+// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
+// compiles to two loads and two stores.
+//
+// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
+//
+// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
+// is likely to surprise you.
+
+template <class Dest, class Source>
+inline Dest bit_cast(const Source& source) {
+  // Compile time assertion: sizeof(Dest) == sizeof(Source)
+  // A compile error here means your Dest and Source have different sizes.
+  // typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1];
+
+  Dest dest;
+  memcpy(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+// The following enum should be used only as a constructor argument to indicate
+// that the variable has static storage class, and that the constructor should
+// do nothing to its state.  It indicates to the reader that it is legal to
+// declare a static instance of the class, provided the constructor is given
+// the base::LINKER_INITIALIZED argument.  Normally, it is unsafe to declare a
+// static variable that has a constructor or a destructor because invocation
+// order is undefined.  However, IF the type can be initialized by filling with
+// zeroes (which the loader does for static variables), AND the destructor also
+// does nothing to the storage, AND there are no virtual methods, then a
+// constructor declared as
+//       explicit MyClass(base::LinkerInitialized x) {}
+// and invoked as
+//       static MyClass my_variable_name(base::LINKER_INITIALIZED);
+namespace base {
+enum LinkerInitialized { LINKER_INITIALIZED };
+}  // base
+
+// UnaligndLoad32 is put here instead of util/port.h to
+// avoid the circular dependency between port.h and basictypes.h
+// ARM does not support unaligned memory access.
+#if defined(ARCH_CPU_X86_FAMILY)
+// x86 and x86-64 can perform unaligned loads/stores directly;
+inline uint32 UnalignedLoad32(const void* p) {
+  return *reinterpret_cast<const uint32*>(p);
+}
+#else
+#define NEED_ALIGNED_LOADS
+// If target architecture does not support unaligned loads and stores,
+// use memcpy version of UNALIGNED_LOAD32.
+inline uint32 UnalignedLoad32(const void* p) {
+  uint32 t;
+  memcpy(&t, reinterpret_cast<const uint8*>(p), sizeof(t));
+  return t;
+}
+
+#endif
+#endif  // UTIL_BASICTYPES_H_
diff --git a/contrib/google-ced/util/case_insensitive_hash.h b/contrib/google-ced/util/case_insensitive_hash.h
new file mode 100644
index 000000000..7b0c9db76
--- /dev/null
+++ b/contrib/google-ced/util/case_insensitive_hash.h
@@ -0,0 +1,88 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_CASE_INSENSITIVE_HASH_H_
+#define UTIL_CASE_INSENSITIVE_HASH_H_
+
+#include <ctype.h>
+#include <stddef.h>
+#ifndef _MSC_VER
+#include <strings.h>
+#endif
+
+#include <string>
+
+#include "util/basictypes.h"
+#include "util/string_util.h"
+
+// Functors for hashing c-strings with case-insensitive semantics.
+struct CStringCaseHash {
+  size_t operator()(const char *str) const {
+    unsigned long hash_val = 0;
+    while (*str) {
+      hash_val = 5*hash_val + tolower(*str);
+      str++;
+    }
+    return (size_t)hash_val;
+  }
+};
+
+struct CStringCaseEqual {
+  bool operator()(const char *str1, const char *str2) const {
+    return !base::strcasecmp(str1, str2);
+  }
+};
+
+// These functors, in addition to being case-insensitive, ignore all
+// non-alphanumeric characters.  This is useful when we want all variants of
+// a string -- where variants can differ in puncutation and whitespace -- to
+// map to the same value.
+struct CStringAlnumCaseHash {
+  size_t operator()(const char *str) const {
+    unsigned long hash_val = 0;
+    while (*str) {
+      if (isalnum(*str)) {
+        hash_val = 5*hash_val + tolower(*str);
+      }
+      str++;
+    }
+    return (size_t)hash_val;
+  }
+};
+
+struct CStringAlnumCaseEqual {
+  bool operator()(const char *str1, const char *str2) const {
+    while (true) {
+      // Skip until each pointer is pointing to an alphanumeric char or '\0'
+      while (!isalnum(*str1) && (*str1 != '\0')) {
+        str1++;
+      }
+      while (!isalnum(*str2) && (*str2 != '\0')) {
+        str2++;
+      }
+      if (tolower(*str1) != tolower(*str2)) {
+        return false;       // mismatch on alphanumeric char or '\0'
+      }
+      if (*str1 == '\0') {  // in which case *str2 must be '\0' as well
+        return true;        // reached '\0' in both strings without mismatch
+      }
+      str1++;
+      str2++;
+    }
+  }
+};
+
+#endif  // UTIL_CASE_INSENSITIVE_HASH_H_
diff --git a/contrib/google-ced/util/commandlineflags.h b/contrib/google-ced/util/commandlineflags.h
new file mode 100644
index 000000000..341a659ba
--- /dev/null
+++ b/contrib/google-ced/util/commandlineflags.h
@@ -0,0 +1,39 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_COMMANDLINEFLAGS_H_
+#define UTIL_COMMANDLINEFLAGS_H_
+
+
+#undef DEFINE_bool
+#define DEFINE_bool(name, default_value, comment) \
+    bool FLAGS_##name = default_value
+#undef DEFINE_int32
+#define DEFINE_int32(name, default_value, comment) \
+    int32 FLAGS_##name = default_value
+#undef DEFINE_string
+#define DEFINE_string(name, default_value, comment) \
+    string FLAGS_##name = default_value
+
+#undef DECLARE_bool
+#define DECLARE_bool(name) extern bool FLAGS_##name
+#undef DECLARE_int32
+#define DECLARE_int32(name) extern int32 FLAGS_##name
+#undef DECLARE_string
+#define DECLARE_string(name) extern string FLAGS_##name
+
+
+#endif  // UTIL_COMMANDLINEFLAGS_H_
diff --git a/contrib/google-ced/util/encodings/encodings.cc b/contrib/google-ced/util/encodings/encodings.cc
new file mode 100644
index 000000000..b5f8dc5fa
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings.cc
@@ -0,0 +1,891 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "util/encodings/encodings.h"
+
+#include <string.h>                     // for strcasecmp
+#include <unordered_map>
+#include <utility>                      // for pair
+
+#include "util/basictypes.h"
+#include "util/string_util.h"
+#include "util/case_insensitive_hash.h"
+
+struct EncodingInfo {
+  // The standard name for this encoding.
+  //
+  const char* encoding_name_;
+
+  // The "preferred MIME name" of an encoding as specified by the IANA at:
+  //     http://www.iana.org/assignments/character-sets
+  //
+  //   Note that the preferred MIME name may differ slightly from the
+  //   official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
+  //
+  const char* mime_encoding_name_;
+
+  // It is an internal policy that if an encoding has an IANA name,
+  // then encoding_name_ and mime_encoding_name_ must be the same string.
+  //
+  // However, there can be exceptions if there are compelling reasons.
+  // For example, Japanese mobile handsets require the name
+  // "Shift_JIS" in charset=... parameter in Content-Type headers to
+  // process emoji (emoticons) in their private encodings.  In that
+  // case, mime_encoding_name_ should be "Shift_JIS", despite
+  // encoding_name_ actually is "X-KDDI-Shift_JIS".
+
+  // Some multi-byte encodings use byte values that coincide with the
+  // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
+  // can misinterpret these, as indicated in an external XSS report from
+  // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
+  // also use UTF8 instead of encodings that we don't support in our
+  // output, and we generally try to be conservative in what we send out.
+  // Where the client asks for single- or double-byte encodings that are
+  // not as common, we substitute a more common single- or double-byte
+  // encoding, if there is one, thereby preserving the client's intent
+  // to use less space than UTF-8. This also means that characters
+  // outside the destination set will be converted to HTML NCRs (&#NNN;)
+  // if requested.
+
+  Encoding preferred_web_output_encoding_;
+};
+
+static const EncodingInfo kEncodingInfoTable[] = {
+  { "ASCII", "ISO-8859-1", ISO_8859_1},
+  { "Latin2", "ISO-8859-2", ISO_8859_2},
+  { "Latin3", "ISO-8859-3", UTF8},
+      // MSIE 6 does not support ISO-8859-3 (XSS issue)
+  { "Latin4", "ISO-8859-4", ISO_8859_4},
+  { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
+  { "Arabic", "ISO-8859-6", ISO_8859_6},
+  { "Greek", "ISO-8859-7", ISO_8859_7},
+  { "Hebrew", "ISO-8859-8", MSFT_CP1255},
+      // we do not endorse the visual order
+  { "Latin5", "ISO-8859-9", ISO_8859_9},
+  { "Latin6", "ISO-8859-10", UTF8},
+      // MSIE does not support ISO-8859-10 (XSS issue)
+  { "EUC-JP",  "EUC-JP", JAPANESE_EUC_JP},
+  { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+  { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
+      // due to potential confusion with HTML syntax chars
+  { "BIG5", "Big5", CHINESE_BIG5},
+  { "GB",  "GB2312", CHINESE_GB},
+  { "EUC-CN",
+        "EUC-CN",
+        // Misnamed. Should be EUC-TW.
+        CHINESE_BIG5},
+      // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
+      // and EUC-TW is rare, so we prefer Big5 for output.
+  { "KSC", "EUC-KR", KOREAN_EUC_KR},
+  { "Unicode",
+    "UTF-16LE",
+        // Internet Explorer doesn't recognize "ISO-10646-UCS-2"
+        UTF8
+        // due to potential confusion with HTML syntax chars
+        },
+  { "EUC",
+        "EUC",  // Misnamed. Should be EUC-TW.
+        CHINESE_BIG5
+        // MSIE does not recognize "EUC" (XSS issue),
+        // and EUC-TW is rare, so we prefer Big5 for output.
+        },
+  { "CNS",
+        "CNS",  // Misnamed. Should be EUC-TW.
+        CHINESE_BIG5},
+      // MSIE does not recognize "CNS" (XSS issue),
+      // and EUC-TW is rare, so we prefer Big5 for output.
+  { "BIG5-CP950",
+        "BIG5-CP950",  // Not an IANA name
+        CHINESE_BIG5
+        // MSIE does not recognize "BIG5-CP950" (XSS issue)
+        },
+  { "CP932", "CP932",  // Not an IANA name
+        JAPANESE_SHIFT_JIS},  // MSIE does not recognize "CP932" (XSS issue)
+  { "UTF8", "UTF-8", UTF8},
+  { "Unknown",
+        "x-unknown",  // Not an IANA name
+        UTF8},  // UTF-8 is our default output encoding
+  { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
+  { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
+  { "CP1251", "windows-1251", RUSSIAN_CP1251},
+  { "CP1252", "windows-1252", MSFT_CP1252},
+  { "KOI8U",
+        "KOI8-U",
+        ISO_8859_5},  // because koi8-u is not as common
+  { "CP1250", "windows-1250", MSFT_CP1250},
+  { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
+  { "CP1254", "windows-1254", MSFT_CP1254},
+  { "CP1257", "windows-1257", MSFT_CP1257},
+  { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
+  { "CP874", "windows-874", MSFT_CP874},
+  { "CP1256", "windows-1256", MSFT_CP1256},
+  { "CP1255", "windows-1255", MSFT_CP1255},
+  { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
+      // Java does not support iso-8859-8-i
+  { "VISUAL", "ISO-8859-8", MSFT_CP1255},
+      // we do not endorse the visual order
+  { "CP852", "cp852", MSFT_CP1250},
+      // because cp852 is not as common
+  { "CSN_369103", "csn_369103", MSFT_CP1250},
+      // MSIE does not recognize "csn_369103" (XSS issue)
+  { "CP1253", "windows-1253", MSFT_CP1253},
+  { "CP866", "IBM866", RUSSIAN_CP1251},
+      // because cp866 is not as common
+  { "ISO-8859-13", "ISO-8859-13", UTF8},
+      // because iso-8859-13 is not widely supported
+  { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
+      // due to potential confusion with HTML syntax chars
+  { "GBK", "GBK", GBK},
+  { "GB18030", "GB18030", GBK},
+      // because gb18030 is not widely supported
+  { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
+      // because Big5-HKSCS is not widely supported
+  { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
+      // due to potential confusion with HTML syntax chars
+  { "TSCII", "tscii", UTF8},
+      // we do not have an output converter for this font encoding
+  { "TAM", "tam", UTF8},
+      // we do not have an output converter for this font encoding
+  { "TAB", "tab", UTF8},
+      // we do not have an output converter for this font encoding
+  { "JAGRAN", "jagran", UTF8},
+      // we do not have an output converter for this font encoding
+  { "MACINTOSH", "MACINTOSH", ISO_8859_1},
+      // because macintosh is relatively uncommon
+  { "UTF7", "UTF-7",
+        UTF8},  // UTF-7 has been the subject of XSS attacks and is deprecated
+  { "BHASKAR", "bhaskar",
+        UTF8},  // we do not have an output converter for this font encoding
+  { "HTCHANAKYA", "htchanakya",  // not an IANA charset name.
+        UTF8},  // we do not have an output converter for this font encoding
+  { "UTF-16BE", "UTF-16BE",
+        UTF8},  // due to potential confusion with HTML syntax chars
+  { "UTF-16LE", "UTF-16LE",
+        UTF8},  // due to potential confusion with HTML syntax chars
+  { "UTF-32BE", "UTF-32BE",
+        UTF8},  // unlikely to cause XSS bugs, but very uncommon on Web
+  { "UTF-32LE", "UTF-32LE",
+        UTF8},  // unlikely to cause XSS bugs, but very uncommon on Web
+  { "X-BINARYENC", "x-binaryenc",  // Not an IANA name
+        UTF8},  // because this one is not intended for output (just input)
+  { "HZ-GB-2312", "HZ-GB-2312",
+        CHINESE_GB},  // due to potential confusion with HTML syntax chars
+  { "X-UTF8UTF8", "x-utf8utf8",  // Not an IANA name
+        UTF8},  // because this one is not intended for output (just input)
+  { "X-TAM-ELANGO", "x-tam-elango",
+        UTF8},  // we do not have an output converter for this font encoding
+  { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
+        UTF8},  // we do not have an output converter for this font encoding
+  { "X-TAM-SHREE", "x-tam-shree",
+        UTF8},  // we do not have an output converter for this font encoding
+  { "X-TAM-TBOOMIS", "x-tam-tboomis",
+        UTF8},  // we do not have an output converter for this font encoding
+  { "X-TAM-TMNEWS", "x-tam-tmnews",
+        UTF8},  // we do not have an output converter for this font encoding
+  { "X-TAM-WEBTAMIL", "x-tam-webtamil",
+        UTF8},  // we do not have an output converter for this font encoding
+
+  { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+      // KDDI version of Shift_JIS with Google Emoji PUA mappings.
+      // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
+      // "Shift_JIS" in HTTP headers and email messages.
+
+  { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+      // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
+      // See the comment at KDDI_SHIFT_JIS for other issues.
+
+  { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+      // SoftBank version of Shift_JIS with Google Emoji PUA mappings.
+      // See the comment at KDDI_SHIFT_JIS for other issues.
+
+  { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
+      // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
+      // See the comment at KDDI_SHIFT_JIS for other issues.
+      // The preferred Web encoding is due to potential confusion with
+      // HTML syntax chars.
+
+  { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
+      // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
+      // See the comment at KDDI_SHIFT_JIS for other issues.
+      // The preferred Web encoding is due to potential confusion with
+      // HTML syntax chars.
+
+      // Please refer to NOTE: section in the comments in the definition
+      // of "struct I18NInfoByEncoding", before adding new encodings.
+
+};
+
+
+
+COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
+               kEncodingInfoTable_has_incorrect_size);
+
+Encoding default_encoding() {return LATIN1;}
+
+// *************************************************************
+// Encoding predicates
+//   IsValidEncoding()
+//   IsEncEncCompatible
+//   IsEncodingWithSupportedLanguage
+//   IsSupersetOfAscii7Bit
+//   Is8BitEncoding
+//   IsCJKEncoding
+//   IsHebrewEncoding
+//   IsRightToLeftEncoding
+//   IsLogicalRightToLeftEncoding
+//   IsVisualRightToLeftEncoding
+//   IsIso2022Encoding
+//   IsIso2022JpOrVariant
+//   IsShiftJisOrVariant
+//   IsJapaneseCellPhoneCarrierSpecificEncoding
+// *************************************************************
+
+bool IsValidEncoding(Encoding enc) {
+  return ((enc >= 0) && (enc < kNumEncodings));
+}
+
+bool IsEncEncCompatible(const Encoding from, const Encoding to) {
+  // Tests compatibility between the "from" and "to" encodings; in
+  // the typical case -- when both are valid known encodings -- this
+  // returns true iff converting from first to second is a no-op.
+  if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
+    return false;  // we only work with valid encodings...
+  } else if (to == from) {
+    return true;   // the trivial common case
+  }
+
+  if (to == UNKNOWN_ENCODING) {
+    return true;   // all valid encodings are compatible with the unknown
+  }
+
+  if (from == UNKNOWN_ENCODING) {
+    return false;  // no unknown encoding is compatible with one that is
+  }
+
+  if (from == ASCII_7BIT) {
+    return IsSupersetOfAscii7Bit(to);
+  }
+
+  return (from == ISO_8859_1 && to == MSFT_CP1252) ||
+         (from == ISO_8859_8 && to == HEBREW_VISUAL) ||
+         (from == HEBREW_VISUAL && to == ISO_8859_8) ||
+         (from == ISO_8859_9 && to == MSFT_CP1254) ||
+         (from == ISO_8859_11 && to == MSFT_CP874) ||
+         (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
+         (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
+         (from == CHINESE_GB && to == GBK) ||
+         (from == CHINESE_GB && to == GB18030) ||
+         (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
+         (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
+         (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
+         (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
+         (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
+         (from == CHINESE_CNS && to == CHINESE_EUC_DEC);
+}
+
+// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
+// encoding represent the same characters as they do in ISO_8859_1.
+
+// TODO: This list could be expanded.  Many other encodings are supersets
+// of 7-bit Ascii.  In fact, Japanese JIS and Unicode are the only two
+// encodings that I know for a fact should *not* be in this list.
+bool IsSupersetOfAscii7Bit(Encoding e) {
+  switch (e) {
+    case ISO_8859_1:
+    case ISO_8859_2:
+    case ISO_8859_3:
+    case ISO_8859_4:
+    case ISO_8859_5:
+    case ISO_8859_6:
+    case ISO_8859_7:
+    case ISO_8859_8:
+    case ISO_8859_9:
+    case ISO_8859_10:
+    case JAPANESE_EUC_JP:
+    case JAPANESE_SHIFT_JIS:
+    case CHINESE_BIG5:
+    case CHINESE_GB:
+    case CHINESE_EUC_CN:
+    case KOREAN_EUC_KR:
+    case CHINESE_EUC_DEC:
+    case CHINESE_CNS:
+    case CHINESE_BIG5_CP950:
+    case JAPANESE_CP932:
+    case UTF8:
+    case UNKNOWN_ENCODING:
+    case ASCII_7BIT:
+    case RUSSIAN_KOI8_R:
+    case RUSSIAN_CP1251:
+    case MSFT_CP1252:
+    case RUSSIAN_KOI8_RU:
+    case MSFT_CP1250:
+    case ISO_8859_15:
+    case MSFT_CP1254:
+    case MSFT_CP1257:
+    case ISO_8859_11:
+    case MSFT_CP874:
+    case MSFT_CP1256:
+    case MSFT_CP1255:
+    case ISO_8859_8_I:
+    case HEBREW_VISUAL:
+    case CZECH_CP852:
+    case MSFT_CP1253:
+    case RUSSIAN_CP866:
+    case ISO_8859_13:
+    case GBK:
+    case GB18030:
+    case BIG5_HKSCS:
+    case MACINTOSH_ROMAN:
+      return true;
+    default:
+      return false;
+  }
+}
+
+// To be an 8-bit encoding means that there are fewer than 256 symbols.
+// Each byte determines a new character; there are no multi-byte sequences.
+
+// TODO: This list could maybe be expanded.  Other encodings may be 8-bit.
+bool Is8BitEncoding(Encoding e) {
+  switch (e) {
+    case ASCII_7BIT:
+    case ISO_8859_1:
+    case ISO_8859_2:
+    case ISO_8859_3:
+    case ISO_8859_4:
+    case ISO_8859_5:
+    case ISO_8859_6:
+    case ISO_8859_7:
+    case ISO_8859_8:
+    case ISO_8859_8_I:
+    case ISO_8859_9:
+    case ISO_8859_10:
+    case ISO_8859_11:
+    case ISO_8859_13:
+    case ISO_8859_15:
+    case MSFT_CP1252:
+    case MSFT_CP1253:
+    case MSFT_CP1254:
+    case MSFT_CP1255:
+    case MSFT_CP1256:
+    case MSFT_CP1257:
+    case RUSSIAN_KOI8_R:
+    case RUSSIAN_KOI8_RU:
+    case RUSSIAN_CP866:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsCJKEncoding(Encoding e) {
+  switch (e) {
+    case JAPANESE_EUC_JP:
+    case JAPANESE_SHIFT_JIS:
+    case JAPANESE_JIS:
+    case CHINESE_BIG5:
+    case CHINESE_GB:
+    case CHINESE_EUC_CN:
+    case KOREAN_EUC_KR:
+    case CHINESE_EUC_DEC:
+    case CHINESE_CNS:
+    case CHINESE_BIG5_CP950:
+    case JAPANESE_CP932:
+    case ISO_2022_KR:
+    case GBK:
+    case GB18030:
+    case BIG5_HKSCS:
+    case ISO_2022_CN:
+    case HZ_GB_2312:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsHebrewEncoding(Encoding e) {
+  return (e == ISO_8859_8 ||
+          e == ISO_8859_8_I ||
+          e == MSFT_CP1255 ||
+          e == HEBREW_VISUAL);
+}
+
+
+
+bool IsRightToLeftEncoding(Encoding enc) {
+  switch (enc) {
+    case MSFT_CP1255:
+    case MSFT_CP1256:
+    case ARABIC_ENCODING:
+    case HEBREW_ENCODING:
+    case ISO_8859_8_I:
+    case HEBREW_VISUAL:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsLogicalRightToLeftEncoding(Encoding enc) {
+  return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
+}
+
+// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
+// is NOT visual.
+bool IsVisualRightToLeftEncoding(Encoding enc) {
+  switch (enc) {
+    case HEBREW_ENCODING:
+    case HEBREW_VISUAL:
+      return true;
+    default:
+      return false;
+  }
+}
+
+
+
+
+
+bool IsIso2022Encoding(Encoding enc) {
+  return (IsIso2022JpOrVariant(enc) ||
+          enc == ISO_2022_KR ||
+          enc == ISO_2022_CN);
+}
+
+bool IsIso2022JpOrVariant(Encoding enc) {
+  return (enc == JAPANESE_JIS ||
+          enc == KDDI_ISO_2022_JP ||
+          enc == SOFTBANK_ISO_2022_JP);
+}
+
+bool IsShiftJisOrVariant(Encoding enc) {
+  return (enc == JAPANESE_SHIFT_JIS ||
+          enc == JAPANESE_CP932 ||
+          enc == KDDI_SHIFT_JIS ||
+          enc == DOCOMO_SHIFT_JIS ||
+          enc == SOFTBANK_SHIFT_JIS);
+}
+
+bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
+  return (enc == KDDI_ISO_2022_JP ||
+          enc == KDDI_SHIFT_JIS ||
+          enc == DOCOMO_SHIFT_JIS ||
+          enc == SOFTBANK_SHIFT_JIS ||
+          enc == SOFTBANK_ISO_2022_JP);
+}
+
+
+// *************************************************************
+// ENCODING NAMES
+//   EncodingName() [Encoding to name]
+//   MimeEncodingName() [Encoding to name]
+//   EncodingFromName() [name to Encoding]
+//   EncodingNameAliasToEncoding() [name to Encoding]
+//   default_encoding_name()
+//   invalid_encoding_name()
+// *************************************************************
+
+const char * EncodingName(const Encoding enc) {
+  if ( (enc < 0) || (enc >= kNumEncodings) )
+    return invalid_encoding_name();
+  return kEncodingInfoTable[enc].encoding_name_;
+}
+
+// TODO: Unify MimeEncodingName and EncodingName, or determine why
+// such a unification is not possible.
+
+const char * MimeEncodingName(Encoding enc) {
+  if ( (enc < 0) || (enc >= kNumEncodings) )
+    return "";  // TODO: Should this be invalid_encoding_name()?
+  return kEncodingInfoTable[enc].mime_encoding_name_;
+}
+
+bool EncodingFromName(const char* enc_name, Encoding *encoding) {
+  *encoding = UNKNOWN_ENCODING;
+  if ( enc_name == NULL ) return false;
+
+  for ( int i = 0; i < kNumEncodings; i++ ) {
+    if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
+      *encoding = static_cast<Encoding>(i);
+      return true;
+    }
+  }
+  return false;
+}
+
+// The encoding_map maps standard and non-standard encoding-names
+// (strings) to Encoding enums. It is used only by
+// EncodingNameAliasToEncoding. Note that the map uses
+// case-insensitive hash and comparison functions.
+
+typedef std::unordered_map<const char *, Encoding,
+           CStringAlnumCaseHash,
+           CStringAlnumCaseEqual> EncodingMap;
+
+static const EncodingMap& GetEncodingMap() {
+  static EncodingMap encoding_map;
+  if (!encoding_map.empty()) {
+    // Already initialized
+    return encoding_map;
+  }
+
+  // Initialize the map with all the "standard" encoding names,
+  // i.e., the ones returned by EncodingName and MimeEncodingName.
+  //
+  // First, add internal encoding names returned by EncodingName().
+  for (int i = 0; i < NUM_ENCODINGS; ++i) {
+    Encoding e = static_cast<Encoding>(i);
+    // Internal encoding names must be unique.
+    // The internal names are guaranteed to be unique by the CHECK_EQ.
+    const char *encoding_name = EncodingName(e);
+    // CHECK_EQ(0, encoding_map.count(encoding_name))
+    //  << "Duplicate found for " << encoding_name;
+    encoding_map[encoding_name] = e;
+  }
+  // Then, add mime encoding names returned by MimeEncodingName().
+  // We don't override existing entries, to give precedence to entries
+  // added earlier.
+  for (int i = 0; i < NUM_ENCODINGS; ++i) {
+    Encoding e = static_cast<Encoding>(i);
+    // Note that MimeEncodingName() can return the same mime encoding
+    // name for different encoding enums like JAPANESE_SHIFT_JIS and
+    // KDDI_SHIFT_JIS.  In that case, the encoding enum first seen
+    // will be the value for the encoding name in the map.
+    const char *mime_encoding_name = MimeEncodingName(e);
+    if (encoding_map.count(mime_encoding_name) == 0) {
+      encoding_map[mime_encoding_name] = e;
+    }
+  }
+
+  // Add some non-standard names: alternate spellings, common typos,
+  // etc. (It does no harm to add names already in the map.) Note
+  // that although the map is case-insensitive, by convention the
+  // keys are written here in lower case. For ease of maintenance,
+  // they are listed in alphabetical order.
+  encoding_map["5601"] = KOREAN_EUC_KR;
+  encoding_map["646"] = ASCII_7BIT;
+  encoding_map["852"] = CZECH_CP852;
+  encoding_map["866"] = RUSSIAN_CP866;
+  encoding_map["8859-1"] = ISO_8859_1;
+  encoding_map["ansi-1251"] = RUSSIAN_CP1251;
+  encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
+  encoding_map["arabic"] = ISO_8859_6;
+  encoding_map["ascii"] = ISO_8859_1;
+  encoding_map["ascii-7-bit"] = ASCII_7BIT;  // not iana standard
+  encoding_map["asmo-708"] = ISO_8859_6;
+  encoding_map["bhaskar"] = BHASKAR;
+  encoding_map["big5"] = CHINESE_BIG5;
+  encoding_map["big5-cp950"] = CHINESE_BIG5_CP950;  // not iana standard
+  encoding_map["big5-hkscs"] = BIG5_HKSCS;
+  encoding_map["chinese"] = CHINESE_GB;
+  encoding_map["cns"] = CHINESE_CNS;  // not iana standard
+  encoding_map["cns11643"] = CHINESE_CNS;
+  encoding_map["cp1250"] = MSFT_CP1250;  // not iana standard
+  encoding_map["cp1251"] = RUSSIAN_CP1251;  // not iana standard
+  encoding_map["cp1252"] = MSFT_CP1252;  // not iana standard
+  encoding_map["cp1253"] = MSFT_CP1253;  // not iana standard
+  encoding_map["cp1254"] = MSFT_CP1254;  // not iana standard
+  encoding_map["cp1255"] = MSFT_CP1255;
+  encoding_map["cp1256"] = MSFT_CP1256;
+  encoding_map["cp1257"] = MSFT_CP1257;  // not iana standard
+  encoding_map["cp819"] = ISO_8859_1;
+  encoding_map["cp852"] = CZECH_CP852;
+  encoding_map["cp866"] = RUSSIAN_CP866;
+  encoding_map["cp-866"] = RUSSIAN_CP866;
+  encoding_map["cp874"] = MSFT_CP874;
+  encoding_map["cp932"] = JAPANESE_CP932;  // not iana standard
+  encoding_map["cp950"] = CHINESE_BIG5_CP950;   // not iana standard
+  encoding_map["csbig5"] = CHINESE_BIG5;
+  encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
+  encoding_map["cseuckr"] = KOREAN_EUC_KR;
+  encoding_map["csgb2312"] = CHINESE_GB;
+  encoding_map["csibm852"] = CZECH_CP852;
+  encoding_map["csibm866"] = RUSSIAN_CP866;
+  encoding_map["csiso2022jp"] = JAPANESE_JIS;
+  encoding_map["csiso2022kr"] = ISO_2022_KR;
+  encoding_map["csiso58gb231280"] = CHINESE_GB;
+  encoding_map["csiso88598i"] = ISO_8859_8_I;
+  encoding_map["csisolatin1"] = ISO_8859_1;
+  encoding_map["csisolatin2"] = ISO_8859_2;
+  encoding_map["csisolatin3"] = ISO_8859_3;
+  encoding_map["csisolatin4"] = ISO_8859_4;
+  encoding_map["csisolatin5"] = ISO_8859_9;
+  encoding_map["csisolatin6"] = ISO_8859_10;
+  encoding_map["csisolatinarabic"] = ISO_8859_6;
+  encoding_map["csisolatincyrillic"] = ISO_8859_5;
+  encoding_map["csisolatingreek"] = ISO_8859_7;
+  encoding_map["csisolatinhebrew"] = ISO_8859_8;
+  encoding_map["csksc56011987"] = KOREAN_EUC_KR;
+  encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
+  encoding_map["csn-369103"] = CZECH_CSN_369103;
+  encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
+  encoding_map["csunicode"] = UTF16BE;
+  encoding_map["csunicode11"] = UTF16BE;
+  encoding_map["csunicode11utf7"] = UTF7;
+  encoding_map["csunicodeascii"] = UTF16BE;
+  encoding_map["csunicodelatin1"] = UTF16BE;
+  encoding_map["cyrillic"] = ISO_8859_5;
+  encoding_map["ecma-114"] = ISO_8859_6;
+  encoding_map["ecma-118"] = ISO_8859_7;
+  encoding_map["elot_928"] = ISO_8859_7;
+  encoding_map["euc"] = CHINESE_EUC_DEC;  // not iana standard
+  encoding_map["euc-cn"] = CHINESE_EUC_CN;  // not iana standard
+  encoding_map["euc-dec"] = CHINESE_EUC_DEC;  // not iana standard
+  encoding_map["euc-jp"] = JAPANESE_EUC_JP;
+  encoding_map["euc-kr"] = KOREAN_EUC_KR;
+  encoding_map["eucgb2312_cn"] = CHINESE_GB;
+  encoding_map["gb"] = CHINESE_GB;  // not iana standard
+  encoding_map["gb18030"] = GB18030;
+  encoding_map["gb2132"] = CHINESE_GB;  // common typo
+  encoding_map["gb2312"] = CHINESE_GB;
+  encoding_map["gb_2312-80"] = CHINESE_GB;
+  encoding_map["gbk"] = GBK;
+  encoding_map["greek"] = ISO_8859_7;
+  encoding_map["greek8"] = ISO_8859_7;
+  encoding_map["hebrew"] = ISO_8859_8;
+  encoding_map["htchanakya"] = HTCHANAKYA;
+  encoding_map["hz-gb-2312"] = HZ_GB_2312;
+  encoding_map["ibm819"] = ISO_8859_1;
+  encoding_map["ibm852"] = CZECH_CP852;
+  encoding_map["ibm874"] = MSFT_CP874;
+  encoding_map["iso-10646"] = UTF16BE;
+  encoding_map["iso-10646-j-1"] = UTF16BE;
+  encoding_map["iso-10646-ucs-2"] = UNICODE;
+  encoding_map["iso-10646-ucs-4"] = UTF32BE;
+  encoding_map["iso-10646-ucs-basic"] = UTF16BE;
+  encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
+  encoding_map["iso-2022-cn"] = ISO_2022_CN;
+  encoding_map["iso-2022-jp"] = JAPANESE_JIS;
+  encoding_map["iso-2022-kr"] = ISO_2022_KR;
+  encoding_map["iso-8559-1"] = ISO_8859_1;   // common typo
+  encoding_map["iso-874"] = MSFT_CP874;
+  encoding_map["iso-8858-1"] = ISO_8859_1;   // common typo
+  // iso-8859-0 was a temporary name, eventually renamed iso-8859-15
+  encoding_map["iso-8859-0"] = ISO_8859_15;
+  encoding_map["iso-8859-1"] = ISO_8859_1;
+  encoding_map["iso-8859-10"] = ISO_8859_10;
+  encoding_map["iso-8859-11"] = ISO_8859_11;
+  encoding_map["iso-8859-13"] = ISO_8859_13;
+  encoding_map["iso-8859-15"] = ISO_8859_15;
+  encoding_map["iso-8859-2"] = ISO_8859_2;
+  encoding_map["iso-8859-3"] = ISO_8859_3;
+  encoding_map["iso-8859-4"] = ISO_8859_4;
+  encoding_map["iso-8859-5"] = ISO_8859_5;
+  encoding_map["iso-8859-6"] = ISO_8859_6;
+  encoding_map["iso-8859-7"] = ISO_8859_7;
+  encoding_map["iso-8859-8"] = ISO_8859_8;
+  encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
+  encoding_map["iso-8859-9"] = ISO_8859_9;
+  encoding_map["iso-9959-1"] = ISO_8859_1;   // common typo
+  encoding_map["iso-ir-100"] = ISO_8859_1;
+  encoding_map["iso-ir-101"] = ISO_8859_2;
+  encoding_map["iso-ir-109"] = ISO_8859_3;
+  encoding_map["iso-ir-110"] = ISO_8859_4;
+  encoding_map["iso-ir-126"] = ISO_8859_7;
+  encoding_map["iso-ir-127"] = ISO_8859_6;
+  encoding_map["iso-ir-138"] = ISO_8859_8;
+  encoding_map["iso-ir-144"] = ISO_8859_5;
+  encoding_map["iso-ir-148"] = ISO_8859_9;
+  encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
+  encoding_map["iso-ir-157"] = ISO_8859_10;
+  encoding_map["iso-ir-58"] = CHINESE_GB;
+  encoding_map["iso-latin-1"] = ISO_8859_1;
+  encoding_map["iso_2022-cn"] = ISO_2022_CN;
+  encoding_map["iso_2022-kr"] = ISO_2022_KR;
+  encoding_map["iso_8859-1"] = ISO_8859_1;
+  encoding_map["iso_8859-10:1992"] = ISO_8859_10;
+  encoding_map["iso_8859-11"] = ISO_8859_11;
+  encoding_map["iso_8859-13"] = ISO_8859_13;
+  encoding_map["iso_8859-15"] = ISO_8859_15;
+  encoding_map["iso_8859-1:1987"] = ISO_8859_1;
+  encoding_map["iso_8859-2"] = ISO_8859_2;
+  encoding_map["iso_8859-2:1987"] = ISO_8859_2;
+  encoding_map["iso_8859-3"] = ISO_8859_3;
+  encoding_map["iso_8859-3:1988"] = ISO_8859_3;
+  encoding_map["iso_8859-4"] = ISO_8859_4;
+  encoding_map["iso_8859-4:1988"] = ISO_8859_4;
+  encoding_map["iso_8859-5"] = ISO_8859_5;
+  encoding_map["iso_8859-5:1988"] = ISO_8859_5;
+  encoding_map["iso_8859-6"] = ISO_8859_6;
+  encoding_map["iso_8859-6:1987"] = ISO_8859_6;
+  encoding_map["iso_8859-7"] = ISO_8859_7;
+  encoding_map["iso_8859-7:1987"] = ISO_8859_7;
+  encoding_map["iso_8859-8"] = ISO_8859_8;
+  encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
+  encoding_map["iso_8859-9"] = ISO_8859_9;
+  encoding_map["iso_8859-9:1989"] = ISO_8859_9;
+  encoding_map["jagran"] = JAGRAN;
+  encoding_map["jis"] = JAPANESE_JIS;   // not iana standard
+  encoding_map["koi8-cs"] = CZECH_CSN_369103;
+  encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
+  encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU;  // not iana standard
+  encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
+  encoding_map["koi8r"] = RUSSIAN_KOI8_R;  // not iana standard
+  encoding_map["koi8u"] = RUSSIAN_KOI8_RU;  // not iana standard
+  encoding_map["korean"] = KOREAN_EUC_KR;  // i assume this is what is meant
+  encoding_map["ks-c-5601"] = KOREAN_EUC_KR;  // not iana standard
+  encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR;  // not iana standard
+  encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
+  encoding_map["ksc"] = KOREAN_EUC_KR;  // not iana standard
+  encoding_map["l1"] = ISO_8859_1;
+  encoding_map["l2"] = ISO_8859_2;
+  encoding_map["l3"] = ISO_8859_3;
+  encoding_map["l4"] = ISO_8859_4;
+  encoding_map["l5"] = ISO_8859_9;
+  encoding_map["l6"] = ISO_8859_10;
+  encoding_map["latin-1"] = ISO_8859_1;  // not iana standard
+  encoding_map["latin1"] = ISO_8859_1;
+  encoding_map["latin2"] = ISO_8859_2;
+  encoding_map["latin3"] = ISO_8859_3;
+  encoding_map["latin4"] = ISO_8859_4;
+  encoding_map["latin5"] = ISO_8859_9;
+  encoding_map["latin6"] = ISO_8859_10;
+  encoding_map["mac"] = MACINTOSH_ROMAN;
+  encoding_map["macintosh"] = MACINTOSH_ROMAN;
+  encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
+  encoding_map["ms932"] = JAPANESE_CP932;  // not iana standard
+  encoding_map["ms_kanji"] = JAPANESE_CP932;
+  encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
+  encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
+  encoding_map["sjis"] = JAPANESE_SHIFT_JIS;  // not iana standard
+  encoding_map["sjs"] = JAPANESE_SHIFT_JIS;  // not iana standard
+  encoding_map["sun_eu_greek"] = ISO_8859_7;
+  encoding_map["tab"] = TAMIL_BI;
+  encoding_map["tam"] = TAMIL_MONO;
+  encoding_map["tis-620"] = ISO_8859_11;
+  encoding_map["tscii"] = TSCII;
+  encoding_map["un"] = UNKNOWN_ENCODING;  // not iana standard
+  encoding_map["unicode"] = UNICODE;  // not iana standard
+  encoding_map["unicode-1-1-utf-7"] = UTF7;
+  encoding_map["unicode-1-1-utf-8"] = UTF8;
+  encoding_map["unicode-2-0-utf-7"] = UTF7;
+  encoding_map["unknown"] = UNKNOWN_ENCODING;   // not iana standard
+  encoding_map["us"] = ISO_8859_1;
+  encoding_map["us-ascii"] = ISO_8859_1;
+  encoding_map["utf-16be"] = UTF16BE;
+  encoding_map["utf-16le"] = UTF16LE;
+  encoding_map["utf-32be"] = UTF32BE;
+  encoding_map["utf-32le"] = UTF32LE;
+  encoding_map["utf-7"] = UTF7;
+  encoding_map["utf-8"] = UTF8;
+  encoding_map["utf7"] = UTF7;
+  encoding_map["utf8"] = UTF8;  // not iana standard
+  encoding_map["visual"] = HEBREW_VISUAL;
+  encoding_map["win-1250"] = MSFT_CP1250;  // not iana standard
+  encoding_map["win-1251"] = RUSSIAN_CP1251;  // not iana standard
+  encoding_map["window-874"] = MSFT_CP874;
+  encoding_map["windows-1250"] = MSFT_CP1250;
+  encoding_map["windows-1251"] = RUSSIAN_CP1251;
+  encoding_map["windows-1252"] = MSFT_CP1252;
+  encoding_map["windows-1253"] = MSFT_CP1253;
+  encoding_map["windows-1254"] = MSFT_CP1254;
+  encoding_map["windows-1255"] = MSFT_CP1255;
+  encoding_map["windows-1256"] = MSFT_CP1256;
+  encoding_map["windows-1257"] = MSFT_CP1257;
+  encoding_map["windows-31j"] = JAPANESE_CP932;
+  encoding_map["windows-874"] = MSFT_CP874;
+  encoding_map["windows-936"] = GBK;
+  encoding_map["x-big5"] = CHINESE_BIG5;
+  encoding_map["x-binaryenc"] = BINARYENC;  // not iana standard
+  encoding_map["x-cp1250"] = MSFT_CP1250;
+  encoding_map["x-cp1251"] = RUSSIAN_CP1251;
+  encoding_map["x-cp1252"] = MSFT_CP1252;
+  encoding_map["x-cp1253"] = MSFT_CP1253;
+  encoding_map["x-cp1254"] = MSFT_CP1254;
+  encoding_map["x-cp1255"] = MSFT_CP1255;
+  encoding_map["x-cp1256"] = MSFT_CP1256;
+  encoding_map["x-cp1257"] = MSFT_CP1257;
+  encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
+  encoding_map["x-euc-tw"] = CHINESE_CNS;
+  encoding_map["x-gbk"] = GBK;
+  encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
+  encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
+  encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
+  encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
+  encoding_map["x-jis"] = JAPANESE_JIS;  // not iana standard
+  encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
+  encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS;  // not iana standard
+  encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
+  encoding_map["x-unicode-2-0-utf-7"] = UTF7;
+  encoding_map["x-utf8utf8"] = UTF8UTF8;  // not iana standard
+  encoding_map["x-x-big5"] = CHINESE_BIG5;
+  encoding_map["zh_cn.euc"] = CHINESE_GB;
+  encoding_map["zh_tw-big5"] = CHINESE_BIG5;
+  encoding_map["zh_tw-euc"] = CHINESE_CNS;
+
+  // Remove they entry for the empty string, if any.
+  encoding_map.erase("");
+
+  return encoding_map;
+}
+
+// ----------------------------------------------------------------------
+// EncodingNameAliasToEncoding()
+//
+// This function takes an encoding name/alias and returns the Encoding
+// enum. The input is case insensitive. It is the union of the common
+// IANA standard names, the charset names used in Netscape Navigator,
+// and some common names we have been using.
+// See: http://www.iana.org/assignments/character-sets
+// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
+//
+// UNKNOWN_ENCODING is returned if none matches.
+//
+// TODO: Check if it is possible to remove the non-standard,
+// non-netscape-use names. It is because this routine is used for
+// encoding detections from html meta info. Non-standard names may
+// introduce noise on encoding detection.
+//
+// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
+// or determine why such a unification is not possible.
+// ----------------------------------------------------------------------
+Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
+  if (!encoding_name) {
+    return UNKNOWN_ENCODING;
+  }
+
+  const EncodingMap& encoding_map = GetEncodingMap();
+
+  EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
+  if (emi != encoding_map.end()) {
+    return emi->second;
+  } else {
+    return UNKNOWN_ENCODING;
+  }
+}
+
+const char* default_encoding_name() {
+  return kEncodingInfoTable[LATIN1].encoding_name_;
+}
+
+static const char* const kInvalidEncodingName = "invalid_encoding";
+
+const char *invalid_encoding_name() {
+  return kInvalidEncodingName;
+}
+
+
+
+// *************************************************************
+// Miscellany
+// *************************************************************
+
+
+Encoding PreferredWebOutputEncoding(Encoding enc) {
+  return IsValidEncoding(enc)
+      ? kEncodingInfoTable[enc].preferred_web_output_encoding_
+      : UTF8;
+}
diff --git a/contrib/google-ced/util/encodings/encodings.h b/contrib/google-ced/util/encodings/encodings.h
new file mode 100644
index 000000000..647797432
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings.h
@@ -0,0 +1,299 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_ENCODINGS_ENCODINGS_H_
+#define UTIL_ENCODINGS_ENCODINGS_H_
+
+// This interface defines the Encoding enum and various functions that
+// depend only on Encoding values.
+
+// A hash-function for Encoding, hash<Encoding>, is defined in
+// i18n/encodings/public/encodings-hash.h
+
+// On some Windows projects, UNICODE may be defined, which would prevent the
+// Encoding enum below from compiling. Note that this is a quick fix that does
+// not break any existing projects. The UNICODE enum may someday be changed
+// to something more specific and non-colliding, but this involves careful
+// testing of changes in many other projects.
+#undef UNICODE
+
+// NOTE: The Encoding enum must always start at 0. This assumption has
+// been made and used.
+
+#ifndef SWIG
+
+#include "util/encodings/encodings.pb.h"
+
+#else
+
+// TODO: Include a SWIG workaround header file.
+
+#endif
+
+const int kNumEncodings = NUM_ENCODINGS;
+
+// some of the popular encoding aliases
+// TODO: Make these static const Encoding values instead of macros.
+#define LATIN1           ISO_8859_1
+#define LATIN2           ISO_8859_2
+#define LATIN3           ISO_8859_3
+#define LATIN4           ISO_8859_4
+#define CYRILLIC         ISO_8859_5
+#define ARABIC_ENCODING  ISO_8859_6     // avoiding the same name as language
+#define GREEK_ENCODING   ISO_8859_7     // avoiding the same name as language
+#define HEBREW_ENCODING  ISO_8859_8     // avoiding the same name as language
+#define LATIN5           ISO_8859_9
+#define LATIN6           ISO_8859_10
+#define KOREAN_HANGUL    KOREAN_EUC_KR
+
+// The default Encoding (LATIN1).
+Encoding default_encoding();
+
+
+
+// *************************************************************
+// Encoding predicates
+//   IsValidEncoding()
+//   IsEncEncCompatible
+//   IsSupersetOfAscii7Bit
+//   Is8BitEncoding
+//   IsCJKEncoding
+//   IsHebrewEncoding
+//   IsRightToLeftEncoding
+//   IsLogicalRightToLeftEncoding
+//   IsVisualRightToLeftEncoding
+//   IsIso2022Encoding
+//   IsIso2022JpOrVariant
+//   IsShiftJisOrVariant
+//   IsJapaneseCellPhoneCarrierSpecificEncoding
+// *************************************************************
+
+// IsValidEncoding
+// ===================================
+//
+// Function to check if the input language enum is within range.
+//
+
+bool IsValidEncoding(Encoding enc);
+
+//
+// IsEncEncCompatible
+// ------------------
+//
+// This function is to determine whether or not converting from the
+// first encoding to the second requires any changes to the underlying
+// text (e.g.  ASCII_7BIT is a subset of UTF8).
+//
+// TODO: the current implementation is likely incomplete.  It would be
+// good to consider the full matrix of all pairs of encodings and to fish out
+// all compatible pairs.
+//
+bool IsEncEncCompatible(const Encoding from, const Encoding to);
+
+// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
+// encoding represent the same characters as they do in ISO_8859_1.
+
+// WARNING: This function does not currently return true for all encodings that
+// are supersets of Ascii 7-bit.
+bool IsSupersetOfAscii7Bit(Encoding e);
+
+// To be an 8-bit encoding means that there are fewer than 256 symbols.
+// Each byte determines a new character; there are no multi-byte sequences.
+
+// WARNING: This function does not currently return true for all encodings that
+// are 8-bit encodings.
+bool Is8BitEncoding(Encoding e);
+
+// IsCJKEncoding
+// -------------
+//
+// This function returns true if the encoding is either Chinese
+// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
+// considered a CJK encoding.
+bool IsCJKEncoding(Encoding e);
+
+// IsHebrewEncoding
+// -------------
+//
+// This function returns true if the encoding is a Hebrew specific
+// encoding (not UTF8, etc).
+bool IsHebrewEncoding(Encoding e);
+
+// IsRightToLeftEncoding
+// ---------------------
+//
+// Returns true if the encoding is a right-to-left encoding.
+//
+// Note that the name of this function is somewhat misleading. There is nothing
+// "right to left" about these encodings. They merely contain code points for
+// characters in RTL languages such as Hebrew and Arabic. But this is also
+// true for UTF-8.
+//
+// TODO: Get rid of this function. The only special-case we
+// should need to worry about are visual encodings. Anything we
+// need to do for all 'RTL' encodings we need to do for UTF-8 as well.
+bool IsRightToLeftEncoding(Encoding enc);
+
+// IsLogicalRightToLeftEncoding
+// ----------------------------
+//
+// Returns true if the encoding is a logical right-to-left encoding.
+// Logical right-to-left encodings are those that the browser renders
+// right-to-left and applies the BiDi algorithm to. Therefore the characters
+// appear in reading order in the file, and indexing, snippet generation etc.
+// should all just work with no special processing.
+//
+// TODO: Get rid of this function. The only special-case we
+// should need to worry about are visual encodings.
+bool IsLogicalRightToLeftEncoding(Encoding enc);
+
+// IsVisualRightToLeftEncoding
+// ---------------------------
+//
+// Returns true if the encoding is a visual right-to-left encoding.
+// Visual right-to-left encodings are those that the browser renders
+// left-to-right and does not apply the BiDi algorithm to. Therefore each
+// line appears in reverse order in the file, lines are manually wrapped
+// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
+// the prehistoric days when browsers couldn't render right-to-left, but
+// unfortunately some visual pages persist to this day. These documents require
+// special processing so that we don't index or snippet them with each line
+// reversed.
+bool IsVisualRightToLeftEncoding(Encoding enc);
+
+// IsIso2022Encoding
+// -----------------
+//
+// Returns true if the encoding is a kind of ISO 2022 such as
+// ISO-2022-JP.
+bool IsIso2022Encoding(Encoding enc);
+
+// IsIso2022JpOrVariant
+// --------------------
+//
+// Returns true if the encoding is ISO-2022-JP or a variant such as
+// KDDI's ISO-2022-JP.
+bool IsIso2022JpOrVariant(Encoding enc);
+
+// IsShiftJisOrVariant
+// --------------------
+//
+// Returns true if the encoding is Shift_JIS or a variant such as
+// KDDI's Shift_JIS.
+bool IsShiftJisOrVariant(Encoding enc);
+
+// IsJapanesCellPhoneCarrierSpecificEncoding
+// -----------------------------------------
+//
+// Returns true if it's Japanese cell phone carrier specific encoding
+// such as KDDI_SHIFT_JIS.
+bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
+
+
+
+// *************************************************************
+// ENCODING NAMES
+//
+// This interface defines a standard name for each valid encoding, and
+// a standard name for invalid encodings. (Some names use all upper
+// case, but others use mixed case.)
+//
+//   EncodingName() [Encoding to name]
+//   MimeEncodingName() [Encoding to name]
+//   EncodingFromName() [name to Encoding]
+//   EncodingNameAliasToEncoding() [name to Encoding]
+//   default_encoding_name()
+//   invalid_encoding_name()
+// *************************************************************
+
+// EncodingName
+// ------------
+//
+// Given the encoding, returns its standard name.
+// Return invalid_encoding_name() if the encoding is invalid.
+//
+const char* EncodingName(Encoding enc);
+
+//
+// MimeEncodingName
+// ----------------
+//
+// Return the "preferred MIME name" of an encoding.
+//
+// This name is suitable for using in HTTP headers, HTML tags,
+// and as the "charset" parameter of a MIME Content-Type.
+const char* MimeEncodingName(Encoding enc);
+
+
+// The maximum length of an encoding name
+const int kMaxEncodingNameSize = 50;
+
+// The standard name of the default encoding.
+const char* default_encoding_name();
+
+// The name used for an invalid encoding.
+const char* invalid_encoding_name();
+
+// EncodingFromName
+// ----------------
+//
+// If enc_name matches the standard name of an Encoding, using a
+// case-insensitive comparison, set *encoding to that Encoding and
+// return true.  Otherwise set *encoding to UNKNOWN_ENCODING and
+// return false.
+//
+// REQUIRES: encoding must not be NULL.
+//
+bool EncodingFromName(const char* enc_name, Encoding *encoding);
+
+//
+// EncodingNameAliasToEncoding
+// ---------------------------
+//
+// If enc_name matches the standard name or an alias of an Encoding,
+// using a case-insensitive comparison, return that
+// Encoding. Otherwise, return UNKNOWN_ENCODING.
+//
+// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
+// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
+// common variations with hyphens and underscores (e.g., "koi8-u" and
+// "koi8u" for RUSSIAN_KOI8_R).
+
+Encoding EncodingNameAliasToEncoding(const char *enc_name);
+
+// *************************************************************
+// Miscellany
+// *************************************************************
+
+// PreferredWebOutputEncoding
+// --------------------------
+//
+// Some multi-byte encodings use byte values that coincide with the
+// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
+// can misinterpret these, as indicated in an external XSS report from
+// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
+// also use UTF8 instead of encodings that we don't support in our
+// output, and we generally try to be conservative in what we send out.
+// Where the client asks for single- or double-byte encodings that are
+// not as common, we substitute a more common single- or double-byte
+// encoding, if there is one, thereby preserving the client's intent
+// to use less space than UTF-8. This also means that characters
+// outside the destination set will be converted to HTML NCRs (&#NNN;)
+// if requested.
+Encoding PreferredWebOutputEncoding(Encoding enc);
+
+
+#endif  // UTIL_ENCODINGS_ENCODINGS_H_
diff --git a/contrib/google-ced/util/encodings/encodings.pb.h b/contrib/google-ced/util/encodings/encodings.pb.h
new file mode 100644
index 000000000..ffbd716ec
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings.pb.h
@@ -0,0 +1,181 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_
+#define UTIL_ENCODINGS_ENCODINGS_PB_H_
+
+enum Encoding {
+  ISO_8859_1           =  0,  // Teragram ASCII
+  ISO_8859_2           =  1,  // Teragram Latin2
+  ISO_8859_3           =  2,  // in BasisTech but not in Teragram
+  ISO_8859_4           =  3,  // Teragram Latin4
+  ISO_8859_5           =  4,  // Teragram ISO-8859-5
+  ISO_8859_6           =  5,  // Teragram Arabic
+  ISO_8859_7           =  6,  // Teragram Greek
+  ISO_8859_8           =  7,  // Teragram Hebrew
+  ISO_8859_9           =  8,  // in BasisTech but not in Teragram
+  ISO_8859_10          =  9,  // in BasisTech but not in Teragram
+  JAPANESE_EUC_JP      = 10,  // Teragram EUC_JP
+  JAPANESE_SHIFT_JIS   = 11,  // Teragram SJS
+  JAPANESE_JIS         = 12,  // Teragram JIS
+  CHINESE_BIG5         = 13,  // Teragram BIG5
+  CHINESE_GB           = 14,  // Teragram GB
+  CHINESE_EUC_CN       = 15,  // Misnamed. Should be EUC_TW. Was Basis Tech
+                              // CNS11643EUC, before that Teragram EUC-CN(!)
+                              // See //i18n/basistech/basistech_encodings.h
+  KOREAN_EUC_KR        = 16,  // Teragram KSC
+  UNICODE              = 17,  // Teragram Unicode
+  CHINESE_EUC_DEC      = 18,  // Misnamed. Should be EUC_TW. Was Basis Tech
+                              // CNS11643EUC, before that Teragram EUC.
+  CHINESE_CNS          = 19,  // Misnamed. Should be EUC_TW. Was Basis Tech
+                              // CNS11643EUC, before that Teragram CNS.
+  CHINESE_BIG5_CP950   = 20,  // Teragram BIG5_CP950
+  JAPANESE_CP932       = 21,  // Teragram CP932
+  UTF8                 = 22,
+  UNKNOWN_ENCODING     = 23,
+  ASCII_7BIT           = 24,  // ISO_8859_1 with all characters <= 127.
+                              // Should be present only in the crawler
+                              // and in the repository,
+                              // *never* as a result of Document::encoding().
+  RUSSIAN_KOI8_R       = 25,  // Teragram KOI8R
+  RUSSIAN_CP1251       = 26,  // Teragram CP1251
+
+  //----------------------------------------------------------
+  // These are _not_ output from teragram. Instead, they are as
+  // detected in the headers of usenet articles.
+  MSFT_CP1252          = 27,  // 27: CP1252 aka MSFT euro ascii
+  RUSSIAN_KOI8_RU      = 28,  // CP21866 aka KOI8-U, used for Ukrainian.
+                              // Misnamed, this is _not_ KOI8-RU but KOI8-U.
+                              // KOI8-U is used much more often than KOI8-RU.
+  MSFT_CP1250          = 29,  // CP1250 aka MSFT eastern european
+  ISO_8859_15          = 30,  // aka ISO_8859_0 aka ISO_8859_1 euroized
+  //----------------------------------------------------------
+
+  //----------------------------------------------------------
+  // These are in BasisTech but not in Teragram. They are
+  // needed for new interface languages. Now detected by
+  // research langid
+  MSFT_CP1254          = 31,  // used for Turkish
+  MSFT_CP1257          = 32,  // used in Baltic countries
+  //----------------------------------------------------------
+
+  //----------------------------------------------------------
+  //----------------------------------------------------------
+  // New encodings detected by Teragram
+  ISO_8859_11          = 33,  // aka TIS-620, used for Thai
+  MSFT_CP874           = 34,  // used for Thai
+  MSFT_CP1256          = 35,  // used for Arabic
+
+  //----------------------------------------------------------
+  // Detected as ISO_8859_8 by Teragram, but can be found in META tags
+  MSFT_CP1255          = 36,  // Logical Hebrew Microsoft
+  ISO_8859_8_I         = 37,  // Iso Hebrew Logical
+  HEBREW_VISUAL        = 38,  // Iso Hebrew Visual
+  //----------------------------------------------------------
+
+  //----------------------------------------------------------
+  // Detected by research langid
+  CZECH_CP852          = 39,
+  CZECH_CSN_369103     = 40,  // aka ISO_IR_139 aka KOI8_CS
+  MSFT_CP1253          = 41,  // used for Greek
+  RUSSIAN_CP866        = 42,
+  //----------------------------------------------------------
+
+  //----------------------------------------------------------
+  // Handled by iconv in glibc
+  ISO_8859_13          = 43,
+  ISO_2022_KR          = 44,
+  GBK                  = 45,
+  GB18030              = 46,
+  BIG5_HKSCS           = 47,
+  ISO_2022_CN          = 48,
+
+  //-----------------------------------------------------------
+  // Detected by xin liu's detector
+  // Handled by transcoder
+  // (Indic encodings)
+
+  TSCII                = 49,
+  TAMIL_MONO           = 50,
+  TAMIL_BI             = 51,
+  JAGRAN               = 52,
+
+
+  MACINTOSH_ROMAN      = 53,
+  UTF7                 = 54,
+  BHASKAR              = 55,  // Indic encoding - Devanagari
+  HTCHANAKYA           = 56,  // 56 Indic encoding - Devanagari
+
+  //-----------------------------------------------------------
+  // These allow a single place (inputconverter and outputconverter)
+  // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
+  // bulk conversions, with interchange-valid checking on input and
+  // fallback if needed on ouput.
+  UTF16BE              = 57,  // big-endian UTF-16
+  UTF16LE              = 58,  // little-endian UTF-16
+  UTF32BE              = 59,  // big-endian UTF-32
+  UTF32LE              = 60,  // little-endian UTF-32
+  //-----------------------------------------------------------
+
+  //-----------------------------------------------------------
+  // An encoding that means "This is not text, but it may have some
+  // simple ASCII text embedded". Intended input conversion (not yet
+  // implemented) is to keep strings of >=4 seven-bit ASCII characters
+  // (follow each kept string with an ASCII space), delete the rest of
+  // the bytes. This will pick up and allow indexing of e.g. captions
+  // in JPEGs. No output conversion needed.
+  BINARYENC            = 61,
+  //-----------------------------------------------------------
+
+  //-----------------------------------------------------------
+  // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
+  // ~{ ... ~} for 2-byte pairs, and the browsers support this.
+  HZ_GB_2312           = 62,
+  //-----------------------------------------------------------
+
+  //-----------------------------------------------------------
+  // Some external vendors make the common input error of
+  // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
+  UTF8UTF8             = 63,
+  //-----------------------------------------------------------
+
+  //-----------------------------------------------------------
+  // Handled by transcoder for tamil language specific font
+  // encodings without the support for detection at present.
+  TAM_ELANGO           = 64,  // Elango - Tamil
+  TAM_LTTMBARANI       = 65,  // Barani - Tamil
+  TAM_SHREE            = 66,  // Shree - Tamil
+  TAM_TBOOMIS          = 67,  // TBoomis - Tamil
+  TAM_TMNEWS           = 68,  // TMNews - Tamil
+  TAM_WEBTAMIL         = 69,  // Webtamil - Tamil
+  //-----------------------------------------------------------
+
+  //-----------------------------------------------------------
+  // Shift_JIS variants used by Japanese cell phone carriers.
+  KDDI_SHIFT_JIS       = 70,
+  DOCOMO_SHIFT_JIS     = 71,
+  SOFTBANK_SHIFT_JIS   = 72,
+  // ISO-2022-JP variants used by KDDI and SoftBank.
+  KDDI_ISO_2022_JP     = 73,
+  SOFTBANK_ISO_2022_JP = 74,
+  //-----------------------------------------------------------
+
+  NUM_ENCODINGS        = 75,  // Always keep this at the end. It is not a
+                              // valid Encoding enum, it is only used to
+                              // indicate the total number of Encodings.
+};
+
+#endif  // UTIL_ENCODINGS_ENCODINGS_PB_H_
diff --git a/contrib/google-ced/util/encodings/encodings_unittest.cc b/contrib/google-ced/util/encodings/encodings_unittest.cc
new file mode 100644
index 000000000..223e3e45b
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings_unittest.cc
@@ -0,0 +1,34 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "util/encodings/encodings.h"
+
+#include "gtest/gtest.h"
+
+TEST(EncodingsTest, EncodingNameAliasToEncoding) {
+  // Test that cases, non-alpha-numeric chars are ignored.
+  EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso_8859_1"));
+  EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso-8859-1"));
+
+  // Test that spaces are ignored.
+  EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF8"));
+  EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF 8"));
+  EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF-8"));
+
+  // Test alphanumeric differences are counted.
+  EXPECT_NE(UTF8, EncodingNameAliasToEncoding("UTF-7"));
+  EXPECT_NE(KOREAN_EUC_KR, EncodingNameAliasToEncoding("euc-jp"));
+}
diff --git a/contrib/google-ced/util/languages/languages.cc b/contrib/google-ced/util/languages/languages.cc
new file mode 100644
index 000000000..852351fc6
--- /dev/null
+++ b/contrib/google-ced/util/languages/languages.cc
@@ -0,0 +1,349 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "util/languages/languages.h"
+
+#include "util/basictypes.h"
+#include "util/string_util.h"
+
+
+Language default_language() {return ENGLISH;}
+
+
+// Language names and codes
+
+struct LanguageInfo {
+  const char * language_name_;
+  const char * language_code_639_1_;   // the ISO-639-1 code for the language
+  const char * language_code_639_2_;   // the ISO-639-2 code for the language
+  const char * language_code_other_;   // some nonstandard code for the language
+};
+
+static const LanguageInfo kLanguageInfoTable[] = {
+  { "ENGLISH",             "en", "eng", NULL},
+  { "DANISH",              "da", "dan", NULL},
+  { "DUTCH",               "nl", "dut", NULL},
+  { "FINNISH",             "fi", "fin", NULL},
+  { "FRENCH",              "fr", "fre", NULL},
+  { "GERMAN",              "de", "ger", NULL},
+  { "HEBREW",              "he", "heb", NULL},
+  { "ITALIAN",             "it", "ita", NULL},
+  { "Japanese",            "ja", "jpn", NULL},
+  { "Korean",              "ko", "kor", NULL},
+  { "NORWEGIAN",           "nb", "nor", NULL},
+  { "POLISH",              "pl", "pol", NULL},
+  { "PORTUGUESE",          "pt", "por", NULL},
+  { "RUSSIAN",             "ru", "rus", NULL},
+  { "SPANISH",             "es", "spa", NULL},
+  { "SWEDISH",             "sv", "swe", NULL},
+  { "Chinese",             "zh", "chi", "zh-CN"},
+  { "CZECH",               "cs", "cze", NULL},
+  { "GREEK",               "el", "gre", NULL},
+  { "ICELANDIC",           "is", "ice", NULL},
+  { "LATVIAN",             "lv", "lav", NULL},
+  { "LITHUANIAN",          "lt", "lit", NULL},
+  { "ROMANIAN",            "ro", "rum", NULL},
+  { "HUNGARIAN",           "hu", "hun", NULL},
+  { "ESTONIAN",            "et", "est", NULL},
+  // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
+  // and "Unknown", they are essentially the same. Need to unify them.
+  // "un" and "ut" are invented by us, not from ISO-639.
+  //
+  { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
+  { "Unknown",             NULL, NULL, "un"},
+  { "BULGARIAN",           "bg", "bul", NULL},
+  { "CROATIAN",            "hr", "scr", NULL},
+  { "SERBIAN",             "sr", "scc", NULL},
+  { "IRISH",               "ga", "gle", NULL},
+  { "GALICIAN",            "gl", "glg", NULL},
+  // Impossible to tell Tagalog from Filipino at the moment.
+  // Use ISO 639-2 code for Filipino here.
+  { "TAGALOG",             NULL, "fil", NULL},
+  { "TURKISH",             "tr", "tur", NULL},
+  { "UKRAINIAN",           "uk", "ukr", NULL},
+  { "HINDI",               "hi", "hin", NULL},
+  { "MACEDONIAN",          "mk", "mac", NULL},
+  { "BENGALI",             "bn", "ben", NULL},
+  { "INDONESIAN",          "id", "ind", NULL},
+  { "LATIN",               "la", "lat", NULL},
+  { "MALAY",               "ms", "may", NULL},
+  { "MALAYALAM",           "ml", "mal", NULL},
+  { "WELSH",               "cy", "wel", NULL},
+  { "NEPALI",              "ne", "nep", NULL},
+  { "TELUGU",              "te", "tel", NULL},
+  { "ALBANIAN",            "sq", "alb", NULL},
+  { "TAMIL",               "ta", "tam", NULL},
+  { "BELARUSIAN",          "be", "bel", NULL},
+  { "JAVANESE",            "jw", "jav", NULL},
+  { "OCCITAN",             "oc", "oci", NULL},
+  { "URDU",                "ur", "urd", NULL},
+  { "BIHARI",              "bh", "bih", NULL},
+  { "GUJARATI",            "gu", "guj", NULL},
+  { "THAI",                "th", "tha", NULL},
+  { "ARABIC",              "ar", "ara", NULL},
+  { "CATALAN",             "ca", "cat", NULL},
+  { "ESPERANTO",           "eo", "epo", NULL},
+  { "BASQUE",              "eu", "baq", NULL},
+  { "INTERLINGUA",         "ia", "ina", NULL},
+  { "KANNADA",             "kn", "kan", NULL},
+  { "PUNJABI",             "pa", "pan", NULL},
+  { "SCOTS_GAELIC",        "gd", "gla", NULL},
+  { "SWAHILI",             "sw", "swa", NULL},
+  { "SLOVENIAN",           "sl", "slv", NULL},
+  { "MARATHI",             "mr", "mar", NULL},
+  { "MALTESE",             "mt", "mlt", NULL},
+  { "VIETNAMESE",          "vi", "vie", NULL},
+  { "FRISIAN",             "fy", "fry", NULL},
+  { "SLOVAK",              "sk", "slo", NULL},
+  { "ChineseT",
+    NULL,  NULL,  // We intentionally set these 2 fields to NULL to avoid
+                  // confusion between CHINESE_T and CHINESE.
+    "zh-TW"},
+  { "FAROESE",             "fo", "fao", NULL},
+  { "SUNDANESE",           "su", "sun", NULL},
+  { "UZBEK",               "uz", "uzb", NULL},
+  { "AMHARIC",             "am", "amh", NULL},
+  { "AZERBAIJANI",         "az", "aze", NULL},
+  { "GEORGIAN",            "ka", "geo", NULL},
+  { "TIGRINYA",            "ti", "tir", NULL},
+  { "PERSIAN",             "fa", "per", NULL},
+  { "BOSNIAN",             "bs", "bos", NULL},
+  { "SINHALESE",           "si", "sin", NULL},
+  { "NORWEGIAN_N",         "nn", "nno", NULL},
+  { "PORTUGUESE_P",        NULL, NULL, "pt-PT"},
+  { "PORTUGUESE_B",        NULL, NULL, "pt-BR"},
+  { "XHOSA",               "xh", "xho", NULL},
+  { "ZULU",                "zu", "zul", NULL},
+  { "GUARANI",             "gn", "grn", NULL},
+  { "SESOTHO",             "st", "sot", NULL},
+  { "TURKMEN",             "tk", "tuk", NULL},
+  { "KYRGYZ",              "ky", "kir", NULL},
+  { "BRETON",              "br", "bre", NULL},
+  { "TWI",                 "tw", "twi", NULL},
+  { "YIDDISH",             "yi", "yid", NULL},
+  { "SERBO_CROATIAN",      "sh", NULL, NULL},
+  { "SOMALI",              "so", "som", NULL},
+  { "UIGHUR",              "ug", "uig", NULL},
+  { "KURDISH",             "ku", "kur", NULL},
+  { "MONGOLIAN",           "mn", "mon", NULL},
+  { "ARMENIAN",            "hy", "arm", NULL},
+  { "LAOTHIAN",            "lo", "lao", NULL},
+  { "SINDHI",              "sd", "snd", NULL},
+  { "RHAETO_ROMANCE",      "rm", "roh", NULL},
+  { "AFRIKAANS",           "af", "afr", NULL},
+  { "LUXEMBOURGISH",       "lb", "ltz", NULL},
+  { "BURMESE",             "my", "bur", NULL},
+  // KHMER is known as Cambodian for Google user interfaces.
+  { "KHMER",               "km", "khm", NULL},
+  { "TIBETAN",             "bo", "tib", NULL},
+  { "DHIVEHI",             "dv", "div", NULL},
+  { "CHEROKEE",            NULL, "chr", NULL},
+  { "SYRIAC",              NULL, "syr", NULL},
+  { "LIMBU",               NULL, NULL, "sit-NP"},
+  { "ORIYA",               "or", "ori", NULL},
+  { "ASSAMESE",            "as", "asm", NULL},
+  { "CORSICAN",            "co", "cos", NULL},
+  { "INTERLINGUE",         "ie", "ine", NULL},
+  { "KAZAKH",              "kk", "kaz", NULL},
+  { "LINGALA",             "ln", "lin", NULL},
+  { "MOLDAVIAN",           "mo", "mol", NULL},
+  { "PASHTO",              "ps", "pus", NULL},
+  { "QUECHUA",             "qu", "que", NULL},
+  { "SHONA",               "sn", "sna", NULL},
+  { "TAJIK",               "tg", "tgk", NULL},
+  { "TATAR",               "tt", "tat", NULL},
+  { "TONGA",               "to", "tog", NULL},
+  { "YORUBA",              "yo", "yor", NULL},
+  { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
+  { "CREOLES_AND_PIDGINS_FRENCH_BASED",  NULL, "cpf", NULL},
+  { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
+  { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
+  { "MAORI",               "mi", "mao", NULL},
+  { "WOLOF",               "wo", "wol", NULL},
+  { "ABKHAZIAN",           "ab", "abk", NULL},
+  { "AFAR",                "aa", "aar", NULL},
+  { "AYMARA",              "ay", "aym", NULL},
+  { "BASHKIR",             "ba", "bak", NULL},
+  { "BISLAMA",             "bi", "bis", NULL},
+  { "DZONGKHA",            "dz", "dzo", NULL},
+  { "FIJIAN",              "fj", "fij", NULL},
+  { "GREENLANDIC",         "kl", "kal", NULL},
+  { "HAUSA",               "ha", "hau", NULL},
+  { "HAITIAN_CREOLE",       "ht", NULL, NULL},
+  { "INUPIAK",             "ik", "ipk", NULL},
+  { "INUKTITUT",           "iu", "iku", NULL},
+  { "KASHMIRI",            "ks", "kas", NULL},
+  { "KINYARWANDA",         "rw", "kin", NULL},
+  { "MALAGASY",            "mg", "mlg", NULL},
+  { "NAURU",               "na", "nau", NULL},
+  { "OROMO",               "om", "orm", NULL},
+  { "RUNDI",               "rn", "run", NULL},
+  { "SAMOAN",              "sm", "smo", NULL},
+  { "SANGO",               "sg", "sag", NULL},
+  { "SANSKRIT",            "sa", "san", NULL},
+  { "SISWANT",             "ss", "ssw", NULL},
+  { "TSONGA",              "ts", "tso", NULL},
+  { "TSWANA",              "tn", "tsn", NULL},
+  { "VOLAPUK",             "vo", "vol", NULL},
+  { "ZHUANG",              "za", "zha", NULL},
+  { "KHASI",               NULL, "kha", NULL},
+  { "SCOTS",               NULL, "sco", NULL},
+  { "GANDA",               "lg", "lug", NULL},
+  { "MANX",                "gv", "glv", NULL},
+  { "MONTENEGRIN",         NULL, NULL, "sr-ME"},
+  { "XX",                  NULL, NULL, "XX"},
+};
+
+COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
+               kLanguageInfoTable_has_incorrect_length);
+
+
+// LANGUAGE NAMES
+
+const char* default_language_name() {
+  return kLanguageInfoTable[ENGLISH].language_name_;
+}
+
+static const char* const kInvalidLanguageName = "invalid_language";
+
+const char *invalid_language_name() {
+  return kInvalidLanguageName;
+}
+
+const char* LanguageName(Language lang) {
+  return IsValidLanguage(lang)
+      ? kLanguageInfoTable[lang].language_name_
+      : kInvalidLanguageName;
+}
+
+
+
+// LANGUAGE CODES
+
+
+// The space before invalid_language_code is intentional. It is used
+// to prevent it matching any two letter language code.
+//
+static const char* const kInvalidLanguageCode = " invalid_language_code";
+
+const char *invalid_language_code() {
+  return kInvalidLanguageCode;
+}
+
+const char * LanguageCode(Language lang) {
+  if (! IsValidLanguage(lang))
+    return kInvalidLanguageCode;
+  const LanguageInfo& info = kLanguageInfoTable[lang];
+  if (info.language_code_639_1_) {
+    return info.language_code_639_1_;
+  } else if (info.language_code_639_2_) {
+    return info.language_code_639_2_;
+  } else if (info.language_code_other_) {
+    return info.language_code_other_;
+  } else {
+    return kInvalidLanguageCode;
+  }
+}
+
+const char* default_language_code() {
+  return kLanguageInfoTable[ENGLISH].language_code_639_1_;
+}
+
+const char* LanguageCodeISO639_1(Language lang) {
+  if (! IsValidLanguage(lang))
+    return kInvalidLanguageCode;
+  if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
+    return code;
+  return kInvalidLanguageCode;
+}
+
+const char* LanguageCodeISO639_2(Language lang) {
+  if (! IsValidLanguage(lang))
+    return kInvalidLanguageCode;
+  if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
+    return code;
+  return kInvalidLanguageCode;
+}
+
+const char* LanguageCodeWithDialects(Language lang) {
+  if (lang == CHINESE)
+    return "zh-CN";
+  return LanguageCode(lang);
+}
+
+
+
+bool LanguageFromCode(const char* lang_code, Language *language) {
+  *language = UNKNOWN_LANGUAGE;
+  if ( lang_code == NULL ) return false;
+
+  for ( int i = 0 ; i < kNumLanguages ; i++ ) {
+    const LanguageInfo& info = kLanguageInfoTable[i];
+    if ((info.language_code_639_1_ &&
+         !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
+        (info.language_code_639_2_ &&
+         !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
+        (info.language_code_other_ &&
+         !base::strcasecmp(lang_code, info.language_code_other_))) {
+      *language = static_cast<Language>(i);
+      return true;
+    }
+  }
+
+  // For convenience, this function can also parse the non-standard
+  // five-letter language codes "zh-cn" and "zh-tw" which are used by
+  // front-ends such as GWS to distinguish Simplified from Traditional
+  // Chinese.
+  if (!base::strcasecmp(lang_code, "zh-cn") ||
+      !base::strcasecmp(lang_code, "zh_cn")) {
+    *language = CHINESE;
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "zh-tw") ||
+      !base::strcasecmp(lang_code, "zh_tw")) {
+    *language = CHINESE_T;
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "sr-me") ||
+      !base::strcasecmp(lang_code, "sr_me")) {
+    *language = MONTENEGRIN;
+    return true;
+  }
+
+  // Process language-code synonyms.
+  if (!base::strcasecmp(lang_code, "he")) {
+    *language = HEBREW;  // Use "iw".
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "in")) {
+    *language = INDONESIAN;  // Use "id".
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "ji")) {
+    *language = YIDDISH;  // Use "yi".
+    return true;
+  }
+
+  // Process language-detection synonyms.
+  // These distinct languages cannot be differentiated by our current
+  // language-detection algorithms.
+  if (!base::strcasecmp(lang_code, "fil")) {
+    *language = TAGALOG;
+    return true;
+  }
+
+  return false;
+}
diff --git a/contrib/google-ced/util/languages/languages.h b/contrib/google-ced/util/languages/languages.h
new file mode 100644
index 000000000..4237961e3
--- /dev/null
+++ b/contrib/google-ced/util/languages/languages.h
@@ -0,0 +1,381 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_LANGUAGES_LANGUAGES_H_
+#define UTIL_LANGUAGES_LANGUAGES_H_
+
+// This interface defines the Language enum and functions that depend
+// only on Language values.
+
+// A hash-function for Language, hash<Language>, is defined in
+// i18n/languages/public/languages-hash.h
+
+#ifndef SWIG
+// Language enum defined in languages.proto
+// Also description on how to add languages.
+#include "util/languages/languages.pb.h"
+
+#else
+
+// TODO: Include a header containing swig-compatible enum.
+
+#endif
+
+const int kNumLanguages = NUM_LANGUAGES;
+
+// Return the default language (ENGLISH).
+Language default_language();
+
+
+// *******************************************
+// Language predicates
+//   IsValidLanguage()
+//   IS_LANGUAGE_UNKNOWN()
+//   IsCJKLanguage()
+//   IsChineseLanguage()
+//   IsNorwegianLanguage()
+//   IsPortugueseLanguage()
+//   IsRightToLeftLanguage()
+//   IsMaybeRightToLeftLanguage()
+//   IsSameLanguage()
+//   IsScriptRequiringLongerSnippets()
+// *******************************************
+
+// IsValidLanguage
+// ===============
+//
+// Function to check if the input is within range of the Language enum. If
+// IsValidLanguage(lang) returns true, it is safe to call
+// static_cast<Language>(lang).
+//
+inline bool IsValidLanguage(int lang) {
+  return ((lang >= 0) && (lang < kNumLanguages));
+}
+
+// Return true if the language is "unknown". (This function was
+// previously a macro, hence the spelling in all caps.)
+//
+inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
+  return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
+}
+
+// IsCJKLanguage
+// -------------
+//
+// This function returns true if the language is either Chinese
+// (simplified or traditional), Japanese, or Korean.
+bool IsCJKLanguage(Language lang);
+
+// IsChineseLanguage
+// -----------------
+//
+// This function returns true if the language is either Chinese
+// (simplified or traditional)
+bool IsChineseLanguage(Language lang);
+
+// IsNorwegianLanguage
+// --------------------
+//
+// This function returns true if the language is any of the Norwegian
+// (regular or Nynorsk).
+bool IsNorwegianLanguage(Language lang);
+
+// IsPortugueseLanguage
+// --------------------
+//
+// This function returns true if the language is any of the Portuguese
+// languages (regular, Portugal or Brazil)
+bool IsPortugueseLanguage(Language lang);
+
+// IsSameLanguage
+// --------------
+//
+// WARNING: This function provides only a simple test on the values of
+// the two Language arguments. It returns false if either language is
+// invalid. It returns true if the language arguments are equal, or
+// if they are both Chinese languages, both Norwegian languages, or
+// both Portuguese languages, as defined by IsChineseLanguage,
+// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
+// false.
+bool IsSameLanguage(Language lang1, Language lang2);
+
+
+// IsRightToLeftLanguage
+// ---------------------
+//
+// This function returns true if the language is only written right-to-left
+// (E.g., Hebrew, Arabic, Persian etc.)
+//
+// IMPORTANT NOTE: Technically we're talking about scripts, not languages.
+// There are languages that can be written in more than one script.
+// Examples:
+//   - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
+//     Latin or Cyrillic script, and right-to-left in Arabic script.
+//   - Sindhi and Punjabi are written in different scripts, depending on
+//     region and dialect.
+//   - Turkmen used an Arabic script historically, but not any more.
+//   - Pashto and Uyghur can use Arabic script, but use a Roman script
+//     on the Internet.
+//   - Kashmiri and Urdu are written either with Arabic or Devanagari script.
+//
+// This function only returns true for languages that are always, unequivocally
+// written in right-to-left script.
+//
+// TODO: If we want to do anything special with multi-script languages
+// we should create new 'languages' for each language+script, as we do for
+// traditional vs. simplified Chinese. However most such languages are rare in
+// use and even rarer on the web, so this is unlikely to be something we'll
+// be concerned with for a while.
+bool IsRightToLeftLanguage(Language lang);
+
+// IsMaybeRightToLeftLanguage
+// --------------------------
+//
+// This function returns true if the language may appear on the web in a
+// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
+//
+// NOTE: See important notes under IsRightToLeftLanguage(...).
+//
+// This function returns true for languages that *may* appear on the web in a
+// right-to-left script, even if they may also appear in a left-to-right
+// script.
+//
+// This function should typically be used in cases where doing some work on
+// left-to-right text would be OK (usually a no-op), and this function is used
+// just to cut down on unnecessary work on regular, LTR text.
+bool IsMaybeRightToLeftLanguage(Language lang);
+
+// IsScriptRequiringLongerSnippets
+// --------------------
+//
+// This function returns true if the script chracteristics require longer
+// snippet length (Devanagari, Bengali, Gurmukhi,
+// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
+// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
+// bool IsScriptRequiringLongerSnippets(UnicodeScript script);
+
+
+// *******************************************
+// LANGUAGE NAMES
+//
+// This interface defines a standard name for each valid Language,
+// and a standard name for invalid languages. Some language names use all
+// uppercase letters, but others use mixed case.
+//   LanguageName() [Language to name]
+//   LanguageEnumName() [language to enum name]
+//   LanguageFromName() [name to Language]
+//   default_language_name()
+//   invalid_language_name()
+// *******************************************
+
+// Given a Language, returns its standard name.
+// Return invalid_language_name() if the language is invalid.
+const char* LanguageName(Language lang);
+
+// Given a Language, return the name of the enum constant for that
+// language. In all but a few cases, this is the same as its standard
+// name. For example, LanguageName(CHINESE) returns "Chinese", but
+// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
+// code that is generating C++ code, where the enum constant is more
+// useful than its integer value.  Return "NUM_LANGUAGES" if
+// the language is invalid.
+const char* LanguageEnumName(Language lang);
+
+// The maximum length of a standard language name.
+const int kMaxLanguageNameSize = 50;
+
+// The standard name for the default language.
+const char* default_language_name();
+
+// The standard name for all invalid languages.
+const char* invalid_language_name();
+
+// If lang_name matches the standard name of a Language, using a
+// case-insensitive comparison, set *language to that Language and
+// return true.
+// Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
+//
+// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
+// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
+// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
+// as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
+// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
+// CHINESE_T (i.e., a synonym for "ChineseT").
+//
+// REQUIRES: language must not be NULL.
+//
+bool LanguageFromName(const char* lang_name, Language *language);
+
+
+
+// *******************************************
+// LANGUAGE CODES
+//
+// This interface defines a standard code for each valid language, and
+// a standard code for invalid languages. These are derived from ISO codes,
+// with some Google additions.
+//   LanguageCode()
+//   default_language_code()
+//   invalid_language_code()
+//   LanguageCodeWithDialects()
+//   LanguageCodeISO639_1()
+//   LanguageCodeISO639_2()
+// *******************************************
+
+// Given a Language, return its standard code. There are Google-specific codes:
+//     For CHINESE_T, return "zh-TW".
+//     For TG_UNKNOWN_LANGUAGE, return "ut".
+//     For UNKNOWN_LANGUAGE, return "un".
+//     For PORTUGUESE_P, return "pt-PT".
+//     For PORTUGUESE_B, return "pt-BR".
+//     For LIMBU, return "sit-NP".
+//     For CHEROKEE, return "chr".
+//     For SYRIAC, return "syr".
+// Otherwise return the ISO 639-1 two-letter language code for lang.
+// If lang is invalid, return invalid_language_code().
+//
+// NOTE: See the note below about the codes for Chinese languages.
+//
+const char* LanguageCode(Language lang);
+
+// The maximum length of a language code.
+const int kMaxLanguageCodeSize = 50;
+
+// The standard code for the default language.
+const char* default_language_code();
+
+// The standard code for all invalid languages.
+const char* invalid_language_code();
+
+
+// --------------------------------------------
+// NOTE: CHINESE LANGUAGE CODES
+//
+// There are three functions that return codes for Chinese languages.
+// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
+// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
+// The following list shows the different results.
+//
+// LanguageCode(CHINESE) returns "zh"
+// LanguageCode(CHINESE_T) returns "zh-TW".
+//
+// LanguageCodeWithDialects(CHINESE) returns "zh-CN".
+// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
+//
+// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
+// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
+// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
+//
+// --------------------------------------------
+
+// LanguageCodeWithDialects
+// ------------------------
+//
+// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
+const char* LanguageCodeWithDialects(Language lang);
+
+// LanguageCodeISO639_1
+// --------------------
+//
+// Return the ISO 639-1 two-letter language code for lang.
+// Return invalid_language_code() if lang is invalid or does not have
+// an ISO 639-1 two-letter language code.
+const char* LanguageCodeISO639_1(Language lang);
+
+// LanguageCodeISO639_2
+// --------------------
+//
+// Return the ISO 639-2 three-letter language for lang.
+// Return invalid_language_code() if lang is invalid or does not have
+// an ISO 639-2 three-letter language code.
+const char* LanguageCodeISO639_2(Language lang);
+
+// LanguageFromCode
+// ----------------
+//
+// If lang_code matches the code for a Language, using a case-insensitive
+// comparison, set *lang to that Language and return true.
+// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
+//
+// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
+// (three-letter) code, or a Google-specific code (see LanguageCode).
+//
+// Certain language-code aliases are also allowed:
+//   For "zh-cn" and "zh_cn", set *lang to CHINESE.
+//   For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
+//   For "he", set *lang to HEBREW.
+//   For "in", set *lang to INDONESIAN.
+//   For "ji", set *lang to YIDDISH.
+//   For "fil", set *lang to TAGALOG.
+//
+// REQUIRES: 'lang' must not be NULL.
+bool LanguageFromCode(const char* lang_code, Language *language);
+
+
+// LanguageFromCodeOrName
+// ----------------------
+//
+// If lang_code_or_name is a language code or a language name.
+// set *language to the corresponding Language and return true.
+// Otherwise set *language to UNKNOWN_LANGUAGE and return false.
+//
+bool LanguageFromCodeOrName(const char* lang_code_or_name,
+                            Language* language);
+
+// LanguageNameFromCode
+// --------------------
+//
+// If language_code is the code for a Language (see LanguageFromCode),
+// return the standard name of that language (see LanguageName).
+// Otherwise return invalid_language_name().
+//
+const char* LanguageNameFromCode(const char* language_code);
+
+
+// Miscellany
+
+// LanguageCodeToUnderscoreForm
+// ----------------------------
+//
+// Given a language code, convert the dash "-" to underscore "_".
+//
+// Specifically, if result_length <= strlen(lang_code), set result[0]
+// to '\0' and return false. Otherwise, copy lang_code to result,
+// converting every dash to an underscore, converting every character
+// before the first dash or underscore to lower case, and converting
+// every character after the first dash or underscore to upper
+// case. If there is no dash or underscore, convert the entire string
+// to lower case.
+//
+// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
+
+bool LanguageCodeToUnderscoreForm(const char* lang_code,
+                                  char* result,
+                                  int result_length);
+
+//
+// AlwaysPutInExpectedRestrict
+// ---------------------------
+//
+// For Web pages in certain top-level domains, Web Search always
+// applies a "country restrict". If 'tld' matches one of those, using
+// a case-SENSITIVE comparison, set *expected_language to the Language
+// most commonly found in that top-level domain and return true.
+// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
+bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
+
+
+#endif  // UTIL_LANGUAGES_LANGUAGES_H_
diff --git a/contrib/google-ced/util/languages/languages.pb.h b/contrib/google-ced/util/languages/languages.pb.h
new file mode 100644
index 000000000..84f1d6a79
--- /dev/null
+++ b/contrib/google-ced/util/languages/languages.pb.h
@@ -0,0 +1,191 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_LANGUAGES_LANGUAGES_PB_H_
+#define UTIL_LANGUAGES_LANGUAGES_PB_H_
+
+enum Language {
+  ENGLISH       = 0,
+  DANISH        = 1,
+  DUTCH         = 2,
+  FINNISH       = 3,
+  FRENCH        = 4,
+  GERMAN        = 5,
+  HEBREW        = 6,
+  ITALIAN       = 7,
+  JAPANESE      = 8,
+  KOREAN        = 9,
+  NORWEGIAN     = 10,
+  POLISH        = 11,
+  PORTUGUESE    = 12,
+  RUSSIAN       = 13,
+  SPANISH       = 14,
+  SWEDISH       = 15,
+  CHINESE       = 16,
+  CZECH         = 17,
+  GREEK         = 18,
+  ICELANDIC     = 19,
+  LATVIAN       = 20,
+  LITHUANIAN    = 21,
+  ROMANIAN      = 22,
+  HUNGARIAN     = 23,
+  ESTONIAN      = 24,
+  TG_UNKNOWN_LANGUAGE   = 25,
+  UNKNOWN_LANGUAGE      = 26,
+  BULGARIAN     = 27,
+  CROATIAN      = 28,
+  SERBIAN       = 29,
+  IRISH         = 30,      // UI only.
+  GALICIAN      = 31,
+  TAGALOG       = 32,      // Tagalog (tl) + Filipino (fil),
+  TURKISH       = 33,
+  UKRAINIAN     = 34,
+  HINDI         = 35,
+  MACEDONIAN    = 36,
+  BENGALI       = 37,
+  INDONESIAN    = 38,
+  LATIN         = 39,      // UI only.
+  MALAY         = 40,
+  MALAYALAM     = 41,
+  WELSH         = 42,      // UI only.
+  NEPALI        = 43,
+  TELUGU        = 44,
+  ALBANIAN      = 45,
+  TAMIL         = 46,
+  BELARUSIAN    = 47,
+  JAVANESE      = 48,      // UI only.
+  OCCITAN       = 49,      // UI only.
+  URDU          = 50,
+  BIHARI        = 51,
+  GUJARATI      = 52,
+  THAI          = 53,
+  ARABIC        = 54,
+  CATALAN       = 55,
+  ESPERANTO     = 56,
+  BASQUE        = 57,
+  INTERLINGUA   = 58,      // UI only.
+  KANNADA       = 59,
+  PUNJABI       = 60,
+  SCOTS_GAELIC  = 61,      // UI only.
+  SWAHILI       = 62,
+  SLOVENIAN     = 63,
+  MARATHI       = 64,
+  MALTESE       = 65,
+  VIETNAMESE    = 66,
+  FRISIAN       = 67,      // UI only.
+  SLOVAK        = 68,
+  CHINESE_T     = 69,      // This is added to solve the problem of
+                           // distinguishing Traditional and Simplified
+                           // Chinese when the encoding is UTF8.
+  FAROESE       = 70,      // UI only.
+  SUNDANESE     = 71,      // UI only.
+  UZBEK         = 72,
+  AMHARIC       = 73,
+  AZERBAIJANI   = 74,
+  GEORGIAN      = 75,
+  TIGRINYA      = 76,      // UI only.
+  PERSIAN       = 77,
+  BOSNIAN       = 78,      // UI only. LangId language: CROATIAN (28)
+  SINHALESE     = 79,
+  NORWEGIAN_N   = 80,      // UI only. LangId language: NORWEGIAN (10)
+  PORTUGUESE_P  = 81,      // UI only. LangId language: PORTUGUESE (12)
+  PORTUGUESE_B  = 82,      // UI only. LangId language: PORTUGUESE (12)
+  XHOSA         = 83,      // UI only.
+  ZULU          = 84,      // UI only.
+  GUARANI       = 85,
+  SESOTHO       = 86,      // UI only.
+  TURKMEN       = 87,      // UI only.
+  KYRGYZ        = 88,
+  BRETON        = 89,      // UI only.
+  TWI           = 90,      // UI only.
+  YIDDISH       = 91,      // UI only.
+  SERBO_CROATIAN= 92,      // UI only. LangId language: SERBIAN (29)
+  SOMALI        = 93,      // UI only.
+  UIGHUR        = 94,
+  KURDISH       = 95,
+  MONGOLIAN     = 96,
+  ARMENIAN      = 97,
+  LAOTHIAN      = 98,
+  SINDHI        = 99,
+  RHAETO_ROMANCE= 100,     // UI only.
+  AFRIKAANS     = 101,
+  LUXEMBOURGISH = 102,     // UI only.
+  BURMESE       = 103,
+  KHMER         = 104,
+  TIBETAN       = 105,
+  DHIVEHI       = 106,     // sometimes spelled Divehi, lang of Maldives
+  CHEROKEE      = 107,
+  SYRIAC        = 108,     // UI only.
+  LIMBU         = 109,     // UI only.
+  ORIYA         = 110,
+  ASSAMESE      = 111,     // UI only.
+  CORSICAN      = 112,     // UI only.
+  INTERLINGUE   = 113,     // UI only.
+  KAZAKH        = 114,
+  LINGALA       = 115,     // UI only.
+  MOLDAVIAN     = 116,     // UI only. LangId language: ROMANIAN (22)
+  PASHTO        = 117,
+  QUECHUA       = 118,     // UI only.
+  SHONA         = 119,     // UI only.
+  TAJIK         = 120,
+  TATAR         = 121,     // UI only.
+  TONGA         = 122,     // UI only.
+  YORUBA        = 123,     // UI only.
+  CREOLES_AND_PIDGINS_ENGLISH_BASED       = 124,   // UI only.
+  CREOLES_AND_PIDGINS_FRENCH_BASED        = 125,   // UI only.
+  CREOLES_AND_PIDGINS_PORTUGUESE_BASED    = 126,   // UI only.
+  CREOLES_AND_PIDGINS_OTHER               = 127,   // UI only.
+  MAORI         = 128,     // UI only.
+  WOLOF         = 129,     // UI only.
+  ABKHAZIAN     = 130,     // UI only.
+  AFAR          = 131,     // UI only.
+  AYMARA        = 132,     // UI only.
+  BASHKIR       = 133,     // UI only.
+  BISLAMA       = 134,     // UI only.
+  DZONGKHA      = 135,     // UI only.
+  FIJIAN        = 136,     // UI only.
+  GREENLANDIC   = 137,     // UI only.
+  HAUSA         = 138,     // UI only.
+  HAITIAN_CREOLE= 139,     // UI only.
+  INUPIAK       = 140,     // UI only.
+  INUKTITUT     = 141,
+  KASHMIRI      = 142,     // UI only.
+  KINYARWANDA   = 143,     // UI only.
+  MALAGASY      = 144,     // UI only.
+  NAURU         = 145,     // UI only.
+  OROMO         = 146,     // UI only.
+  RUNDI         = 147,     // UI only.
+  SAMOAN        = 148,     // UI only.
+  SANGO         = 149,     // UI only.
+  SANSKRIT      = 150,
+  SISWANT       = 151,     // UI only.
+  TSONGA        = 152,     // UI only.
+  TSWANA        = 153,     // UI only.
+  VOLAPUK       = 154,     // UI only.
+  ZHUANG        = 155,     // UI only.
+  KHASI         = 156,     // UI only.
+  SCOTS         = 157,     // UI only.
+  GANDA         = 158,     // UI only.
+  MANX          = 159,     // UI only.
+  MONTENEGRIN   = 160,     // UI only. LangId language: SERBIAN (29)
+  NUM_LANGUAGES = 161,        // Always keep this at the end. It is not a
+                              // valid Language enum. It is only used to
+                              // indicate the total number of Languages.
+  // NOTE: If you add a language, you will break a unittest. See the note
+  // at the top of this enum.
+};
+
+#endif  // UTIL_LANGUAGES_LANGUAGES_PB_H_
diff --git a/contrib/google-ced/util/logging.h b/contrib/google-ced/util/logging.h
new file mode 100644
index 000000000..16e50f209
--- /dev/null
+++ b/contrib/google-ced/util/logging.h
@@ -0,0 +1,25 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_LOGGING_H_
+#define UTIL_LOGGING_H_
+
+#undef CHECK
+#define CHECK(expr)
+#undef DCHECK
+#define DCHECK(expr)
+
+#endif  // UTIL_LOGGING_H_
diff --git a/contrib/google-ced/util/port.h b/contrib/google-ced/util/port.h
new file mode 100644
index 000000000..3799b1696
--- /dev/null
+++ b/contrib/google-ced/util/port.h
@@ -0,0 +1,53 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_PORT_H_
+#define UTIL_PORT_H_
+
+#include <stdarg.h>
+
+#if defined(_MSC_VER)
+#define GG_LONGLONG(x) x##I64
+#define GG_ULONGLONG(x) x##UI64
+#else
+#define GG_LONGLONG(x) x##LL
+#define GG_ULONGLONG(x) x##ULL
+#endif
+
+// Per C99 7.8.14, define __STDC_CONSTANT_MACROS before including <stdint.h>
+// to get the INTn_C and UINTn_C macros for integer constants.  It's difficult
+// to guarantee any specific ordering of header includes, so it's difficult to
+// guarantee that the INTn_C macros can be defined by including <stdint.h> at
+// any specific point.  Provide GG_INTn_C macros instead.
+
+#define GG_INT8_C(x)    (x)
+#define GG_INT16_C(x)   (x)
+#define GG_INT32_C(x)   (x)
+#define GG_INT64_C(x)   GG_LONGLONG(x)
+
+#define GG_UINT8_C(x)   (x ## U)
+#define GG_UINT16_C(x)  (x ## U)
+#define GG_UINT32_C(x)  (x ## U)
+#define GG_UINT64_C(x)  GG_ULONGLONG(x)
+
+// Define an OS-neutral wrapper for shared library entry points
+#if defined(_WIN32)
+#define API_CALL __stdcall
+#else
+#define API_CALL
+#endif
+
+#endif  // UTIL_PORT_H_
diff --git a/contrib/google-ced/util/string_util.h b/contrib/google-ced/util/string_util.h
new file mode 100644
index 000000000..5977f4fd8
--- /dev/null
+++ b/contrib/google-ced/util/string_util.h
@@ -0,0 +1,61 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_STRING_UTIL_H_
+#define UTIL_STRING_UTIL_H_
+
+#include <string.h>
+
+namespace base {
+
+#if defined(_WIN32)
+// Compare the two strings s1 and s2 without regard to case using
+// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
+// s2 > s1 according to a lexicographic comparison.
+inline int strcasecmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline int strncasecmp(const char* s1, const char* s2, size_t n) {
+  return _strnicmp(s1, s2, n);
+}
+#else
+inline int strcasecmp(const char* s1, const char* s2) {
+  return ::strcasecmp(s1, s2);
+}
+inline int strncasecmp(const char* s1, const char* s2, size_t n) {
+  return ::strncasecmp(s1, s2, n);
+}
+#endif
+}
+
+#ifndef HAVE_MEMRCHR
+#if defined(__GLIBC__) && ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 2)))
+#define HAVE_MEMRCHR
+#endif
+#endif
+
+#ifndef HAVE_MEMRCHR
+inline void* memrchr(const void* s, int c, size_t n) {
+  const unsigned char* p = (const unsigned char*) s;
+  for (p += n; n > 0; n--) {
+    if (*--p == c)
+      return (void*) p;
+  }
+  return NULL;
+}
+#endif
+
+#endif  // UTIL_STRING_UTIL_H_
diff --git a/contrib/google-ced/util/varsetter.h b/contrib/google-ced/util/varsetter.h
new file mode 100644
index 000000000..8e8cbf2c0
--- /dev/null
+++ b/contrib/google-ced/util/varsetter.h
@@ -0,0 +1,66 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_VARSETTER_H_
+#define UTIL_VARSETTER_H_
+
+//
+// Use a VarSetter object to temporarily set an object of some sort to
+// a particular value.  When the VarSetter object is destructed, the
+// underlying object will revert to its former value.
+//
+// Sample code:
+//
+#if 0
+{
+  bool b = true;
+  {
+    VarSetter<bool> bool_setter(&b, false);
+    // Now b == false.
+  }
+  // Now b == true again.
+}
+#endif
+
+template <class C>
+class VarSetter {
+public:
+
+  // Constructor that just sets the object to a fixed value
+  VarSetter(C* object, const C& value) : object_(object), old_value_(*object) {
+    *object = value;
+  }
+
+  ~VarSetter() { *object_ = old_value_; }
+
+private:
+
+  C*const object_;
+  C old_value_;
+
+  // Disallow
+  VarSetter(const VarSetter&);
+  VarSetter& operator=(const VarSetter&);
+
+  // VarSetters always live on the stack
+  static void* operator new (size_t);
+  static void* operator new[](size_t);  // Redundant, no default ctor
+
+  static void operator delete (void*);
+  static void operator delete[](void*);
+};
+
+#endif  // UTIL_VARSETTER_H_
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2020-05-26 11:31:47 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2020-05-26 11:31:47 +0100
commit	19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3 (patch)
tree	6d0f43f3cd9ede27eb578562480633e27f042934 /contrib/google-ced/util
parent	c11838dcbacbfd0a75e98f95a63a026217c88c51 (diff)
download	rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.tar.gz rspamd-19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3.zip