You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

encodings.h 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. // Copyright 2016 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #ifndef UTIL_ENCODINGS_ENCODINGS_H_
  17. #define UTIL_ENCODINGS_ENCODINGS_H_
  18. // This interface defines the Encoding enum and various functions that
  19. // depend only on Encoding values.
  20. // A hash-function for Encoding, hash<Encoding>, is defined in
  21. // i18n/encodings/public/encodings-hash.h
  22. // On some Windows projects, UNICODE may be defined, which would prevent the
  23. // Encoding enum below from compiling. Note that this is a quick fix that does
  24. // not break any existing projects. The UNICODE enum may someday be changed
  25. // to something more specific and non-colliding, but this involves careful
  26. // testing of changes in many other projects.
  27. #undef UNICODE
  28. // NOTE: The Encoding enum must always start at 0. This assumption has
  29. // been made and used.
  30. #ifndef SWIG
  31. #include "util/encodings/encodings.pb.h"
  32. #else
  33. // TODO: Include a SWIG workaround header file.
  34. #endif
  35. const int kNumEncodings = NUM_ENCODINGS;
  36. // some of the popular encoding aliases
  37. // TODO: Make these static const Encoding values instead of macros.
  38. #define LATIN1 ISO_8859_1
  39. #define LATIN2 ISO_8859_2
  40. #define LATIN3 ISO_8859_3
  41. #define LATIN4 ISO_8859_4
  42. #define CYRILLIC ISO_8859_5
  43. #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
  44. #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
  45. #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
  46. #define LATIN5 ISO_8859_9
  47. #define LATIN6 ISO_8859_10
  48. #define KOREAN_HANGUL KOREAN_EUC_KR
  49. // The default Encoding (LATIN1).
  50. Encoding default_encoding();
  51. // *************************************************************
  52. // Encoding predicates
  53. // IsValidEncoding()
  54. // IsEncEncCompatible
  55. // IsSupersetOfAscii7Bit
  56. // Is8BitEncoding
  57. // IsCJKEncoding
  58. // IsHebrewEncoding
  59. // IsRightToLeftEncoding
  60. // IsLogicalRightToLeftEncoding
  61. // IsVisualRightToLeftEncoding
  62. // IsIso2022Encoding
  63. // IsIso2022JpOrVariant
  64. // IsShiftJisOrVariant
  65. // IsJapaneseCellPhoneCarrierSpecificEncoding
  66. // *************************************************************
  67. // IsValidEncoding
  68. // ===================================
  69. //
  70. // Function to check if the input language enum is within range.
  71. //
  72. bool IsValidEncoding(Encoding enc);
  73. //
  74. // IsEncEncCompatible
  75. // ------------------
  76. //
  77. // This function is to determine whether or not converting from the
  78. // first encoding to the second requires any changes to the underlying
  79. // text (e.g. ASCII_7BIT is a subset of UTF8).
  80. //
  81. // TODO: the current implementation is likely incomplete. It would be
  82. // good to consider the full matrix of all pairs of encodings and to fish out
  83. // all compatible pairs.
  84. //
  85. bool IsEncEncCompatible(const Encoding from, const Encoding to);
  86. // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
  87. // encoding represent the same characters as they do in ISO_8859_1.
  88. // WARNING: This function does not currently return true for all encodings that
  89. // are supersets of Ascii 7-bit.
  90. bool IsSupersetOfAscii7Bit(Encoding e);
  91. // To be an 8-bit encoding means that there are fewer than 256 symbols.
  92. // Each byte determines a new character; there are no multi-byte sequences.
  93. // WARNING: This function does not currently return true for all encodings that
  94. // are 8-bit encodings.
  95. bool Is8BitEncoding(Encoding e);
  96. // IsCJKEncoding
  97. // -------------
  98. //
  99. // This function returns true if the encoding is either Chinese
  100. // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
  101. // considered a CJK encoding.
  102. bool IsCJKEncoding(Encoding e);
  103. // IsHebrewEncoding
  104. // -------------
  105. //
  106. // This function returns true if the encoding is a Hebrew specific
  107. // encoding (not UTF8, etc).
  108. bool IsHebrewEncoding(Encoding e);
  109. // IsRightToLeftEncoding
  110. // ---------------------
  111. //
  112. // Returns true if the encoding is a right-to-left encoding.
  113. //
  114. // Note that the name of this function is somewhat misleading. There is nothing
  115. // "right to left" about these encodings. They merely contain code points for
  116. // characters in RTL languages such as Hebrew and Arabic. But this is also
  117. // true for UTF-8.
  118. //
  119. // TODO: Get rid of this function. The only special-case we
  120. // should need to worry about are visual encodings. Anything we
  121. // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
  122. bool IsRightToLeftEncoding(Encoding enc);
  123. // IsLogicalRightToLeftEncoding
  124. // ----------------------------
  125. //
  126. // Returns true if the encoding is a logical right-to-left encoding.
  127. // Logical right-to-left encodings are those that the browser renders
  128. // right-to-left and applies the BiDi algorithm to. Therefore the characters
  129. // appear in reading order in the file, and indexing, snippet generation etc.
  130. // should all just work with no special processing.
  131. //
  132. // TODO: Get rid of this function. The only special-case we
  133. // should need to worry about are visual encodings.
  134. bool IsLogicalRightToLeftEncoding(Encoding enc);
  135. // IsVisualRightToLeftEncoding
  136. // ---------------------------
  137. //
  138. // Returns true if the encoding is a visual right-to-left encoding.
  139. // Visual right-to-left encodings are those that the browser renders
  140. // left-to-right and does not apply the BiDi algorithm to. Therefore each
  141. // line appears in reverse order in the file, lines are manually wrapped
  142. // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
  143. // the prehistoric days when browsers couldn't render right-to-left, but
  144. // unfortunately some visual pages persist to this day. These documents require
  145. // special processing so that we don't index or snippet them with each line
  146. // reversed.
  147. bool IsVisualRightToLeftEncoding(Encoding enc);
  148. // IsIso2022Encoding
  149. // -----------------
  150. //
  151. // Returns true if the encoding is a kind of ISO 2022 such as
  152. // ISO-2022-JP.
  153. bool IsIso2022Encoding(Encoding enc);
  154. // IsIso2022JpOrVariant
  155. // --------------------
  156. //
  157. // Returns true if the encoding is ISO-2022-JP or a variant such as
  158. // KDDI's ISO-2022-JP.
  159. bool IsIso2022JpOrVariant(Encoding enc);
  160. // IsShiftJisOrVariant
  161. // --------------------
  162. //
  163. // Returns true if the encoding is Shift_JIS or a variant such as
  164. // KDDI's Shift_JIS.
  165. bool IsShiftJisOrVariant(Encoding enc);
  166. // IsJapanesCellPhoneCarrierSpecificEncoding
  167. // -----------------------------------------
  168. //
  169. // Returns true if it's Japanese cell phone carrier specific encoding
  170. // such as KDDI_SHIFT_JIS.
  171. bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
  172. // *************************************************************
  173. // ENCODING NAMES
  174. //
  175. // This interface defines a standard name for each valid encoding, and
  176. // a standard name for invalid encodings. (Some names use all upper
  177. // case, but others use mixed case.)
  178. //
  179. // EncodingName() [Encoding to name]
  180. // MimeEncodingName() [Encoding to name]
  181. // EncodingFromName() [name to Encoding]
  182. // EncodingNameAliasToEncoding() [name to Encoding]
  183. // default_encoding_name()
  184. // invalid_encoding_name()
  185. // *************************************************************
  186. // EncodingName
  187. // ------------
  188. //
  189. // Given the encoding, returns its standard name.
  190. // Return invalid_encoding_name() if the encoding is invalid.
  191. //
  192. const char* EncodingName(Encoding enc);
  193. //
  194. // MimeEncodingName
  195. // ----------------
  196. //
  197. // Return the "preferred MIME name" of an encoding.
  198. //
  199. // This name is suitable for using in HTTP headers, HTML tags,
  200. // and as the "charset" parameter of a MIME Content-Type.
  201. const char* MimeEncodingName(Encoding enc);
  202. // The maximum length of an encoding name
  203. const int kMaxEncodingNameSize = 50;
  204. // The standard name of the default encoding.
  205. const char* default_encoding_name();
  206. // The name used for an invalid encoding.
  207. const char* invalid_encoding_name();
  208. // EncodingFromName
  209. // ----------------
  210. //
  211. // If enc_name matches the standard name of an Encoding, using a
  212. // case-insensitive comparison, set *encoding to that Encoding and
  213. // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
  214. // return false.
  215. //
  216. // REQUIRES: encoding must not be NULL.
  217. //
  218. bool EncodingFromName(const char* enc_name, Encoding *encoding);
  219. //
  220. // EncodingNameAliasToEncoding
  221. // ---------------------------
  222. //
  223. // If enc_name matches the standard name or an alias of an Encoding,
  224. // using a case-insensitive comparison, return that
  225. // Encoding. Otherwise, return UNKNOWN_ENCODING.
  226. //
  227. // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
  228. // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
  229. // common variations with hyphens and underscores (e.g., "koi8-u" and
  230. // "koi8u" for RUSSIAN_KOI8_R).
  231. Encoding EncodingNameAliasToEncoding(const char *enc_name);
  232. // *************************************************************
  233. // Miscellany
  234. // *************************************************************
  235. // PreferredWebOutputEncoding
  236. // --------------------------
  237. //
  238. // Some multi-byte encodings use byte values that coincide with the
  239. // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
  240. // can misinterpret these, as indicated in an external XSS report from
  241. // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
  242. // also use UTF8 instead of encodings that we don't support in our
  243. // output, and we generally try to be conservative in what we send out.
  244. // Where the client asks for single- or double-byte encodings that are
  245. // not as common, we substitute a more common single- or double-byte
  246. // encoding, if there is one, thereby preserving the client's intent
  247. // to use less space than UTF-8. This also means that characters
  248. // outside the destination set will be converted to HTML NCRs (&#NNN;)
  249. // if requested.
  250. Encoding PreferredWebOutputEncoding(Encoding enc);
  251. #endif // UTIL_ENCODINGS_ENCODINGS_H_