You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

fastutf8.c 5.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /*
  2. * MIT License
  3. *
  4. * Copyright (c) 2019 Yibo Cai
  5. * Copyright (c) 2019 Vsevolod Stakhov
  6. * Permission is hereby granted, free of charge, to any person obtaining a copy
  7. * of this software and associated documentation files (the "Software"), to deal
  8. * in the Software without restriction, including without limitation the rights
  9. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. * copies of the Software, and to permit persons to whom the Software is
  11. * furnished to do so, subject to the following conditions:
  12. *
  13. * The above copyright notice and this permission notice shall be included in all
  14. * copies or substantial portions of the Software.
  15. *
  16. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22. * SOFTWARE.
  23. */
  24. #include "fastutf8.h"
  25. #include "libcryptobox/platform_config.h"
  26. /*
  27. * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
  28. *
  29. * Table 3-7. Well-Formed UTF-8 Byte Sequences
  30. *
  31. * +--------------------+------------+-------------+------------+-------------+
  32. * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
  33. * +--------------------+------------+-------------+------------+-------------+
  34. * | U+0000..U+007F | 00..7F | | | |
  35. * +--------------------+------------+-------------+------------+-------------+
  36. * | U+0080..U+07FF | C2..DF | 80..BF | | |
  37. * +--------------------+------------+-------------+------------+-------------+
  38. * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
  39. * +--------------------+------------+-------------+------------+-------------+
  40. * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
  41. * +--------------------+------------+-------------+------------+-------------+
  42. * | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
  43. * +--------------------+------------+-------------+------------+-------------+
  44. * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
  45. * +--------------------+------------+-------------+------------+-------------+
  46. * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
  47. * +--------------------+------------+-------------+------------+-------------+
  48. * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
  49. * +--------------------+------------+-------------+------------+-------------+
  50. * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
  51. * +--------------------+------------+-------------+------------+-------------+
  52. */
  53. /* Return 0 - success, >0 - index (1 based) of first error char */
  54. off_t
  55. rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len)
  56. {
  57. off_t err_pos = 1;
  58. while (len) {
  59. int bytes;
  60. const unsigned char byte1 = data[0];
  61. /* 00..7F */
  62. if (byte1 <= 0x7F) {
  63. bytes = 1;
  64. /* C2..DF, 80..BF */
  65. }
  66. else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
  67. (signed char) data[1] <= (signed char) 0xBF) {
  68. bytes = 2;
  69. }
  70. else if (len >= 3) {
  71. const unsigned char byte2 = data[1];
  72. /* Is byte2, byte3 between 0x80 ~ 0xBF */
  73. const int byte2_ok = (signed char) byte2 <= (signed char) 0xBF;
  74. const int byte3_ok = (signed char) data[2] <= (signed char) 0xBF;
  75. if (byte2_ok && byte3_ok &&
  76. /* E0, A0..BF, 80..BF */
  77. ((byte1 == 0xE0 && byte2 >= 0xA0) ||
  78. /* E1..EC, 80..BF, 80..BF */
  79. (byte1 >= 0xE1 && byte1 <= 0xEC) ||
  80. /* ED, 80..9F, 80..BF */
  81. (byte1 == 0xED && byte2 <= 0x9F) ||
  82. /* EE..EF, 80..BF, 80..BF */
  83. (byte1 >= 0xEE && byte1 <= 0xEF))) {
  84. bytes = 3;
  85. }
  86. else if (len >= 4) {
  87. /* Is byte4 between 0x80 ~ 0xBF */
  88. const int byte4_ok = (signed char) data[3] <= (signed char) 0xBF;
  89. if (byte2_ok && byte3_ok && byte4_ok &&
  90. /* F0, 90..BF, 80..BF, 80..BF */
  91. ((byte1 == 0xF0 && byte2 >= 0x90) ||
  92. /* F1..F3, 80..BF, 80..BF, 80..BF */
  93. (byte1 >= 0xF1 && byte1 <= 0xF3) ||
  94. /* F4, 80..8F, 80..BF, 80..BF */
  95. (byte1 == 0xF4 && byte2 <= 0x8F))) {
  96. bytes = 4;
  97. }
  98. else {
  99. return err_pos;
  100. }
  101. }
  102. else {
  103. return err_pos;
  104. }
  105. }
  106. else {
  107. return err_pos;
  108. }
  109. len -= bytes;
  110. err_pos += bytes;
  111. data += bytes;
  112. }
  113. return 0;
  114. }
  115. /* Prototypes */
  116. #if defined(HAVE_SSE41) && defined(__x86_64__)
  117. extern off_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len);
  118. #endif
  119. #if defined(HAVE_AVX2) && defined(__x86_64__)
  120. extern off_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len);
  121. #endif
  122. static off_t (*validate_func) (const unsigned char *data, size_t len) =
  123. rspamd_fast_utf8_validate_ref;
  124. void
  125. rspamd_fast_utf8_library_init (unsigned flags)
  126. {
  127. #if defined(HAVE_SSE41) && defined(__x86_64__)
  128. if (flags & RSPAMD_FAST_UTF8_FLAG_SSE41) {
  129. validate_func = rspamd_fast_utf8_validate_sse41;
  130. }
  131. #endif
  132. #if defined(HAVE_AVX2) && defined(__x86_64__)
  133. if (flags & RSPAMD_FAST_UTF8_FLAG_AVX2) {
  134. validate_func = rspamd_fast_utf8_validate_avx2;
  135. }
  136. #endif
  137. }
  138. off_t
  139. rspamd_fast_utf8_validate (const unsigned char *data, size_t len)
  140. {
  141. return len >= 64 ?
  142. validate_func (data, len) :
  143. rspamd_fast_utf8_validate_ref (data, len);
  144. }