diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-04-03 11:37:54 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-04-03 11:37:54 +0100 |
commit | 0a822b38c6c7177633c81d373946c7ba12639ec7 (patch) | |
tree | 7bcb7dc0c5aa329d76eb7acd50e0bd7745cc0a49 | |
parent | 92563dbf0220d91dc4be0a418fb62ccc220dd447 (diff) | |
download | rspamd-0a822b38c6c7177633c81d373946c7ba12639ec7.tar.gz rspamd-0a822b38c6c7177633c81d373946c7ba12639ec7.zip |
[Minor] Remove strange conversion sources from replxx
Issue: #3317
-rw-r--r-- | contrib/replxx/CMakeLists.txt | 1 | ||||
-rw-r--r-- | contrib/replxx/src/ConvertUTF.cpp | 271 | ||||
-rw-r--r-- | contrib/replxx/src/ConvertUTF.h | 139 | ||||
-rw-r--r-- | contrib/replxx/src/conversion.cxx | 61 | ||||
-rw-r--r-- | contrib/replxx/src/conversion.hxx | 9 | ||||
-rw-r--r-- | contrib/replxx/src/unicodestring.hxx | 1 |
6 files changed, 44 insertions, 438 deletions
diff --git a/contrib/replxx/CMakeLists.txt b/contrib/replxx/CMakeLists.txt index b391639f5..9225fd8f4 100644 --- a/contrib/replxx/CMakeLists.txt +++ b/contrib/replxx/CMakeLists.txt @@ -52,7 +52,6 @@ endif() set( REPLXX_SOURCES src/conversion.cxx - src/ConvertUTF.cpp src/escape.cxx src/history.cxx src/replxx_impl.cxx diff --git a/contrib/replxx/src/ConvertUTF.cpp b/contrib/replxx/src/ConvertUTF.cpp deleted file mode 100644 index 3609c6249..000000000 --- a/contrib/replxx/src/ConvertUTF.cpp +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. - */ - -/* --------------------------------------------------------------------- - - Conversions between UTF32, UTF-16, and UTF-8. Source code file. - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Sept 2001: fixed const & error conditions per - mods suggested by S. Parent & A. Lillich. - June 2002: Tim Dodd added detection and handling of incomplete - source sequences, enhanced error detection, added casts - to eliminate compiler warnings. - July 2003: slight mods to back out aggressive FFFE detection. - Jan 2004: updated switches in from-UTF8 conversions. - Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. - - See the header file "ConvertUTF.h" for complete documentation. - ------------------------------------------------------------------------- */ - -#include "ConvertUTF.h" -#ifdef CVTUTF_DEBUG -#include <stdio.h> -#endif - -namespace replxx { - -#define UNI_SUR_HIGH_START (UTF32)0xD800 -#define UNI_SUR_LOW_END (UTF32)0xDFFF - -/* --------------------------------------------------------------------- */ - -/* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - -/* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - -/* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ -static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -/* --------------------------------------------------------------------- */ - -/* The interface converts a whole buffer to avoid function-call overhead. - * Constants have been gathered. Loops & conditionals have been removed as - * much as possible for efficiency, in favor of drop-through switches. - * (See "Note A" at the bottom of the file for equivalent code.) - * If your compiler supports it, the "isLegalUTF8" call can be turned - * into an inline function. - */ - -/* --------------------------------------------------------------------- */ - -/* - * Utility routine to tell whether a sequence of bytes is legal UTF-8. - * This must be called with the length pre-determined by the first byte. - * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; - * and the sequence is illegal right away if there aren't that many bytes - * available. - * If presented with a length > 4, this returns false. The Unicode - * definition of UTF-8 goes up to 4-byte sequences. - */ - -static bool isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: { if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; } /* fall through */ - case 3: { if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; } /* fall through */ - case 2: { - if ((a = (*--srcptr)) > 0xBF) return false; - - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - } /* fall through */ - case 1: { if (*source >= 0x80 && *source < 0xC2) return false; } /* fall through */ - } - if (*source > 0xF4) return false; - return true; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion ) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: { *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; } /* fall through */ - case 3: { *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; } /* fall through */ - case 2: { *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; } /* fall through */ - case 1: { *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); } /* fall through */ - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: { ch += *source++; ch <<= 6; } /* fall through */ - case 4: { ch += *source++; ch <<= 6; } /* fall through */ - case 3: { ch += *source++; ch <<= 6; } /* fall through */ - case 2: { ch += *source++; ch <<= 6; } /* fall through */ - case 1: { ch += *source++; ch <<= 6; } /* fall through */ - case 0: { ch += *source++; } /* fall through */ - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -} - -/* --------------------------------------------------------------------- - - Note A. - The fall-through switches in UTF-8 reading code save a - temp variable, some decrements & conditionals. The switches - are equivalent to the following loop: - { - int tmpBytesToRead = extraBytesToRead+1; - do { - ch += *source++; - --tmpBytesToRead; - if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); - } - In UTF-8 writing code, the switches on "bytesToWrite" are - similarly unrolled loops. - - --------------------------------------------------------------------- */ diff --git a/contrib/replxx/src/ConvertUTF.h b/contrib/replxx/src/ConvertUTF.h deleted file mode 100644 index f91d55732..000000000 --- a/contrib/replxx/src/ConvertUTF.h +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. - */ - -/* --------------------------------------------------------------------- - - Conversions between UTF32, UTF-16, and UTF-8. Header file. - - Several funtions are included here, forming a complete set of - conversions between the three formats. UTF-7 is not included - here, but is handled in a separate source file. - - Each of these routines takes pointers to input buffers and output - buffers. The input buffers are const. - - Each routine converts the text between *sourceStart and sourceEnd, - putting the result into the buffer between *targetStart and - targetEnd. Note: the end pointers are *after* the last item: e.g. - *(sourceEnd - 1) is the last item. - - The return result indicates whether the conversion was successful, - and if not, whether the problem was in the source or target buffers. - (Only the first encountered problem is indicated.) - - After the conversion, *sourceStart and *targetStart are both - updated to point to the end of last text successfully converted in - the respective buffers. - - Input parameters: - sourceStart - pointer to a pointer to the source buffer. - The contents of this are modified on return so that - it points at the next thing to be converted. - targetStart - similarly, pointer to pointer to the target buffer. - sourceEnd, targetEnd - respectively pointers to the ends of the - two buffers, for overflow checking only. - - These conversion functions take a ConversionFlags argument. When this - flag is set to strict, both irregular sequences and isolated surrogates - will cause an error. When the flag is set to lenient, both irregular - sequences and isolated surrogates are converted. - - Whether the flag is strict or lenient, all illegal sequences will cause - an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, - or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code - must check for illegal sequences. - - When the flag is set to lenient, characters over 0x10FFFF are converted - to the replacement character; otherwise (when the flag is set to strict) - they constitute an error. - - Output parameters: - The value "sourceIllegal" is returned from some routines if the input - sequence is malformed. When "sourceIllegal" is returned, the source - value will point to the illegal value that caused the problem. E.g., - in UTF-8 when a sequence is malformed, it points to the start of the - malformed sequence. - - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Fixes & updates, Sept 2001. - ------------------------------------------------------------------------- */ - -/* --------------------------------------------------------------------- - The following 4 definitions are compiler-specific. - The C standard does not guarantee that wchar_t has at least - 16 bits, so wchar_t is no less portable than unsigned short! - All should be unsigned values to avoid sign extension during - bit mask & shift operations. ------------------------------------------------------------------------- */ - -#ifndef REPLXX_CONVERT_UTF8_H_INCLUDED -#define REPLXX_CONVERT_UTF8_H_INCLUDED 1 - -#if 0 -typedef unsigned long UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -#endif - -#include <stdint.h> -#include <string> - -namespace replxx { - -typedef uint32_t UTF32; -typedef uint16_t UTF16; -typedef uint8_t UTF8; - -/* Some fundamental constants */ -#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD -#define UNI_MAX_BMP (UTF32)0x0000FFFF -#define UNI_MAX_UTF16 (UTF32)0x0010FFFF -#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF -#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF - -typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ -} ConversionResult; - -typedef enum { - strictConversion = 0, - lenientConversion -} ConversionFlags; - -ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); - -} - -#endif /* REPLXX_CONVERT_UTF8_H_INCLUDED */ - -/* --------------------------------------------------------------------- */ diff --git a/contrib/replxx/src/conversion.cxx b/contrib/replxx/src/conversion.cxx index 0b4c5fa48..b7fa3da4d 100644 --- a/contrib/replxx/src/conversion.cxx +++ b/contrib/replxx/src/conversion.cxx @@ -2,7 +2,10 @@ #include <string> #include <cstring> #include <cctype> -#include <locale.h> +#include <clocale> + +#include "unicode/utf8.h" + #include "conversion.hxx" @@ -44,20 +47,26 @@ bool is8BitEncoding( is_8bit_encoding() ); ConversionResult copyString8to32(char32_t* dst, int dstSize, int& dstCount, const char* src) { ConversionResult res = ConversionResult::conversionOK; if ( ! locale::is8BitEncoding ) { - const UTF8* sourceStart = reinterpret_cast<const UTF8*>(src); - const UTF8* sourceEnd = sourceStart + strlen(src); - UTF32* targetStart = reinterpret_cast<UTF32*>(dst); - UTF32* targetEnd = targetStart + dstSize; - - res = ConvertUTF8toUTF32( - &sourceStart, sourceEnd, &targetStart, targetEnd, lenientConversion); + auto sourceStart = reinterpret_cast<const unsigned char*>(src); + auto slen = strlen(src); + auto targetStart = reinterpret_cast<UChar32*>(dst); + int i = 0, j = 0; + + while (i < slen && j < dstSize) { + UChar32 uc; + U8_NEXT (sourceStart, i, slen, uc); + + if (uc <= 0) { + /* Replace with 0xFFFD */ + uc = 0x0000FFFD; + } + targetStart[j++] = uc; + } - if (res == conversionOK) { - dstCount = targetStart - reinterpret_cast<UTF32*>(dst); + dstCount = j; - if (dstCount < dstSize) { - *targetStart = 0; - } + if (j < dstSize) { + targetStart[j] = 0; } } else { for ( dstCount = 0; ( dstCount < dstSize ) && src[dstCount]; ++ dstCount ) { @@ -77,22 +86,24 @@ void copyString32to8( char* dst, int dstSize, const char32_t* src, int srcSize, int* dstCount ) { if ( ! locale::is8BitEncoding ) { - const UTF32* sourceStart = reinterpret_cast<const UTF32*>(src); - const UTF32* sourceEnd = sourceStart + srcSize; - UTF8* targetStart = reinterpret_cast<UTF8*>(dst); - UTF8* targetEnd = targetStart + dstSize; + int j = 0; + UBool is_error = 0; - ConversionResult res = ConvertUTF32toUTF8( - &sourceStart, sourceEnd, &targetStart, targetEnd, lenientConversion); + for (auto i = 0; i < srcSize; i ++) { + U8_APPEND (dst, j, dstSize, src[i], is_error); - if (res == conversionOK) { - int resCount( targetStart - reinterpret_cast<UTF8*>( dst ) ); + if (is_error) { + break; + } + } - if ( resCount < dstSize ) { - *targetStart = 0; + if (!is_error) { + if (dstCount) { + *dstCount = j; } - if ( dstCount ) { - *dstCount = resCount; + + if (j < dstSize) { + dst[j] = '\0'; } } } else { diff --git a/contrib/replxx/src/conversion.hxx b/contrib/replxx/src/conversion.hxx index 45d251a6c..1cb2d450d 100644 --- a/contrib/replxx/src/conversion.hxx +++ b/contrib/replxx/src/conversion.hxx @@ -1,12 +1,17 @@ #ifndef REPLXX_CONVERSION_HXX_INCLUDED #define REPLXX_CONVERSION_HXX_INCLUDED 1 -#include "ConvertUTF.h" - namespace replxx { typedef unsigned char char8_t; +typedef enum { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +} ConversionResult; + ConversionResult copyString8to32( char32_t* dst, int dstSize, int& dstCount, char const* src ); ConversionResult copyString8to32( char32_t* dst, int dstSize, int& dstCount, char8_t const* src ); void copyString32to8( char* dst, int dstSize, char32_t const* src, int srcSize, int* dstCount = nullptr ); diff --git a/contrib/replxx/src/unicodestring.hxx b/contrib/replxx/src/unicodestring.hxx index 53a7372c6..1607ede66 100644 --- a/contrib/replxx/src/unicodestring.hxx +++ b/contrib/replxx/src/unicodestring.hxx @@ -2,6 +2,7 @@ #define REPLXX_UNICODESTRING_HXX_INCLUDED #include <vector> +#include <string> #include <cstring> #include "conversion.hxx" |