5 yıl önce · 8a4c5ca57a
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2420,7 +2420,7 @@ rspamd_get_unicode_normalizer (void)
 }


 gboolean
 enum rspamd_normalise_result
 rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 		guint *len)
 {
@@ -2430,7 +2430,8 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 	const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
 	gint32 nsym, end;
 	UChar *src = NULL, *dest = NULL;
 	gboolean ret = FALSE;
 	enum rspamd_normalise_result ret = 0;
 	gboolean has_invisible = FALSE;

 	/* We first need to convert data to UChars :( */
 	src = g_malloc ((*len + 1) * sizeof (*src));
@@ -2440,6 +2441,7 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 	if (!U_SUCCESS (uc_err)) {
 		msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
 				u_errorName (uc_err));
 		ret |= RSPAMD_UNICODE_NORM_ERROR;
 		goto out;
 	}

@@ -2449,36 +2451,81 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 	if (!U_SUCCESS (uc_err)) {
 		msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
 				u_errorName (uc_err));
 		ret |= RSPAMD_UNICODE_NORM_ERROR;
 		goto out;
 	}

 	if (end == nsym) {
 		/* No normalisation needed */
 	for (gint32 i = 0; i < nsym; i ++) {
 		if (IS_ZERO_WIDTH_SPACE (src[i])) {
 			has_invisible = TRUE;
 			break;
 		}
 	}

 	uc_err = U_ZERO_ERROR;

 	if (end != nsym) {
 		/* No normalisation needed, but we may still have invisible spaces */
 		/* We copy sub(src, 0, end) to dest and normalise the rest */
 		ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
 		dest = g_malloc (nsym * sizeof (*dest));
 		memcpy (dest, src, end * sizeof (*dest));
 		nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
 				src + end, nsym - end, &uc_err);

 		if (!U_SUCCESS (uc_err)) {
 			if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
 				msg_warn_pool_check ("cannot normalise URL: %s",
 						u_errorName (uc_err));
 				ret |= RSPAMD_UNICODE_NORM_ERROR;
 			}

 			goto out;
 		}
 	}
 	else if (!has_invisible) {
 		goto out;
 	}
 	else {
 		dest = src;
 		src = NULL;
 	}

 	/* We copy sub(src, 0, end) to dest and normalise the rest */
 	ret = TRUE;
 	dest = g_malloc (nsym * sizeof (*dest));
 	memcpy (dest, src, end * sizeof (*dest));
 	nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
 			src + end, nsym - end, &uc_err);
 	if (has_invisible) {
 		/* Also filter zero width spaces */
 		gint32 new_len = 0;
 		UChar *t = dest, *h = dest;

 	if (!U_SUCCESS (uc_err)) {
 		if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
 			msg_warn_pool_check ("cannot normalise URL: %s",
 					u_errorName (uc_err));
 		ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;

 		for (gint32 i = 0; i < nsym; i ++) {
 			if (!IS_ZERO_WIDTH_SPACE (*h)) {
 				*t++ = *h++;
 				new_len ++;
 			}
 			else {
 				h ++;
 			}
 		}

 		goto out;
 		nsym = new_len;
 	}

 	/* We now convert it back to utf */
 	nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);

 	if (!U_SUCCESS (uc_err)) {
 		msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
 				u_errorName (uc_err));
 		msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
 					   " input length: %d chars, unicode length: %d utf16 symbols",
 				u_errorName (uc_err), (gint)*len, (gint)nsym);

 		if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
 			ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
 		}
 		else {
 			ret |= RSPAMD_UNICODE_NORM_ERROR;
 		}

 		goto out;
 	}

--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -402,6 +402,14 @@ struct UConverter *rspamd_get_utf8_converter (void);
 struct UNormalizer2;
 const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);

 enum rspamd_normalise_result {
 	RSPAMD_UNICODE_NORM_NORMAL = 0,
 	RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
 	RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
 	RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
 	RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
 };

 /**
 * Gets a string in UTF8 and normalises it to NFKC_Casefold form
 * @param pool optional memory pool used for logging purposes
@@ -409,7 +417,7 @@ const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
 * @param len
 * @return TRUE if a string has been normalised
 */
 gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
 enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
 		gchar *start, guint *len);

 enum rspamd_regexp_escape_flags {