]> source.dussan.org Git - rspamd.git/commitdiff
Rework symbols classes in url parser.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 8 Sep 2015 13:44:40 +0000 (14:44 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 8 Sep 2015 13:44:40 +0000 (14:44 +0100)
src/libserver/url.c

index fb3e961e680cf1366289646087e27485d276ebbd..23b052b50aad08bf1657a4809d20fb1451a3c99b 100644 (file)
@@ -152,47 +152,167 @@ struct url_match_scanner {
 
 struct url_match_scanner *url_scanner = NULL;
 
-static guchar url_scanner_table[256] = {
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               24, 128, 160, 128, 128, 128, 128, 128, 160, 160, 128, 128, 160, 192,
-               160, 160,
-               68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 160, 160, 32, 128, 32, 128,
-               160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
-               66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 160, 160, 160, 128, 192,
-               128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
-               66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 128, 128, 128, 128, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+enum {
+       IS_LWSP = (1 << 0),
+       IS_DOMAIN = (1 << 1),
+       IS_URLSAFE = (1 << 2),
+       IS_MAILSAFE = (1 << 3),
+       IS_DOMAIN_END = (1 << 4)
 };
 
-enum {
-       IS_CTRL = (1 << 0),
-       IS_ALPHA = (1 << 1),
-       IS_DIGIT = (1 << 2),
-       IS_LWSP = (1 << 3),
-       IS_SPACE = (1 << 4),
-       IS_SPECIAL = (1 << 5),
-       IS_DOMAIN = (1 << 6),
-       IS_URLSAFE = (1 << 7)
+static const unsigned int url_scanner_table[256] = {
+               0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP /*   */,
+               IS_MAILSAFE /* ! */, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* " */,
+               IS_MAILSAFE /* # */, IS_MAILSAFE /* $ */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* % */, 0 /* & */, IS_MAILSAFE /* ' */,
+               IS_MAILSAFE /* ( */, IS_MAILSAFE /* ) */, IS_MAILSAFE /* * */,
+               IS_MAILSAFE /* + */, IS_MAILSAFE /* , */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* - */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* . */, IS_DOMAIN_END|IS_MAILSAFE /* / */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 0 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 1 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 2 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 3 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 4 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 5 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 6 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 7 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 8 */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 9 */, IS_DOMAIN_END|IS_MAILSAFE /* : */,
+               IS_MAILSAFE /* ; */, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* < */, 0 /* = */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* > */, IS_DOMAIN_END /* ? */, 0 /* @ */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* A */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* B */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* C */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* D */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* E */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* F */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* G */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* H */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* I */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* J */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* K */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* L */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* M */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* N */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* O */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* P */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* Q */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* R */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* S */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* T */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* U */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* V */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* W */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* X */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* Y */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* Z */, IS_MAILSAFE /* [ */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* \ */, IS_MAILSAFE /* ] */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* ^ */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* _ */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* ` */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* a */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* b */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* c */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* d */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* e */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* f */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* g */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* h */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* i */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* j */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* k */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* l */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* m */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* n */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* o */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* p */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* q */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* r */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* s */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* t */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* u */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* v */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* w */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* x */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* y */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* z */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* { */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* | */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* } */,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* ~ */,
+               0, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
+               IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE
 };
 
-#define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0)
-#define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0)
-#define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL | IS_SPACE | \
-    IS_CTRL)) == 0)
-#define is_usersafe(x) ((url_scanner_table[(guchar)(x)] & (IS_CTRL | IS_SPACE)) == 0)
-#define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0)
-#define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0)
+#define is_lwsp(x) ((url_scanner_table[(guint)(x)] & IS_LWSP) != 0)
+#define is_mailsafe(x) ((url_scanner_table[(guint)(x)] & (IS_MAILSAFE)) != 0)
 #define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0)
-#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA | IS_DIGIT | \
-    IS_URLSAFE)) != 0)
+#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_URLSAFE)) != 0)
 
 const gchar *
 rspamd_url_strerror (enum uri_errno err)
@@ -425,7 +545,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        SET_U (u, UF_USERINFO);
                                        st = parse_at;
                                }
-                               else if (!is_usersafe (t)) {
+                               else if (!is_mailsafe (t)) {
                                        goto out;
                                }
                                p++;
@@ -449,7 +569,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                st = parse_query;
                                break;
                        case parse_query:
-                               if (!is_atom (t)) {
+                               if (!is_mailsafe (t)) {
                                        goto out;
                                }
                                p++;
@@ -1509,7 +1629,7 @@ url_email_end (struct url_callback_data *cb,
                }
 
                c = pos - 1;
-               while (c > cb->begin && is_usersafe (*c)) {
+               while (c > cb->begin && is_mailsafe (*c)) {
                        c --;
                }
                /* Rewind to the first alphanumeric character */
@@ -1523,7 +1643,7 @@ url_email_end (struct url_callback_data *cb,
                        p ++;
                }
                /* Rewind it again to avoid bad emails to be detected */
-               while (p > pos && !g_ascii_isalnum (*p)) {
+               while (p > pos && p < cb->end && !g_ascii_isalnum (*p)) {
                        p --;
                }