Browse Source

Rework symbols classes in url parser.

tags/1.0.0
Vsevolod Stakhov 8 years ago
parent
commit
c7b0eed616
1 changed files with 160 additions and 40 deletions
  1. 160
    40
      src/libserver/url.c

+ 160
- 40
src/libserver/url.c View File

@@ -152,47 +152,167 @@ struct url_match_scanner {

struct url_match_scanner *url_scanner = NULL;

static guchar url_scanner_table[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24, 128, 160, 128, 128, 128, 128, 128, 160, 160, 128, 128, 160, 192,
160, 160,
68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 160, 160, 32, 128, 32, 128,
160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 160, 160, 160, 128, 192,
128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 128, 128, 128, 128, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
enum {
IS_LWSP = (1 << 0),
IS_DOMAIN = (1 << 1),
IS_URLSAFE = (1 << 2),
IS_MAILSAFE = (1 << 3),
IS_DOMAIN_END = (1 << 4)
};

enum {
IS_CTRL = (1 << 0),
IS_ALPHA = (1 << 1),
IS_DIGIT = (1 << 2),
IS_LWSP = (1 << 3),
IS_SPACE = (1 << 4),
IS_SPECIAL = (1 << 5),
IS_DOMAIN = (1 << 6),
IS_URLSAFE = (1 << 7)
static const unsigned int url_scanner_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP /* */,
IS_MAILSAFE /* ! */, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* " */,
IS_MAILSAFE /* # */, IS_MAILSAFE /* $ */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* % */, 0 /* & */, IS_MAILSAFE /* ' */,
IS_MAILSAFE /* ( */, IS_MAILSAFE /* ) */, IS_MAILSAFE /* * */,
IS_MAILSAFE /* + */, IS_MAILSAFE /* , */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* - */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* . */, IS_DOMAIN_END|IS_MAILSAFE /* / */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 0 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 1 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 2 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 3 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 4 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 5 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 6 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 7 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 8 */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* 9 */, IS_DOMAIN_END|IS_MAILSAFE /* : */,
IS_MAILSAFE /* ; */, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* < */, 0 /* = */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* > */, IS_DOMAIN_END /* ? */, 0 /* @ */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* A */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* B */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* C */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* D */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* E */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* F */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* G */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* H */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* I */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* J */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* K */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* L */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* M */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* N */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* O */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* P */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* Q */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* R */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* S */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* T */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* U */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* V */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* W */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* X */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* Y */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* Z */, IS_MAILSAFE /* [ */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* \ */, IS_MAILSAFE /* ] */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* ^ */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* _ */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* ` */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* a */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* b */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* c */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* d */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* e */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* f */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* g */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* h */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* i */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* j */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* k */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* l */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* m */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* n */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* o */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* p */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* q */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* r */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* s */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* t */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* u */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* v */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* w */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* x */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* y */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* z */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* { */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* | */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* } */,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE /* ~ */,
0, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE, IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE,
IS_URLSAFE|IS_DOMAIN|IS_MAILSAFE
};

#define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0)
#define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0)
#define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL | IS_SPACE | \
IS_CTRL)) == 0)
#define is_usersafe(x) ((url_scanner_table[(guchar)(x)] & (IS_CTRL | IS_SPACE)) == 0)
#define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0)
#define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0)
#define is_lwsp(x) ((url_scanner_table[(guint)(x)] & IS_LWSP) != 0)
#define is_mailsafe(x) ((url_scanner_table[(guint)(x)] & (IS_MAILSAFE)) != 0)
#define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0)
#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA | IS_DIGIT | \
IS_URLSAFE)) != 0)
#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_URLSAFE)) != 0)

const gchar *
rspamd_url_strerror (enum uri_errno err)
@@ -425,7 +545,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
SET_U (u, UF_USERINFO);
st = parse_at;
}
else if (!is_usersafe (t)) {
else if (!is_mailsafe (t)) {
goto out;
}
p++;
@@ -449,7 +569,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
st = parse_query;
break;
case parse_query:
if (!is_atom (t)) {
if (!is_mailsafe (t)) {
goto out;
}
p++;
@@ -1509,7 +1629,7 @@ url_email_end (struct url_callback_data *cb,
}

c = pos - 1;
while (c > cb->begin && is_usersafe (*c)) {
while (c > cb->begin && is_mailsafe (*c)) {
c --;
}
/* Rewind to the first alphanumeric character */
@@ -1523,7 +1643,7 @@ url_email_end (struct url_callback_data *cb,
p ++;
}
/* Rewind it again to avoid bad emails to be detected */
while (p > pos && !g_ascii_isalnum (*p)) {
while (p > pos && p < cb->end && !g_ascii_isalnum (*p)) {
p --;
}


Loading…
Cancel
Save