}
}
text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
- rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool);
+ rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK) {
disp_tok.len = text_url->hostlen;
state = ignore_bad_tag;
}
else {
+ const guchar *attr_name_end = in;
+
if (*in == '=') {
state = parse_equal;
}
+ else if (*in == '"') {
+ /* No equal or something sane but we have quote character */
+ state = parse_start_dquote;
+ attr_name_end = in - 1;
+
+ while (attr_name_end > *savep) {
+ if (!g_ascii_isalnum (*attr_name_end)) {
+ attr_name_end --;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* One character forward to obtain length */
+ attr_name_end ++;
+ }
else if (g_ascii_isspace (*in)) {
state = spaces_before_eq;
}
else if (*in == '/') {
tag->flags |= FL_CLOSED;
}
+ else if (!g_ascii_isgraph (*in)) {
+ state = parse_value;
+ attr_name_end = in - 1;
+
+ while (attr_name_end > *savep) {
+ if (!g_ascii_isalnum (*attr_name_end)) {
+ attr_name_end --;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* One character forward to obtain length */
+ attr_name_end ++;
+ }
else {
return;
}
- if (!rspamd_html_parse_tag_component (pool, *savep, in, tag)) {
+ if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
/* Ignore unknown params */
*savep = NULL;
}
+ else if (state == parse_value) {
+ *savep = in + 1;
+ }
}
break;
tag->flags |= FL_CLOSED;
store = TRUE;
}
- else if (g_ascii_isspace (*in) || *in == '>') {
+ else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
store = TRUE;
state = spaces_after_param;
}
struct html_tag_component *comp)
{
struct rspamd_url *url;
+ guint saved_flags = 0;
gchar *decoded;
gint rc;
gsize decoded_len;
url = rspamd_mempool_alloc0 (pool, sizeof (*url));
- if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
- url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ enum rspamd_normalise_result norm_res;
+
+ norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
+
+ if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
+ saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
}
- rc = rspamd_url_parse (url, decoded, dlen, pool);
+ if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
+ saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
+ }
+
+ rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
if (rc == URI_ERRNO_OK) {
+ url->flags |= saved_flags;
+
if (has_bad_chars) {
url->flags |= RSPAMD_URL_FLAG_OBSCURED;
}
rc = rspamd_url_parse (query_url,
url_str,
strlen (url_str),
- pool);
+ pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK &&
query_url->hostlen > 0) {
}
static gint
-rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
- gchar const **end, gboolean strict, guint *flags)
+rspamd_mailto_parse (struct http_parser_url *u,
+ const gchar *str, gsize len,
+ gchar const **end,
+ enum rspamd_url_parse_flags parse_flags, guint *flags)
{
const gchar *p = str, *c = str, *last = str + len;
gchar t;
*end = p;
}
- if (!strict) {
+ if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) {
return 0;
}
static gint
rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
- gchar const **end, gboolean strict, guint *flags)
+ gchar const **end,
+ enum rspamd_url_parse_flags parse_flags,
+ guint *flags)
{
const gchar *p = str, *c = str, *last = str + len, *slash = NULL,
*password_start = NULL, *user_start = NULL;
SET_U (u, UF_SCHEMA);
}
else if (!g_ascii_isalnum (t) && t != '+' && t != '-') {
- if (!strict && p > c) {
+ if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) {
/* We might have some domain, but no protocol */
st = parse_domain;
p = c;
}
else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
if (*p & 0x80) {
- *flags |= RSPAMD_URL_FLAG_IDN;
+ (*flags) |= RSPAMD_URL_FLAG_IDN;
guint i = 0;
U8_NEXT (p, i, last - p, uc);
if (!u_isalnum (uc)) {
/* Bad symbol */
- if (strict) {
- goto out;
+ if (IS_ZERO_WIDTH_SPACE (uc)) {
+ (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
}
else {
- goto set;
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
}
}
p ++;
}
else {
- if (strict) {
- goto out;
+ if (parse_flags & RSPAMD_URL_PARSE_HREF) {
+ /* We have to use all shit we are given here */
+ p ++;
+ (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
}
else {
- goto set;
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
}
}
}
goto set;
}
else if (!g_ascii_isdigit (t)) {
- if (strict || !g_ascii_isspace (t)) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) ||
+ !g_ascii_isspace (t)) {
goto out;
}
else {
goto set;
}
else if (is_lwsp (t)) {
- if (strict) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
if (g_ascii_isspace (t)) {
goto set;
}
goto set;
}
else if (is_lwsp (t)) {
- if (strict) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
if (g_ascii_isspace (t)) {
goto set;
}
goto set;
}
else if (is_lwsp (t)) {
- if (strict) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
if (g_ascii_isspace (t)) {
goto set;
}
}
enum uri_errno
-rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
- rspamd_mempool_t *pool)
+rspamd_url_parse (struct rspamd_url *uri,
+ gchar *uristring, gsize len,
+ rspamd_mempool_t *pool,
+ enum rspamd_url_parse_flags parse_flags)
{
struct http_parser_url u;
gchar *p, *comp;
if (len > sizeof ("mailto:") - 1) {
/* For mailto: urls we also need to add slashes to make it a valid URL */
if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) {
- ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE, &flags);
+ ret = rspamd_mailto_parse (&u, uristring, len, &end, parse_flags,
+ &flags);
}
else {
- ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
+ ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags,
+ &flags);
}
}
else {
- ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
+ ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags, &flags);
}
if (ret != 0) {
uri->protocollen);
rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+
if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
}
+
rspamd_url_shift (uri, unquoted_len, UF_HOST);
if (uri->datalen) {
rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
rspamd_url_shift (uri, unquoted_len, UF_PATH);
}
+
if (uri->querylen) {
unquoted_len = rspamd_url_decode (uri->query,
uri->query,
}
rspamd_url_shift (uri, unquoted_len, UF_QUERY);
}
+
if (uri->fragmentlen) {
unquoted_len = rspamd_url_decode (uri->fragment,
uri->fragment,
uri->host, uri->hostlen,
rspamd_tld_trie_callback, uri, NULL);
- if (uri->tldlen == 0) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && uri->tldlen == 0) {
/* Ignore URL's without TLD if it is not a numeric URL */
if (!rspamd_url_is_ip (uri, pool)) {
return URI_ERRNO_TLD_MISSING;
}
if (uri->protocol == PROTOCOL_UNKNOWN) {
- return URI_ERRNO_INVALID_PROTOCOL;
+ if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
+ return URI_ERRNO_INVALID_PROTOCOL;
+ }
+ else {
+ /* Hack, hack, hack */
+ uri->protocol = PROTOCOL_HTTP;
+ }
}
return URI_ERRNO_OK;
len = MIN (len, match->newline_pos - pos);
}
- if (rspamd_web_parse (NULL, pos, len, &last, FALSE, &flags) != 0) {
+ if (rspamd_web_parse (NULL, pos, len, &last,
+ RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
return FALSE;
}
if (!match->prefix || match->prefix[0] == '\0') {
/* We have mailto:// at the beginning */
- if (rspamd_mailto_parse (&u, pos, len, &last, FALSE, &flags) != 0) {
+ if (rspamd_mailto_parse (&u, pos, len, &last,
+ RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
return FALSE;
}
cb->fin = m.m_begin + m.m_len;
url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
g_strstrip (cb->url_str);
- rc = rspamd_url_parse (url, cb->url_str, strlen (cb->url_str), pool);
+ rc = rspamd_url_parse (url, cb->url_str,
+ strlen (cb->url_str), pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK && url->hostlen > 0) {
if (cb->prefix_added) {
rc = rspamd_url_parse (query_url,
url_str,
strlen (url_str),
- task->task_pool);
+ task->task_pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK &&
query_url->hostlen > 0) {
rc = rspamd_url_parse (query_url,
url_str,
strlen (url_str),
- task->task_pool);
+ task->task_pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK &&
url->hostlen > 0) {