rspamd_normalize_text_part (task, text_part);
if (!IS_PART_HTML (text_part)) {
- rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
+ rspamd_url_text_extract (task->task_pool, task, text_part,
+ RSPAMD_URL_FIND_ALL);
+ }
+ else {
+ rspamd_url_text_extract (task->task_pool, task, text_part,
+ RSPAMD_URL_FIND_STRICT);
}
if (text_part->exceptions) {
p = task->subject;
len = strlen (p);
rspamd_cryptobox_hash_update (&st, p, len);
- rspamd_url_find_multiple (task->task_pool, p, len, FALSE, NULL,
+ rspamd_url_find_multiple (task->task_pool, p, len,
+ RSPAMD_URL_FIND_STRICT, NULL,
rspamd_url_task_subject_callback, task);
}
}
if (end > url_text + 4 &&
- rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE,
+ rspamd_url_find (pool, url_text, end - url_text, &url_str,
+ RSPAMD_URL_FIND_ALL,
&url_pos, NULL) &&
url_str != NULL) {
if (url_pos > 0) {
if (url->querylen > 0) {
- if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE,
+ if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
+ RSPAMD_URL_FIND_ALL,
NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (pool,
sizeof (struct rspamd_url));
{"sip:", "", url_web_start, url_web_end,
0, 0},
{"www.", "http://", url_web_start, url_web_end,
- 0, 0},
+ URL_FLAG_NOHTML, 0},
{"ftp.", "ftp://", url_web_start, url_web_end,
URL_FLAG_NOHTML, 0},
/* Likely emails */
gchar *url_str;
rspamd_mempool_t *pool;
gint len;
- gboolean is_html;
+ enum rspamd_url_find_type how;
gboolean prefix_added;
guint newline_idx;
GPtrArray *newlines;
static gint
rspamd_url_trie_callback (struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context)
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
{
struct url_matcher *matcher;
url_match_t m;
matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
strnum);
- if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+ if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
}
gboolean
-rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
- gchar **url_str, gboolean is_html, goffset *url_pos,
- gboolean *prefix_added)
+rspamd_url_find (rspamd_mempool_t *pool,
+ const gchar *begin, gsize len,
+ gchar **url_str,
+ enum rspamd_url_find_type how,
+ goffset *url_pos,
+ gboolean *prefix_added)
{
struct url_callback_data cb;
gint ret;
memset (&cb, 0, sizeof (cb));
cb.begin = begin;
cb.end = begin + len;
- cb.is_html = is_html;
+ cb.how = how;
cb.pool = pool;
ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len,
static gint
rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context,
- gboolean multiple)
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context,
+ gboolean multiple)
{
struct rspamd_url *url;
struct url_matcher *matcher;
strnum);
pool = cb->pool;
- if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+ if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, url->query, url->querylen,
- &url_str, IS_PART_HTML (cbd->part), NULL, &prefix_added)) {
+ &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));
void
rspamd_url_text_extract (rspamd_mempool_t *pool,
- struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- gboolean is_html)
+ struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_url_find_type how)
{
struct rspamd_url_mimepart_cbdata mcbd;
mcbd.part = part;
rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
- part->utf_stripped_content->len, is_html, part->newlines,
+ part->utf_stripped_content->len, how, part->newlines,
rspamd_url_text_part_callback, &mcbd);
}
void
-rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html, GPtrArray *nlines,
- url_insert_function func, gpointer ud)
+rspamd_url_find_multiple (rspamd_mempool_t *pool,
+ const gchar *in,
+ gsize inlen,
+ enum rspamd_url_find_type how,
+ GPtrArray *nlines,
+ url_insert_function func,
+ gpointer ud)
{
struct url_callback_data cb;
memset (&cb, 0, sizeof (cb));
cb.begin = in;
cb.end = in + inlen;
- cb.is_html = is_html;
+ cb.how = how;
cb.pool = pool;
cb.funcd = ud;
}
void
-rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html,
- url_insert_function func, gpointer ud)
+rspamd_url_find_single (rspamd_mempool_t *pool,
+ const gchar *in,
+ gsize inlen,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud)
{
struct url_callback_data cb;
memset (&cb, 0, sizeof (cb));
cb.begin = in;
cb.end = in + inlen;
- cb.is_html = is_html;
+ cb.how = how;
cb.pool = pool;
cb.funcd = ud;
/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, url->query, url->querylen,
- &url_str, FALSE, NULL, &prefix_added)) {
+ &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));
PROTOCOL_UNKNOWN = 1u << 31,
};
+enum rspamd_url_parse_flags {
+ RSPAMD_URL_PARSE_TEXT = 0,
+ RSPAMD_URL_PARSE_HREF = (1u << 0),
+ RSPAMD_URL_PARSE_CHECK = (1 << 1),
+};
+
+enum rspamd_url_find_type {
+ RSPAMD_URL_FIND_ALL = 0,
+ RSPAMD_URL_FIND_STRICT,
+};
+
/**
* Initialize url library
* @param cfg
* @param is_html turn on html euristic
*/
void rspamd_url_text_extract (rspamd_mempool_t *pool,
- struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- gboolean is_html);
-
-enum rspamd_url_parse_flags {
- RSPAMD_URL_PARSE_TEXT = 0,
- RSPAMD_URL_PARSE_HREF = (1u << 0),
- RSPAMD_URL_PARSE_CHECK = (1 << 1),
-};
+ struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_url_find_type how);
/*
* Parse a single url into an uri structure
* @param url_str storage for url string(or NULL)
* @return TRUE if url is found in specified text
*/
-gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
- gchar **url_str, gboolean is_html, goffset *url_pos,
- gboolean *prefix_added);
+gboolean rspamd_url_find (rspamd_mempool_t *pool,
+ const gchar *begin, gsize len,
+ gchar **url_str,
+ enum rspamd_url_find_type how,
+ goffset *url_pos,
+ gboolean *prefix_added);
/*
* Return text representation of url parsing error
*/
* @param func
* @param ud
*/
-void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html, GPtrArray *nlines,
- url_insert_function func, gpointer ud);
+void rspamd_url_find_multiple (rspamd_mempool_t *pool,
+ const gchar *in, gsize inlen,
+ enum rspamd_url_find_type how,
+ GPtrArray *nlines,
+ url_insert_function func,
+ gpointer ud);
/**
* Search for a single url in text and call `func` for each url found
* @param pool
* @param func
* @param ud
*/
-void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html,
- url_insert_function func, gpointer ud);
+void rspamd_url_find_single (rspamd_mempool_t *pool,
+ const gchar *in, gsize inlen,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud);
/**
* Generic callback to insert URLs into rspamd_task
* @param ud
*/
void rspamd_url_task_subject_callback (struct rspamd_url *url,
- gsize start_offset,
- gsize end_offset, gpointer ud);
+ gsize start_offset,
+ gsize end_offset, gpointer ud);
/**
* Adds a tag for url
* @param pool
*/
void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
- const gchar *value,
- rspamd_mempool_t *pool);
+ const gchar *value,
+ rspamd_mempool_t *pool);
guint rspamd_url_hash (gconstpointer u);
guint rspamd_email_hash (gconstpointer u);
* @return
*/
const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
- rspamd_mempool_t *pool);
+ rspamd_mempool_t *pool);
/**
return luaL_error (L, "invalid arguments");
}
else {
- rspamd_url_find_single (pool, text, length, FALSE,
+ rspamd_url_find_single (pool, text, length, RSPAMD_URL_FIND_ALL,
lua_url_single_inserter, L);
if (lua_type (L, -1) != LUA_TUSERDATA) {
if (text != NULL) {
lua_newtable (L);
- rspamd_url_find_multiple (pool, text, length, FALSE, NULL,
+ rspamd_url_find_multiple (pool, text, length,
+ RSPAMD_URL_FIND_ALL, NULL,
lua_url_table_inserter, L);
}