aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/url.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/url.c')
-rw-r--r--src/libserver/url.c1620
1 files changed, 1620 insertions, 0 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c
new file mode 100644
index 000000000..c4313e8a9
--- /dev/null
+++ b/src/libserver/url.c
@@ -0,0 +1,1620 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "url.h"
+#include "util.h"
+#include "fstring.h"
+#include "main.h"
+#include "message.h"
+#include "trie.h"
+
+#define POST_CHAR 1
+#define POST_CHAR_S "\001"
+
+/* Tcp port range */
+#define LOWEST_PORT 0
+#define HIGHEST_PORT 65535
+
+#define uri_port_is_valid(port) \
+ (LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT)
+
+struct _proto {
+ guchar *name;
+ gint port;
+ uintptr_t *unused;
+ guint need_slashes:1;
+ guint need_slash_after_host:1;
+ guint free_syntax:1;
+ guint need_ssl:1;
+};
+
+typedef struct url_match_s {
+ const gchar *m_begin;
+ gsize m_len;
+ const gchar *pattern;
+ const gchar *prefix;
+ gboolean add_prefix;
+} url_match_t;
+
+#define URL_FLAG_NOHTML 0x1
+#define URL_FLAG_STRICT_MATCH 0x2
+
+struct url_matcher {
+ const gchar *pattern;
+ const gchar *prefix;
+ gboolean (*start)(const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+ gboolean (*end)(const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+ gint flags;
+};
+
+static gboolean url_file_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+static gboolean url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+
+static gboolean url_web_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+static gboolean url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+
+static gboolean url_tld_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+static gboolean url_tld_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+
+static gboolean url_email_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+static gboolean url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+
+struct url_matcher matchers[] = {
+ /* Common prefixes */
+ { "file://", "", url_file_start, url_file_end, 0 },
+ { "ftp://", "", url_web_start, url_web_end, 0 },
+ { "sftp://", "", url_web_start, url_web_end, 0 },
+ { "http://", "", url_web_start, url_web_end, 0 },
+ { "https://", "", url_web_start, url_web_end, 0 },
+ { "news://", "", url_web_start, url_web_end, 0 },
+ { "nntp://", "", url_web_start, url_web_end, 0 },
+ { "telnet://", "", url_web_start, url_web_end, 0 },
+ { "webcal://", "", url_web_start, url_web_end, 0 },
+ { "mailto://", "", url_email_start, url_email_end, 0 },
+ { "callto://", "", url_web_start, url_web_end, 0 },
+ { "h323:", "", url_web_start, url_web_end, 0 },
+ { "sip:", "", url_web_start, url_web_end, 0 },
+ { "www.", "http://", url_web_start, url_web_end, 0 },
+ { "ftp.", "ftp://", url_web_start, url_web_end, URL_FLAG_NOHTML },
+ /* TLD domains parts */
+ { ".ac", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ad", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ae", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".aero", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".af", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ag", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ai", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".al", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".am", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".an", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ao", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".aq", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ar", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".arpa", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".as", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".asia", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".at", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".au", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".aw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ax", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".az", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ba", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bb", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bd", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".be", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bf", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bh", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bi", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".biz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bj", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bo", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".br", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bs", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bt", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bv", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".by", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".bz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ca", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cat", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cc", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cd", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cf", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ch", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ci", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ck", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cl", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".co", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".com", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".coop", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cv", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cx", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cy", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".cz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".de", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".dj", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".dk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".dm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".do", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".dz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ec", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".edu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ee", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".eg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".er", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".es", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".et", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".eu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".fi", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".fj", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".fk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".fm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".fo", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".fr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ga", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gb", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gd", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ge", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gf", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gh", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gi", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gl", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gov", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gp", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gq", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gs", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gt", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".gy", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".hk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".hm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".hn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".hr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ht", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".hu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".id", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ie", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".il", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".im", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".in", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".info", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".int", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".io", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".iq", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ir", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".is", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".it", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".je", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".jm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".jo", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".jobs", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".jp", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ke", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".kg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".kh", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ki", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".km", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".kn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".kp", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".kr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".kw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ky", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".kz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".la", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".lb", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".lc", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".li", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".lk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".lr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ls", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".lt", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".lu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".lv", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ly", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ma", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mc", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".md", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".me", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mh", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mil", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ml", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mo", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mobi", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mp", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mq", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ms", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mt", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".museum", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mv", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mx", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".my", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".mz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".na", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".name", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".nc", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ne", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".net", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".nf", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ng", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ni", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".nl", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".no", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".np", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".nr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".nu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".nz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".om", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".org", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pa", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pe", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pf", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ph", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pl", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pro", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ps", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pt", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".pw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".py", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".qa", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".re", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ro", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".rs", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ru", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".rw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sa", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sb", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sc", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sd", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".se", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sh", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".si", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sj", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sl", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".so", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".st", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".su", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sv", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sx", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sy", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".sz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tc", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".td", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tel", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tf", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".th", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tj", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tl", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".to", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tp", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tr", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".travel", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tt", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tv", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".tz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ua", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ug", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".uk", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".us", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".uy", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".uz", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".va", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".vc", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ve", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".vg", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".vi", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".vn", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".vu", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".wf", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ws", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".xxx", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".ye", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".yt", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".za", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".zm", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ { ".zw", "http://", url_tld_start, url_tld_end, URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
+ /* Likely emails */
+ { "@", "mailto://",url_email_start, url_email_end, URL_FLAG_NOHTML }
+};
+
+struct url_match_scanner {
+ struct url_matcher *matchers;
+ gsize matchers_count;
+ rspamd_trie_t *patterns;
+};
+
+struct url_match_scanner *url_scanner = NULL;
+
+static const struct _proto protocol_backends[] = {
+ {"file", 0, NULL, 1, 0, 0, 0},
+ {"ftp", 21, NULL, 1, 0, 0, 0},
+ {"http", 80, NULL, 1, 0, 0, 0},
+ {"https", 443, NULL, 1, 0, 0, 1},
+ {"mailto", 25, NULL, 1, 0, 0, 0},
+ /* Keep these last! */
+ {NULL, 0, NULL, 0, 0, 1, 0}
+};
+
+/* Convert an ASCII hex digit to the corresponding number between 0
+ and 15. H should be a hexadecimal digit that satisfies isxdigit;
+ otherwise, the result is undefined. */
+#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : g_ascii_toupper (h) - 'A' + 10)
+#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2))
+/* The reverse of the above: convert a number in the [0, 16) range to
+ the ASCII representation of the corresponding hexadecimal digit.
+ `+ 0' is there so you can't accidentally use it as an lvalue. */
+#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0)
+#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0)
+
+static guchar url_scanner_table[256] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
+ 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
+ 160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,192,
+ 128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+enum {
+ IS_CTRL = (1 << 0),
+ IS_ALPHA = (1 << 1),
+ IS_DIGIT = (1 << 2),
+ IS_LWSP = (1 << 3),
+ IS_SPACE = (1 << 4),
+ IS_SPECIAL = (1 << 5),
+ IS_DOMAIN = (1 << 6),
+ IS_URLSAFE = (1 << 7)
+};
+
+#define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0)
+#define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0)
+#define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
+#define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0)
+#define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0)
+#define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0)
+#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
+
+
+const gchar *
+url_strerror (enum uri_errno err)
+{
+ switch (err) {
+ case URI_ERRNO_OK:
+ return "Parsing went well";
+ case URI_ERRNO_EMPTY:
+ return "The URI string was empty";
+ case URI_ERRNO_INVALID_PROTOCOL:
+ return "No protocol was found";
+ case URI_ERRNO_NO_SLASHES:
+ return "Slashes after protocol missing";
+ case URI_ERRNO_TOO_MANY_SLASHES:
+ return "Too many slashes after protocol";
+ case URI_ERRNO_TRAILING_DOTS:
+ return "'.' after host";
+ case URI_ERRNO_NO_HOST:
+ return "Host part is missing";
+ case URI_ERRNO_NO_PORT_COLON:
+ return "':' after host without port";
+ case URI_ERRNO_NO_HOST_SLASH:
+ return "Slash after host missing";
+ case URI_ERRNO_IPV6_SECURITY:
+ return "IPv6 security bug detected";
+ case URI_ERRNO_INVALID_PORT:
+ return "Port number is bad";
+ case URI_ERRNO_INVALID_PORT_RANGE:
+ return "Port number is not within 0-65535";
+ }
+ return NULL;
+}
+
+static gint
+check_uri_file (gchar *name)
+{
+ static const gchar chars[] = POST_CHAR_S "#?";
+
+ return strcspn (name, chars);
+}
+
+static gint
+url_init (void)
+{
+ guint i;
+ gchar patbuf[128];
+
+ if (url_scanner == NULL) {
+ url_scanner = g_malloc (sizeof (struct url_match_scanner));
+ url_scanner->matchers = matchers;
+ url_scanner->matchers_count = G_N_ELEMENTS (matchers);
+ url_scanner->patterns = rspamd_trie_create (TRUE);
+ for (i = 0; i < url_scanner->matchers_count; i ++) {
+ if (matchers[i].flags & URL_FLAG_STRICT_MATCH) {
+ /* Insert more specific patterns */
+
+ /* some.tld/ */
+ rspamd_snprintf (patbuf, sizeof (patbuf), "%s/", matchers[i].pattern);
+ rspamd_trie_insert (url_scanner->patterns, patbuf, i);
+ /* some.tld */
+ rspamd_snprintf (patbuf, sizeof (patbuf), "%s ", matchers[i].pattern);
+ rspamd_trie_insert (url_scanner->patterns, patbuf, i);
+ /* some.tld: */
+ rspamd_snprintf (patbuf, sizeof (patbuf), "%s:", matchers[i].pattern);
+ rspamd_trie_insert (url_scanner->patterns, patbuf, i);
+ }
+ else {
+ rspamd_trie_insert (url_scanner->patterns, matchers[i].pattern, i);
+ }
+ }
+ }
+
+ return 0;
+}
+
+enum protocol
+get_protocol (gchar *name, gint namelen)
+{
+ /* These are really enum protocol values but can take on negative
+ * values and since 0 <= -1 for enum values it's better to use clean
+ * integer type. */
+ gint start, end;
+ enum protocol protocol;
+ guchar *pname;
+ gint pnamelen, minlen, compare;
+
+ /* Almost dichotomic search is used here */
+ /* Starting at the HTTP entry which is the most common that will make
+ * file and NNTP the next entries checked and amongst the third checks
+ * are proxy and FTP. */
+ start = 0;
+ end = PROTOCOL_UNKNOWN - 1;
+ protocol = PROTOCOL_HTTP;
+
+ while (start <= end) {
+ pname = protocol_backends[protocol].name;
+ pnamelen = strlen (pname);
+ minlen = MIN (pnamelen, namelen);
+ compare = g_ascii_strncasecmp (pname, name, minlen);
+
+ if (compare == 0) {
+ if (pnamelen == namelen)
+ return protocol;
+
+ /* If the current protocol name is longer than the
+ * protocol name being searched for move @end else move
+ * @start. */
+ compare = pnamelen > namelen ? 1 : -1;
+ }
+
+ if (compare > 0)
+ end = protocol - 1;
+ else
+ start = protocol + 1;
+
+ protocol = (start + end) / 2;
+ }
+
+ return PROTOCOL_UNKNOWN;
+}
+
+
+gint
+get_protocol_port (enum protocol protocol)
+{
+ return protocol_backends[protocol].port;
+}
+
+gint
+get_protocol_need_slashes (enum protocol protocol)
+{
+ return protocol_backends[protocol].need_slashes;
+}
+
+gint
+get_protocol_need_slash_after_host (enum protocol protocol)
+{
+ return protocol_backends[protocol].need_slash_after_host;
+}
+
+gint
+get_protocol_free_syntax (enum protocol protocol)
+{
+ return protocol_backends[protocol].free_syntax;
+}
+
+static gint
+get_protocol_length (const gchar *url)
+{
+ gchar *end = (gchar *)url;
+
+ /* Seek the end of the protocol name if any. */
+ /* RFC1738:
+ * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
+ * (but per its recommendations we accept "upalpha" too) */
+ while (g_ascii_isalnum (*end) || *end == '+' || *end == '-' || *end == '.')
+ end++;
+
+ /* Also return 0 if there's no protocol name (@end == @url). */
+ return (*end == ':') ? end - url : 0;
+}
+
+
+/*
+ * Calcualte new length of unescaped hostlen
+ */
+static guint
+url_calculate_escaped_hostlen (gchar *host, guint hostlen)
+{
+ guint i, result = hostlen;
+ gchar *p = host, c;
+
+ for (i = 0; i < hostlen; i++, p++) {
+ if (*p == '%' && g_ascii_isxdigit (*(p + 1)) && g_ascii_isxdigit (*(p + 2)) && i < hostlen - 2) {
+ c = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
+ if (c != '\0') {
+ result -= 2;
+ }
+ }
+ }
+
+ return result;
+}
+
+/* URL-unescape the string S.
+
+ This is done by transforming the sequences "%HH" to the character
+ represented by the hexadecimal digits HH. If % is not followed by
+ two hexadecimal digits, it is inserted literally.
+
+ The transformation is done in place. If you need the original
+ string intact, make a copy before calling this function. */
+
+static void
+url_unescape (gchar *s)
+{
+ gchar *t = s; /* t - tortoise */
+ gchar *h = s; /* h - hare */
+
+ for (; *h; h++, t++) {
+ if (*h != '%') {
+ copychar:
+ *t = *h;
+ }
+ else {
+ gchar c;
+ /* Do nothing if '%' is not followed by two hex digits. */
+ if (!h[1] || !h[2] || !(g_ascii_isxdigit (h[1]) && g_ascii_isxdigit (h[2])))
+ goto copychar;
+ c = X2DIGITS_TO_NUM (h[1], h[2]);
+ /* Don't unescape %00 because there is no way to insert it
+ * into a C string without effectively truncating it. */
+ if (c == '\0')
+ goto copychar;
+ *t = c;
+ h += 2;
+ }
+ }
+ *t = '\0';
+}
+
+static void
+url_strip (gchar *s)
+{
+ gchar *t = s; /* t - tortoise */
+ gchar *h = s; /* h - hare */
+
+ while (*h) {
+ if (g_ascii_isgraph (*h)) {
+ *t = *h;
+ t++;
+ }
+ h++;
+ }
+ *t = '\0';
+}
+
+static gchar *
+url_escape_1 (const gchar *s, gint allow_passthrough, rspamd_mempool_t * pool)
+{
+ const gchar *p1;
+ gchar *p2, *newstr;
+ gint newlen;
+ gint addition = 0;
+
+ for (p1 = s; *p1; p1++)
+ if (!is_urlsafe (*p1)) {
+ addition += 2; /* Two more characters (hex digits) */
+ }
+
+ if (!addition) {
+ if (allow_passthrough) {
+ return (gchar *)s;
+ }
+ else {
+ return rspamd_mempool_strdup (pool, s);
+ }
+ }
+
+ newlen = (p1 - s) + addition;
+ newstr = (gchar *)rspamd_mempool_alloc (pool, newlen + 1);
+
+ p1 = s;
+ p2 = newstr;
+ while (*p1) {
+ /* Quote the characters that match the test mask. */
+ if (!is_urlsafe (*p1)) {
+ guchar c = *p1++;
+ *p2++ = '%';
+ *p2++ = XNUM_TO_DIGIT (c >> 4);
+ *p2++ = XNUM_TO_DIGIT (c & 0xf);
+ }
+ else
+ *p2++ = *p1++;
+ }
+ *p2 = '\0';
+
+ return newstr;
+}
+
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+ string, returning a freshly allocated string. */
+
+gchar *
+url_escape (const gchar *s, rspamd_mempool_t * pool)
+{
+ return url_escape_1 (s, 0, pool);
+}
+
+/* Decide whether the gchar at position P needs to be encoded. (It is
+ not enough to pass a single gchar *P because the function may need
+ to inspect the surrounding context.)
+
+ Return 1 if the gchar should be escaped as %XX, 0 otherwise. */
+
+static inline gboolean
+char_needs_escaping (const gchar *p)
+{
+ if (*p == '%') {
+ if (g_ascii_isxdigit (*(p + 1)) && g_ascii_isxdigit (*(p + 2))) {
+ return FALSE;
+ }
+ else {
+ return TRUE;
+ }
+ }
+ else if (! is_urlsafe (*p)) {
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/* Translate a %-escaped (but possibly non-conformant) input string S
+ into a %-escaped (and conformant) output string.
+*/
+
+static gchar *
+reencode_escapes (gchar *s, rspamd_mempool_t * pool)
+{
+ const gchar *p1;
+ gchar *newstr, *p2;
+ gint oldlen, newlen;
+
+ gint encode_count = 0;
+
+ /* First pass: inspect the string to see if there's anything to do,
+ and to calculate the new length. */
+ for (p1 = s; *p1; p1++)
+ if (char_needs_escaping (p1))
+ ++encode_count;
+
+ if (!encode_count) {
+ /* The string is good as it is. */
+ return s;
+ }
+
+ oldlen = p1 - s;
+ /* Each encoding adds two characters (hex digits). */
+ newlen = oldlen + 2 * encode_count;
+ newstr = rspamd_mempool_alloc (pool, newlen + 1);
+
+ /* Second pass: copy the string to the destination address, encoding
+ chars when needed. */
+ p1 = s;
+ p2 = newstr;
+
+ while (*p1)
+ if (char_needs_escaping (p1)) {
+ guchar c = *p1++;
+ *p2++ = '%';
+ *p2++ = XNUM_TO_DIGIT (c >> 4);
+ *p2++ = XNUM_TO_DIGIT (c & 0xf);
+ }
+ else {
+ *p2++ = *p1++;
+ }
+
+ *p2 = '\0';
+ return newstr;
+}
+
+/* Unescape CHR in an otherwise escaped STR. Used to selectively
+ escaping of certain characters, such as "/" and ":". Returns a
+ count of unescaped chars. */
+
+static void
+unescape_single_char (gchar *str, gchar chr)
+{
+ const gchar c1 = XNUM_TO_DIGIT (chr >> 4);
+ const gchar c2 = XNUM_TO_DIGIT (chr & 0xf);
+ gchar *h = str; /* hare */
+ gchar *t = str; /* tortoise */
+
+ for (; *h; h++, t++) {
+ if (h[0] == '%' && h[1] == c1 && h[2] == c2) {
+ *t = chr;
+ h += 2;
+ }
+ else {
+ *t = *h;
+ }
+ }
+ *t = '\0';
+}
+
+
+/*
+ * Resolve "." and ".." elements of PATH by destructively modifying
+ * PATH and return non-zero if PATH has been modified, zero otherwise.
+ */
+
+static gboolean
+path_simplify (gchar *path)
+{
+ gchar *h = path; /* hare */
+ gchar *t = path; /* tortoise */
+ gchar *beg = path; /* boundary for backing the tortoise */
+ gchar *end = path + strlen (path);
+
+ while (h < end) {
+ /* Hare should be at the beginning of a path element. */
+ if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) {
+ /* Ignore "./". */
+ h += 2;
+ }
+ else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) {
+ /* Handle "../" by retreating the tortoise by one path
+ element -- but not past beginning. */
+ if (t > beg) {
+ /* Move backwards until T hits the beginning of the
+ previous path element or the beginning of path. */
+ for (--t; t > beg && t[-1] != '/'; t--);
+ }
+ else {
+ /* If we're at the beginning, copy the "../" literally
+ move the beginning so a later ".." doesn't remove
+ it. */
+ beg = t + 3;
+ goto regular;
+ }
+ h += 3;
+ }
+ else {
+ regular:
+ /* A regular path element. If H hasn't advanced past T,
+ simply skip to the next path element. Otherwise, copy
+ the path element until the next slash. */
+ if (t == h) {
+ /* Skip the path element, including the slash. */
+ while (h < end && *h != '/')
+ t++, h++;
+ if (h < end)
+ t++, h++;
+ }
+ else {
+ /* Copy the path element, including the final slash. */
+ while (h < end && *h != '/')
+ *t++ = *h++;
+ if (h < end)
+ *t++ = *h++;
+ }
+ }
+ }
+
+ if (t != h)
+ *t = '\0';
+
+ return t != h;
+}
+
+enum uri_errno
+parse_uri (struct uri *uri, gchar *uristring, rspamd_mempool_t * pool)
+{
+ guchar *prefix_end, *host_end, *p;
+ guchar *lbracket, *rbracket;
+ gint datalen, n, addrlen;
+ guchar *frag_or_post, *user_end, *port_end;
+
+ memset (uri, 0, sizeof (*uri));
+
+ /* Nothing to do for an empty url. */
+ if (!*uristring)
+ return URI_ERRNO_EMPTY;
+
+ uri->string = reencode_escapes (uristring, pool);
+ msg_debug ("reencoding escapes in original url: '%s'", struri (uri));
+ uri->protocollen = get_protocol_length (struri (uri));
+
+ /* Assume http as default protocol */
+ if (!uri->protocollen || (uri->protocol = get_protocol (struri (uri), uri->protocollen)) == PROTOCOL_UNKNOWN) {
+ /* Make exception for numeric urls */
+ p = uri->string;
+ while (*p && (g_ascii_isalnum (*p) || *p == ':')) {
+ p ++;
+ }
+ if (*p == '\0') {
+ return URI_ERRNO_INVALID_PROTOCOL;
+ }
+ p = g_strconcat ("http://", uri->string, NULL);
+ uri->string = rspamd_mempool_strdup (pool, p);
+ g_free (p);
+ uri->protocol = PROTOCOL_HTTP;
+ prefix_end = struri (uri) + 7;
+ }
+ else {
+ /* Figure out whether the protocol is known */
+ msg_debug ("getting protocol from url: %d", uri->protocol);
+
+ prefix_end = struri (uri) + uri->protocollen; /* ':' */
+
+ /* Check if there's a digit after the protocol name. */
+ if (g_ascii_isdigit (*prefix_end)) {
+ p = struri (uri);
+ uri->ip_family = p[uri->protocollen] - '0';
+ prefix_end++;
+ }
+ if (*prefix_end != ':') {
+ msg_debug ("invalid protocol in uri");
+ return URI_ERRNO_INVALID_PROTOCOL;
+ }
+ prefix_end++;
+
+ /* Skip slashes */
+
+ if (prefix_end[0] == '/' && prefix_end[1] == '/') {
+ if (prefix_end[2] == '/') {
+ msg_debug ("too many '/' in uri");
+ return URI_ERRNO_TOO_MANY_SLASHES;
+ }
+
+ prefix_end += 2;
+
+ }
+ else {
+ msg_debug ("no '/' in uri");
+ return URI_ERRNO_NO_SLASHES;
+ }
+ }
+
+ if (get_protocol_free_syntax (uri->protocol)) {
+ uri->data = prefix_end;
+ uri->datalen = strlen (prefix_end);
+ return URI_ERRNO_OK;
+
+ }
+ else if (uri->protocol == PROTOCOL_FILE) {
+ datalen = check_uri_file (prefix_end);
+ frag_or_post = prefix_end + datalen;
+
+ /* Extract the fragment part. */
+ if (datalen >= 0) {
+ if (*frag_or_post == '#') {
+ uri->fragment = frag_or_post + 1;
+ uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
+ frag_or_post = uri->fragment + uri->fragmentlen;
+ }
+ if (*frag_or_post == POST_CHAR) {
+ uri->post = frag_or_post + 1;
+ }
+ }
+ else {
+ datalen = strlen (prefix_end);
+ }
+
+ uri->data = prefix_end;
+ uri->datalen = datalen;
+
+ return URI_ERRNO_OK;
+ }
+
+ /* Isolate host */
+
+ /* Get brackets enclosing IPv6 address */
+ lbracket = strchr (prefix_end, '[');
+ if (lbracket) {
+ rbracket = strchr (lbracket, ']');
+ /* [address] is handled only inside of hostname part (surprisingly). */
+ if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/"))
+ uri->ipv6 = 1;
+ else
+ lbracket = rbracket = NULL;
+ }
+ else {
+ rbracket = NULL;
+ }
+
+ /* Possibly skip auth part */
+ host_end = prefix_end + strcspn (prefix_end, "@");
+
+ if (prefix_end + strcspn (prefix_end, "/?") > host_end && *host_end) { /* we have auth info here */
+
+ /* Allow '@' in the password component */
+ while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?"))
+ host_end = host_end + 1 + strcspn (host_end + 1, "@");
+
+ user_end = strchr (prefix_end, ':');
+
+ if (!user_end || user_end > host_end) {
+ uri->user = prefix_end;
+ uri->userlen = host_end - prefix_end;
+ }
+ else {
+ uri->user = prefix_end;
+ uri->userlen = user_end - prefix_end;
+ uri->password = user_end + 1;
+ uri->passwordlen = host_end - user_end - 1;
+ }
+ prefix_end = host_end + 1;
+ }
+
+ if (uri->ipv6 && rbracket != NULL) {
+ host_end = rbracket + strcspn (rbracket, ":/?");
+ }
+ else {
+ host_end = prefix_end + strcspn (prefix_end, ":/?");
+ }
+
+ if (uri->ipv6) {
+ addrlen = rbracket - lbracket - 1;
+
+
+ uri->host = lbracket + 1;
+ uri->hostlen = addrlen;
+ }
+ else {
+ uri->host = prefix_end;
+ uri->hostlen = host_end - prefix_end;
+
+ /* Trim trailing '.'s */
+ if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
+ return URI_ERRNO_TRAILING_DOTS;
+ }
+
+ if (*host_end == ':') { /* we have port here */
+ port_end = host_end + 1 + strcspn (host_end + 1, "/");
+
+ host_end++;
+
+ uri->port = host_end;
+ uri->portlen = port_end - host_end;
+
+ if (uri->portlen == 0)
+ return URI_ERRNO_NO_PORT_COLON;
+
+ /* We only use 8 bits for portlen so better check */
+ if ((gint)uri->portlen != port_end - host_end)
+ return URI_ERRNO_INVALID_PORT;
+
+ /* test if port is number */
+ for (; host_end < port_end; host_end++)
+ if (!g_ascii_isdigit (*host_end))
+ return URI_ERRNO_INVALID_PORT;
+
+ /* Check valid port value, and let show an error message
+ * about invalid url syntax. */
+ if (uri->port && uri->portlen) {
+
+ errno = 0;
+ n = strtol (uri->port, NULL, 10);
+ if (errno || !uri_port_is_valid (n))
+ return URI_ERRNO_INVALID_PORT;
+ }
+ }
+
+ if (*host_end == '/') {
+ host_end++;
+
+ }
+ else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end != '?') {
+ /* The need for slash after the host component depends on the
+ * need for a host component. -- The dangerous mind of Jonah */
+ if (!uri->hostlen)
+ return URI_ERRNO_NO_HOST;
+
+ return URI_ERRNO_NO_HOST_SLASH;
+ }
+
+ /* Look for #fragment or POST_CHAR */
+ prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S);
+ uri->data = host_end;
+ uri->datalen = prefix_end - host_end;
+
+ if (*prefix_end == '#') {
+ uri->fragment = prefix_end + 1;
+ uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
+ prefix_end = uri->fragment + uri->fragmentlen;
+ }
+
+ if (*prefix_end == POST_CHAR) {
+ uri->post = prefix_end + 1;
+ }
+
+ convert_to_lowercase (uri->string, uri->protocollen);
+ convert_to_lowercase (uri->host, uri->hostlen);
+ /* Decode %HH sequences in host name. This is important not so much
+ to support %HH sequences in host names (which other browser
+ don't), but to support binary characters (which will have been
+ converted to %HH by reencode_escapes). */
+ if (strchr (uri->host, '%')) {
+ uri->hostlen = url_calculate_escaped_hostlen (uri->host, uri->hostlen);
+ }
+
+ url_strip (struri (uri));
+ url_unescape (uri->host);
+
+ path_simplify (uri->data);
+
+ return URI_ERRNO_OK;
+}
+
+static const gchar url_braces[] = {
+ '(', ')' ,
+ '{', '}' ,
+ '[', ']' ,
+ '<', '>' ,
+ '|', '|' ,
+ '\'', '\''
+};
+
+static gboolean
+is_open_brace (gchar c)
+{
+ if (c == '(' ||
+ c == '{' ||
+ c == '[' ||
+ c == '<' ||
+ c == '|' ||
+ c == '\'') {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+static gboolean
+url_file_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ match->m_begin = pos;
+ return TRUE;
+}
+static gboolean
+url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ const gchar *p;
+ gchar stop;
+ guint i;
+
+ p = pos + strlen (match->pattern);
+ stop = *p;
+ if (*p == '/') {
+ p ++;
+ }
+
+ for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) {
+ if (*p == url_braces[i]) {
+ stop = url_braces[i + 1];
+ break;
+ }
+ }
+
+ while (p < end && *p != stop && is_urlsafe (*p)) {
+ p ++;
+ }
+
+ if (p == begin) {
+ return FALSE;
+ }
+ match->m_len = p - match->m_begin;
+
+ return TRUE;
+
+}
+
+static gboolean
+url_tld_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ const gchar *p = pos;
+
+ /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */
+ while (p >= begin) {
+ if ((!is_domain (*p) && *p != '.' && *p != '/') || g_ascii_isspace (*p)) {
+ p ++;
+ if (!g_ascii_isalnum (*p)) {
+ /* Urls cannot start with strange symbols */
+ return FALSE;
+ }
+ match->m_begin = p;
+ return TRUE;
+ }
+ else if (p == begin && p != pos) {
+ match->m_begin = p;
+ return TRUE;
+ }
+ else if (*p == '.') {
+ if (p == begin) {
+ /* Urls cannot start with a dot */
+ return FALSE;
+ }
+ if (!g_ascii_isalnum (p[1])) {
+ /* Wrong we have an invalid character after dot */
+ return FALSE;
+ }
+ }
+ else if (*p == '/') {
+ /* Urls cannot contain '/' in their body */
+ return FALSE;
+ }
+ p --;
+ }
+
+ return FALSE;
+}
+
+static gboolean
+url_tld_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ const gchar *p;
+
+ /* A url must be finished by tld, so it must be followed by space character */
+ p = pos + strlen (match->pattern);
+ if (p == end || g_ascii_isspace (*p) || *p == ',') {
+ match->m_len = p - match->m_begin;
+ return TRUE;
+ }
+ else if (*p == '/' || *p == ':') {
+ /* Parse arguments, ports by normal way by url default function */
+ p = match->m_begin;
+ /* Check common prefix */
+ if (g_ascii_strncasecmp (p, "http://", sizeof ("http://") - 1) == 0) {
+ return url_web_end (begin, end, match->m_begin + sizeof ("http://") - 1, match);
+ }
+ else {
+ return url_web_end (begin, end, match->m_begin, match);
+ }
+
+ }
+ return FALSE;
+}
+
+static gboolean
+url_web_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ /* Check what we have found */
+ if (pos > begin && (g_ascii_strncasecmp (pos, "www", 3) == 0 || g_ascii_strncasecmp (pos, "ftp", 3) == 0)) {
+ if (!is_open_brace (*(pos - 1)) && !g_ascii_isspace (*(pos - 1))) {
+ return FALSE;
+ }
+ }
+ if (*pos == '.') {
+ /* Urls cannot start with . */
+ return FALSE;
+ }
+ match->m_begin = pos;
+
+ return TRUE;
+}
+
+static gboolean
+url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ const gchar *p, *c;
+ gchar open_brace = '\0', close_brace = '\0';
+ gint brace_stack = 0;
+ gboolean passwd = FALSE;
+ guint port, i;
+
+ p = pos + strlen (match->pattern);
+ for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) {
+ if (*p == url_braces[i]) {
+ close_brace = url_braces[i + 1];
+ open_brace = *p;
+ break;
+ }
+ }
+
+ /* find the end of the domain */
+ if (is_atom (*p)) {
+ /* might be a domain or user@domain */
+ c = p;
+ while (p < end) {
+ if (!is_atom (*p)) {
+ break;
+ }
+
+ p++;
+
+ while (p < end && is_atom (*p)) {
+ p++;
+ }
+
+ if ((p + 1) < end && *p == '.' && (is_atom (*(p + 1)) || *(p + 1) == '/')) {
+ p++;
+ }
+ }
+
+ if (*p != '@') {
+ p = c;
+ }
+ else {
+ p++;
+ }
+
+ goto domain;
+ }
+ else if (is_domain (*p) || (*p & 0x80)) {
+domain:
+ while (p < end) {
+ if (!is_domain (*p) && !(*p & 0x80)) {
+ break;
+ }
+
+ p++;
+
+ while (p < end && (is_domain (*p) || (*p & 0x80))) {
+ p++;
+ }
+
+ if ((p + 1) < end && *p == '.' && (is_domain (*(p + 1)) || *(p + 1) == '/' || (*(p + 1) & 0x80))) {
+ p++;
+ }
+ }
+ }
+ else {
+ return FALSE;
+ }
+
+ if (p < end) {
+ switch (*p) {
+ case ':': /* we either have a port or a password */
+ p++;
+
+ if (is_digit (*p) || passwd) {
+ port = (*p++ - '0');
+
+ while (p < end && is_digit (*p) && port < 65536) {
+ port = (port * 10) + (*p++ - '0');
+ }
+
+ if (!passwd && (port >= 65536 || *p == '@')) {
+ if (p < end && *p == '@') {
+ /* this must be a password? */
+ goto passwd;
+ }
+ else if (p < end) {
+ return FALSE;
+ }
+
+ p--;
+ }
+ }
+ else {
+ passwd:
+ passwd = TRUE;
+ c = p;
+
+ while (p < end && is_atom (*p)) {
+ p++;
+ }
+
+ if ((p + 2) < end) {
+ if (*p == '@') {
+ p++;
+ if (is_domain (*p)) {
+ goto domain;
+ }
+ }
+
+ return FALSE;
+ }
+ }
+
+ if (p >= end || *p != '/') {
+ break;
+ }
+
+ /* we have a '/' so there could be a path - fall through */
+ case '/': /* we've detected a path component to our url */
+ p++;
+ case '?':
+ while (p < end && is_urlsafe (*p)) {
+ if (*p == open_brace) {
+ brace_stack++;
+ }
+ else if (*p == close_brace) {
+ brace_stack--;
+ if (brace_stack == -1) {
+ break;
+ }
+ }
+ p++;
+ }
+
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* urls are extremely unlikely to end with any
+ * punctuation, so strip any trailing
+ * punctuation off. Also strip off any closing
+ * double-quotes. */
+ while (p > pos && strchr (",.:;?!-|}])\"", p[-1])) {
+ p--;
+ }
+
+ match->m_len = (p - pos);
+
+ return TRUE;
+}
+
+
+static gboolean
+url_email_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ const gchar *p;
+ /* Check what we have found */
+ if (pos > begin && *pos == '@') {
+ /* Try to extract it with username */
+ p = pos - 1;
+ while (p > begin && (is_domain (*p) || *p == '.' || *p == '_')) {
+ p --;
+ }
+ if (!is_domain (*p) && p != pos - 1) {
+ match->m_begin = p + 1;
+ return TRUE;
+ }
+ else if (p == begin) {
+ match->m_begin = p;
+ return TRUE;
+ }
+ }
+ else {
+ p = pos + strlen (match->pattern);
+ if (is_domain (*p)) {
+ match->m_begin = pos;
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+static gboolean
+url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+ const gchar *p;
+ gboolean got_at = FALSE;
+
+ p = pos + strlen (match->pattern);
+ if (*pos == '@') {
+ got_at = TRUE;
+ }
+
+ while (p < end && (is_domain (*p) || *p == '_'
+ || (*p == '@' && !got_at) ||
+ (*p == '.' && p + 1 < end && is_domain (*(p + 1))))) {
+ if (*p == '@') {
+ got_at = TRUE;
+ }
+ p ++;
+ }
+ match->m_len = p - match->m_begin;
+ match->add_prefix = TRUE;
+ return got_at;
+}
+
+void
+url_parse_text (rspamd_mempool_t * pool, struct rspamd_task *task, struct mime_text_part *part, gboolean is_html)
+{
+ gint rc;
+ gchar *url_str = NULL, *url_start, *url_end;
+ struct uri *new;
+ struct process_exception *ex;
+ gchar *p, *end, *begin;
+
+
+ if (!part->orig->data || part->orig->len == 0) {
+ msg_warn ("got empty text part");
+ return;
+ }
+
+ if (url_init () == 0) {
+ if (is_html) {
+ begin = part->orig->data;
+ end = begin + part->orig->len;
+ p = begin;
+ }
+ else {
+ begin = part->content->data;
+ end = begin + part->content->len;
+ p = begin;
+ }
+ while (p < end) {
+ if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str, is_html)) {
+ if (url_str != NULL) {
+ new = rspamd_mempool_alloc0 (pool, sizeof (struct uri));
+ ex = rspamd_mempool_alloc0 (pool, sizeof (struct process_exception));
+ if (new != NULL) {
+ g_strstrip (url_str);
+ rc = parse_uri (new, url_str, pool);
+ if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) &&
+ new->hostlen > 0) {
+ ex->pos = url_start - begin;
+ ex->len = url_end - url_start;
+ if (new->protocol == PROTOCOL_MAILTO) {
+ if (new->userlen > 0) {
+ if (!g_tree_lookup (task->emails, new)) {
+ g_tree_insert (task->emails, new, new);
+ }
+ }
+ }
+ else {
+ if (!g_tree_lookup (task->urls, new)) {
+ g_tree_insert (task->urls, new, new);
+ }
+ }
+ part->urls_offset = g_list_prepend (part->urls_offset, ex);
+ }
+ else if (rc != URI_ERRNO_OK) {
+ msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
+ }
+ }
+ }
+ }
+ else {
+ break;
+ }
+ p = url_end + 1;
+ }
+ }
+ /* Handle offsets of this part */
+ if (part->urls_offset != NULL) {
+ part->urls_offset = g_list_reverse (part->urls_offset);
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, part->urls_offset);
+ }
+}
+
+gboolean
+url_try_text (rspamd_mempool_t *pool, const gchar *begin, gsize len, gchar **start, gchar **fin, gchar **url_str, gboolean is_html)
+{
+ const gchar *end, *pos;
+ gint idx, l;
+ struct url_matcher *matcher;
+ url_match_t m;
+
+ end = begin + len;
+ if (url_init () == 0) {
+ if ((pos = rspamd_trie_lookup (url_scanner->patterns, begin, len, &idx)) == NULL) {
+ return FALSE;
+ }
+ else {
+ matcher = &matchers[idx];
+ if ((matcher->flags & URL_FLAG_NOHTML) && is_html) {
+ /* Do not try to match non-html like urls in html texts */
+ return FALSE;
+ }
+ m.pattern = matcher->pattern;
+ m.prefix = matcher->prefix;
+ m.add_prefix = FALSE;
+ if (matcher->start (begin, end, pos, &m) && matcher->end (begin, end, pos, &m)) {
+ if (m.add_prefix) {
+ l = m.m_len + 1 + strlen (m.prefix);
+ *url_str = rspamd_mempool_alloc (pool, l);
+ rspamd_snprintf (*url_str, l, "%s%*s", m.prefix, m.m_len, m.m_begin);
+ }
+ else {
+ *url_str = rspamd_mempool_alloc (pool, m.m_len + 1);
+ memcpy (*url_str, m.m_begin, m.m_len);
+ (*url_str)[m.m_len] = '\0';
+ }
+ if (start != NULL) {
+ *start = (gchar *)m.m_begin;
+ }
+ if (fin != NULL) {
+ *fin = (gchar *)m.m_begin + m.m_len;
+ }
+ }
+ else {
+ *url_str = NULL;
+ if (start != NULL) {
+ *start = (gchar *)pos;
+ }
+ if (fin != NULL) {
+ *fin = (gchar *)pos + strlen (m.prefix);
+ }
+ }
+
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/*
+ * vi: ts=4
+ */