From cbfa72954e882cfa8ad233d2b2d791526bc85f73 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 26 Nov 2013 16:10:37 +0000 Subject: [PATCH] Reduce false positive rate in urls detection. --- src/url.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/url.c b/src/url.c index e858a20d5..927b61889 100644 --- a/src/url.c +++ b/src/url.c @@ -1201,7 +1201,7 @@ url_tld_start (const gchar *begin, const gchar *end, const gchar *pos, url_match /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */ while (p >= begin) { - if ((!is_domain (*p) && *p != '.') || g_ascii_isspace (*p)) { + if ((!is_domain (*p) && *p != '.' && *p != '/') || g_ascii_isspace (*p)) { p ++; if (!g_ascii_isalnum (*p)) { /* Urls cannot start with strange symbols */ @@ -1224,6 +1224,10 @@ url_tld_start (const gchar *begin, const gchar *end, const gchar *pos, url_match return FALSE; } } + else if (*p == '/') { + /* Urls cannot contain '/' in their body */ + return FALSE; + } p --; } @@ -1235,9 +1239,9 @@ url_tld_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t { const gchar *p; - /* A url must be finished by tld, so it must be followed by punctuation or by space character */ + /* A url must be finished by tld, so it must be followed by space character */ p = pos + strlen (match->pattern); - if (p == end || g_ascii_isspace (*p) || g_ascii_ispunct (*p)) { + if (p == end || g_ascii_isspace (*p) || *p == ',') { match->m_len = p - match->m_begin; return TRUE; } @@ -1356,10 +1360,13 @@ domain: } if (!passwd && (port >= 65536 || *p == '@')) { - if (p < end) { + if (p < end && *p == '@') { /* this must be a password? */ goto passwd; } + else if (p < end) { + return FALSE; + } p--; } -- 2.39.5