Browse Source

* Add initial version of URLs parser (still need to make PCRE parse all pattern matches)

* Link with PCRE
tags/0.2.7
Vsevolod Stakhov 16 years ago
parent
commit
7cd13c464f
5 changed files with 644 additions and 24 deletions
  1. 8
    2
      configure
  2. 5
    0
      main.h
  3. 494
    0
      url.c
  4. 86
    0
      url.h
  5. 51
    22
      worker.c

+ 8
- 2
configure View File

@@ -20,7 +20,7 @@ LEX_SRC="cfg_file.l"
YACC_OUTPUT="cfg_yacc.c"
LEX_OUTPUT="cfg_lex.c"

SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c worker.c fstring.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c worker.c fstring.c url.c ${LEX_OUTPUT} ${YACC_OUTPUT}"

CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
CFLAGS="$CFLAGS -Wno-unused-function -Wunused-variable -Wno-sign-compare"
@@ -28,7 +28,7 @@ CFLAGS="$CFLAGS -Wunused-value -ggdb -I${LOCALBASE}/include"
CFLAGS="$CFLAGS "
LDFLAGS="$LDFLAGS -L/usr/lib -L${LOCALBASE}/lib"
OPT_FLAGS="-O -pipe -fno-omit-frame-pointer"
DEPS="cfg_file.h memcached.h util.h main.h upstream.h fstring.h ${LEX_OUTPUT} ${YACC_OUTPUT}"
DEPS="cfg_file.h memcached.h util.h main.h upstream.h fstring.h url.h ${LEX_OUTPUT} ${YACC_OUTPUT}"
EXEC=rspamd
USER=postfix
GROUP=postfix
@@ -520,6 +520,12 @@ if [ $? -eq 1 ] ; then
exit 1
fi

check_lib "pcre" "pcre.h"
if [ $? -eq 1 ] ; then
echo "PCRE not found, check config.log for details"
exit 1
fi

check_lib "m"
check_lib "pcre"
check_lib "md"

+ 5
- 0
main.h View File

@@ -18,6 +18,7 @@
#include <event.h>

#include "fstring.h"
#include "url.h"

/* Default values */
#define FIXED_CONFIG_FILE "./rspamd.conf"
@@ -75,6 +76,10 @@ struct worker_task {
size_t content_length;
f_str_buf_t *msg;
struct bufferevent *bev;
/* Number of mime parts */
int parts_count;
/* URLs extracted from message */
TAILQ_HEAD (uriq, uri) urls;
};

void start_worker (struct rspamd_worker *worker, int listen_sock);

+ 494
- 0
url.c View File

@@ -0,0 +1,494 @@
#include <sys/types.h>
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>
#include <pcre.h>
#include <syslog.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <netdb.h>

#include "url.h"
#include "fstring.h"
#include "main.h"

#define POST_CHAR 1
#define POST_CHAR_S "\001"

struct _proto {
unsigned char *name;
int port;
uintptr_t *unused;
unsigned int need_slashes:1;
unsigned int need_slash_after_host:1;
unsigned int free_syntax:1;
unsigned int need_ssl:1;
};

static const char *html_url = "((?:href=)|(?:archive=)|(?:code=)|(?:codebase=)|(?:src=)|(?:cite=)"
"|(:?background=)|(?:pluginspage=)|(?:pluginurl=)|(?:action=)|(?:dynsrc=)|(?:longdesc=)|(?:lowsrc=)|(?:src=)|(?:usemap=))"
"\\\"?([^>\"<]+)\\\"?";
static const char *text_url = "((mailto\\:|(news|(ht|f)tp(s?))\\://){1}[^>\"<]+)";

static short url_initialized = 0;
static pcre_extra *text_re_extra;
static pcre *text_re;
static pcre_extra *html_re_extra;
static pcre *html_re;

static const struct _proto protocol_backends[] = {
{ "file", 0, NULL, 1, 0, 0, 0 },
{ "ftp", 21, NULL, 1, 1, 0, 0 },
{ "http", 80, NULL, 1, 1, 0, 0 },
{ "https", 443, NULL, 1, 1, 0, 1 },

/* Keep these last! */
{ NULL, 0, NULL, 0, 0, 1, 0 },
};

static inline int
end_of_dir(unsigned char c)
{
return c == POST_CHAR || c == '#' || c == ';' || c == '?';
}

static inline int
is_uri_dir_sep(struct uri *uri, unsigned char pos)
{
return (pos == '/');
}

static int
url_init (void)
{
if (url_initialized == 0) {
text_re = pcre_compile (text_url, PCRE_CASELESS, NULL, 0, NULL);
if (text_re == NULL) {
msg_info ("url_init: cannot init url parsing regexp");
return -1;
}
text_re_extra = pcre_study (text_re, 0, NULL);
html_re = pcre_compile (html_url, PCRE_CASELESS, NULL, 0, NULL);
if (html_re == NULL) {
msg_info ("url_init: cannot init url parsing regexp");
return -1;
}
html_re_extra = pcre_study (html_re, 0, NULL);
url_initialized = 1;
}

return 0;
}

enum protocol
get_protocol(unsigned char *name, int namelen)
{
/* These are really enum protocol values but can take on negative
* values and since 0 <= -1 for enum values it's better to use clean
* integer type. */
int start, end;
enum protocol protocol;
unsigned char *pname;
int pnamelen, minlen, compare;

/* Almost dichotomic search is used here */
/* Starting at the HTTP entry which is the most common that will make
* file and NNTP the next entries checked and amongst the third checks
* are proxy and FTP. */
start = 0;
end = PROTOCOL_UNKNOWN - 1;
protocol = PROTOCOL_HTTP;

while (start <= end) {
pname = protocol_backends[protocol].name;
pnamelen = strlen (pname);
minlen = MIN (pnamelen, namelen);
compare = strncasecmp (pname, name, minlen);

if (compare == 0) {
if (pnamelen == namelen)
return protocol;

/* If the current protocol name is longer than the
* protocol name being searched for move @end else move
* @start. */
compare = pnamelen > namelen ? 1 : -1;
}

if (compare > 0)
end = protocol - 1;
else
start = protocol + 1;

protocol = (start + end) / 2;
}

return PROTOCOL_UNKNOWN;
}


int
get_protocol_port(enum protocol protocol)
{
return protocol_backends[protocol].port;
}

int
get_protocol_need_slashes(enum protocol protocol)
{
return protocol_backends[protocol].need_slashes;
}

int
get_protocol_need_slash_after_host(enum protocol protocol)
{
return protocol_backends[protocol].need_slash_after_host;
}

int
get_protocol_free_syntax(enum protocol protocol)
{
return protocol_backends[protocol].free_syntax;
}

static int
get_protocol_length(const unsigned char *url)
{
unsigned char *end = (unsigned char *) url;

/* Seek the end of the protocol name if any. */
/* RFC1738:
* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
* (but per its recommendations we accept "upalpha" too) */
while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
end++;

/* Now we make something to support our "IP version in protocol scheme
* name" hack and silently chop off the last digit if it's there. The
* IETF's not gonna notice I hope or it'd be going after us hard. */
if (end != url && isdigit(end[-1]))
end--;

/* Also return 0 if there's no protocol name (@end == @url). */
return (*end == ':' || isdigit(*end)) ? end - url : 0;
}

static enum uri_errno
parse_uri(struct uri *uri, unsigned char *uristring)
{
unsigned char *prefix_end, *host_end;
unsigned char *lbracket, *rbracket;
int datalen, n, addrlen;
unsigned char *frag_or_post, *user_end, *port_end;

memset (uri, 0, sizeof (*uri));

/* Nothing to do for an empty url. */
if (!*uristring) return URI_ERRNO_EMPTY;

uri->string = uristring;
uri->protocollen = get_protocol_length (uristring);

/* Invalid */
if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;

/* Figure out whether the protocol is known */
uri->protocol = get_protocol (struri(uri), uri->protocollen);

prefix_end = uristring + uri->protocollen; /* ':' */

/* Check if there's a digit after the protocol name. */
if (isdigit (*prefix_end)) {
uri->ip_family = uristring[uri->protocollen] - '0';
prefix_end++;
}
if (*prefix_end != ':')
return URI_ERRNO_INVALID_PROTOCOL;
prefix_end++;

/* Skip slashes */

if (prefix_end[0] == '/' && prefix_end[1] == '/') {
if (prefix_end[2] == '/')
return URI_ERRNO_TOO_MANY_SLASHES;

prefix_end += 2;

} else {
return URI_ERRNO_NO_SLASHES;
}

if (get_protocol_free_syntax (uri->protocol)) {
uri->data = prefix_end;
uri->datalen = strlen (prefix_end);
return URI_ERRNO_OK;

} else if (uri->protocol == PROTOCOL_FILE) {
datalen = check_uri_file (prefix_end);
frag_or_post = prefix_end + datalen;

/* Extract the fragment part. */
if (datalen >= 0) {
if (*frag_or_post == '#') {
uri->fragment = frag_or_post + 1;
uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
frag_or_post = uri->fragment + uri->fragmentlen;
}
if (*frag_or_post == POST_CHAR) {
uri->post = frag_or_post + 1;
}
} else {
datalen = strlen(prefix_end);
}

uri->data = prefix_end;
uri->datalen = datalen;

return URI_ERRNO_OK;
}

/* Isolate host */

/* Get brackets enclosing IPv6 address */
lbracket = strchr (prefix_end, '[');
if (lbracket) {
rbracket = strchr (lbracket, ']');
/* [address] is handled only inside of hostname part (surprisingly). */
if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/"))
uri->ipv6 = 1;
else
lbracket = rbracket = NULL;
} else {
rbracket = NULL;
}

/* Possibly skip auth part */
host_end = prefix_end + strcspn (prefix_end, "@");

if (prefix_end + strcspn (prefix_end, "/") > host_end
&& *host_end) { /* we have auth info here */

/* Allow '@' in the password component */
while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?"))
host_end = host_end + 1 + strcspn (host_end + 1, "@");

user_end = strchr (prefix_end, ':');

if (!user_end || user_end > host_end) {
uri->user = prefix_end;
uri->userlen = host_end - prefix_end;
} else {
uri->user = prefix_end;
uri->userlen = user_end - prefix_end;
uri->password = user_end + 1;
uri->passwordlen = host_end - user_end - 1;
}
prefix_end = host_end + 1;
}

if (uri->ipv6)
host_end = rbracket + strcspn (rbracket, ":/?");
else
host_end = prefix_end + strcspn (prefix_end, ":/?");

if (uri->ipv6) {
addrlen = rbracket - lbracket - 1;


uri->host = lbracket + 1;
uri->hostlen = addrlen;
} else {
uri->host = prefix_end;
uri->hostlen = host_end - prefix_end;

/* Trim trailing '.'s */
if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
return URI_ERRNO_TRAILING_DOTS;
}

if (*host_end == ':') { /* we have port here */
port_end = host_end + 1 + strcspn (host_end + 1, "/");

host_end++;

uri->port = host_end;
uri->portlen = port_end - host_end;

if (uri->portlen == 0)
return URI_ERRNO_NO_PORT_COLON;

/* We only use 8 bits for portlen so better check */
if (uri->portlen != port_end - host_end)
return URI_ERRNO_INVALID_PORT;

/* test if port is number */
for (; host_end < port_end; host_end++)
if (!isdigit (*host_end))
return URI_ERRNO_INVALID_PORT;

/* Check valid port value, and let show an error message
* about invalid url syntax. */
if (uri->port && uri->portlen) {

errno = 0;
n = strtol (uri->port, NULL, 10);
if (errno || !uri_port_is_valid (n))
return URI_ERRNO_INVALID_PORT;
}
}

if (*host_end == '/') {
host_end++;

} else if (get_protocol_need_slash_after_host (uri->protocol)) {
/* The need for slash after the host component depends on the
* need for a host component. -- The dangerous mind of Jonah */
if (!uri->hostlen)
return URI_ERRNO_NO_HOST;

return URI_ERRNO_NO_HOST_SLASH;
}

/* Look for #fragment or POST_CHAR */
prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S);
uri->data = host_end;
uri->datalen = prefix_end - host_end;

if (*prefix_end == '#') {
uri->fragment = prefix_end + 1;
uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
prefix_end = uri->fragment + uri->fragmentlen;
}

if (*prefix_end == POST_CHAR) {
uri->post = prefix_end + 1;
}

return URI_ERRNO_OK;
}

static unsigned char *
normalize_uri(struct uri *uri, unsigned char *uristring)
{
unsigned char *parse_string = uristring;
unsigned char *src, *dest, *path;
int need_slash = 0;
int parse = (uri == NULL);
struct uri uri_struct;

if (!uri) uri = &uri_struct;

/*
* We need to get the real (proxied) URI but lowercase relevant URI
* parts along the way.
*/
if (parse && parse_uri (uri, parse_string) != URI_ERRNO_OK)
return uristring;


/* This is a maybe not the right place but both join_urls() and
* get_translated_uri() through translate_url() calls this
* function and then it already works on and modifies an
* allocated copy. */
convert_to_lowercase (uri->string, uri->protocollen);
if (uri->hostlen) convert_to_lowercase (uri->host, uri->hostlen);

parse = 1;
parse_string = uri->data;

if (get_protocol_free_syntax (uri->protocol))
return uristring;

if (uri->protocol != PROTOCOL_UNKNOWN)
need_slash = get_protocol_need_slash_after_host (uri->protocol);

/* We want to start at the first slash to also reduce URIs like
* http://host//index.html to http://host/index.html */
path = uri->data - need_slash;
dest = src = path;

/* This loop mangles the URI string by removing directory elevators and
* other cruft. Example: /.././etc////..//usr/ -> /usr/ */
while (*dest) {
/* If the following pieces are the LAST parts of URL, we remove
* them as well. See RFC 1808 for details. */

if (end_of_dir (src[0])) {
/* URL data contains no more path. */
memmove (dest, src, strlen(src) + 1);
break;
}

if (!is_uri_dir_sep (uri, src[0])) {
/* This is to reduce indentation */

} else if (src[1] == '.') {
if (!src[2]) {
/* /. - skip the dot */
*dest++ = *src;
*dest = 0;
break;

} else if (is_uri_dir_sep (uri, src[2])) {
/* /./ - strip that.. */
src += 2;
continue;

} else if (src[2] == '.'
&& (is_uri_dir_sep (uri, src[3]) || !src[3])) {
/* /../ or /.. - skip it and preceding element. */

/* First back out the last incrementation of
* @dest (dest++) to get the position that was
* last asigned to. */
if (dest > path) dest--;

/* @dest might be pointing to a dir separator
* so we decrement before any testing. */
while (dest > path) {
dest--;
if (is_uri_dir_sep (uri, *dest)) break;
}

if (!src[3]) {
/* /.. - add ending slash and stop */
*dest++ = *src;
*dest = 0;
break;
}

src += 3;
continue;
}

} else if (is_uri_dir_sep (uri, src[1])) {
/* // - ignore first '/'. */
src += 1;
continue;
}

/* We don't want to access memory past the NUL char. */
*dest = *src++;
if (*dest) dest++;
}

return uristring;
}


void
url_parse_text (struct worker_task *task, GByteArray *content)
{
if (url_init () == 0) {
/* TODO: */
}
}

void
url_parse_html (struct worker_task *task, GByteArray *content)
{
if (url_init () == 0) {
/* TODO: */
}
}

+ 86
- 0
url.h View File

@@ -0,0 +1,86 @@
/* URL check functions */
#ifndef URL_H
#define URL_H

#include <sys/types.h>
#include <sys/socket.h>
#ifndef OWN_QUEUE_H
#include <sys/queue.h>
#else
#include "queue.h"
#endif

#include <glib.h>

struct worker_task;

struct uri {
/* The start of the uri (and thus start of the protocol string). */
unsigned char *string;

/* The internal type of protocol. Can _never_ be PROTOCOL_UNKNOWN. */
int protocol; /* enum protocol */

int ip_family;

unsigned char *user;
unsigned char *password;
unsigned char *host;
unsigned char *port;
/* @data can contain both the path and query uri fields.
* It can never be NULL but can have zero length. */
unsigned char *data;
unsigned char *fragment;
/* @post can contain some special encoded form data, used internally
* to make form data handling more efficient. The data is marked by
* POST_CHAR in the uri string. */
unsigned char *post;

/* @protocollen should only be usable if @protocol is either
* PROTOCOL_USER or an uri string should be composed. */
unsigned int protocollen:16;
unsigned int userlen:16;
unsigned int passwordlen:16;
unsigned int hostlen:16;
unsigned int portlen:8;
unsigned int datalen:16;
unsigned int fragmentlen:16;

/* Flags */
unsigned int ipv6:1; /* URI contains IPv6 host */
unsigned int form:1; /* URI originated from form */
/* Link */
TAILQ_ENTRY(uri) next;
};

enum uri_errno {
URI_ERRNO_OK, /* Parsing went well */
URI_ERRNO_EMPTY, /* The URI string was empty */
URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
URI_ERRNO_NO_SLASHES, /* Slashes after protocol missing */
URI_ERRNO_TOO_MANY_SLASHES, /* Too many slashes after protocol */
URI_ERRNO_TRAILING_DOTS, /* '.' after host */
URI_ERRNO_NO_HOST, /* Host part is missing */
URI_ERRNO_NO_PORT_COLON, /* ':' after host without port */
URI_ERRNO_NO_HOST_SLASH, /* Slash after host missing */
URI_ERRNO_IPV6_SECURITY, /* IPv6 security bug detected */
URI_ERRNO_INVALID_PORT, /* Port number is bad */
URI_ERRNO_INVALID_PORT_RANGE, /* Port number is not within 0-65535 */
};

enum protocol {
PROTOCOL_FILE,
PROTOCOL_FTP,
PROTOCOL_HTTP,
PROTOCOL_HTTPS,

PROTOCOL_UNKNOWN,
};

#define struri(uri) ((uri)->string)

void url_parse_html (struct worker_task *task, GByteArray *part);
void url_parse_text (struct worker_task *task, GByteArray *part);

#endif

+ 51
- 22
worker.c View File

@@ -22,6 +22,7 @@
#include "main.h"
#include "upstream.h"
#include "cfg_file.h"
#include "url.h"

#define CONTENT_LENGTH_HEADER "Content-Length:"

@@ -57,12 +58,35 @@ sigusr_handler (int fd, short what, void *arg)
return;
}

static void
free_task (struct worker_task *task)
{
struct uri *cur;
if (task) {
if (task->msg) {
fstrfree (task->msg->buf);
free (task->msg);
}
while (!TAILQ_EMPTY(&task->urls)) {
cur = TAILQ_FIRST(&task->urls);
free (cur->string);
free (cur);
TAILQ_REMOVE (&task->urls, cur, next);
}
free (task);
}
}

static void
mime_foreach_callback (GMimeObject *part, gpointer user_data)
{
int *count = user_data;
struct worker_task *task = (struct worker_task *)user_data;
const GMimeContentType *type;
GMimeDataWrapper *wrapper;
GMimeStream *part_stream;
GByteArray *part_content;
(*count)++;
task->parts_count ++;
/* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
@@ -77,7 +101,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
g_mime_message_foreach_part() again here. */
message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
g_mime_message_foreach_part (message, mime_foreach_callback, count);
g_mime_message_foreach_part (message, mime_foreach_callback, task);
g_object_unref (message);
} else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
/* message/partial */
@@ -94,6 +118,20 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
/* we'll get to finding out if this is a signed/encrypted multipart later... */
} else if (GMIME_IS_PART (part)) {
/* a normal leaf part, could be text/plain or image/jpeg etc */
wrapper = g_mime_part_get_content_object (GMIME_PART (part));
if (wrapper != NULL) {
part_stream = g_mime_stream_mem_new ();
if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
type = g_mime_part_get_content_type (GMIME_PART (part));
if (g_mime_content_type_is_type (type, "text", "html")) {
url_parse_html (task, part_content);
}
else if (g_mime_content_type_is_type (type, "text", "plain")) {
url_parse_text (task, part_content);
}
}
}
} else {
g_assert_not_reached ();
}
@@ -101,14 +139,13 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)


static void
process_message (f_str_t *msg)
process_message (struct worker_task *task)
{
int count = 0;
GMimeMessage *message;
GMimeParser *parser;
GMimeStream *stream;

stream = g_mime_stream_mem_new_with_buffer (msg->begin, msg->len);
stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len);
/* create a new parser object to parse the stream */
parser = g_mime_parser_new_with_stream (stream);
/* create a new parser object to parse the stream */
@@ -123,9 +160,9 @@ process_message (f_str_t *msg)
/* free the parser (and the stream) */
g_object_unref (parser);

g_mime_message_foreach_part (message, mime_foreach_callback, &count);
g_mime_message_foreach_part (message, mime_foreach_callback, task);
msg_info ("process_message: found %d parts in message", count);
msg_info ("process_message: found %d parts in message", task->parts_count);
}

static void
@@ -186,7 +223,7 @@ read_socket (struct bufferevent *bev, void *arg)
task->msg->pos += r;
update_buf_size (task->msg);
if (task->msg->free == 0) {
process_message (task->msg->buf);
process_message (task);
task->state = WRITE_REPLY;
}
}
@@ -194,9 +231,7 @@ read_socket (struct bufferevent *bev, void *arg)
msg_err ("read_socket: cannot read data to buffer: %ld", (long int)r);
bufferevent_disable (bev, EV_READ);
bufferevent_free (bev);
fstrfree (task->msg->buf);
free (task->msg);
free (task);
free_task (task);
}
break;
case WRITE_REPLY:
@@ -220,12 +255,9 @@ write_socket (struct bufferevent *bev, void *arg)
if (task->state > READ_MESSAGE) {
msg_info ("closing connection");
/* Free buffers */
fstrfree (task->msg->buf);
free (task->msg);
free_task (task);
bufferevent_disable (bev, EV_WRITE);
bufferevent_free (bev);

free (task);
}
}

@@ -235,14 +267,9 @@ err_socket (struct bufferevent *bev, short what, void *arg)
struct worker_task *task = (struct worker_task *)arg;
msg_info ("closing connection");
/* Free buffers */
if (task->state > READ_HEADER) {
fstrfree (task->msg->buf);
free (task->msg);
}
free_task (task);
bufferevent_disable (bev, EV_READ);
bufferevent_free (bev);

free (task);
}

static void
@@ -269,6 +296,8 @@ accept_socket (int fd, short what, void *arg)
new_task->worker = worker;
new_task->state = READ_COMMAND;
new_task->content_length = 0;
new_task->parts_count = 0;
TAILQ_INIT (&new_task->urls);

/* Read event */
new_task->bev = bufferevent_new (nfd, read_socket, write_socket, err_socket, (void *)new_task);

Loading…
Cancel
Save