* Add initial version of URLs parser (still need to make PCRE parse all pattern matches)

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Wed, 11 Jun 2008 14:34:33 +0000 (18:34 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Wed, 11 Jun 2008 14:34:33 +0000 (18:34 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 11 Jun 2008 14:34:33 +0000 (18:34 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 11 Jun 2008 14:34:33 +0000 (18:34 +0400)
diff --git a/configure b/configure

index e9471cd787db2e421dd962374709e633b961b756..472b4c4bd813d632e480dc3a1e4473318a752862 100755 (executable)
--- a/configure
+++ b/configure
@@ -20,7 +20,7 @@ LEX_SRC="cfg_file.l"
  YACC_OUTPUT="cfg_yacc.c"
  LEX_OUTPUT="cfg_lex.c"
  
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c worker.c fstring.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c worker.c fstring.c url.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
  
  CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
  CFLAGS="$CFLAGS -Wno-unused-function -Wunused-variable -Wno-sign-compare"
@@ -28,7 +28,7 @@ CFLAGS="$CFLAGS -Wunused-value -ggdb -I${LOCALBASE}/include"
  CFLAGS="$CFLAGS "
  LDFLAGS="$LDFLAGS -L/usr/lib -L${LOCALBASE}/lib"
  OPT_FLAGS="-O -pipe -fno-omit-frame-pointer"
-DEPS="cfg_file.h memcached.h util.h main.h upstream.h fstring.h ${LEX_OUTPUT} ${YACC_OUTPUT}"
+DEPS="cfg_file.h memcached.h util.h main.h upstream.h fstring.h url.h ${LEX_OUTPUT} ${YACC_OUTPUT}"
  EXEC=rspamd
  USER=postfix
  GROUP=postfix
@@ -520,6 +520,12 @@ if [ $? -eq 1 ] ; then
         exit 1
  fi
  
+check_lib "pcre" "pcre.h"
+if [ $? -eq 1 ] ; then
+       echo "PCRE not found, check config.log for details"
+       exit 1
+fi
+
  check_lib "m"
  check_lib "pcre"
  check_lib "md"
diff --git a/main.h b/main.h

index 4c4a8aaac9474a036da3a4e697f766c3c0ee3d23..425d219a42cd5b4e25b528036f9a237f36098e78 100644 (file)
--- a/main.h
+++ b/main.h
@@ -18,6 +18,7 @@
  #include <event.h>
  
  #include "fstring.h"
+#include "url.h"
  
  /* Default values */
  #define FIXED_CONFIG_FILE "./rspamd.conf"
@@ -75,6 +76,10 @@ struct worker_task {
         size_t content_length;
         f_str_buf_t *msg;
         struct bufferevent *bev;
+       /* Number of mime parts */
+       int parts_count;
+       /* URLs extracted from message */
+       TAILQ_HEAD (uriq, uri) urls;
  };
  
  void start_worker (struct rspamd_worker *worker, int listen_sock);
diff --git a/url.c b/url.c

new file mode 100644 (file)

index 0000000..dc8e805
--- /dev/null
+++ b/url.c
@@ -0,0 +1,494 @@
+#include <sys/types.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <pcre.h>
+#include <syslog.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+#include "url.h"
+#include "fstring.h"
+#include "main.h"
+
+#define POST_CHAR 1
+#define POST_CHAR_S "\001"
+
+struct _proto {
+       unsigned char *name;
+       int port;
+       uintptr_t *unused;
+       unsigned int need_slashes:1;
+       unsigned int need_slash_after_host:1;
+       unsigned int free_syntax:1;
+       unsigned int need_ssl:1;
+};
+
+static const char *html_url = "((?:href=)|(?:archive=)|(?:code=)|(?:codebase=)|(?:src=)|(?:cite=)"
+"|(:?background=)|(?:pluginspage=)|(?:pluginurl=)|(?:action=)|(?:dynsrc=)|(?:longdesc=)|(?:lowsrc=)|(?:src=)|(?:usemap=))"
+"\\\"?([^>\"<]+)\\\"?";
+static const char *text_url = "((mailto\\:|(news|(ht|f)tp(s?))\\://){1}[^>\"<]+)";
+
+static short url_initialized = 0;
+static pcre_extra *text_re_extra;
+static pcre *text_re;
+static pcre_extra *html_re_extra;
+static pcre *html_re;
+
+static const struct _proto protocol_backends[] = {
+       { "file",          0, NULL,             1, 0, 0, 0 },
+       { "ftp",          21, NULL,             1, 1, 0, 0 },
+       { "http",         80, NULL,             1, 1, 0, 0 },
+       { "https",       443, NULL,             1, 1, 0, 1 },
+
+       /* Keep these last! */
+       { NULL,            0, NULL,                     0, 0, 1, 0 },
+};
+
+static inline int
+end_of_dir(unsigned char c)
+{
+       return c == POST_CHAR || c == '#' || c == ';' || c == '?';
+}
+
+static inline int
+is_uri_dir_sep(struct uri *uri, unsigned char pos)
+{
+       return (pos == '/');
+}
+
+static int
+url_init (void)
+{
+       if (url_initialized == 0) {
+               text_re = pcre_compile (text_url, PCRE_CASELESS, NULL, 0, NULL);
+               if (text_re == NULL) {
+                       msg_info ("url_init: cannot init url parsing regexp");
+                       return -1;
+               }
+               text_re_extra = pcre_study (text_re, 0, NULL);
+               html_re = pcre_compile (html_url, PCRE_CASELESS, NULL, 0, NULL);
+               if (html_re == NULL) {
+                       msg_info ("url_init: cannot init url parsing regexp");
+                       return -1;
+               }
+               html_re_extra = pcre_study (html_re, 0, NULL);
+               url_initialized = 1;
+       }
+
+       return 0;
+}
+
+enum protocol
+get_protocol(unsigned char *name, int namelen)
+{
+       /* These are really enum protocol values but can take on negative
+        * values and since 0 <= -1 for enum values it's better to use clean
+        * integer type. */
+       int start, end;
+       enum protocol protocol;
+       unsigned char *pname;
+       int pnamelen, minlen, compare;
+
+       /* Almost dichotomic search is used here */
+       /* Starting at the HTTP entry which is the most common that will make
+        * file and NNTP the next entries checked and amongst the third checks
+        * are proxy and FTP. */
+       start    = 0;
+       end      = PROTOCOL_UNKNOWN - 1;
+       protocol = PROTOCOL_HTTP;
+
+       while (start <= end) {
+               pname = protocol_backends[protocol].name;
+               pnamelen = strlen (pname);
+               minlen = MIN (pnamelen, namelen);
+               compare = strncasecmp (pname, name, minlen);
+
+               if (compare == 0) {
+                       if (pnamelen == namelen)
+                               return protocol;
+
+                       /* If the current protocol name is longer than the
+                        * protocol name being searched for move @end else move
+                        * @start. */
+                       compare = pnamelen > namelen ? 1 : -1;
+               }
+
+               if (compare > 0)
+                       end = protocol - 1;
+               else
+                       start = protocol + 1;
+
+               protocol = (start + end) / 2;
+       }
+
+       return PROTOCOL_UNKNOWN;
+}
+
+
+int
+get_protocol_port(enum protocol protocol)
+{
+       return protocol_backends[protocol].port;
+}
+
+int
+get_protocol_need_slashes(enum protocol protocol)
+{
+       return protocol_backends[protocol].need_slashes;
+}
+
+int
+get_protocol_need_slash_after_host(enum protocol protocol)
+{
+       return protocol_backends[protocol].need_slash_after_host;
+}
+
+int
+get_protocol_free_syntax(enum protocol protocol)
+{
+       return protocol_backends[protocol].free_syntax;
+}
+
+static int
+get_protocol_length(const unsigned char *url)
+{
+       unsigned char *end = (unsigned char *) url;
+
+       /* Seek the end of the protocol name if any. */
+       /* RFC1738:
+        * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
+        * (but per its recommendations we accept "upalpha" too) */
+       while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
+               end++;
+
+       /* Now we make something to support our "IP version in protocol scheme
+        * name" hack and silently chop off the last digit if it's there. The
+        * IETF's not gonna notice I hope or it'd be going after us hard. */
+       if (end != url && isdigit(end[-1]))
+               end--;
+
+       /* Also return 0 if there's no protocol name (@end == @url). */
+       return (*end == ':' || isdigit(*end)) ? end - url : 0;
+}
+
+static enum uri_errno
+parse_uri(struct uri *uri, unsigned char *uristring)
+{
+       unsigned char *prefix_end, *host_end;
+       unsigned char *lbracket, *rbracket;
+       int datalen, n, addrlen;
+       unsigned char *frag_or_post, *user_end, *port_end;
+
+       memset (uri, 0, sizeof (*uri));
+
+       /* Nothing to do for an empty url. */
+       if (!*uristring) return URI_ERRNO_EMPTY;
+
+       uri->string = uristring;
+       uri->protocollen = get_protocol_length (uristring);
+
+       /* Invalid */
+       if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
+
+       /* Figure out whether the protocol is known */
+       uri->protocol = get_protocol (struri(uri), uri->protocollen);
+
+       prefix_end = uristring + uri->protocollen; /* ':' */
+
+       /* Check if there's a digit after the protocol name. */
+       if (isdigit (*prefix_end)) {
+               uri->ip_family = uristring[uri->protocollen] - '0';
+               prefix_end++;
+       }
+       if (*prefix_end != ':')
+               return URI_ERRNO_INVALID_PROTOCOL;
+       prefix_end++;
+
+       /* Skip slashes */
+
+       if (prefix_end[0] == '/' && prefix_end[1] == '/') {
+               if (prefix_end[2] == '/')
+                       return URI_ERRNO_TOO_MANY_SLASHES;
+
+               prefix_end += 2;
+
+       } else {
+               return URI_ERRNO_NO_SLASHES;
+       }
+
+       if (get_protocol_free_syntax (uri->protocol)) {
+               uri->data = prefix_end;
+               uri->datalen = strlen (prefix_end);
+               return URI_ERRNO_OK;
+
+       } else if (uri->protocol == PROTOCOL_FILE) {
+               datalen = check_uri_file (prefix_end);
+               frag_or_post = prefix_end + datalen;
+
+               /* Extract the fragment part. */
+               if (datalen >= 0) {
+                       if (*frag_or_post == '#') {
+                               uri->fragment = frag_or_post + 1;
+                               uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
+                               frag_or_post = uri->fragment + uri->fragmentlen;
+                       }
+                       if (*frag_or_post == POST_CHAR) {
+                               uri->post = frag_or_post + 1;
+                       }
+               } else {
+                       datalen = strlen(prefix_end);
+               }
+
+               uri->data = prefix_end;
+               uri->datalen = datalen;
+
+               return URI_ERRNO_OK;
+       }
+
+       /* Isolate host */
+
+       /* Get brackets enclosing IPv6 address */
+       lbracket = strchr (prefix_end, '[');
+       if (lbracket) {
+               rbracket = strchr (lbracket, ']');
+               /* [address] is handled only inside of hostname part (surprisingly). */
+               if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/"))
+                       uri->ipv6 = 1;
+               else
+                       lbracket = rbracket = NULL;
+       } else {
+               rbracket = NULL;
+       }
+
+       /* Possibly skip auth part */
+       host_end = prefix_end + strcspn (prefix_end, "@");
+
+       if (prefix_end + strcspn (prefix_end, "/") > host_end
+           && *host_end) { /* we have auth info here */
+
+               /* Allow '@' in the password component */
+               while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?"))
+                       host_end = host_end + 1 + strcspn (host_end + 1, "@");
+
+               user_end = strchr (prefix_end, ':');
+
+               if (!user_end || user_end > host_end) {
+                       uri->user = prefix_end;
+                       uri->userlen = host_end - prefix_end;
+               } else {
+                       uri->user = prefix_end;
+                       uri->userlen = user_end - prefix_end;
+                       uri->password = user_end + 1;
+                       uri->passwordlen = host_end - user_end - 1;
+               }
+               prefix_end = host_end + 1;
+       }
+
+       if (uri->ipv6)
+               host_end = rbracket + strcspn (rbracket, ":/?");
+       else
+               host_end = prefix_end + strcspn (prefix_end, ":/?");
+
+       if (uri->ipv6) {
+               addrlen = rbracket - lbracket - 1;
+
+
+               uri->host = lbracket + 1;
+               uri->hostlen = addrlen;
+       } else {
+               uri->host = prefix_end;
+               uri->hostlen = host_end - prefix_end;
+
+               /* Trim trailing '.'s */
+               if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
+                       return URI_ERRNO_TRAILING_DOTS;
+       }
+
+       if (*host_end == ':') { /* we have port here */
+               port_end = host_end + 1 + strcspn (host_end + 1, "/");
+
+               host_end++;
+
+               uri->port = host_end;
+               uri->portlen = port_end - host_end;
+
+               if (uri->portlen == 0)
+                       return URI_ERRNO_NO_PORT_COLON;
+
+               /* We only use 8 bits for portlen so better check */
+               if (uri->portlen != port_end - host_end)
+                       return URI_ERRNO_INVALID_PORT;
+
+               /* test if port is number */
+               for (; host_end < port_end; host_end++)
+                       if (!isdigit (*host_end))
+                               return URI_ERRNO_INVALID_PORT;
+
+               /* Check valid port value, and let show an error message
+                * about invalid url syntax. */
+               if (uri->port && uri->portlen) {
+
+                       errno = 0;
+                       n = strtol (uri->port, NULL, 10);
+                       if (errno || !uri_port_is_valid (n))
+                               return URI_ERRNO_INVALID_PORT;
+               }
+       }
+
+       if (*host_end == '/') {
+               host_end++;
+
+       } else if (get_protocol_need_slash_after_host (uri->protocol)) {
+               /* The need for slash after the host component depends on the
+                * need for a host component. -- The dangerous mind of Jonah */
+               if (!uri->hostlen)
+                       return URI_ERRNO_NO_HOST;
+
+               return URI_ERRNO_NO_HOST_SLASH;
+       }
+
+       /* Look for #fragment or POST_CHAR */
+       prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S);
+       uri->data = host_end;
+       uri->datalen = prefix_end - host_end;
+
+       if (*prefix_end == '#') {
+               uri->fragment = prefix_end + 1;
+               uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
+               prefix_end = uri->fragment + uri->fragmentlen;
+       }
+
+       if (*prefix_end == POST_CHAR) {
+               uri->post = prefix_end + 1;
+       }
+
+       return URI_ERRNO_OK;
+}
+
+static unsigned char *
+normalize_uri(struct uri *uri, unsigned char *uristring)
+{
+       unsigned char *parse_string = uristring;
+       unsigned char *src, *dest, *path;
+       int need_slash = 0;
+       int parse = (uri == NULL);
+       struct uri uri_struct;
+
+       if (!uri) uri = &uri_struct;
+
+       /* 
+        * We need to get the real (proxied) URI but lowercase relevant URI
+        * parts along the way. 
+        */
+       if (parse && parse_uri (uri, parse_string) != URI_ERRNO_OK)
+               return uristring;
+
+
+       /* This is a maybe not the right place but both join_urls() and
+        * get_translated_uri() through translate_url() calls this
+        * function and then it already works on and modifies an
+        * allocated copy. */
+       convert_to_lowercase (uri->string, uri->protocollen);
+       if (uri->hostlen) convert_to_lowercase (uri->host, uri->hostlen);
+
+       parse = 1;
+       parse_string = uri->data;
+
+       if (get_protocol_free_syntax (uri->protocol))
+               return uristring;
+
+       if (uri->protocol != PROTOCOL_UNKNOWN)
+               need_slash = get_protocol_need_slash_after_host (uri->protocol);
+
+       /* We want to start at the first slash to also reduce URIs like
+        * http://host//index.html to http://host/index.html */
+       path = uri->data - need_slash;
+       dest = src = path;
+
+       /* This loop mangles the URI string by removing directory elevators and
+        * other cruft. Example: /.././etc////..//usr/ -> /usr/ */
+       while (*dest) {
+               /* If the following pieces are the LAST parts of URL, we remove
+                * them as well. See RFC 1808 for details. */
+
+               if (end_of_dir (src[0])) {
+                       /* URL data contains no more path. */
+                       memmove (dest, src, strlen(src) + 1);
+                       break;
+               }
+
+               if (!is_uri_dir_sep (uri, src[0])) {
+                       /* This is to reduce indentation */
+
+               } else if (src[1] == '.') {
+                       if (!src[2]) {
+                               /* /. - skip the dot */
+                               *dest++ = *src;
+                               *dest = 0;
+                               break;
+
+                       } else if (is_uri_dir_sep (uri, src[2])) {
+                               /* /./ - strip that.. */
+                               src += 2;
+                               continue;
+
+                       } else if (src[2] == '.'
+                                  && (is_uri_dir_sep (uri, src[3]) || !src[3])) {
+                               /* /../ or /.. - skip it and preceding element. */
+
+                               /* First back out the last incrementation of
+                                * @dest (dest++) to get the position that was
+                                * last asigned to. */
+                               if (dest > path) dest--;
+
+                               /* @dest might be pointing to a dir separator
+                                * so we decrement before any testing. */
+                               while (dest > path) {
+                                       dest--;
+                                       if (is_uri_dir_sep (uri, *dest)) break;
+                               }
+
+                               if (!src[3]) {
+                                       /* /.. - add ending slash and stop */
+                                       *dest++ = *src;
+                                       *dest = 0;
+                                       break;
+                               }
+
+                               src += 3;
+                               continue;
+                       }
+
+               } else if (is_uri_dir_sep (uri, src[1])) {
+                       /* // - ignore first '/'. */
+                       src += 1;
+                       continue;
+               }
+
+               /* We don't want to access memory past the NUL char. */
+               *dest = *src++;
+               if (*dest) dest++;
+       }
+
+       return uristring;
+}
+
+
+void 
+url_parse_text (struct worker_task *task, GByteArray *content)
+{
+       if (url_init () == 0) {
+               /* TODO: */
+       }
+}
+
+void 
+url_parse_html (struct worker_task *task, GByteArray *content)
+{
+       if (url_init () == 0) {
+               /* TODO: */
+       }
+}
diff --git a/url.h b/url.h

new file mode 100644 (file)

index 0000000..7d9d87d
--- /dev/null
+++ b/url.h
@@ -0,0 +1,86 @@
+/* URL check functions */
+#ifndef URL_H
+#define URL_H
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#ifndef OWN_QUEUE_H
+#include <sys/queue.h>
+#else
+#include "queue.h"
+#endif
+
+#include <glib.h>
+
+struct worker_task;
+
+struct uri {
+       /* The start of the uri (and thus start of the protocol string). */
+       unsigned char *string;
+
+       /* The internal type of protocol. Can _never_ be PROTOCOL_UNKNOWN. */
+       int protocol; /* enum protocol */
+
+       int ip_family;
+
+       unsigned char *user;
+       unsigned char *password;
+       unsigned char *host;
+       unsigned char *port;
+       /* @data can contain both the path and query uri fields.
+        * It can never be NULL but can have zero length. */
+       unsigned char *data;
+       unsigned char *fragment;
+       /* @post can contain some special encoded form data, used internally
+        * to make form data handling more efficient. The data is marked by
+        * POST_CHAR in the uri string. */
+       unsigned char *post;
+
+       /* @protocollen should only be usable if @protocol is either
+        * PROTOCOL_USER or an uri string should be composed. */
+       unsigned int protocollen:16;
+       unsigned int userlen:16;
+       unsigned int passwordlen:16;
+       unsigned int hostlen:16;
+       unsigned int portlen:8;
+       unsigned int datalen:16;
+       unsigned int fragmentlen:16;
+
+       /* Flags */
+       unsigned int ipv6:1;    /* URI contains IPv6 host */
+       unsigned int form:1;    /* URI originated from form */
+       
+       /* Link */
+       TAILQ_ENTRY(uri) next;
+};
+
+enum uri_errno {
+       URI_ERRNO_OK,                   /* Parsing went well */
+       URI_ERRNO_EMPTY,                /* The URI string was empty */
+       URI_ERRNO_INVALID_PROTOCOL,     /* No protocol was found */
+       URI_ERRNO_NO_SLASHES,           /* Slashes after protocol missing */
+       URI_ERRNO_TOO_MANY_SLASHES,     /* Too many slashes after protocol */
+       URI_ERRNO_TRAILING_DOTS,        /* '.' after host */
+       URI_ERRNO_NO_HOST,              /* Host part is missing */
+       URI_ERRNO_NO_PORT_COLON,        /* ':' after host without port */
+       URI_ERRNO_NO_HOST_SLASH,        /* Slash after host missing */
+       URI_ERRNO_IPV6_SECURITY,        /* IPv6 security bug detected */
+       URI_ERRNO_INVALID_PORT,         /* Port number is bad */
+       URI_ERRNO_INVALID_PORT_RANGE,   /* Port number is not within 0-65535 */
+};
+
+enum protocol {
+       PROTOCOL_FILE,
+       PROTOCOL_FTP,
+       PROTOCOL_HTTP,
+       PROTOCOL_HTTPS,
+
+       PROTOCOL_UNKNOWN,
+};
+
+#define struri(uri) ((uri)->string)
+
+void url_parse_html (struct worker_task *task, GByteArray *part);
+void url_parse_text (struct worker_task *task, GByteArray *part);
+
+#endif
diff --git a/worker.c b/worker.c

index 1b0682fb0638891b15c60c2d9da9087778fb84f5..e847ff183d79ff7a2aa608586a4d231b69a7f85b 100644 (file)
--- a/worker.c
+++ b/worker.c
@@ -22,6 +22,7 @@
  #include "main.h"
  #include "upstream.h"
  #include "cfg_file.h"
+#include "url.h"
  
  #define CONTENT_LENGTH_HEADER "Content-Length:"
  
@@ -57,12 +58,35 @@ sigusr_handler (int fd, short what, void *arg)
         return;
  }
  
+static void
+free_task (struct worker_task *task)
+{
+       struct uri *cur;
+       if (task) {
+               if (task->msg) {
+                       fstrfree (task->msg->buf);
+                       free (task->msg);
+               }
+               while (!TAILQ_EMPTY(&task->urls)) {
+                       cur = TAILQ_FIRST(&task->urls);
+                       free (cur->string);
+                       free (cur);
+                       TAILQ_REMOVE (&task->urls, cur, next);
+               }
+               free (task);
+       }
+}
+
  static void
  mime_foreach_callback (GMimeObject *part, gpointer user_data)
  {
-       int *count = user_data;
+       struct worker_task *task = (struct worker_task *)user_data;
+       const GMimeContentType *type;
+       GMimeDataWrapper *wrapper;
+       GMimeStream *part_stream;
+       GByteArray *part_content;
         
-       (*count)++;
+       task->parts_count ++;
         
         /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
         
@@ -77,7 +101,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
                     g_mime_message_foreach_part() again here. */
                 
                 message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-               g_mime_message_foreach_part (message, mime_foreach_callback, count);
+               g_mime_message_foreach_part (message, mime_foreach_callback, task);
                 g_object_unref (message);
         } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
                 /* message/partial */
@@ -94,6 +118,20 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
                 /* we'll get to finding out if this is a signed/encrypted multipart later... */
         } else if (GMIME_IS_PART (part)) {
                 /* a normal leaf part, could be text/plain or image/jpeg etc */
+               wrapper = g_mime_part_get_content_object (GMIME_PART (part));
+               if (wrapper != NULL) {
+                       part_stream = g_mime_stream_mem_new ();
+                       if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
+                               part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
+                               type = g_mime_part_get_content_type (GMIME_PART (part));
+                               if (g_mime_content_type_is_type (type, "text", "html")) {
+                                       url_parse_html (task, part_content);
+                               } 
+                               else if (g_mime_content_type_is_type (type, "text", "plain")) {
+                                       url_parse_text (task, part_content);
+                               }
+                       }
+               }
         } else {
                 g_assert_not_reached ();
         }
@@ -101,14 +139,13 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
  
  
  static void
-process_message (f_str_t *msg)
+process_message (struct worker_task *task)
  {
-       int count = 0;
         GMimeMessage *message;
         GMimeParser *parser;
         GMimeStream *stream;
  
-       stream = g_mime_stream_mem_new_with_buffer (msg->begin, msg->len);
+       stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len);
         /* create a new parser object to parse the stream */
         parser = g_mime_parser_new_with_stream (stream);
         /* create a new parser object to parse the stream */
@@ -123,9 +160,9 @@ process_message (f_str_t *msg)
         /* free the parser (and the stream) */
         g_object_unref (parser);
  
-       g_mime_message_foreach_part (message, mime_foreach_callback, &count);
+       g_mime_message_foreach_part (message, mime_foreach_callback, task);
         
-       msg_info ("process_message: found %d parts in message", count);
+       msg_info ("process_message: found %d parts in message", task->parts_count);
  }
  
  static void
@@ -186,7 +223,7 @@ read_socket (struct bufferevent *bev, void *arg)
                                 task->msg->pos += r;
                                 update_buf_size (task->msg);
                                 if (task->msg->free == 0) {
-                                       process_message (task->msg->buf);
+                                       process_message (task);
                                         task->state = WRITE_REPLY;
                                 }
                         }
@@ -194,9 +231,7 @@ read_socket (struct bufferevent *bev, void *arg)
                                 msg_err ("read_socket: cannot read data to buffer: %ld", (long int)r);
                                 bufferevent_disable (bev, EV_READ);
                                 bufferevent_free (bev);
-                               fstrfree (task->msg->buf);
-                               free (task->msg);
-                               free (task);
+                               free_task (task);
                         }
                         break;
                 case WRITE_REPLY:
@@ -220,12 +255,9 @@ write_socket (struct bufferevent *bev, void *arg)
         if (task->state > READ_MESSAGE) {
                 msg_info ("closing connection");
                 /* Free buffers */
-               fstrfree (task->msg->buf);
-               free (task->msg);
+               free_task (task);
                 bufferevent_disable (bev, EV_WRITE);
                 bufferevent_free (bev);
-
-               free (task);
         }
  }
  
@@ -235,14 +267,9 @@ err_socket (struct bufferevent *bev, short what, void *arg)
         struct worker_task *task = (struct worker_task *)arg;
         msg_info ("closing connection");
         /* Free buffers */
-       if (task->state > READ_HEADER) {
-               fstrfree (task->msg->buf);
-               free (task->msg);
-       }
+       free_task (task);
         bufferevent_disable (bev, EV_READ);
         bufferevent_free (bev);
-
-       free (task);
  }
  
  static void
@@ -269,6 +296,8 @@ accept_socket (int fd, short what, void *arg)
         new_task->worker = worker;
         new_task->state = READ_COMMAND;
         new_task->content_length = 0;
+       new_task->parts_count = 0;
+       TAILQ_INIT (&new_task->urls);
  
         /* Read event */
         new_task->bev = bufferevent_new (nfd, read_socket, write_socket, err_socket, (void *)new_task);
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Wed, 11 Jun 2008 14:34:33 +0000 (18:34 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Wed, 11 Jun 2008 14:34:33 +0000 (18:34 +0400)
configure		patch \| blob \| history
main.h		patch \| blob \| history
url.c	[new file with mode: 0644]	patch \| blob
url.h	[new file with mode: 0644]	patch \| blob
worker.c		patch \| blob \| history