Browse Source

* Add extracting urls from messages

tags/0.2.7
Vsevolod Stakhov 16 years ago
parent
commit
e92bfae6a1
3 changed files with 67 additions and 5 deletions
  1. 58
    5
      url.c
  2. 7
    0
      util.c
  3. 2
    0
      util.h

+ 58
- 5
url.c View File

@@ -16,6 +16,13 @@
#define POST_CHAR 1
#define POST_CHAR_S "\001"

/* Tcp port range */
#define LOWEST_PORT 0
#define HIGHEST_PORT 65535

#define uri_port_is_valid(port) \
(LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT)

struct _proto {
unsigned char *name;
int port;
@@ -26,10 +33,10 @@ struct _proto {
unsigned int need_ssl:1;
};

static const char *html_url = "((?:href=)|(?:archive=)|(?:code=)|(?:codebase=)|(?:src=)|(?:cite=)"
"|(:?background=)|(?:pluginspage=)|(?:pluginurl=)|(?:action=)|(?:dynsrc=)|(?:longdesc=)|(?:lowsrc=)|(?:src=)|(?:usemap=))"
static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)"
"|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))"
"\\\"?([^>\"<]+)\\\"?";
static const char *text_url = "((mailto\\:|(news|(ht|f)tp(s?))\\://){1}[^>\"<]+)";
static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^>\"<]+)";

static short url_initialized = 0;
static pcre_extra *text_re_extra;
@@ -59,6 +66,14 @@ is_uri_dir_sep(struct uri *uri, unsigned char pos)
return (pos == '/');
}

static int
check_uri_file(unsigned char *name)
{
static const unsigned char chars[] = POST_CHAR_S "#?";

return strcspn(name, chars);
}

static int
url_init (void)
{
@@ -480,15 +495,53 @@ normalize_uri(struct uri *uri, unsigned char *uristring)
void
url_parse_text (struct worker_task *task, GByteArray *content)
{
int ovec[30];
int pos = 0, rc;
char *url_str = NULL;
struct uri *new;

if (url_init () == 0) {
/* TODO: */
while ((rc = pcre_exec (text_re, text_re_extra, (const char *)content->data, content->len, pos, 0,
ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
if (rc > 0) {
pos = ovec[1];
pcre_get_substring ((const char *)content->data, ovec, rc, 1, (const char **)&url_str);
if (url_str != NULL) {
new = g_malloc (sizeof (struct uri));
if (new != NULL) {
parse_uri (new, url_str);
normalize_uri (new, url_str);
TAILQ_INSERT_TAIL (&task->urls, new, next);
}
}
}
}
}
}

void
url_parse_html (struct worker_task *task, GByteArray *content)
{
int ovec[30];
int pos = 0, rc;
char *url_str = NULL;
struct uri *new;

if (url_init () == 0) {
/* TODO: */
while ((rc = pcre_exec (html_re, html_re_extra, (const char *)content->data, content->len, pos, 0,
ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
if (rc > 0) {
pos = ovec[1];
pcre_get_substring ((const char *)content->data, ovec, rc, 3, (const char **)&url_str);
if (url_str != NULL) {
new = g_malloc (sizeof (struct uri));
if (new != NULL) {
parse_uri (new, url_str);
normalize_uri (new, url_str);
TAILQ_INSERT_TAIL (&task->urls, new, next);
}
}
}
}
}
}

+ 7
- 0
util.c View File

@@ -194,6 +194,13 @@ pass_signal_worker (struct workq *workers, int signo)
}
}

void convert_to_lowercase (char *str, unsigned int size)
{
while (size --) {
*str = tolower (*str ++);
}
}

#ifndef HAVE_SETPROCTITLE

static char *title_buffer = 0;

+ 2
- 0
util.h View File

@@ -31,6 +31,8 @@ int event_make_socket_nonblocking(int);
void init_signals (struct sigaction *, sig_t);
/* Send specified signal to each worker */
void pass_signal_worker (struct workq *, int );
/* Convert string to lowercase */
void convert_to_lowercase (char *str, unsigned int size);

#ifndef HAVE_SETPROCTITLE
int init_title(int argc, char *argv[], char *envp[]);

Loading…
Cancel
Save