123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418 |
- /*-
- * Copyright 2020 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include "lua_common.h"
- #include "tokenizers/tokenizers.h"
- #include "contrib/uthash/utlist.h"
- #include "libserver/html.h"
- #include "libmime/email_addr.h"
- #include "libmime/content_type.h"
- #include "libmime/mime_headers.h"
- #include "libmime/smtp_parsers.h"
- #include "lua_parsers.h"
-
- /***
- * @module rspamd_parsers
- * This module contains Lua-C interfaces to Rspamd parsers of different kind.
- */
-
- /***
- * @function parsers.tokenize_text(input[, exceptions])
- * Create tokens from a text using optional exceptions list
- * @param {text/string} input input data
- * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
- * @return {table/strings} list of strings representing words in the text
- */
-
-
- /***
- * @function parsers.parse_html(input)
- * Parses HTML and returns the according text
- * @param {string|text} in input HTML
- * @return {rspamd_text} processed text with no HTML tags
- */
-
- /***
- * @function parsers.parse_mail_address(str, [pool])
- * Parses email address and returns a table of tables in the following format:
- *
- * - `raw` - the original value without any processing
- * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah@foo.com>` it returns `Vsevolod Stakhov`
- * - `addr` - address part of the address
- * - `user` - user part (if present) of the address, e.g. `blah`
- * - `domain` - domain part (if present), e.g. `foo.com`
- * - `flags` - table with following keys set to true if given condition fulfilled:
- * - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
- * - [ip] - domain is IPv4/IPv6 address
- * - [braced] - angled `<blah@foo.com>` address
- * - [quoted] - quoted user part
- * - [empty] - empty address
- * - [backslash] - user part contains backslash
- * - [8bit] - contains 8bit characters
- *
- * @param {string} str input string
- * @param {rspamd_mempool} pool memory pool to use
- * @return {table/tables} parsed list of mail addresses
- */
-
- /***
- * @function parsers.parse_content_type(ct_string, mempool)
- * Parses content-type string to a table:
- * - `type`
- * - `subtype`
- * - `charset`
- * - `boundary`
- * - other attributes
- *
- * @param {string} ct_string content type as string
- * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
- * @return table or nil if cannot parse content type
- */
-
- /***
- * @function parsers.parse_smtp_date(str[, local_tz])
- * Converts an SMTP date string to unix timestamp
- * @param {string} str input string
- * @param {boolean} local_tz convert to local tz if `true`
- * @return {number} time as unix timestamp (converted to float)
- */
-
- static const struct luaL_reg parserslib_f[] = {
- LUA_INTERFACE_DEF (parsers, tokenize_text),
- LUA_INTERFACE_DEF (parsers, parse_html),
- LUA_INTERFACE_DEF (parsers, parse_mail_address),
- LUA_INTERFACE_DEF (parsers, parse_content_type),
- LUA_INTERFACE_DEF (parsers, parse_smtp_date),
-
- {NULL, NULL}
- };
-
- gint
- lua_parsers_tokenize_text (lua_State *L)
- {
- LUA_TRACE_POINT;
- const gchar *in = NULL;
- gsize len = 0, pos, ex_len, i;
- GList *exceptions = NULL, *cur;
- struct rspamd_lua_text *t;
- struct rspamd_process_exception *ex;
- UText utxt = UTEXT_INITIALIZER;
- GArray *res;
- rspamd_stat_token_t *w;
-
- if (lua_type (L, 1) == LUA_TSTRING) {
- in = luaL_checklstring (L, 1, &len);
- }
- else if (lua_type (L, 1) == LUA_TUSERDATA) {
- t = lua_check_text (L, 1);
-
- if (t) {
- in = t->start;
- len = t->len;
- }
- }
-
- if (in == NULL) {
- lua_pushnil (L);
- return 1;
- }
-
- if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) {
- lua_pushvalue (L, 2);
- lua_pushnil (L);
-
- while (lua_next (L, -2) != 0) {
- if (lua_type (L, -1) == LUA_TTABLE) {
- lua_rawgeti (L, -1, 1);
- pos = luaL_checknumber (L, -1);
- lua_pop (L, 1);
- lua_rawgeti (L, -1, 2);
- ex_len = luaL_checknumber (L, -1);
- lua_pop (L, 1);
-
- if (ex_len > 0) {
- ex = g_malloc0 (sizeof (*ex));
- ex->pos = pos;
- ex->len = ex_len;
- ex->type = RSPAMD_EXCEPTION_GENERIC;
- exceptions = g_list_prepend (exceptions, ex);
- }
- }
- lua_pop (L, 1);
- }
-
- lua_pop (L, 1);
- }
-
- if (exceptions) {
- exceptions = g_list_reverse (exceptions);
- }
-
- UErrorCode uc_err = U_ZERO_ERROR;
- utext_openUTF8 (&utxt,
- in,
- len,
- &uc_err);
-
- res = rspamd_tokenize_text ((gchar *)in, len,
- &utxt,
- RSPAMD_TOKENIZE_UTF, NULL,
- exceptions,
- NULL, NULL, NULL);
-
- if (res == NULL) {
- lua_pushnil (L);
- }
- else {
- lua_createtable (L, res->len, 0);
-
- for (i = 0; i < res->len; i ++) {
- w = &g_array_index (res, rspamd_stat_token_t, i);
- lua_pushlstring (L, w->original.begin, w->original.len);
- lua_rawseti (L, -2, i + 1);
- }
- }
-
- cur = exceptions;
- while (cur) {
- ex = cur->data;
- g_free (ex);
- cur = g_list_next (cur);
- }
-
- g_list_free (exceptions);
- utext_close (&utxt);
-
- return 1;
- }
-
- gint
- lua_parsers_parse_html (lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_text *t;
- const gchar *start = NULL;
- gsize len;
- GByteArray *res, *in;
- rspamd_mempool_t *pool;
- struct html_content *hc;
-
- if (lua_type (L, 1) == LUA_TUSERDATA) {
- t = lua_check_text (L, 1);
-
- if (t != NULL) {
- start = t->start;
- len = t->len;
- }
- }
- else if (lua_type (L, 1) == LUA_TSTRING) {
- start = luaL_checklstring (L, 1, &len);
- }
-
- if (start != NULL) {
- pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
- hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
- in = g_byte_array_sized_new (len);
- g_byte_array_append (in, start, len);
-
- res = rspamd_html_process_part (pool, hc, in);
-
- t = lua_newuserdata (L, sizeof (*t));
- rspamd_lua_setclass (L, "rspamd{text}", -1);
- t->start = res->data;
- t->len = res->len;
- t->flags = RSPAMD_TEXT_FLAG_OWN;
-
- g_byte_array_free (res, FALSE);
- g_byte_array_free (in, TRUE);
- rspamd_mempool_delete (pool);
- }
- else {
- lua_pushnil (L);
- }
-
- return 1;
- }
-
- gint
- lua_parsers_parse_mail_address (lua_State *L)
- {
- LUA_TRACE_POINT;
- GPtrArray *addrs;
- gsize len;
- const gchar *str = luaL_checklstring (L, 1, &len);
- rspamd_mempool_t *pool;
- gboolean own_pool = FALSE;
-
- if (str) {
-
- if (lua_type (L, 2) == LUA_TUSERDATA) {
- pool = rspamd_lua_check_mempool (L, 2);
-
- if (pool == NULL) {
- return luaL_error (L, "invalid arguments");
- }
- }
- else {
- pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
- "lua parsers", 0);
- own_pool = TRUE;
- }
-
- addrs = rspamd_email_address_from_mime (pool, str, len, NULL, -1);
-
- if (addrs == NULL) {
- lua_pushnil (L);
- }
- else {
- lua_push_emails_address_list (L, addrs, 0);
- }
-
- if (own_pool) {
- rspamd_mempool_delete (pool);
- }
- }
- else {
- lua_pushnil (L);
- }
-
- return 1;
- }
-
- gint
- lua_parsers_parse_content_type (lua_State *L)
- {
- LUA_TRACE_POINT;
- gsize len;
- const gchar *ct_str = luaL_checklstring (L, 1, &len);
- rspamd_mempool_t *pool = rspamd_lua_check_mempool (L, 2);
- struct rspamd_content_type *ct;
-
- if (!ct_str || !pool) {
- return luaL_error (L, "invalid arguments");
- }
-
- ct = rspamd_content_type_parse (ct_str, len, pool);
-
- if (ct == NULL) {
- lua_pushnil (L);
- }
- else {
- GHashTableIter it;
- gpointer k, v;
-
- lua_createtable (L, 0, 4 + (ct->attrs ? g_hash_table_size (ct->attrs) : 0));
-
- if (ct->type.len > 0) {
- lua_pushstring (L, "type");
- lua_pushlstring (L, ct->type.begin, ct->type.len);
- lua_settable (L, -3);
- }
-
- if (ct->subtype.len > 0) {
- lua_pushstring (L, "subtype");
- lua_pushlstring (L, ct->subtype.begin, ct->subtype.len);
- lua_settable (L, -3);
- }
-
- if (ct->charset.len > 0) {
- lua_pushstring (L, "charset");
- lua_pushlstring (L, ct->charset.begin, ct->charset.len);
- lua_settable (L, -3);
- }
-
- if (ct->orig_boundary.len > 0) {
- lua_pushstring (L, "boundary");
- lua_pushlstring (L, ct->orig_boundary.begin, ct->orig_boundary.len);
- lua_settable (L, -3);
- }
-
- if (ct->attrs) {
- g_hash_table_iter_init (&it, ct->attrs);
-
- while (g_hash_table_iter_next (&it, &k, &v)) {
- struct rspamd_content_type_param *param =
- (struct rspamd_content_type_param *)v, *cur;
- guint i = 1;
-
- lua_pushlstring (L, param->name.begin, param->name.len);
- lua_createtable (L, 1, 0);
-
- DL_FOREACH (param, cur) {
- lua_pushlstring (L, cur->value.begin, cur->value.len);
- lua_rawseti (L, -2, i++);
- }
-
- lua_settable (L, -3);
- }
- }
- }
-
- return 1;
- }
-
- int
- lua_parsers_parse_smtp_date (lua_State *L)
- {
- gsize slen;
- const gchar *str = lua_tolstring (L, 1, &slen);
- GError *err = NULL;
-
- if (str == NULL) {
- return luaL_argerror (L, 1, "invalid argument");
- }
-
- time_t tt = rspamd_parse_smtp_date (str, slen, &err);
-
- if (err == NULL) {
- if (lua_isboolean (L, 2) && !!lua_toboolean (L, 2)) {
- struct tm t;
-
- rspamd_localtime (tt, &t);
- #if !defined(__sun)
- t.tm_gmtoff = 0;
- #endif
- t.tm_isdst = 0;
- tt = mktime (&t);
- }
-
- lua_pushnumber (L, tt);
- }
- else {
- lua_pushnil (L);
- lua_pushstring (L, err->message);
- g_error_free (err);
-
- return 2;
- }
-
- return 1;
- }
-
- static gint
- lua_load_parsers (lua_State * L)
- {
- lua_newtable (L);
- luaL_register (L, NULL, parserslib_f);
-
- return 1;
- }
-
- void
- luaopen_parsers (lua_State * L)
- {
- rspamd_lua_add_preload (L, "rspamd_parsers", lua_load_parsers);
- }
|