aboutsummaryrefslogtreecommitdiffstats
path: root/src/lua/lua_parsers.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lua/lua_parsers.c')
-rw-r--r--src/lua/lua_parsers.c85
1 files changed, 76 insertions, 9 deletions
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
index f77b36952..eb7fa6bf5 100644
--- a/src/lua/lua_parsers.c
+++ b/src/lua/lua_parsers.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2020 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -46,6 +46,14 @@
*/
/***
+ * @function parsers.parse_html_content(input, mempool)
+ * Parses HTML and returns the HTML content object for structure analysis
+ * @param {string|text} in input HTML
+ * @param {rspamd_mempool} mempool memory pool for HTML content management
+ * @return {html_content} HTML content object with tag structure
+ */
+LUA_FUNCTION_DEF(parsers, parse_html_content);
+/***
* @function parsers.parse_mail_address(str, [pool])
* Parses email address and returns a table of tables in the following format:
*
@@ -93,6 +101,7 @@
static const struct luaL_reg parserslib_f[] = {
LUA_INTERFACE_DEF(parsers, tokenize_text),
LUA_INTERFACE_DEF(parsers, parse_html),
+ LUA_INTERFACE_DEF(parsers, parse_html_content),
LUA_INTERFACE_DEF(parsers, parse_mail_address),
LUA_INTERFACE_DEF(parsers, parse_content_type),
LUA_INTERFACE_DEF(parsers, parse_smtp_date),
@@ -108,8 +117,8 @@ int lua_parsers_tokenize_text(lua_State *L)
struct rspamd_lua_text *t;
struct rspamd_process_exception *ex;
UText utxt = UTEXT_INITIALIZER;
- GArray *res;
- rspamd_stat_token_t *w;
+ rspamd_words_t *res;
+ rspamd_word_t *w;
if (lua_type(L, 1) == LUA_TSTRING) {
in = luaL_checklstring(L, 1, &len);
@@ -175,13 +184,15 @@ int lua_parsers_tokenize_text(lua_State *L)
lua_pushnil(L);
}
else {
- lua_createtable(L, res->len, 0);
+ lua_createtable(L, kv_size(*res), 0);
- for (i = 0; i < res->len; i++) {
- w = &g_array_index(res, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*res); i++) {
+ w = &kv_A(*res, i);
lua_pushlstring(L, w->original.begin, w->original.len);
lua_rawseti(L, -2, i + 1);
}
+ kv_destroy(*res);
+ g_free(res);
}
cur = exceptions;
@@ -240,6 +251,62 @@ int lua_parsers_parse_html(lua_State *L)
return 1;
}
+static int lua_parsers_parse_html_content(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_text *t;
+ const char *start = NULL;
+ gsize len;
+ GByteArray *in;
+ rspamd_mempool_t *pool;
+ void *hc;
+ void **phc;
+
+ if (lua_type(L, 1) == LUA_TUSERDATA) {
+ t = lua_check_text(L, 1);
+
+ if (t != NULL) {
+ start = t->start;
+ len = t->len;
+ }
+ }
+ else if (lua_type(L, 1) == LUA_TSTRING) {
+ start = luaL_checklstring(L, 1, &len);
+ }
+
+ if (lua_type(L, 2) != LUA_TUSERDATA) {
+ return luaL_error(L, "invalid arguments: mempool expected as second argument");
+ }
+
+ pool = rspamd_lua_check_mempool(L, 2);
+ if (!pool) {
+ return luaL_error(L, "invalid mempool argument");
+ }
+
+ if (start != NULL) {
+ in = g_byte_array_sized_new(len);
+ g_byte_array_append(in, start, len);
+
+ hc = rspamd_html_process_part(pool, in);
+
+ if (hc) {
+ phc = lua_newuserdata(L, sizeof(void *));
+ *phc = hc;
+ rspamd_lua_setclass(L, rspamd_html_classname, -1);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ g_byte_array_free(in, TRUE);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
int lua_parsers_parse_mail_address(lua_State *L)
{
LUA_TRACE_POINT;
@@ -407,4 +474,4 @@ lua_load_parsers(lua_State *L)
void luaopen_parsers(lua_State *L)
{
rspamd_lua_add_preload(L, "rspamd_parsers", lua_load_parsers);
-} \ No newline at end of file
+}