diff options
Diffstat (limited to 'src/lua/lua_parsers.c')
-rw-r--r-- | src/lua/lua_parsers.c | 85 |
1 files changed, 76 insertions, 9 deletions
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c index f77b36952..eb7fa6bf5 100644 --- a/src/lua/lua_parsers.c +++ b/src/lua/lua_parsers.c @@ -1,11 +1,11 @@ -/*- - * Copyright 2020 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -46,6 +46,14 @@ */ /*** + * @function parsers.parse_html_content(input, mempool) + * Parses HTML and returns the HTML content object for structure analysis + * @param {string|text} in input HTML + * @param {rspamd_mempool} mempool memory pool for HTML content management + * @return {html_content} HTML content object with tag structure + */ +LUA_FUNCTION_DEF(parsers, parse_html_content); +/*** * @function parsers.parse_mail_address(str, [pool]) * Parses email address and returns a table of tables in the following format: * @@ -93,6 +101,7 @@ static const struct luaL_reg parserslib_f[] = { LUA_INTERFACE_DEF(parsers, tokenize_text), LUA_INTERFACE_DEF(parsers, parse_html), + LUA_INTERFACE_DEF(parsers, parse_html_content), LUA_INTERFACE_DEF(parsers, parse_mail_address), LUA_INTERFACE_DEF(parsers, parse_content_type), LUA_INTERFACE_DEF(parsers, parse_smtp_date), @@ -108,8 +117,8 @@ int lua_parsers_tokenize_text(lua_State *L) struct rspamd_lua_text *t; struct rspamd_process_exception *ex; UText utxt = UTEXT_INITIALIZER; - GArray *res; - rspamd_stat_token_t *w; + rspamd_words_t *res; + rspamd_word_t *w; if (lua_type(L, 1) == LUA_TSTRING) { in = luaL_checklstring(L, 1, &len); @@ -175,13 +184,15 @@ int lua_parsers_tokenize_text(lua_State *L) lua_pushnil(L); } else { - lua_createtable(L, res->len, 0); + lua_createtable(L, kv_size(*res), 0); - for (i = 0; i < res->len; i++) { - w = &g_array_index(res, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*res); i++) { + w = &kv_A(*res, i); lua_pushlstring(L, w->original.begin, w->original.len); lua_rawseti(L, -2, i + 1); } + kv_destroy(*res); + g_free(res); } cur = exceptions; @@ -240,6 +251,62 @@ int lua_parsers_parse_html(lua_State *L) return 1; } +static int lua_parsers_parse_html_content(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t; + const char *start = NULL; + gsize len; + GByteArray *in; + rspamd_mempool_t *pool; + void *hc; + void **phc; + + if (lua_type(L, 1) == LUA_TUSERDATA) { + t = lua_check_text(L, 1); + + if (t != NULL) { + start = t->start; + len = t->len; + } + } + else if (lua_type(L, 1) == LUA_TSTRING) { + start = luaL_checklstring(L, 1, &len); + } + + if (lua_type(L, 2) != LUA_TUSERDATA) { + return luaL_error(L, "invalid arguments: mempool expected as second argument"); + } + + pool = rspamd_lua_check_mempool(L, 2); + if (!pool) { + return luaL_error(L, "invalid mempool argument"); + } + + if (start != NULL) { + in = g_byte_array_sized_new(len); + g_byte_array_append(in, start, len); + + hc = rspamd_html_process_part(pool, in); + + if (hc) { + phc = lua_newuserdata(L, sizeof(void *)); + *phc = hc; + rspamd_lua_setclass(L, rspamd_html_classname, -1); + } + else { + lua_pushnil(L); + } + + g_byte_array_free(in, TRUE); + } + else { + lua_pushnil(L); + } + + return 1; +} + int lua_parsers_parse_mail_address(lua_State *L) { LUA_TRACE_POINT; @@ -407,4 +474,4 @@ lua_load_parsers(lua_State *L) void luaopen_parsers(lua_State *L) { rspamd_lua_add_preload(L, "rspamd_parsers", lua_load_parsers); -}
\ No newline at end of file +} |