Browse Source

Add HTML tags interface for LUA

tags/1.2.0
Vsevolod Stakhov 8 years ago
parent
commit
2fcc3bbac0
4 changed files with 364 additions and 208 deletions
  1. 3
    189
      src/libserver/html.c
  2. 1
    0
      src/libserver/html.h
  3. 208
    0
      src/libserver/html_tags.h
  4. 152
    19
      src/lua/lua_html.c

+ 3
- 189
src/libserver/html.c View File

@@ -18,199 +18,11 @@
#include "rspamd.h"
#include "message.h"
#include "html.h"
#include "html_tags.h"
#include "url.h"

static sig_atomic_t tags_sorted = 0;

/* Known HTML tags */
typedef enum
{
Tag_UNKNOWN, /**< Unknown tag! */
Tag_A, /**< A */
Tag_ABBR, /**< ABBR */
Tag_ACRONYM, /**< ACRONYM */
Tag_ADDRESS, /**< ADDRESS */
Tag_ALIGN, /**< ALIGN */
Tag_APPLET, /**< APPLET */
Tag_AREA, /**< AREA */
Tag_B, /**< B */
Tag_BASE, /**< BASE */
Tag_BASEFONT, /**< BASEFONT */
Tag_BDO, /**< BDO */
Tag_BGSOUND, /**< BGSOUND */
Tag_BIG, /**< BIG */
Tag_BLINK, /**< BLINK */
Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
Tag_BODY, /**< BODY */
Tag_BR, /**< BR */
Tag_BUTTON, /**< BUTTON */
Tag_CAPTION, /**< CAPTION */
Tag_CENTER, /**< CENTER */
Tag_CITE, /**< CITE */
Tag_CODE, /**< CODE */
Tag_COL, /**< COL */
Tag_COLGROUP, /**< COLGROUP */
Tag_COMMENT, /**< COMMENT */
Tag_DD, /**< DD */
Tag_DEL, /**< DEL */
Tag_DFN, /**< DFN */
Tag_DIR, /**< DIR */
Tag_DIV, /**< DIF */
Tag_DL, /**< DL */
Tag_DT, /**< DT */
Tag_EM, /**< EM */
Tag_EMBED, /**< EMBED */
Tag_FIELDSET, /**< FIELDSET */
Tag_FONT, /**< FONT */
Tag_FORM, /**< FORM */
Tag_FRAME, /**< FRAME */
Tag_FRAMESET, /**< FRAMESET */
Tag_H1, /**< H1 */
Tag_H2, /**< H2 */
Tag_H3, /**< H3 */
Tag_H4, /**< H4 */
Tag_H5, /**< H5 */
Tag_H6, /**< H6 */
Tag_HEAD, /**< HEAD */
Tag_HR, /**< HR */
Tag_HTML, /**< HTML */
Tag_I, /**< I */
Tag_IFRAME, /**< IFRAME */
Tag_ILAYER, /**< ILAYER */
Tag_IMG, /**< IMG */
Tag_INPUT, /**< INPUT */
Tag_INS, /**< INS */
Tag_ISINDEX, /**< ISINDEX */
Tag_KBD, /**< KBD */
Tag_KEYGEN, /**< KEYGEN */
Tag_LABEL, /**< LABEL */
Tag_LAYER, /**< LAYER */
Tag_LEGEND, /**< LEGEND */
Tag_LI, /**< LI */
Tag_LINK, /**< LINK */
Tag_LISTING, /**< LISTING */
Tag_MAP, /**< MAP */
Tag_MARQUEE, /**< MARQUEE */
Tag_MENU, /**< MENU */
Tag_META, /**< META */
Tag_MULTICOL, /**< MULTICOL */
Tag_NOBR, /**< NOBR */
Tag_NOEMBED, /**< NOEMBED */
Tag_NOFRAMES, /**< NOFRAMES */
Tag_NOLAYER, /**< NOLAYER */
Tag_NOSAVE, /**< NOSAVE */
Tag_NOSCRIPT, /**< NOSCRIPT */
Tag_OBJECT, /**< OBJECT */
Tag_OL, /**< OL */
Tag_OPTGROUP, /**< OPTGROUP */
Tag_OPTION, /**< OPTION */
Tag_P, /**< P */
Tag_PARAM, /**< PARAM */
Tag_PLAINTEXT, /**< PLAINTEXT */
Tag_PRE, /**< PRE */
Tag_Q, /**< Q */
Tag_RB, /**< RB */
Tag_RBC, /**< RBC */
Tag_RP, /**< RP */
Tag_RT, /**< RT */
Tag_RTC, /**< RTC */
Tag_RUBY, /**< RUBY */
Tag_S, /**< S */
Tag_SAMP, /**< SAMP */
Tag_SCRIPT, /**< SCRIPT */
Tag_SELECT, /**< SELECT */
Tag_SERVER, /**< SERVER */
Tag_SERVLET, /**< SERVLET */
Tag_SMALL, /**< SMALL */
Tag_SPACER, /**< SPACER */
Tag_SPAN, /**< SPAN */
Tag_STRIKE, /**< STRIKE */
Tag_STRONG, /**< STRONG */
Tag_STYLE, /**< STYLE */
Tag_SUB, /**< SUB */
Tag_SUP, /**< SUP */
Tag_TABLE, /**< TABLE */
Tag_TBODY, /**< TBODY */
Tag_TD, /**< TD */
Tag_TEXTAREA, /**< TEXTAREA */
Tag_TFOOT, /**< TFOOT */
Tag_TH, /**< TH */
Tag_THEAD, /**< THEAD */
Tag_TITLE, /**< TITLE */
Tag_TR, /**< TR */
Tag_TT, /**< TT */
Tag_U, /**< U */
Tag_UL, /**< UL */
Tag_VAR, /**< VAR */
Tag_WBR, /**< WBR */
Tag_XMP, /**< XMP */
Tag_XML, /**< XML */
Tag_NEXTID, /**< NEXTID */

N_TAGS /**< Must be last */
} tag_id_t;

#define CM_UNKNOWN 0
/* Elements with no content. Map to HTML specification. */
#define CM_EMPTY (1 << 0)
/* Elements that appear outside of "BODY". */
#define CM_HTML (1 << 1)
/* Elements that can appear within HEAD. */
#define CM_HEAD (1 << 2)
/* HTML "block" elements. */
#define CM_BLOCK (1 << 3)
/* HTML "inline" elements. */
#define CM_INLINE (1 << 4)
/* Elements that mark list item ("LI"). */
#define CM_LIST (1 << 5)
/* Elements that mark definition list item ("DL", "DT"). */
#define CM_DEFLIST (1 << 6)
/* Elements that can appear inside TABLE. */
#define CM_TABLE (1 << 7)
/* Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_ROWGRP (1 << 8)
/* Used for "TD", "TH" */
#define CM_ROW (1 << 9)
/* Elements whose content must be protected against white space movement.
Includes some elements that can found in forms. */
#define CM_FIELD (1 << 10)
/* Used to avoid propagating inline emphasis inside some elements
such as OBJECT or APPLET. */
#define CM_OBJECT (1 << 11)
/* Elements that allows "PARAM". */
#define CM_PARAM (1 << 12)
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_FRAMES (1 << 13)
/* Heading elements (h1, h2, ...). */
#define CM_HEADING (1 << 14)
/* Elements with an optional end tag. */
#define CM_OPT (1 << 15)
/* Elements that use "align" attribute for vertical position. */
#define CM_IMG (1 << 16)
/* Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_MIXED (1 << 17)
/* Elements whose content needs to be indented only if containing one
CM_BLOCK element. */
#define CM_NO_INDENT (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE (1 << 19)
/* User defined elements. Used to determine how attributes wihout value
should be printed. */
#define CM_NEW (1 << 20)
/* Elements that cannot be omitted. */
#define CM_OMITST (1 << 21)
/* Unique elements */
#define CM_UNIQUE (1 << 22)
/* XML tag */
#define FL_XML (1 << 23)
/* Closing tag */
#define FL_CLOSING (1 << 24)
/* Fully closed tag (e.g. <a attrs />) */
#define FL_CLOSED (1 << 25)
#define FL_BROKEN (1 << 26)
#define FL_IGNORE (1 << 27)
#define FL_BLOCK (1 << 28)

struct html_tag_def {
gint id;
const gchar *name;
@@ -918,6 +730,8 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
nnode);
}

tag->parent = *cur_level;

if (!(tag->flags & CM_INLINE)) {
/* Block tag */
nnode = g_node_new (tag);

+ 1
- 0
src/libserver/html.h View File

@@ -83,6 +83,7 @@ struct html_tag {
struct html_tag_component name;
GQueue *params;
gpointer extra; /** Additional data associated with tag (e.g. image) */
GNode *parent;
};

/* Forwarded declaration */

+ 208
- 0
src/libserver/html_tags.h View File

@@ -0,0 +1,208 @@
/*-
* Copyright 2016 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SRC_LIBSERVER_HTML_TAGS_H_
#define SRC_LIBSERVER_HTML_TAGS_H_

/* Known HTML tags */
typedef enum
{
Tag_UNKNOWN, /**< Unknown tag! */
Tag_A, /**< A */
Tag_ABBR, /**< ABBR */
Tag_ACRONYM, /**< ACRONYM */
Tag_ADDRESS, /**< ADDRESS */
Tag_ALIGN, /**< ALIGN */
Tag_APPLET, /**< APPLET */
Tag_AREA, /**< AREA */
Tag_B, /**< B */
Tag_BASE, /**< BASE */
Tag_BASEFONT, /**< BASEFONT */
Tag_BDO, /**< BDO */
Tag_BGSOUND, /**< BGSOUND */
Tag_BIG, /**< BIG */
Tag_BLINK, /**< BLINK */
Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
Tag_BODY, /**< BODY */
Tag_BR, /**< BR */
Tag_BUTTON, /**< BUTTON */
Tag_CAPTION, /**< CAPTION */
Tag_CENTER, /**< CENTER */
Tag_CITE, /**< CITE */
Tag_CODE, /**< CODE */
Tag_COL, /**< COL */
Tag_COLGROUP, /**< COLGROUP */
Tag_COMMENT, /**< COMMENT */
Tag_DD, /**< DD */
Tag_DEL, /**< DEL */
Tag_DFN, /**< DFN */
Tag_DIR, /**< DIR */
Tag_DIV, /**< DIF */
Tag_DL, /**< DL */
Tag_DT, /**< DT */
Tag_EM, /**< EM */
Tag_EMBED, /**< EMBED */
Tag_FIELDSET, /**< FIELDSET */
Tag_FONT, /**< FONT */
Tag_FORM, /**< FORM */
Tag_FRAME, /**< FRAME */
Tag_FRAMESET, /**< FRAMESET */
Tag_H1, /**< H1 */
Tag_H2, /**< H2 */
Tag_H3, /**< H3 */
Tag_H4, /**< H4 */
Tag_H5, /**< H5 */
Tag_H6, /**< H6 */
Tag_HEAD, /**< HEAD */
Tag_HR, /**< HR */
Tag_HTML, /**< HTML */
Tag_I, /**< I */
Tag_IFRAME, /**< IFRAME */
Tag_ILAYER, /**< ILAYER */
Tag_IMG, /**< IMG */
Tag_INPUT, /**< INPUT */
Tag_INS, /**< INS */
Tag_ISINDEX, /**< ISINDEX */
Tag_KBD, /**< KBD */
Tag_KEYGEN, /**< KEYGEN */
Tag_LABEL, /**< LABEL */
Tag_LAYER, /**< LAYER */
Tag_LEGEND, /**< LEGEND */
Tag_LI, /**< LI */
Tag_LINK, /**< LINK */
Tag_LISTING, /**< LISTING */
Tag_MAP, /**< MAP */
Tag_MARQUEE, /**< MARQUEE */
Tag_MENU, /**< MENU */
Tag_META, /**< META */
Tag_MULTICOL, /**< MULTICOL */
Tag_NOBR, /**< NOBR */
Tag_NOEMBED, /**< NOEMBED */
Tag_NOFRAMES, /**< NOFRAMES */
Tag_NOLAYER, /**< NOLAYER */
Tag_NOSAVE, /**< NOSAVE */
Tag_NOSCRIPT, /**< NOSCRIPT */
Tag_OBJECT, /**< OBJECT */
Tag_OL, /**< OL */
Tag_OPTGROUP, /**< OPTGROUP */
Tag_OPTION, /**< OPTION */
Tag_P, /**< P */
Tag_PARAM, /**< PARAM */
Tag_PLAINTEXT, /**< PLAINTEXT */
Tag_PRE, /**< PRE */
Tag_Q, /**< Q */
Tag_RB, /**< RB */
Tag_RBC, /**< RBC */
Tag_RP, /**< RP */
Tag_RT, /**< RT */
Tag_RTC, /**< RTC */
Tag_RUBY, /**< RUBY */
Tag_S, /**< S */
Tag_SAMP, /**< SAMP */
Tag_SCRIPT, /**< SCRIPT */
Tag_SELECT, /**< SELECT */
Tag_SERVER, /**< SERVER */
Tag_SERVLET, /**< SERVLET */
Tag_SMALL, /**< SMALL */
Tag_SPACER, /**< SPACER */
Tag_SPAN, /**< SPAN */
Tag_STRIKE, /**< STRIKE */
Tag_STRONG, /**< STRONG */
Tag_STYLE, /**< STYLE */
Tag_SUB, /**< SUB */
Tag_SUP, /**< SUP */
Tag_TABLE, /**< TABLE */
Tag_TBODY, /**< TBODY */
Tag_TD, /**< TD */
Tag_TEXTAREA, /**< TEXTAREA */
Tag_TFOOT, /**< TFOOT */
Tag_TH, /**< TH */
Tag_THEAD, /**< THEAD */
Tag_TITLE, /**< TITLE */
Tag_TR, /**< TR */
Tag_TT, /**< TT */
Tag_U, /**< U */
Tag_UL, /**< UL */
Tag_VAR, /**< VAR */
Tag_WBR, /**< WBR */
Tag_XMP, /**< XMP */
Tag_XML, /**< XML */
Tag_NEXTID, /**< NEXTID */

N_TAGS /**< Must be last */
} tag_id_t;

#define CM_UNKNOWN 0
/* Elements with no content. Map to HTML specification. */
#define CM_EMPTY (1 << 0)
/* Elements that appear outside of "BODY". */
#define CM_HTML (1 << 1)
/* Elements that can appear within HEAD. */
#define CM_HEAD (1 << 2)
/* HTML "block" elements. */
#define CM_BLOCK (1 << 3)
/* HTML "inline" elements. */
#define CM_INLINE (1 << 4)
/* Elements that mark list item ("LI"). */
#define CM_LIST (1 << 5)
/* Elements that mark definition list item ("DL", "DT"). */
#define CM_DEFLIST (1 << 6)
/* Elements that can appear inside TABLE. */
#define CM_TABLE (1 << 7)
/* Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_ROWGRP (1 << 8)
/* Used for "TD", "TH" */
#define CM_ROW (1 << 9)
/* Elements whose content must be protected against white space movement.
Includes some elements that can found in forms. */
#define CM_FIELD (1 << 10)
/* Used to avoid propagating inline emphasis inside some elements
such as OBJECT or APPLET. */
#define CM_OBJECT (1 << 11)
/* Elements that allows "PARAM". */
#define CM_PARAM (1 << 12)
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_FRAMES (1 << 13)
/* Heading elements (h1, h2, ...). */
#define CM_HEADING (1 << 14)
/* Elements with an optional end tag. */
#define CM_OPT (1 << 15)
/* Elements that use "align" attribute for vertical position. */
#define CM_IMG (1 << 16)
/* Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_MIXED (1 << 17)
/* Elements whose content needs to be indented only if containing one
CM_BLOCK element. */
#define CM_NO_INDENT (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE (1 << 19)
/* User defined elements. Used to determine how attributes wihout value
should be printed. */
#define CM_NEW (1 << 20)
/* Elements that cannot be omitted. */
#define CM_OMITST (1 << 21)
/* Unique elements */
#define CM_UNIQUE (1 << 22)
/* XML tag */
#define FL_XML (1 << 23)
/* Closing tag */
#define FL_CLOSING (1 << 24)
/* Fully closed tag (e.g. <a attrs />) */
#define FL_CLOSED (1 << 25)
#define FL_BROKEN (1 << 26)
#define FL_IGNORE (1 << 27)
#define FL_BLOCK (1 << 28)

#endif /* SRC_LIBSERVER_HTML_TAGS_H_ */

+ 152
- 19
src/lua/lua_html.c View File

@@ -16,6 +16,7 @@
#include "lua_common.h"
#include "message.h"
#include "html.h"
#include "html_tags.h"
#include "images.h"

/***
@@ -103,6 +104,33 @@ static const struct luaL_reg htmllib_m[] = {
{NULL, NULL}
};

/***
* @method html_tag:get_type()
* Returns string representation of HTML type for a tag
* @return {string} type of tag
*/
LUA_FUNCTION_DEF (html_tag, get_type);
/***
* @method html_tag:get_extra()
* Returns extra data associated with the tag
* @return {url|image|nil} extra data associated with the tag
*/
LUA_FUNCTION_DEF (html_tag, get_extra);
/***
* @method html_tag:get_parent()
* Returns parent node for a specified tag
* @return {html_tag} parent object for a specified tag
*/
LUA_FUNCTION_DEF (html_tag, get_parent);

static const struct luaL_reg taglib_m[] = {
LUA_INTERFACE_DEF (html_tag, get_type),
LUA_INTERFACE_DEF (html_tag, get_extra),
LUA_INTERFACE_DEF (html_tag, get_parent),
{"__tostring", rspamd_lua_class_tostring},
{NULL, NULL}
};

static struct html_content *
lua_check_html (lua_State * L, gint pos)
{
@@ -111,6 +139,14 @@ lua_check_html (lua_State * L, gint pos)
return ud ? *((struct html_content **)ud) : NULL;
}

static struct html_tag *
lua_check_html_tag (lua_State * L, gint pos)
{
void *ud = luaL_checkudata (L, pos, "rspamd{html_tag}");
luaL_argcheck (L, ud != NULL, pos, "'html_tag' expected");
return ud ? *((struct html_tag **)ud) : NULL;
}

static gint
lua_html_has_tag (lua_State *L)
{
@@ -170,11 +206,44 @@ lua_html_has_property (lua_State *L)
return 1;
}

static void
lua_html_push_image (lua_State *L, struct html_image *img)
{
struct html_tag **ptag;

lua_newtable (L);

if (img->src) {
lua_pushstring (L, "src");
lua_pushstring (L, img->src);
lua_settable (L, -3);
}

if (img->tag) {
lua_pushstring (L, "tag");
ptag = lua_newuserdata (L, sizeof (gpointer));
*ptag = img->tag;
rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
lua_settable (L, -3);
}

lua_pushstring (L, "height");
lua_pushnumber (L, img->height);
lua_settable (L, -3);
lua_pushstring (L, "width");
lua_pushnumber (L, img->width);
lua_settable (L, -3);
lua_pushstring (L, "embedded");
lua_pushboolean (L, img->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED);
lua_settable (L, -3);
}

static gint
lua_html_get_images (lua_State *L)
{
struct html_content *hc = lua_check_html (L, 1);
struct html_image *img;

guint i;

if (hc != NULL) {
@@ -183,25 +252,7 @@ lua_html_get_images (lua_State *L)
if (hc->images) {
for (i = 0; i < hc->images->len; i ++) {
img = g_ptr_array_index (hc->images, i);

lua_newtable (L);

if (img->src) {
lua_pushstring (L, "src");
lua_pushstring (L, img->src);
lua_settable (L, -3);
}

lua_pushstring (L, "height");
lua_pushnumber (L, img->height);
lua_settable (L, -3);
lua_pushstring (L, "width");
lua_pushnumber (L, img->width);
lua_settable (L, -3);
lua_pushstring (L, "embedded");
lua_pushboolean (L, img->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED);
lua_settable (L, -3);

lua_html_push_image (L, img);
lua_rawseti (L, -2, i + 1);
}
}
@@ -279,9 +330,91 @@ lua_html_get_blocks (lua_State *L)
return 1;
}

static gint
lua_html_tag_get_type (lua_State *L)
{
struct html_tag *tag = lua_check_html_tag (L, 1);
const gchar *tagname;

if (tag != NULL) {
tagname = rspamd_html_tag_by_id (tag->id);

if (tagname) {
lua_pushstring (L, tagname);
}
else {
lua_pushnil (L);
}
}
else {
lua_error (L);
}

return 1;
}

static gint
lua_html_tag_get_parent (lua_State *L)
{
struct html_tag *tag = lua_check_html_tag (L, 1), **ptag;
GNode *node;

if (tag != NULL) {
node = tag->parent;

if (node && node->data) {
ptag = lua_newuserdata (L, sizeof (gpointer));
*ptag = node->data;
rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
}
}
else {
lua_error (L);
}

return 1;
}

static gint
lua_html_tag_get_extra (lua_State *L)
{
struct html_tag *tag = lua_check_html_tag (L, 1);
struct html_image *img;
struct rspamd_url **purl;

if (tag) {
if (tag->extra) {
if (tag->id == Tag_A || tag->id == Tag_IFRAME) {
/* For A that's URL */
purl = lua_newuserdata (L, sizeof (gpointer));
*purl = tag->extra;
rspamd_lua_setclass (L, "rspamd{url}", -1);
}
else if (tag->id == Tag_IMG) {
img = tag->extra;
lua_html_push_image (L, img);
}
else {
/* Unknown extra ? */
lua_pushnil (L);
}
}
else {
lua_pushnil (L);
}
}
else {
lua_error (L);
}

return 1;
}

void
luaopen_html (lua_State * L)
{
rspamd_lua_new_class (L, "rspamd{html}", htmllib_m);
lua_pop (L, 1);
rspamd_lua_new_class (L, "rspamd{html_tag}", taglib_m);
lua_pop (L, 1);
}

Loading…
Cancel
Save