#include "rspamd.h"
#include "message.h"
#include "html.h"
+#include "html_tags.h"
#include "url.h"
static sig_atomic_t tags_sorted = 0;
-/* Known HTML tags */
-typedef enum
-{
- Tag_UNKNOWN, /**< Unknown tag! */
- Tag_A, /**< A */
- Tag_ABBR, /**< ABBR */
- Tag_ACRONYM, /**< ACRONYM */
- Tag_ADDRESS, /**< ADDRESS */
- Tag_ALIGN, /**< ALIGN */
- Tag_APPLET, /**< APPLET */
- Tag_AREA, /**< AREA */
- Tag_B, /**< B */
- Tag_BASE, /**< BASE */
- Tag_BASEFONT, /**< BASEFONT */
- Tag_BDO, /**< BDO */
- Tag_BGSOUND, /**< BGSOUND */
- Tag_BIG, /**< BIG */
- Tag_BLINK, /**< BLINK */
- Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
- Tag_BODY, /**< BODY */
- Tag_BR, /**< BR */
- Tag_BUTTON, /**< BUTTON */
- Tag_CAPTION, /**< CAPTION */
- Tag_CENTER, /**< CENTER */
- Tag_CITE, /**< CITE */
- Tag_CODE, /**< CODE */
- Tag_COL, /**< COL */
- Tag_COLGROUP, /**< COLGROUP */
- Tag_COMMENT, /**< COMMENT */
- Tag_DD, /**< DD */
- Tag_DEL, /**< DEL */
- Tag_DFN, /**< DFN */
- Tag_DIR, /**< DIR */
- Tag_DIV, /**< DIF */
- Tag_DL, /**< DL */
- Tag_DT, /**< DT */
- Tag_EM, /**< EM */
- Tag_EMBED, /**< EMBED */
- Tag_FIELDSET, /**< FIELDSET */
- Tag_FONT, /**< FONT */
- Tag_FORM, /**< FORM */
- Tag_FRAME, /**< FRAME */
- Tag_FRAMESET, /**< FRAMESET */
- Tag_H1, /**< H1 */
- Tag_H2, /**< H2 */
- Tag_H3, /**< H3 */
- Tag_H4, /**< H4 */
- Tag_H5, /**< H5 */
- Tag_H6, /**< H6 */
- Tag_HEAD, /**< HEAD */
- Tag_HR, /**< HR */
- Tag_HTML, /**< HTML */
- Tag_I, /**< I */
- Tag_IFRAME, /**< IFRAME */
- Tag_ILAYER, /**< ILAYER */
- Tag_IMG, /**< IMG */
- Tag_INPUT, /**< INPUT */
- Tag_INS, /**< INS */
- Tag_ISINDEX, /**< ISINDEX */
- Tag_KBD, /**< KBD */
- Tag_KEYGEN, /**< KEYGEN */
- Tag_LABEL, /**< LABEL */
- Tag_LAYER, /**< LAYER */
- Tag_LEGEND, /**< LEGEND */
- Tag_LI, /**< LI */
- Tag_LINK, /**< LINK */
- Tag_LISTING, /**< LISTING */
- Tag_MAP, /**< MAP */
- Tag_MARQUEE, /**< MARQUEE */
- Tag_MENU, /**< MENU */
- Tag_META, /**< META */
- Tag_MULTICOL, /**< MULTICOL */
- Tag_NOBR, /**< NOBR */
- Tag_NOEMBED, /**< NOEMBED */
- Tag_NOFRAMES, /**< NOFRAMES */
- Tag_NOLAYER, /**< NOLAYER */
- Tag_NOSAVE, /**< NOSAVE */
- Tag_NOSCRIPT, /**< NOSCRIPT */
- Tag_OBJECT, /**< OBJECT */
- Tag_OL, /**< OL */
- Tag_OPTGROUP, /**< OPTGROUP */
- Tag_OPTION, /**< OPTION */
- Tag_P, /**< P */
- Tag_PARAM, /**< PARAM */
- Tag_PLAINTEXT, /**< PLAINTEXT */
- Tag_PRE, /**< PRE */
- Tag_Q, /**< Q */
- Tag_RB, /**< RB */
- Tag_RBC, /**< RBC */
- Tag_RP, /**< RP */
- Tag_RT, /**< RT */
- Tag_RTC, /**< RTC */
- Tag_RUBY, /**< RUBY */
- Tag_S, /**< S */
- Tag_SAMP, /**< SAMP */
- Tag_SCRIPT, /**< SCRIPT */
- Tag_SELECT, /**< SELECT */
- Tag_SERVER, /**< SERVER */
- Tag_SERVLET, /**< SERVLET */
- Tag_SMALL, /**< SMALL */
- Tag_SPACER, /**< SPACER */
- Tag_SPAN, /**< SPAN */
- Tag_STRIKE, /**< STRIKE */
- Tag_STRONG, /**< STRONG */
- Tag_STYLE, /**< STYLE */
- Tag_SUB, /**< SUB */
- Tag_SUP, /**< SUP */
- Tag_TABLE, /**< TABLE */
- Tag_TBODY, /**< TBODY */
- Tag_TD, /**< TD */
- Tag_TEXTAREA, /**< TEXTAREA */
- Tag_TFOOT, /**< TFOOT */
- Tag_TH, /**< TH */
- Tag_THEAD, /**< THEAD */
- Tag_TITLE, /**< TITLE */
- Tag_TR, /**< TR */
- Tag_TT, /**< TT */
- Tag_U, /**< U */
- Tag_UL, /**< UL */
- Tag_VAR, /**< VAR */
- Tag_WBR, /**< WBR */
- Tag_XMP, /**< XMP */
- Tag_XML, /**< XML */
- Tag_NEXTID, /**< NEXTID */
-
- N_TAGS /**< Must be last */
-} tag_id_t;
-
-#define CM_UNKNOWN 0
-/* Elements with no content. Map to HTML specification. */
-#define CM_EMPTY (1 << 0)
-/* Elements that appear outside of "BODY". */
-#define CM_HTML (1 << 1)
-/* Elements that can appear within HEAD. */
-#define CM_HEAD (1 << 2)
-/* HTML "block" elements. */
-#define CM_BLOCK (1 << 3)
-/* HTML "inline" elements. */
-#define CM_INLINE (1 << 4)
-/* Elements that mark list item ("LI"). */
-#define CM_LIST (1 << 5)
-/* Elements that mark definition list item ("DL", "DT"). */
-#define CM_DEFLIST (1 << 6)
-/* Elements that can appear inside TABLE. */
-#define CM_TABLE (1 << 7)
-/* Used for "THEAD", "TFOOT" or "TBODY". */
-#define CM_ROWGRP (1 << 8)
-/* Used for "TD", "TH" */
-#define CM_ROW (1 << 9)
-/* Elements whose content must be protected against white space movement.
- Includes some elements that can found in forms. */
-#define CM_FIELD (1 << 10)
-/* Used to avoid propagating inline emphasis inside some elements
- such as OBJECT or APPLET. */
-#define CM_OBJECT (1 << 11)
-/* Elements that allows "PARAM". */
-#define CM_PARAM (1 << 12)
-/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
-#define CM_FRAMES (1 << 13)
-/* Heading elements (h1, h2, ...). */
-#define CM_HEADING (1 << 14)
-/* Elements with an optional end tag. */
-#define CM_OPT (1 << 15)
-/* Elements that use "align" attribute for vertical position. */
-#define CM_IMG (1 << 16)
-/* Elements with inline and block model. Used to avoid calling InlineDup. */
-#define CM_MIXED (1 << 17)
-/* Elements whose content needs to be indented only if containing one
- CM_BLOCK element. */
-#define CM_NO_INDENT (1 << 18)
-/* Elements that are obsolete (such as "dir", "menu"). */
-#define CM_OBSOLETE (1 << 19)
-/* User defined elements. Used to determine how attributes wihout value
- should be printed. */
-#define CM_NEW (1 << 20)
-/* Elements that cannot be omitted. */
-#define CM_OMITST (1 << 21)
-/* Unique elements */
-#define CM_UNIQUE (1 << 22)
-/* XML tag */
-#define FL_XML (1 << 23)
-/* Closing tag */
-#define FL_CLOSING (1 << 24)
-/* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED (1 << 25)
-#define FL_BROKEN (1 << 26)
-#define FL_IGNORE (1 << 27)
-#define FL_BLOCK (1 << 28)
-
struct html_tag_def {
gint id;
const gchar *name;
nnode);
}
+ tag->parent = *cur_level;
+
if (!(tag->flags & CM_INLINE)) {
/* Block tag */
nnode = g_node_new (tag);
struct html_tag_component name;
GQueue *params;
gpointer extra; /** Additional data associated with tag (e.g. image) */
+ GNode *parent;
};
/* Forwarded declaration */
--- /dev/null
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBSERVER_HTML_TAGS_H_
+#define SRC_LIBSERVER_HTML_TAGS_H_
+
+/* Known HTML tags */
+typedef enum
+{
+ Tag_UNKNOWN, /**< Unknown tag! */
+ Tag_A, /**< A */
+ Tag_ABBR, /**< ABBR */
+ Tag_ACRONYM, /**< ACRONYM */
+ Tag_ADDRESS, /**< ADDRESS */
+ Tag_ALIGN, /**< ALIGN */
+ Tag_APPLET, /**< APPLET */
+ Tag_AREA, /**< AREA */
+ Tag_B, /**< B */
+ Tag_BASE, /**< BASE */
+ Tag_BASEFONT, /**< BASEFONT */
+ Tag_BDO, /**< BDO */
+ Tag_BGSOUND, /**< BGSOUND */
+ Tag_BIG, /**< BIG */
+ Tag_BLINK, /**< BLINK */
+ Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
+ Tag_BODY, /**< BODY */
+ Tag_BR, /**< BR */
+ Tag_BUTTON, /**< BUTTON */
+ Tag_CAPTION, /**< CAPTION */
+ Tag_CENTER, /**< CENTER */
+ Tag_CITE, /**< CITE */
+ Tag_CODE, /**< CODE */
+ Tag_COL, /**< COL */
+ Tag_COLGROUP, /**< COLGROUP */
+ Tag_COMMENT, /**< COMMENT */
+ Tag_DD, /**< DD */
+ Tag_DEL, /**< DEL */
+ Tag_DFN, /**< DFN */
+ Tag_DIR, /**< DIR */
+ Tag_DIV, /**< DIF */
+ Tag_DL, /**< DL */
+ Tag_DT, /**< DT */
+ Tag_EM, /**< EM */
+ Tag_EMBED, /**< EMBED */
+ Tag_FIELDSET, /**< FIELDSET */
+ Tag_FONT, /**< FONT */
+ Tag_FORM, /**< FORM */
+ Tag_FRAME, /**< FRAME */
+ Tag_FRAMESET, /**< FRAMESET */
+ Tag_H1, /**< H1 */
+ Tag_H2, /**< H2 */
+ Tag_H3, /**< H3 */
+ Tag_H4, /**< H4 */
+ Tag_H5, /**< H5 */
+ Tag_H6, /**< H6 */
+ Tag_HEAD, /**< HEAD */
+ Tag_HR, /**< HR */
+ Tag_HTML, /**< HTML */
+ Tag_I, /**< I */
+ Tag_IFRAME, /**< IFRAME */
+ Tag_ILAYER, /**< ILAYER */
+ Tag_IMG, /**< IMG */
+ Tag_INPUT, /**< INPUT */
+ Tag_INS, /**< INS */
+ Tag_ISINDEX, /**< ISINDEX */
+ Tag_KBD, /**< KBD */
+ Tag_KEYGEN, /**< KEYGEN */
+ Tag_LABEL, /**< LABEL */
+ Tag_LAYER, /**< LAYER */
+ Tag_LEGEND, /**< LEGEND */
+ Tag_LI, /**< LI */
+ Tag_LINK, /**< LINK */
+ Tag_LISTING, /**< LISTING */
+ Tag_MAP, /**< MAP */
+ Tag_MARQUEE, /**< MARQUEE */
+ Tag_MENU, /**< MENU */
+ Tag_META, /**< META */
+ Tag_MULTICOL, /**< MULTICOL */
+ Tag_NOBR, /**< NOBR */
+ Tag_NOEMBED, /**< NOEMBED */
+ Tag_NOFRAMES, /**< NOFRAMES */
+ Tag_NOLAYER, /**< NOLAYER */
+ Tag_NOSAVE, /**< NOSAVE */
+ Tag_NOSCRIPT, /**< NOSCRIPT */
+ Tag_OBJECT, /**< OBJECT */
+ Tag_OL, /**< OL */
+ Tag_OPTGROUP, /**< OPTGROUP */
+ Tag_OPTION, /**< OPTION */
+ Tag_P, /**< P */
+ Tag_PARAM, /**< PARAM */
+ Tag_PLAINTEXT, /**< PLAINTEXT */
+ Tag_PRE, /**< PRE */
+ Tag_Q, /**< Q */
+ Tag_RB, /**< RB */
+ Tag_RBC, /**< RBC */
+ Tag_RP, /**< RP */
+ Tag_RT, /**< RT */
+ Tag_RTC, /**< RTC */
+ Tag_RUBY, /**< RUBY */
+ Tag_S, /**< S */
+ Tag_SAMP, /**< SAMP */
+ Tag_SCRIPT, /**< SCRIPT */
+ Tag_SELECT, /**< SELECT */
+ Tag_SERVER, /**< SERVER */
+ Tag_SERVLET, /**< SERVLET */
+ Tag_SMALL, /**< SMALL */
+ Tag_SPACER, /**< SPACER */
+ Tag_SPAN, /**< SPAN */
+ Tag_STRIKE, /**< STRIKE */
+ Tag_STRONG, /**< STRONG */
+ Tag_STYLE, /**< STYLE */
+ Tag_SUB, /**< SUB */
+ Tag_SUP, /**< SUP */
+ Tag_TABLE, /**< TABLE */
+ Tag_TBODY, /**< TBODY */
+ Tag_TD, /**< TD */
+ Tag_TEXTAREA, /**< TEXTAREA */
+ Tag_TFOOT, /**< TFOOT */
+ Tag_TH, /**< TH */
+ Tag_THEAD, /**< THEAD */
+ Tag_TITLE, /**< TITLE */
+ Tag_TR, /**< TR */
+ Tag_TT, /**< TT */
+ Tag_U, /**< U */
+ Tag_UL, /**< UL */
+ Tag_VAR, /**< VAR */
+ Tag_WBR, /**< WBR */
+ Tag_XMP, /**< XMP */
+ Tag_XML, /**< XML */
+ Tag_NEXTID, /**< NEXTID */
+
+ N_TAGS /**< Must be last */
+} tag_id_t;
+
+#define CM_UNKNOWN 0
+/* Elements with no content. Map to HTML specification. */
+#define CM_EMPTY (1 << 0)
+/* Elements that appear outside of "BODY". */
+#define CM_HTML (1 << 1)
+/* Elements that can appear within HEAD. */
+#define CM_HEAD (1 << 2)
+/* HTML "block" elements. */
+#define CM_BLOCK (1 << 3)
+/* HTML "inline" elements. */
+#define CM_INLINE (1 << 4)
+/* Elements that mark list item ("LI"). */
+#define CM_LIST (1 << 5)
+/* Elements that mark definition list item ("DL", "DT"). */
+#define CM_DEFLIST (1 << 6)
+/* Elements that can appear inside TABLE. */
+#define CM_TABLE (1 << 7)
+/* Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_ROWGRP (1 << 8)
+/* Used for "TD", "TH" */
+#define CM_ROW (1 << 9)
+/* Elements whose content must be protected against white space movement.
+ Includes some elements that can found in forms. */
+#define CM_FIELD (1 << 10)
+/* Used to avoid propagating inline emphasis inside some elements
+ such as OBJECT or APPLET. */
+#define CM_OBJECT (1 << 11)
+/* Elements that allows "PARAM". */
+#define CM_PARAM (1 << 12)
+/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
+#define CM_FRAMES (1 << 13)
+/* Heading elements (h1, h2, ...). */
+#define CM_HEADING (1 << 14)
+/* Elements with an optional end tag. */
+#define CM_OPT (1 << 15)
+/* Elements that use "align" attribute for vertical position. */
+#define CM_IMG (1 << 16)
+/* Elements with inline and block model. Used to avoid calling InlineDup. */
+#define CM_MIXED (1 << 17)
+/* Elements whose content needs to be indented only if containing one
+ CM_BLOCK element. */
+#define CM_NO_INDENT (1 << 18)
+/* Elements that are obsolete (such as "dir", "menu"). */
+#define CM_OBSOLETE (1 << 19)
+/* User defined elements. Used to determine how attributes wihout value
+ should be printed. */
+#define CM_NEW (1 << 20)
+/* Elements that cannot be omitted. */
+#define CM_OMITST (1 << 21)
+/* Unique elements */
+#define CM_UNIQUE (1 << 22)
+/* XML tag */
+#define FL_XML (1 << 23)
+/* Closing tag */
+#define FL_CLOSING (1 << 24)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED (1 << 25)
+#define FL_BROKEN (1 << 26)
+#define FL_IGNORE (1 << 27)
+#define FL_BLOCK (1 << 28)
+
+#endif /* SRC_LIBSERVER_HTML_TAGS_H_ */
#include "lua_common.h"
#include "message.h"
#include "html.h"
+#include "html_tags.h"
#include "images.h"
/***
{NULL, NULL}
};
+/***
+ * @method html_tag:get_type()
+ * Returns string representation of HTML type for a tag
+ * @return {string} type of tag
+ */
+LUA_FUNCTION_DEF (html_tag, get_type);
+/***
+ * @method html_tag:get_extra()
+ * Returns extra data associated with the tag
+ * @return {url|image|nil} extra data associated with the tag
+ */
+LUA_FUNCTION_DEF (html_tag, get_extra);
+/***
+ * @method html_tag:get_parent()
+ * Returns parent node for a specified tag
+ * @return {html_tag} parent object for a specified tag
+ */
+LUA_FUNCTION_DEF (html_tag, get_parent);
+
+static const struct luaL_reg taglib_m[] = {
+ LUA_INTERFACE_DEF (html_tag, get_type),
+ LUA_INTERFACE_DEF (html_tag, get_extra),
+ LUA_INTERFACE_DEF (html_tag, get_parent),
+ {"__tostring", rspamd_lua_class_tostring},
+ {NULL, NULL}
+};
+
static struct html_content *
lua_check_html (lua_State * L, gint pos)
{
return ud ? *((struct html_content **)ud) : NULL;
}
+static struct html_tag *
+lua_check_html_tag (lua_State * L, gint pos)
+{
+ void *ud = luaL_checkudata (L, pos, "rspamd{html_tag}");
+ luaL_argcheck (L, ud != NULL, pos, "'html_tag' expected");
+ return ud ? *((struct html_tag **)ud) : NULL;
+}
+
static gint
lua_html_has_tag (lua_State *L)
{
return 1;
}
+static void
+lua_html_push_image (lua_State *L, struct html_image *img)
+{
+ struct html_tag **ptag;
+
+ lua_newtable (L);
+
+ if (img->src) {
+ lua_pushstring (L, "src");
+ lua_pushstring (L, img->src);
+ lua_settable (L, -3);
+ }
+
+ if (img->tag) {
+ lua_pushstring (L, "tag");
+ ptag = lua_newuserdata (L, sizeof (gpointer));
+ *ptag = img->tag;
+ rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
+ lua_settable (L, -3);
+ }
+
+ lua_pushstring (L, "height");
+ lua_pushnumber (L, img->height);
+ lua_settable (L, -3);
+ lua_pushstring (L, "width");
+ lua_pushnumber (L, img->width);
+ lua_settable (L, -3);
+ lua_pushstring (L, "embedded");
+ lua_pushboolean (L, img->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED);
+ lua_settable (L, -3);
+}
+
static gint
lua_html_get_images (lua_State *L)
{
struct html_content *hc = lua_check_html (L, 1);
struct html_image *img;
+
guint i;
if (hc != NULL) {
if (hc->images) {
for (i = 0; i < hc->images->len; i ++) {
img = g_ptr_array_index (hc->images, i);
-
- lua_newtable (L);
-
- if (img->src) {
- lua_pushstring (L, "src");
- lua_pushstring (L, img->src);
- lua_settable (L, -3);
- }
-
- lua_pushstring (L, "height");
- lua_pushnumber (L, img->height);
- lua_settable (L, -3);
- lua_pushstring (L, "width");
- lua_pushnumber (L, img->width);
- lua_settable (L, -3);
- lua_pushstring (L, "embedded");
- lua_pushboolean (L, img->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED);
- lua_settable (L, -3);
-
+ lua_html_push_image (L, img);
lua_rawseti (L, -2, i + 1);
}
}
return 1;
}
+static gint
+lua_html_tag_get_type (lua_State *L)
+{
+ struct html_tag *tag = lua_check_html_tag (L, 1);
+ const gchar *tagname;
+
+ if (tag != NULL) {
+ tagname = rspamd_html_tag_by_id (tag->id);
+
+ if (tagname) {
+ lua_pushstring (L, tagname);
+ }
+ else {
+ lua_pushnil (L);
+ }
+ }
+ else {
+ lua_error (L);
+ }
+
+ return 1;
+}
+
+static gint
+lua_html_tag_get_parent (lua_State *L)
+{
+ struct html_tag *tag = lua_check_html_tag (L, 1), **ptag;
+ GNode *node;
+
+ if (tag != NULL) {
+ node = tag->parent;
+
+ if (node && node->data) {
+ ptag = lua_newuserdata (L, sizeof (gpointer));
+ *ptag = node->data;
+ rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
+ }
+ }
+ else {
+ lua_error (L);
+ }
+
+ return 1;
+}
+
+static gint
+lua_html_tag_get_extra (lua_State *L)
+{
+ struct html_tag *tag = lua_check_html_tag (L, 1);
+ struct html_image *img;
+ struct rspamd_url **purl;
+
+ if (tag) {
+ if (tag->extra) {
+ if (tag->id == Tag_A || tag->id == Tag_IFRAME) {
+ /* For A that's URL */
+ purl = lua_newuserdata (L, sizeof (gpointer));
+ *purl = tag->extra;
+ rspamd_lua_setclass (L, "rspamd{url}", -1);
+ }
+ else if (tag->id == Tag_IMG) {
+ img = tag->extra;
+ lua_html_push_image (L, img);
+ }
+ else {
+ /* Unknown extra ? */
+ lua_pushnil (L);
+ }
+ }
+ else {
+ lua_pushnil (L);
+ }
+ }
+ else {
+ lua_error (L);
+ }
+
+ return 1;
+}
+
void
luaopen_html (lua_State * L)
{
rspamd_lua_new_class (L, "rspamd{html}", htmllib_m);
lua_pop (L, 1);
+ rspamd_lua_new_class (L, "rspamd{html_tag}", taglib_m);
+ lua_pop (L, 1);
}