/*
 * Functions for simple html parsing
 */

#ifndef RSPAMD_HTML_H
#define RSPAMD_HTML_H

#include "config.h"
#include "mem_pool.h"

/* Known HTML tags */
typedef enum
{
  Tag_UNKNOWN,  /**< Unknown tag! */
  Tag_A,        /**< A */
  Tag_ABBR,     /**< ABBR */
  Tag_ACRONYM,  /**< ACRONYM */
  Tag_ADDRESS,  /**< ADDRESS */
  Tag_ALIGN,    /**< ALIGN */
  Tag_APPLET,   /**< APPLET */
  Tag_AREA,     /**< AREA */
  Tag_B,        /**< B */
  Tag_BASE,     /**< BASE */
  Tag_BASEFONT, /**< BASEFONT */
  Tag_BDO,      /**< BDO */
  Tag_BGSOUND,  /**< BGSOUND */
  Tag_BIG,      /**< BIG */
  Tag_BLINK,    /**< BLINK */
  Tag_BLOCKQUOTE,   /**< BLOCKQUOTE */
  Tag_BODY,     /**< BODY */
  Tag_BR,       /**< BR */
  Tag_BUTTON,   /**< BUTTON */
  Tag_CAPTION,  /**< CAPTION */
  Tag_CENTER,   /**< CENTER */
  Tag_CITE,     /**< CITE */
  Tag_CODE,     /**< CODE */
  Tag_COL,      /**< COL */
  Tag_COLGROUP, /**< COLGROUP */
  Tag_COMMENT,  /**< COMMENT */
  Tag_DD,       /**< DD */
  Tag_DEL,      /**< DEL */
  Tag_DFN,      /**< DFN */
  Tag_DIR,      /**< DIR */
  Tag_DIV,      /**< DIF */
  Tag_DL,       /**< DL */
  Tag_DT,       /**< DT */
  Tag_EM,       /**< EM */
  Tag_EMBED,    /**< EMBED */
  Tag_FIELDSET, /**< FIELDSET */
  Tag_FONT,     /**< FONT */
  Tag_FORM,     /**< FORM */
  Tag_FRAME,    /**< FRAME */
  Tag_FRAMESET, /**< FRAMESET */
  Tag_H1,       /**< H1 */
  Tag_H2,       /**< H2 */
  Tag_H3,       /**< H3 */
  Tag_H4,       /**< H4 */
  Tag_H5,       /**< H5 */
  Tag_H6,       /**< H6 */
  Tag_HEAD,     /**< HEAD */
  Tag_HR,       /**< HR */
  Tag_HTML,     /**< HTML */
  Tag_I,        /**< I */
  Tag_IFRAME,   /**< IFRAME */
  Tag_ILAYER,   /**< ILAYER */
  Tag_IMG,      /**< IMG */
  Tag_INPUT,    /**< INPUT */
  Tag_INS,      /**< INS */
  Tag_ISINDEX,  /**< ISINDEX */
  Tag_KBD,      /**< KBD */
  Tag_KEYGEN,   /**< KEYGEN */
  Tag_LABEL,    /**< LABEL */
  Tag_LAYER,    /**< LAYER */
  Tag_LEGEND,   /**< LEGEND */
  Tag_LI,       /**< LI */
  Tag_LINK,     /**< LINK */
  Tag_LISTING,  /**< LISTING */
  Tag_MAP,      /**< MAP */
  Tag_MARQUEE,  /**< MARQUEE */
  Tag_MENU,     /**< MENU */
  Tag_META,     /**< META */
  Tag_MULTICOL, /**< MULTICOL */
  Tag_NOBR,     /**< NOBR */
  Tag_NOEMBED,  /**< NOEMBED */
  Tag_NOFRAMES, /**< NOFRAMES */
  Tag_NOLAYER,  /**< NOLAYER */
  Tag_NOSAVE,   /**< NOSAVE */
  Tag_NOSCRIPT, /**< NOSCRIPT */
  Tag_OBJECT,   /**< OBJECT */
  Tag_OL,       /**< OL */
  Tag_OPTGROUP, /**< OPTGROUP */
  Tag_OPTION,   /**< OPTION */
  Tag_P,        /**< P */
  Tag_PARAM,    /**< PARAM */
  Tag_PLAINTEXT,/**< PLAINTEXT */
  Tag_PRE,      /**< PRE */
  Tag_Q,        /**< Q */
  Tag_RB,       /**< RB */
  Tag_RBC,      /**< RBC */
  Tag_RP,       /**< RP */
  Tag_RT,       /**< RT */
  Tag_RTC,      /**< RTC */
  Tag_RUBY,     /**< RUBY */
  Tag_S,        /**< S */
  Tag_SAMP,     /**< SAMP */
  Tag_SCRIPT,   /**< SCRIPT */
  Tag_SELECT,   /**< SELECT */
  Tag_SERVER,   /**< SERVER */
  Tag_SERVLET,  /**< SERVLET */
  Tag_SMALL,    /**< SMALL */
  Tag_SPACER,   /**< SPACER */
  Tag_SPAN,     /**< SPAN */
  Tag_STRIKE,   /**< STRIKE */
  Tag_STRONG,   /**< STRONG */
  Tag_STYLE,    /**< STYLE */
  Tag_SUB,      /**< SUB */
  Tag_SUP,      /**< SUP */
  Tag_TABLE,    /**< TABLE */
  Tag_TBODY,    /**< TBODY */
  Tag_TD,       /**< TD */
  Tag_TEXTAREA, /**< TEXTAREA */
  Tag_TFOOT,    /**< TFOOT */
  Tag_TH,       /**< TH */
  Tag_THEAD,    /**< THEAD */
  Tag_TITLE,    /**< TITLE */
  Tag_TR,       /**< TR */
  Tag_TT,       /**< TT */
  Tag_U,        /**< U */
  Tag_UL,       /**< UL */
  Tag_VAR,      /**< VAR */
  Tag_WBR,      /**< WBR */
  Tag_XMP,      /**< XMP */
  Tag_XML,		/**< XML */
  Tag_NEXTID,   /**< NEXTID */

  N_TAGS        /**< Must be last */
} tag_id_t;

#define CM_UNKNOWN      0
/* Elements with no content. Map to HTML specification. */
#define CM_EMPTY        (1 << 0)
/* Elements that appear outside of "BODY". */
#define CM_HTML         (1 << 1)
/* Elements that can appear within HEAD. */
#define CM_HEAD         (1 << 2)
/* HTML "block" elements. */
#define CM_BLOCK        (1 << 3)
/* HTML "inline" elements. */
#define CM_INLINE       (1 << 4)
/* Elements that mark list item ("LI"). */
#define CM_LIST         (1 << 5)
/* Elements that mark definition list item ("DL", "DT"). */
#define CM_DEFLIST      (1 << 6)
/* Elements that can appear inside TABLE. */
#define CM_TABLE        (1 << 7)
/* Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_ROWGRP       (1 << 8)
/* Used for "TD", "TH" */
#define CM_ROW          (1 << 9)
/* Elements whose content must be protected against white space movement.
   Includes some elements that can found in forms. */
#define CM_FIELD        (1 << 10)
/* Used to avoid propagating inline emphasis inside some elements
   such as OBJECT or APPLET. */
#define CM_OBJECT       (1 << 11)
/* Elements that allows "PARAM". */
#define CM_PARAM        (1 << 12)
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_FRAMES       (1 << 13)
/* Heading elements (h1, h2, ...). */
#define CM_HEADING      (1 << 14)
/* Elements with an optional end tag. */
#define CM_OPT          (1 << 15)
/* Elements that use "align" attribute for vertical position. */
#define CM_IMG          (1 << 16)
/* Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_MIXED        (1 << 17)
/* Elements whose content needs to be indented only if containing one 
   CM_BLOCK element. */
#define CM_NO_INDENT    (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE     (1 << 19)
/* User defined elements. Used to determine how attributes wihout value
   should be printed. */
#define CM_NEW          (1 << 20)
/* Elements that cannot be omitted. */
#define CM_OMITST       (1 << 21)

/* XML tag */
#define FL_XML			(1 << 0)
/* Closing tag */
#define FL_CLOSING		(1 << 1)
/* Fully closed tag (e.g. <a attrs />) */
#define FL_CLOSED		(1 << 2)

struct html_tag {
	tag_id_t id;
	const gchar *name;
	gint flags;
};

struct html_node {
	struct html_tag *tag;
	gint flags;
};

/* Forwarded declaration */
struct worker_task;

/*
 * Add a single node to the tags tree
 */
gboolean add_html_node (struct worker_task *task, memory_pool_t *pool,
		struct mime_text_part *part, gchar *tag_text, gsize tag_len, gsize remain, GNode **cur_level);

/*
 * Get tag structure by its name (binary search is used)
 */
struct html_tag * get_tag_by_name (const gchar *name);

/*
 * Decode HTML entitles in text. Text is modified in place.
 */
void decode_entitles (gchar *s, guint *len);

#endif