summaryrefslogtreecommitdiffstats
path: root/src/html.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2010-07-16 18:40:56 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2010-07-16 18:40:56 +0400
commit698a6c0e767605924386382fff0488c5de1cd3d7 (patch)
treeb13daf3fa21bb6ae6d79b4b3c7fe33e14f7bb3e6 /src/html.c
parent561d5f4c2424f2ff4b1066d1efb308ddcc8aa06b (diff)
downloadrspamd-698a6c0e767605924386382fff0488c5de1cd3d7.tar.gz
rspamd-698a6c0e767605924386382fff0488c5de1cd3d7.zip
* Make improvements to HTML entites decoder: now it replaces entities with common characters and
remove unknown entities. This behaviour is more like of standart HTML to text conversion * Add -d option to force debug output
Diffstat (limited to 'src/html.c')
-rw-r--r--src/html.c546
1 files changed, 283 insertions, 263 deletions
diff --git a/src/html.c b/src/html.c
index 243759fa7..be1f1ef16 100644
--- a/src/html.c
+++ b/src/html.c
@@ -163,6 +163,7 @@ typedef struct _entity entity;
struct _entity {
char *name;
uint code;
+ char *replacement;
};
@@ -170,273 +171,274 @@ static entity entities_defs[] = {
/*
** Markup pre-defined character entities
*/
- {"quot", 34},
- {"amp", 38},
- {"apos", 39},
- {"lt", 60},
- {"gt", 62},
+ {"quot", 34, "\""},
+ {"amp", 38, "&"},
+ {"apos", 39, "'"},
+ {"lt", 60, "<"},
+ {"gt", 62, ">"},
/*
** Latin-1 character entities
*/
- {"nbsp", 160},
- {"iexcl", 161},
- {"cent", 162},
- {"pound", 163},
- {"curren", 164},
- {"yen", 165},
- {"brvbar", 166},
- {"sect", 167},
- {"uml", 168},
- {"copy", 169},
- {"ordf", 170},
- {"laquo", 171},
- {"not", 172},
- {"shy", 173},
- {"reg", 174},
- {"macr", 175},
- {"deg", 176},
- {"plusmn", 177},
- {"sup2", 178},
- {"sup3", 179},
- {"acute", 180},
- {"micro", 181},
- {"para", 182},
- {"middot", 183},
- {"cedil", 184},
- {"sup1", 185},
- {"ordm", 186},
- {"raquo", 187},
- {"frac14", 188},
- {"frac12", 189},
- {"frac34", 190},
- {"iquest", 191},
- {"Agrave", 192},
- {"Aacute", 193},
- {"Acirc", 194},
- {"Atilde", 195},
- {"Auml", 196},
- {"Aring", 197},
- {"AElig", 198},
- {"Ccedil", 199},
- {"Egrave", 200},
- {"Eacute", 201},
- {"Ecirc", 202},
- {"Euml", 203},
- {"Igrave", 204},
- {"Iacute", 205},
- {"Icirc", 206},
- {"Iuml", 207},
- {"ETH", 208},
- {"Ntilde", 209},
- {"Ograve", 210},
- {"Oacute", 211},
- {"Ocirc", 212},
- {"Otilde", 213},
- {"Ouml", 214},
- {"times", 215},
- {"Oslash", 216},
- {"Ugrave", 217},
- {"Uacute", 218},
- {"Ucirc", 219},
- {"Uuml", 220},
- {"Yacute", 221},
- {"THORN", 222},
- {"szlig", 223},
- {"agrave", 224},
- {"aacute", 225},
- {"acirc", 226},
- {"atilde", 227},
- {"auml", 228},
- {"aring", 229},
- {"aelig", 230},
- {"ccedil", 231},
- {"egrave", 232},
- {"eacute", 233},
- {"ecirc", 234},
- {"euml", 235},
- {"igrave", 236},
- {"iacute", 237},
- {"icirc", 238},
- {"iuml", 239},
- {"eth", 240},
- {"ntilde", 241},
- {"ograve", 242},
- {"oacute", 243},
- {"ocirc", 244},
- {"otilde", 245},
- {"ouml", 246},
- {"divide", 247},
- {"oslash", 248},
- {"ugrave", 249},
- {"uacute", 250},
- {"ucirc", 251},
- {"uuml", 252},
- {"yacute", 253},
- {"thorn", 254},
- {"yuml", 255},
+ {"nbsp", 160, " "},
+ {"iexcl", 161, "!"},
+ {"cent", 162, "cent"},
+ {"pound", 163, "pound"},
+ {"curren", 164, "current"},
+ {"yen", 165, "yen"},
+ {"brvbar", 166, NULL},
+ {"sect", 167, NULL},
+ {"uml", 168, "uml"},
+ {"copy", 169, "c"},
+ {"ordf", 170, NULL},
+ {"laquo", 171, "\""},
+ {"not", 172, "!"},
+ {"shy", 173, NULL},
+ {"reg", 174, "r"},
+ {"macr", 175, NULL},
+ {"deg", 176, "deg"},
+ {"plusmn", 177, "+-"},
+ {"sup2", 178, "2"},
+ {"sup3", 179, "3"},
+ {"acute", 180, NULL},
+ {"micro", 181, NULL},
+ {"para", 182, NULL},
+ {"middot", 183, "."},
+ {"cedil", 184, NULL},
+ {"sup1", 185, "1"},
+ {"ordm", 186, NULL},
+ {"raquo", 187, "\""},
+ {"frac14", 188, "1/4"},
+ {"frac12", 189, "1/2"},
+ {"frac34", 190, "3/4"},
+ {"iquest", 191, "i"},
+ {"Agrave", 192, "a"},
+ {"Aacute", 193, "a"},
+ {"Acirc", 194, "a"},
+ {"Atilde", 195, "a"},
+ {"Auml", 196, "a"},
+ {"Aring", 197, "a"},
+ {"AElig", 198, "a"},
+ {"Ccedil", 199, "c"},
+ {"Egrave", 200, "e"},
+ {"Eacute", 201, "e"},
+ {"Ecirc", 202, "e"},
+ {"Euml", 203, "e"},
+ {"Igrave", 204, "i"},
+ {"Iacute", 205, "i"},
+ {"Icirc", 206, "i"},
+ {"Iuml", 207, "i"},
+ {"ETH", 208, "e"},
+ {"Ntilde", 209, "n"},
+ {"Ograve", 210, "o"},
+ {"Oacute", 211, "o"},
+ {"Ocirc", 212, "o"},
+ {"Otilde", 213, "o"},
+ {"Ouml", 214, "o"},
+ {"times", 215, "t"},
+ {"Oslash", 216, "o"},
+ {"Ugrave", 217, "u"},
+ {"Uacute", 218, "u"},
+ {"Ucirc", 219, "u"},
+ {"Uuml", 220, "u"},
+ {"Yacute", 221, "y"},
+ {"THORN", 222, "t"},
+ {"szlig", 223, "s"},
+ {"agrave", 224, "a"},
+ {"aacute", 225, "a"},
+ {"acirc", 226, "a"},
+ {"atilde", 227, "a"},
+ {"auml", 228, "a"},
+ {"aring", 229, "a"},
+ {"aelig", 230, "a"},
+ {"ccedil", 231, "c"},
+ {"egrave", 232, "e"},
+ {"eacute", 233, "e"},
+ {"ecirc", 234, "e"},
+ {"euml", 235, "e"},
+ {"igrave", 236, "e"},
+ {"iacute", 237, "e"},
+ {"icirc", 238, "e"},
+ {"iuml", 239, "e"},
+ {"eth", 240, "e"},
+ {"ntilde", 241, "n"},
+ {"ograve", 242, "o"},
+ {"oacute", 243, "o"},
+ {"ocirc", 244, "o"},
+ {"otilde", 245, "o"},
+ {"ouml", 246, "o"},
+ {"divide", 247, "/"},
+ {"oslash", 248, "/"},
+ {"ugrave", 249, "u"},
+ {"uacute", 250, "u"},
+ {"ucirc", 251, "u"},
+ {"uuml", 252, "u"},
+ {"yacute", 253, "y"},
+ {"thorn", 254, "t"},
+ {"yuml", 255, "y"},
/*
** Extended Entities defined in HTML 4: Symbols
*/
- {"fnof", 402},
- {"Alpha", 913},
- {"Beta", 914},
- {"Gamma", 915},
- {"Delta", 916},
- {"Epsilon", 917},
- {"Zeta", 918},
- {"Eta", 919},
- {"Theta", 920},
- {"Iota", 921},
- {"Kappa", 922},
- {"Lambda", 923},
- {"Mu", 924},
- {"Nu", 925},
- {"Xi", 926},
- {"Omicron", 927},
- {"Pi", 928},
- {"Rho", 929},
- {"Sigma", 931},
- {"Tau", 932},
- {"Upsilon", 933},
- {"Phi", 934},
- {"Chi", 935},
- {"Psi", 936},
- {"Omega", 937},
- {"alpha", 945},
- {"beta", 946},
- {"gamma", 947},
- {"delta", 948},
- {"epsilon", 949},
- {"zeta", 950},
- {"eta", 951},
- {"theta", 952},
- {"iota", 953},
- {"kappa", 954},
- {"lambda", 955},
- {"mu", 956},
- {"nu", 957},
- {"xi", 958},
- {"omicron", 959},
- {"pi", 960},
- {"rho", 961},
- {"sigmaf", 962},
- {"sigma", 963},
- {"tau", 964},
- {"upsilon", 965},
- {"phi", 966},
- {"chi", 967},
- {"psi", 968},
- {"omega", 969},
- {"thetasym", 977},
- {"upsih", 978},
- {"piv", 982},
- {"bull", 8226},
- {"hellip", 8230},
- {"prime", 8242},
- {"Prime", 8243},
- {"oline", 8254},
- {"frasl", 8260},
- {"weierp", 8472},
- {"image", 8465},
- {"real", 8476},
- {"trade", 8482},
- {"alefsym", 8501},
- {"larr", 8592},
- {"uarr", 8593},
- {"rarr", 8594},
- {"darr", 8595},
- {"harr", 8596},
- {"crarr", 8629},
- {"lArr", 8656},
- {"uArr", 8657},
- {"rArr", 8658},
- {"dArr", 8659},
- {"hArr", 8660},
- {"forall", 8704},
- {"part", 8706},
- {"exist", 8707},
- {"empty", 8709},
- {"nabla", 8711},
- {"isin", 8712},
- {"notin", 8713},
- {"ni", 8715},
- {"prod", 8719},
- {"sum", 8721},
- {"minus", 8722},
- {"lowast", 8727},
- {"radic", 8730},
- {"prop", 8733},
- {"infin", 8734},
- {"ang", 8736},
- {"and", 8743},
- {"or", 8744},
- {"cap", 8745},
- {"cup", 8746},
- {"int", 8747},
- {"there4", 8756},
- {"sim", 8764},
- {"cong", 8773},
- {"asymp", 8776},
- {"ne", 8800},
- {"equiv", 8801},
- {"le", 8804},
- {"ge", 8805},
- {"sub", 8834},
- {"sup", 8835},
- {"nsub", 8836},
- {"sube", 8838},
- {"supe", 8839},
- {"oplus", 8853},
- {"otimes", 8855},
- {"perp", 8869},
- {"sdot", 8901},
- {"lceil", 8968},
- {"rceil", 8969},
- {"lfloor", 8970},
- {"rfloor", 8971},
- {"lang", 9001},
- {"rang", 9002},
- {"loz", 9674},
- {"spades", 9824},
- {"clubs", 9827},
- {"hearts", 9829},
- {"diams", 9830},
+ {"fnof", 402, "f"},
+ {"Alpha", 913, "alpha"},
+ {"Beta", 914, "beta"},
+ {"Gamma", 915, "gamma"},
+ {"Delta", 916, "delta"},
+ {"Epsilon", 917, "epsilon"},
+ {"Zeta", 918, "zeta"},
+ {"Eta", 919, "eta"},
+ {"Theta", 920, "theta"},
+ {"Iota", 921, "iota"},
+ {"Kappa", 922, "kappa"},
+ {"Lambda", 923, "lambda"},
+ {"Mu", 924, "mu"},
+ {"Nu", 925, "nu"},
+ {"Xi", 926, "xi"},
+ {"Omicron", 927, "omicron"},
+ {"Pi", 928, "pi"},
+ {"Rho", 929, "rho"},
+ {"Sigma", 931, "sigma"},
+ {"Tau", 932, "tau"},
+ {"Upsilon", 933, "upsilon"},
+ {"Phi", 934, "phi"},
+ {"Chi", 935, "chi"},
+ {"Psi", 936, "psi"},
+ {"Omega", 937, "omega"},
+ {"alpha", 945, "alpha"},
+ {"beta", 946, "beta"},
+ {"gamma", 947, "gamma"},
+ {"delta", 948, "delta"},
+ {"epsilon", 949, "epsilon"},
+ {"zeta", 950, "zeta"},
+ {"eta", 951, "eta"},
+ {"theta", 952, "theta"},
+ {"iota", 953, "iota"},
+ {"kappa", 954, "kappa"},
+ {"lambda", 955, "lambda"},
+ {"mu", 956, "mu"},
+ {"nu", 957, "nu"},
+ {"xi", 958, "xi"},
+ {"omicron", 959, "omicron"},
+ {"pi", 960, "pi"},
+ {"rho", 961, "rho"},
+ {"sigmaf", 962, "sigmaf"},
+ {"sigma", 963, "sigma"},
+ {"tau", 964, "tau"},
+ {"upsilon", 965, "upsilon"},
+ {"phi", 966, "phi"},
+ {"chi", 967, "chi"},
+ {"psi", 968, "psi"},
+ {"omega", 969, "omega"},
+ {"thetasym", 977, "thetasym"},
+ {"upsih", 978, "upsih"},
+ {"piv", 982, "piv"},
+ {"bull", 8226, "bull"},
+ {"hellip", 8230, "hellip"},
+ {"prime", 8242, "'"},
+ {"Prime", 8243, "'"},
+ {"oline", 8254, "-"},
+ {"frasl", 8260, NULL},
+ {"weierp", 8472, NULL},
+ {"image", 8465, NULL},
+ {"real", 8476, NULL},
+ {"trade", 8482, NULL},
+ {"alefsym", 8501, "a"},
+ {"larr", 8592, NULL},
+ {"uarr", 8593, NULL},
+ {"rarr", 8594, NULL},
+ {"darr", 8595, NULL},
+ {"harr", 8596, NULL},
+ {"crarr", 8629, NULL},
+ {"lArr", 8656, NULL},
+ {"uArr", 8657, NULL},
+ {"rArr", 8658, NULL},
+ {"dArr", 8659, NULL},
+ {"hArr", 8660, NULL},
+ {"forall", 8704, NULL},
+ {"part", 8706, NULL},
+ {"exist", 8707, NULL},
+ {"empty", 8709, NULL},
+ {"nabla", 8711, NULL},
+ {"isin", 8712, NULL},
+ {"notin", 8713, NULL},
+ {"ni", 8715, NULL},
+ {"prod", 8719, NULL},
+ {"sum", 8721, "E"},
+ {"minus", 8722, "-"},
+ {"lowast", 8727, NULL},
+ {"radic", 8730, NULL},
+ {"prop", 8733, NULL},
+ {"infin", 8734, NULL},
+ {"ang", 8736, "'"},
+ {"and", 8743, "&"},
+ {"or", 8744, "|"},
+ {"cap", 8745, NULL},
+ {"cup", 8746, NULL},
+ {"int", 8747, NULL},
+ {"there4", 8756, NULL},
+ {"sim", 8764, NULL},
+ {"cong", 8773, NULL},
+ {"asymp", 8776, NULL},
+ {"ne", 8800, "!="},
+ {"equiv", 8801, "=="},
+ {"le", 8804, "<="},
+ {"ge", 8805, ">="},
+ {"sub", 8834, NULL},
+ {"sup", 8835, NULL},
+ {"nsub", 8836, NULL},
+ {"sube", 8838, NULL},
+ {"supe", 8839, NULL},
+ {"oplus", 8853, NULL},
+ {"otimes", 8855, NULL},
+ {"perp", 8869, NULL},
+ {"sdot", 8901, NULL},
+ {"lceil", 8968, NULL},
+ {"rceil", 8969, NULL},
+ {"lfloor", 8970, NULL},
+ {"rfloor", 8971, NULL},
+ {"lang", 9001, NULL},
+ {"rang", 9002, NULL},
+ {"loz", 9674, NULL},
+ {"spades", 9824, NULL},
+ {"clubs", 9827, NULL},
+ {"hearts", 9829, NULL},
+ {"diams", 9830, NULL},
/*
** Extended Entities defined in HTML 4: Special (less Markup at top)
*/
- {"OElig", 338},
- {"oelig", 339},
- {"Scaron", 352},
- {"scaron", 353},
- {"Yuml", 376},
- {"circ", 710},
- {"tilde", 732},
- {"ensp", 8194},
- {"emsp", 8195},
- {"thinsp", 8201},
- {"zwnj", 8204},
- {"zwj", 8205},
- {"lrm", 8206},
- {"rlm", 8207},
- {"ndash", 8211},
- {"mdash", 8212},
- {"lsquo", 8216},
- {"rsquo", 8217},
- {"sbquo", 8218},
- {"ldquo", 8220},
- {"rdquo", 8221},
- {"bdquo", 8222},
- {"dagger", 8224},
- {"Dagger", 8225},
- {"permil", 8240},
- {"lsaquo", 8249},
- {"rsaquo", 8250},
- {"euro", 8364},
+ {"OElig", 338, NULL},
+ {"oelig", 339, NULL},
+ {"Scaron", 352, NULL},
+ {"scaron", 353, NULL},
+ {"Yuml", 376, NULL},
+ {"circ", 710, NULL},
+ {"tilde", 732, NULL},
+ {"ensp", 8194, NULL},
+ {"emsp", 8195, NULL},
+ {"thinsp", 8201, NULL},
+ {"zwnj", 8204, NULL},
+ {"zwj", 8205, NULL},
+ {"lrm", 8206, NULL},
+ {"rlm", 8207, NULL},
+ {"ndash", 8211, "-"},
+ {"mdash", 8212, "-"},
+ {"lsquo", 8216, "'"},
+ {"rsquo", 8217, "'"},
+ {"sbquo", 8218, "\""},
+ {"ldquo", 8220, "\""},
+ {"rdquo", 8221, "\""},
+ {"bdquo", 8222, "\""},
+ {"dagger", 8224, "T"},
+ {"Dagger", 8225, "T"},
+ {"permil", 8240, NULL},
+ {"lsaquo", 8249, "\""},
+ {"rsaquo", 8250, "\""},
+ {"euro", 8364, "E"},
};
+static entity *entities_defs_num = NULL;
static int
tag_cmp (const void *m1, const void *m2)
@@ -456,6 +458,15 @@ entity_cmp (const void *m1, const void *m2)
return g_ascii_strcasecmp (p1->name, p2->name);
}
+static int
+entity_cmp_num (const void *m1, const void *m2)
+{
+ const entity *p1 = m1;
+ const entity *p2 = m2;
+
+ return p1->code - p2->code;
+}
+
static GNode *
construct_html_node (memory_pool_t * pool, char *text)
{
@@ -553,7 +564,7 @@ get_tag_by_name (const char *name)
void
decode_entitles (char *s, guint * len)
{
- guint l;
+ guint l, rep_len;
char *t = s; /* t - tortoise */
char *h = s; /* h - hare */
char *e = s;
@@ -592,12 +603,10 @@ decode_entitles (char *s, guint * len)
key.name = e + 1;
*h = '\0';
if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) {
- if (found->code > 0 || found->code < 127) {
- *t = (char)found->code;
- }
- else {
- /* Skip undecoded */
- t = h;
+ if (found->replacement) {
+ rep_len = strlen (found->replacement);
+ memcpy (t, found->replacement, rep_len);
+ t += rep_len;
}
}
else {
@@ -616,17 +625,25 @@ decode_entitles (char *s, guint * len)
else {
val = strtoul ((e + 3), &end_ptr, base);
}
- if ((end_ptr != NULL && *end_ptr != '\0') || (val == 0 || val > 127)) {
+ if (end_ptr != NULL && *end_ptr != '\0') {
/* Skip undecoded */
t = h;
}
else {
- *t = (char)val;
+ /* Search for a replacement */
+ key.code = val;
+ found = bsearch (&key, entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
+ if (found) {
+ if (found->replacement) {
+ rep_len = strlen (found->replacement);
+ memcpy (t, found->replacement, rep_len);
+ t += rep_len;
+ }
+ }
}
}
*h = ';';
state = 0;
- t++;
}
h++;
break;
@@ -764,6 +781,9 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
}
if (!entities_sorted) {
qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp);
+ entities_defs_num = g_new (entity, G_N_ELEMENTS (entities_defs));
+ memcpy (entities_defs_num, entities_defs, sizeof (entities_defs));
+ qsort (entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
entities_sorted = 1;
}