diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2010-07-16 18:40:56 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2010-07-16 18:40:56 +0400 |
commit | 698a6c0e767605924386382fff0488c5de1cd3d7 (patch) | |
tree | b13daf3fa21bb6ae6d79b4b3c7fe33e14f7bb3e6 /src/html.c | |
parent | 561d5f4c2424f2ff4b1066d1efb308ddcc8aa06b (diff) | |
download | rspamd-698a6c0e767605924386382fff0488c5de1cd3d7.tar.gz rspamd-698a6c0e767605924386382fff0488c5de1cd3d7.zip |
* Make improvements to HTML entites decoder: now it replaces entities with common characters and
remove unknown entities. This behaviour is more like of standart HTML to text conversion
* Add -d option to force debug output
Diffstat (limited to 'src/html.c')
-rw-r--r-- | src/html.c | 546 |
1 files changed, 283 insertions, 263 deletions
diff --git a/src/html.c b/src/html.c index 243759fa7..be1f1ef16 100644 --- a/src/html.c +++ b/src/html.c @@ -163,6 +163,7 @@ typedef struct _entity entity; struct _entity { char *name; uint code; + char *replacement; }; @@ -170,273 +171,274 @@ static entity entities_defs[] = { /* ** Markup pre-defined character entities */ - {"quot", 34}, - {"amp", 38}, - {"apos", 39}, - {"lt", 60}, - {"gt", 62}, + {"quot", 34, "\""}, + {"amp", 38, "&"}, + {"apos", 39, "'"}, + {"lt", 60, "<"}, + {"gt", 62, ">"}, /* ** Latin-1 character entities */ - {"nbsp", 160}, - {"iexcl", 161}, - {"cent", 162}, - {"pound", 163}, - {"curren", 164}, - {"yen", 165}, - {"brvbar", 166}, - {"sect", 167}, - {"uml", 168}, - {"copy", 169}, - {"ordf", 170}, - {"laquo", 171}, - {"not", 172}, - {"shy", 173}, - {"reg", 174}, - {"macr", 175}, - {"deg", 176}, - {"plusmn", 177}, - {"sup2", 178}, - {"sup3", 179}, - {"acute", 180}, - {"micro", 181}, - {"para", 182}, - {"middot", 183}, - {"cedil", 184}, - {"sup1", 185}, - {"ordm", 186}, - {"raquo", 187}, - {"frac14", 188}, - {"frac12", 189}, - {"frac34", 190}, - {"iquest", 191}, - {"Agrave", 192}, - {"Aacute", 193}, - {"Acirc", 194}, - {"Atilde", 195}, - {"Auml", 196}, - {"Aring", 197}, - {"AElig", 198}, - {"Ccedil", 199}, - {"Egrave", 200}, - {"Eacute", 201}, - {"Ecirc", 202}, - {"Euml", 203}, - {"Igrave", 204}, - {"Iacute", 205}, - {"Icirc", 206}, - {"Iuml", 207}, - {"ETH", 208}, - {"Ntilde", 209}, - {"Ograve", 210}, - {"Oacute", 211}, - {"Ocirc", 212}, - {"Otilde", 213}, - {"Ouml", 214}, - {"times", 215}, - {"Oslash", 216}, - {"Ugrave", 217}, - {"Uacute", 218}, - {"Ucirc", 219}, - {"Uuml", 220}, - {"Yacute", 221}, - {"THORN", 222}, - {"szlig", 223}, - {"agrave", 224}, - {"aacute", 225}, - {"acirc", 226}, - {"atilde", 227}, - {"auml", 228}, - {"aring", 229}, - {"aelig", 230}, - {"ccedil", 231}, - {"egrave", 232}, - {"eacute", 233}, - {"ecirc", 234}, - {"euml", 235}, - {"igrave", 236}, - {"iacute", 237}, - {"icirc", 238}, - {"iuml", 239}, - {"eth", 240}, - {"ntilde", 241}, - {"ograve", 242}, - {"oacute", 243}, - {"ocirc", 244}, - {"otilde", 245}, - {"ouml", 246}, - {"divide", 247}, - {"oslash", 248}, - {"ugrave", 249}, - {"uacute", 250}, - {"ucirc", 251}, - {"uuml", 252}, - {"yacute", 253}, - {"thorn", 254}, - {"yuml", 255}, + {"nbsp", 160, " "}, + {"iexcl", 161, "!"}, + {"cent", 162, "cent"}, + {"pound", 163, "pound"}, + {"curren", 164, "current"}, + {"yen", 165, "yen"}, + {"brvbar", 166, NULL}, + {"sect", 167, NULL}, + {"uml", 168, "uml"}, + {"copy", 169, "c"}, + {"ordf", 170, NULL}, + {"laquo", 171, "\""}, + {"not", 172, "!"}, + {"shy", 173, NULL}, + {"reg", 174, "r"}, + {"macr", 175, NULL}, + {"deg", 176, "deg"}, + {"plusmn", 177, "+-"}, + {"sup2", 178, "2"}, + {"sup3", 179, "3"}, + {"acute", 180, NULL}, + {"micro", 181, NULL}, + {"para", 182, NULL}, + {"middot", 183, "."}, + {"cedil", 184, NULL}, + {"sup1", 185, "1"}, + {"ordm", 186, NULL}, + {"raquo", 187, "\""}, + {"frac14", 188, "1/4"}, + {"frac12", 189, "1/2"}, + {"frac34", 190, "3/4"}, + {"iquest", 191, "i"}, + {"Agrave", 192, "a"}, + {"Aacute", 193, "a"}, + {"Acirc", 194, "a"}, + {"Atilde", 195, "a"}, + {"Auml", 196, "a"}, + {"Aring", 197, "a"}, + {"AElig", 198, "a"}, + {"Ccedil", 199, "c"}, + {"Egrave", 200, "e"}, + {"Eacute", 201, "e"}, + {"Ecirc", 202, "e"}, + {"Euml", 203, "e"}, + {"Igrave", 204, "i"}, + {"Iacute", 205, "i"}, + {"Icirc", 206, "i"}, + {"Iuml", 207, "i"}, + {"ETH", 208, "e"}, + {"Ntilde", 209, "n"}, + {"Ograve", 210, "o"}, + {"Oacute", 211, "o"}, + {"Ocirc", 212, "o"}, + {"Otilde", 213, "o"}, + {"Ouml", 214, "o"}, + {"times", 215, "t"}, + {"Oslash", 216, "o"}, + {"Ugrave", 217, "u"}, + {"Uacute", 218, "u"}, + {"Ucirc", 219, "u"}, + {"Uuml", 220, "u"}, + {"Yacute", 221, "y"}, + {"THORN", 222, "t"}, + {"szlig", 223, "s"}, + {"agrave", 224, "a"}, + {"aacute", 225, "a"}, + {"acirc", 226, "a"}, + {"atilde", 227, "a"}, + {"auml", 228, "a"}, + {"aring", 229, "a"}, + {"aelig", 230, "a"}, + {"ccedil", 231, "c"}, + {"egrave", 232, "e"}, + {"eacute", 233, "e"}, + {"ecirc", 234, "e"}, + {"euml", 235, "e"}, + {"igrave", 236, "e"}, + {"iacute", 237, "e"}, + {"icirc", 238, "e"}, + {"iuml", 239, "e"}, + {"eth", 240, "e"}, + {"ntilde", 241, "n"}, + {"ograve", 242, "o"}, + {"oacute", 243, "o"}, + {"ocirc", 244, "o"}, + {"otilde", 245, "o"}, + {"ouml", 246, "o"}, + {"divide", 247, "/"}, + {"oslash", 248, "/"}, + {"ugrave", 249, "u"}, + {"uacute", 250, "u"}, + {"ucirc", 251, "u"}, + {"uuml", 252, "u"}, + {"yacute", 253, "y"}, + {"thorn", 254, "t"}, + {"yuml", 255, "y"}, /* ** Extended Entities defined in HTML 4: Symbols */ - {"fnof", 402}, - {"Alpha", 913}, - {"Beta", 914}, - {"Gamma", 915}, - {"Delta", 916}, - {"Epsilon", 917}, - {"Zeta", 918}, - {"Eta", 919}, - {"Theta", 920}, - {"Iota", 921}, - {"Kappa", 922}, - {"Lambda", 923}, - {"Mu", 924}, - {"Nu", 925}, - {"Xi", 926}, - {"Omicron", 927}, - {"Pi", 928}, - {"Rho", 929}, - {"Sigma", 931}, - {"Tau", 932}, - {"Upsilon", 933}, - {"Phi", 934}, - {"Chi", 935}, - {"Psi", 936}, - {"Omega", 937}, - {"alpha", 945}, - {"beta", 946}, - {"gamma", 947}, - {"delta", 948}, - {"epsilon", 949}, - {"zeta", 950}, - {"eta", 951}, - {"theta", 952}, - {"iota", 953}, - {"kappa", 954}, - {"lambda", 955}, - {"mu", 956}, - {"nu", 957}, - {"xi", 958}, - {"omicron", 959}, - {"pi", 960}, - {"rho", 961}, - {"sigmaf", 962}, - {"sigma", 963}, - {"tau", 964}, - {"upsilon", 965}, - {"phi", 966}, - {"chi", 967}, - {"psi", 968}, - {"omega", 969}, - {"thetasym", 977}, - {"upsih", 978}, - {"piv", 982}, - {"bull", 8226}, - {"hellip", 8230}, - {"prime", 8242}, - {"Prime", 8243}, - {"oline", 8254}, - {"frasl", 8260}, - {"weierp", 8472}, - {"image", 8465}, - {"real", 8476}, - {"trade", 8482}, - {"alefsym", 8501}, - {"larr", 8592}, - {"uarr", 8593}, - {"rarr", 8594}, - {"darr", 8595}, - {"harr", 8596}, - {"crarr", 8629}, - {"lArr", 8656}, - {"uArr", 8657}, - {"rArr", 8658}, - {"dArr", 8659}, - {"hArr", 8660}, - {"forall", 8704}, - {"part", 8706}, - {"exist", 8707}, - {"empty", 8709}, - {"nabla", 8711}, - {"isin", 8712}, - {"notin", 8713}, - {"ni", 8715}, - {"prod", 8719}, - {"sum", 8721}, - {"minus", 8722}, - {"lowast", 8727}, - {"radic", 8730}, - {"prop", 8733}, - {"infin", 8734}, - {"ang", 8736}, - {"and", 8743}, - {"or", 8744}, - {"cap", 8745}, - {"cup", 8746}, - {"int", 8747}, - {"there4", 8756}, - {"sim", 8764}, - {"cong", 8773}, - {"asymp", 8776}, - {"ne", 8800}, - {"equiv", 8801}, - {"le", 8804}, - {"ge", 8805}, - {"sub", 8834}, - {"sup", 8835}, - {"nsub", 8836}, - {"sube", 8838}, - {"supe", 8839}, - {"oplus", 8853}, - {"otimes", 8855}, - {"perp", 8869}, - {"sdot", 8901}, - {"lceil", 8968}, - {"rceil", 8969}, - {"lfloor", 8970}, - {"rfloor", 8971}, - {"lang", 9001}, - {"rang", 9002}, - {"loz", 9674}, - {"spades", 9824}, - {"clubs", 9827}, - {"hearts", 9829}, - {"diams", 9830}, + {"fnof", 402, "f"}, + {"Alpha", 913, "alpha"}, + {"Beta", 914, "beta"}, + {"Gamma", 915, "gamma"}, + {"Delta", 916, "delta"}, + {"Epsilon", 917, "epsilon"}, + {"Zeta", 918, "zeta"}, + {"Eta", 919, "eta"}, + {"Theta", 920, "theta"}, + {"Iota", 921, "iota"}, + {"Kappa", 922, "kappa"}, + {"Lambda", 923, "lambda"}, + {"Mu", 924, "mu"}, + {"Nu", 925, "nu"}, + {"Xi", 926, "xi"}, + {"Omicron", 927, "omicron"}, + {"Pi", 928, "pi"}, + {"Rho", 929, "rho"}, + {"Sigma", 931, "sigma"}, + {"Tau", 932, "tau"}, + {"Upsilon", 933, "upsilon"}, + {"Phi", 934, "phi"}, + {"Chi", 935, "chi"}, + {"Psi", 936, "psi"}, + {"Omega", 937, "omega"}, + {"alpha", 945, "alpha"}, + {"beta", 946, "beta"}, + {"gamma", 947, "gamma"}, + {"delta", 948, "delta"}, + {"epsilon", 949, "epsilon"}, + {"zeta", 950, "zeta"}, + {"eta", 951, "eta"}, + {"theta", 952, "theta"}, + {"iota", 953, "iota"}, + {"kappa", 954, "kappa"}, + {"lambda", 955, "lambda"}, + {"mu", 956, "mu"}, + {"nu", 957, "nu"}, + {"xi", 958, "xi"}, + {"omicron", 959, "omicron"}, + {"pi", 960, "pi"}, + {"rho", 961, "rho"}, + {"sigmaf", 962, "sigmaf"}, + {"sigma", 963, "sigma"}, + {"tau", 964, "tau"}, + {"upsilon", 965, "upsilon"}, + {"phi", 966, "phi"}, + {"chi", 967, "chi"}, + {"psi", 968, "psi"}, + {"omega", 969, "omega"}, + {"thetasym", 977, "thetasym"}, + {"upsih", 978, "upsih"}, + {"piv", 982, "piv"}, + {"bull", 8226, "bull"}, + {"hellip", 8230, "hellip"}, + {"prime", 8242, "'"}, + {"Prime", 8243, "'"}, + {"oline", 8254, "-"}, + {"frasl", 8260, NULL}, + {"weierp", 8472, NULL}, + {"image", 8465, NULL}, + {"real", 8476, NULL}, + {"trade", 8482, NULL}, + {"alefsym", 8501, "a"}, + {"larr", 8592, NULL}, + {"uarr", 8593, NULL}, + {"rarr", 8594, NULL}, + {"darr", 8595, NULL}, + {"harr", 8596, NULL}, + {"crarr", 8629, NULL}, + {"lArr", 8656, NULL}, + {"uArr", 8657, NULL}, + {"rArr", 8658, NULL}, + {"dArr", 8659, NULL}, + {"hArr", 8660, NULL}, + {"forall", 8704, NULL}, + {"part", 8706, NULL}, + {"exist", 8707, NULL}, + {"empty", 8709, NULL}, + {"nabla", 8711, NULL}, + {"isin", 8712, NULL}, + {"notin", 8713, NULL}, + {"ni", 8715, NULL}, + {"prod", 8719, NULL}, + {"sum", 8721, "E"}, + {"minus", 8722, "-"}, + {"lowast", 8727, NULL}, + {"radic", 8730, NULL}, + {"prop", 8733, NULL}, + {"infin", 8734, NULL}, + {"ang", 8736, "'"}, + {"and", 8743, "&"}, + {"or", 8744, "|"}, + {"cap", 8745, NULL}, + {"cup", 8746, NULL}, + {"int", 8747, NULL}, + {"there4", 8756, NULL}, + {"sim", 8764, NULL}, + {"cong", 8773, NULL}, + {"asymp", 8776, NULL}, + {"ne", 8800, "!="}, + {"equiv", 8801, "=="}, + {"le", 8804, "<="}, + {"ge", 8805, ">="}, + {"sub", 8834, NULL}, + {"sup", 8835, NULL}, + {"nsub", 8836, NULL}, + {"sube", 8838, NULL}, + {"supe", 8839, NULL}, + {"oplus", 8853, NULL}, + {"otimes", 8855, NULL}, + {"perp", 8869, NULL}, + {"sdot", 8901, NULL}, + {"lceil", 8968, NULL}, + {"rceil", 8969, NULL}, + {"lfloor", 8970, NULL}, + {"rfloor", 8971, NULL}, + {"lang", 9001, NULL}, + {"rang", 9002, NULL}, + {"loz", 9674, NULL}, + {"spades", 9824, NULL}, + {"clubs", 9827, NULL}, + {"hearts", 9829, NULL}, + {"diams", 9830, NULL}, /* ** Extended Entities defined in HTML 4: Special (less Markup at top) */ - {"OElig", 338}, - {"oelig", 339}, - {"Scaron", 352}, - {"scaron", 353}, - {"Yuml", 376}, - {"circ", 710}, - {"tilde", 732}, - {"ensp", 8194}, - {"emsp", 8195}, - {"thinsp", 8201}, - {"zwnj", 8204}, - {"zwj", 8205}, - {"lrm", 8206}, - {"rlm", 8207}, - {"ndash", 8211}, - {"mdash", 8212}, - {"lsquo", 8216}, - {"rsquo", 8217}, - {"sbquo", 8218}, - {"ldquo", 8220}, - {"rdquo", 8221}, - {"bdquo", 8222}, - {"dagger", 8224}, - {"Dagger", 8225}, - {"permil", 8240}, - {"lsaquo", 8249}, - {"rsaquo", 8250}, - {"euro", 8364}, + {"OElig", 338, NULL}, + {"oelig", 339, NULL}, + {"Scaron", 352, NULL}, + {"scaron", 353, NULL}, + {"Yuml", 376, NULL}, + {"circ", 710, NULL}, + {"tilde", 732, NULL}, + {"ensp", 8194, NULL}, + {"emsp", 8195, NULL}, + {"thinsp", 8201, NULL}, + {"zwnj", 8204, NULL}, + {"zwj", 8205, NULL}, + {"lrm", 8206, NULL}, + {"rlm", 8207, NULL}, + {"ndash", 8211, "-"}, + {"mdash", 8212, "-"}, + {"lsquo", 8216, "'"}, + {"rsquo", 8217, "'"}, + {"sbquo", 8218, "\""}, + {"ldquo", 8220, "\""}, + {"rdquo", 8221, "\""}, + {"bdquo", 8222, "\""}, + {"dagger", 8224, "T"}, + {"Dagger", 8225, "T"}, + {"permil", 8240, NULL}, + {"lsaquo", 8249, "\""}, + {"rsaquo", 8250, "\""}, + {"euro", 8364, "E"}, }; +static entity *entities_defs_num = NULL; static int tag_cmp (const void *m1, const void *m2) @@ -456,6 +458,15 @@ entity_cmp (const void *m1, const void *m2) return g_ascii_strcasecmp (p1->name, p2->name); } +static int +entity_cmp_num (const void *m1, const void *m2) +{ + const entity *p1 = m1; + const entity *p2 = m2; + + return p1->code - p2->code; +} + static GNode * construct_html_node (memory_pool_t * pool, char *text) { @@ -553,7 +564,7 @@ get_tag_by_name (const char *name) void decode_entitles (char *s, guint * len) { - guint l; + guint l, rep_len; char *t = s; /* t - tortoise */ char *h = s; /* h - hare */ char *e = s; @@ -592,12 +603,10 @@ decode_entitles (char *s, guint * len) key.name = e + 1; *h = '\0'; if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) { - if (found->code > 0 || found->code < 127) { - *t = (char)found->code; - } - else { - /* Skip undecoded */ - t = h; + if (found->replacement) { + rep_len = strlen (found->replacement); + memcpy (t, found->replacement, rep_len); + t += rep_len; } } else { @@ -616,17 +625,25 @@ decode_entitles (char *s, guint * len) else { val = strtoul ((e + 3), &end_ptr, base); } - if ((end_ptr != NULL && *end_ptr != '\0') || (val == 0 || val > 127)) { + if (end_ptr != NULL && *end_ptr != '\0') { /* Skip undecoded */ t = h; } else { - *t = (char)val; + /* Search for a replacement */ + key.code = val; + found = bsearch (&key, entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num); + if (found) { + if (found->replacement) { + rep_len = strlen (found->replacement); + memcpy (t, found->replacement, rep_len); + t += rep_len; + } + } } } *h = ';'; state = 0; - t++; } h++; break; @@ -764,6 +781,9 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_ } if (!entities_sorted) { qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp); + entities_defs_num = g_new (entity, G_N_ELEMENTS (entities_defs)); + memcpy (entities_defs_num, entities_defs, sizeof (entities_defs)); + qsort (entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num); entities_sorted = 1; } |