]> source.dussan.org Git - rspamd.git/commitdiff
* Make improvements to HTML entites decoder: now it replaces entities with common...
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 16 Jul 2010 14:40:56 +0000 (18:40 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 16 Jul 2010 14:40:56 +0000 (18:40 +0400)
 remove unknown entities. This behaviour is more like of standart HTML to text conversion
* Add -d option to force debug output

src/html.c
src/main.c

index 243759fa707892d22532fb5c083303c199fc61e5..be1f1ef16afedf391618d07dc9ddd6c663784133 100644 (file)
@@ -163,6 +163,7 @@ typedef struct _entity          entity;
 struct _entity {
        char                           *name;
        uint                            code;
+       char                           *replacement;
 };
 
 
@@ -170,273 +171,274 @@ static entity                   entities_defs[] = {
        /*
         ** Markup pre-defined character entities
         */
-       {"quot", 34},
-       {"amp", 38},
-       {"apos", 39},
-       {"lt", 60},
-       {"gt", 62},
+       {"quot", 34, "\""},
+       {"amp", 38, "&"},
+       {"apos", 39, "'"},
+       {"lt", 60, "<"},
+       {"gt", 62, ">"},
 
        /*
         ** Latin-1 character entities
         */
-       {"nbsp", 160},
-       {"iexcl", 161},
-       {"cent", 162},
-       {"pound", 163},
-       {"curren", 164},
-       {"yen", 165},
-       {"brvbar", 166},
-       {"sect", 167},
-       {"uml", 168},
-       {"copy", 169},
-       {"ordf", 170},
-       {"laquo", 171},
-       {"not", 172},
-       {"shy", 173},
-       {"reg", 174},
-       {"macr", 175},
-       {"deg", 176},
-       {"plusmn", 177},
-       {"sup2", 178},
-       {"sup3", 179},
-       {"acute", 180},
-       {"micro", 181},
-       {"para", 182},
-       {"middot", 183},
-       {"cedil", 184},
-       {"sup1", 185},
-       {"ordm", 186},
-       {"raquo", 187},
-       {"frac14", 188},
-       {"frac12", 189},
-       {"frac34", 190},
-       {"iquest", 191},
-       {"Agrave", 192},
-       {"Aacute", 193},
-       {"Acirc", 194},
-       {"Atilde", 195},
-       {"Auml", 196},
-       {"Aring", 197},
-       {"AElig", 198},
-       {"Ccedil", 199},
-       {"Egrave", 200},
-       {"Eacute", 201},
-       {"Ecirc", 202},
-       {"Euml", 203},
-       {"Igrave", 204},
-       {"Iacute", 205},
-       {"Icirc", 206},
-       {"Iuml", 207},
-       {"ETH", 208},
-       {"Ntilde", 209},
-       {"Ograve", 210},
-       {"Oacute", 211},
-       {"Ocirc", 212},
-       {"Otilde", 213},
-       {"Ouml", 214},
-       {"times", 215},
-       {"Oslash", 216},
-       {"Ugrave", 217},
-       {"Uacute", 218},
-       {"Ucirc", 219},
-       {"Uuml", 220},
-       {"Yacute", 221},
-       {"THORN", 222},
-       {"szlig", 223},
-       {"agrave", 224},
-       {"aacute", 225},
-       {"acirc", 226},
-       {"atilde", 227},
-       {"auml", 228},
-       {"aring", 229},
-       {"aelig", 230},
-       {"ccedil", 231},
-       {"egrave", 232},
-       {"eacute", 233},
-       {"ecirc", 234},
-       {"euml", 235},
-       {"igrave", 236},
-       {"iacute", 237},
-       {"icirc", 238},
-       {"iuml", 239},
-       {"eth", 240},
-       {"ntilde", 241},
-       {"ograve", 242},
-       {"oacute", 243},
-       {"ocirc", 244},
-       {"otilde", 245},
-       {"ouml", 246},
-       {"divide", 247},
-       {"oslash", 248},
-       {"ugrave", 249},
-       {"uacute", 250},
-       {"ucirc", 251},
-       {"uuml", 252},
-       {"yacute", 253},
-       {"thorn", 254},
-       {"yuml", 255},
+       {"nbsp", 160, " "},
+       {"iexcl", 161, "!"},
+       {"cent", 162, "cent"},
+       {"pound", 163, "pound"},
+       {"curren", 164, "current"},
+       {"yen", 165, "yen"},
+       {"brvbar", 166, NULL},
+       {"sect", 167, NULL},
+       {"uml", 168, "uml"},
+       {"copy", 169, "c"},
+       {"ordf", 170, NULL},
+       {"laquo", 171, "\""},
+       {"not", 172, "!"},
+       {"shy", 173, NULL},
+       {"reg", 174, "r"},
+       {"macr", 175, NULL},
+       {"deg", 176, "deg"},
+       {"plusmn", 177, "+-"},
+       {"sup2", 178, "2"},
+       {"sup3", 179, "3"},
+       {"acute", 180, NULL},
+       {"micro", 181, NULL},
+       {"para", 182, NULL},
+       {"middot", 183, "."},
+       {"cedil", 184, NULL},
+       {"sup1", 185, "1"},
+       {"ordm", 186, NULL},
+       {"raquo", 187, "\""},
+       {"frac14", 188, "1/4"},
+       {"frac12", 189, "1/2"},
+       {"frac34", 190, "3/4"},
+       {"iquest", 191, "i"},
+       {"Agrave", 192, "a"},
+       {"Aacute", 193, "a"},
+       {"Acirc", 194, "a"},
+       {"Atilde", 195, "a"},
+       {"Auml", 196, "a"},
+       {"Aring", 197, "a"},
+       {"AElig", 198, "a"},
+       {"Ccedil", 199, "c"},
+       {"Egrave", 200, "e"},
+       {"Eacute", 201, "e"},
+       {"Ecirc", 202, "e"},
+       {"Euml", 203, "e"},
+       {"Igrave", 204, "i"},
+       {"Iacute", 205, "i"},
+       {"Icirc", 206, "i"},
+       {"Iuml", 207, "i"},
+       {"ETH", 208, "e"},
+       {"Ntilde", 209, "n"},
+       {"Ograve", 210, "o"},
+       {"Oacute", 211, "o"},
+       {"Ocirc", 212, "o"},
+       {"Otilde", 213, "o"},
+       {"Ouml", 214, "o"},
+       {"times", 215, "t"},
+       {"Oslash", 216, "o"},
+       {"Ugrave", 217, "u"},
+       {"Uacute", 218, "u"},
+       {"Ucirc", 219, "u"},
+       {"Uuml", 220, "u"},
+       {"Yacute", 221, "y"},
+       {"THORN", 222, "t"},
+       {"szlig", 223, "s"},
+       {"agrave", 224, "a"},
+       {"aacute", 225, "a"},
+       {"acirc", 226, "a"},
+       {"atilde", 227, "a"},
+       {"auml", 228, "a"},
+       {"aring", 229, "a"},
+       {"aelig", 230, "a"},
+       {"ccedil", 231, "c"},
+       {"egrave", 232, "e"},
+       {"eacute", 233, "e"},
+       {"ecirc", 234, "e"},
+       {"euml", 235, "e"},
+       {"igrave", 236, "e"},
+       {"iacute", 237, "e"},
+       {"icirc", 238, "e"},
+       {"iuml", 239, "e"},
+       {"eth", 240, "e"},
+       {"ntilde", 241, "n"},
+       {"ograve", 242, "o"},
+       {"oacute", 243, "o"},
+       {"ocirc", 244, "o"},
+       {"otilde", 245, "o"},
+       {"ouml", 246, "o"},
+       {"divide", 247, "/"},
+       {"oslash", 248, "/"},
+       {"ugrave", 249, "u"},
+       {"uacute", 250, "u"},
+       {"ucirc", 251, "u"},
+       {"uuml", 252, "u"},
+       {"yacute", 253, "y"},
+       {"thorn", 254, "t"},
+       {"yuml", 255, "y"},
 
        /*
         ** Extended Entities defined in HTML 4: Symbols 
         */
-       {"fnof", 402},
-       {"Alpha", 913},
-       {"Beta", 914},
-       {"Gamma", 915},
-       {"Delta", 916},
-       {"Epsilon", 917},
-       {"Zeta", 918},
-       {"Eta", 919},
-       {"Theta", 920},
-       {"Iota", 921},
-       {"Kappa", 922},
-       {"Lambda", 923},
-       {"Mu", 924},
-       {"Nu", 925},
-       {"Xi", 926},
-       {"Omicron", 927},
-       {"Pi", 928},
-       {"Rho", 929},
-       {"Sigma", 931},
-       {"Tau", 932},
-       {"Upsilon", 933},
-       {"Phi", 934},
-       {"Chi", 935},
-       {"Psi", 936},
-       {"Omega", 937},
-       {"alpha", 945},
-       {"beta", 946},
-       {"gamma", 947},
-       {"delta", 948},
-       {"epsilon", 949},
-       {"zeta", 950},
-       {"eta", 951},
-       {"theta", 952},
-       {"iota", 953},
-       {"kappa", 954},
-       {"lambda", 955},
-       {"mu", 956},
-       {"nu", 957},
-       {"xi", 958},
-       {"omicron", 959},
-       {"pi", 960},
-       {"rho", 961},
-       {"sigmaf", 962},
-       {"sigma", 963},
-       {"tau", 964},
-       {"upsilon", 965},
-       {"phi", 966},
-       {"chi", 967},
-       {"psi", 968},
-       {"omega", 969},
-       {"thetasym", 977},
-       {"upsih", 978},
-       {"piv", 982},
-       {"bull", 8226},
-       {"hellip", 8230},
-       {"prime", 8242},
-       {"Prime", 8243},
-       {"oline", 8254},
-       {"frasl", 8260},
-       {"weierp", 8472},
-       {"image", 8465},
-       {"real", 8476},
-       {"trade", 8482},
-       {"alefsym", 8501},
-       {"larr", 8592},
-       {"uarr", 8593},
-       {"rarr", 8594},
-       {"darr", 8595},
-       {"harr", 8596},
-       {"crarr", 8629},
-       {"lArr", 8656},
-       {"uArr", 8657},
-       {"rArr", 8658},
-       {"dArr", 8659},
-       {"hArr", 8660},
-       {"forall", 8704},
-       {"part", 8706},
-       {"exist", 8707},
-       {"empty", 8709},
-       {"nabla", 8711},
-       {"isin", 8712},
-       {"notin", 8713},
-       {"ni", 8715},
-       {"prod", 8719},
-       {"sum", 8721},
-       {"minus", 8722},
-       {"lowast", 8727},
-       {"radic", 8730},
-       {"prop", 8733},
-       {"infin", 8734},
-       {"ang", 8736},
-       {"and", 8743},
-       {"or", 8744},
-       {"cap", 8745},
-       {"cup", 8746},
-       {"int", 8747},
-       {"there4", 8756},
-       {"sim", 8764},
-       {"cong", 8773},
-       {"asymp", 8776},
-       {"ne", 8800},
-       {"equiv", 8801},
-       {"le", 8804},
-       {"ge", 8805},
-       {"sub", 8834},
-       {"sup", 8835},
-       {"nsub", 8836},
-       {"sube", 8838},
-       {"supe", 8839},
-       {"oplus", 8853},
-       {"otimes", 8855},
-       {"perp", 8869},
-       {"sdot", 8901},
-       {"lceil", 8968},
-       {"rceil", 8969},
-       {"lfloor", 8970},
-       {"rfloor", 8971},
-       {"lang", 9001},
-       {"rang", 9002},
-       {"loz", 9674},
-       {"spades", 9824},
-       {"clubs", 9827},
-       {"hearts", 9829},
-       {"diams", 9830},
+       {"fnof", 402, "f"},
+       {"Alpha", 913, "alpha"},
+       {"Beta", 914, "beta"},
+       {"Gamma", 915, "gamma"},
+       {"Delta", 916, "delta"},
+       {"Epsilon", 917, "epsilon"},
+       {"Zeta", 918, "zeta"},
+       {"Eta", 919, "eta"},
+       {"Theta", 920, "theta"},
+       {"Iota", 921, "iota"},
+       {"Kappa", 922, "kappa"},
+       {"Lambda", 923, "lambda"},
+       {"Mu", 924, "mu"},
+       {"Nu", 925, "nu"},
+       {"Xi", 926, "xi"},
+       {"Omicron", 927, "omicron"},
+       {"Pi", 928, "pi"},
+       {"Rho", 929, "rho"},
+       {"Sigma", 931, "sigma"},
+       {"Tau", 932, "tau"},
+       {"Upsilon", 933, "upsilon"},
+       {"Phi", 934, "phi"},
+       {"Chi", 935, "chi"},
+       {"Psi", 936, "psi"},
+       {"Omega", 937, "omega"},
+       {"alpha", 945, "alpha"},
+       {"beta", 946, "beta"},
+       {"gamma", 947, "gamma"},
+       {"delta", 948, "delta"},
+       {"epsilon", 949, "epsilon"},
+       {"zeta", 950, "zeta"},
+       {"eta", 951, "eta"},
+       {"theta", 952, "theta"},
+       {"iota", 953, "iota"},
+       {"kappa", 954, "kappa"},
+       {"lambda", 955, "lambda"},
+       {"mu", 956, "mu"},
+       {"nu", 957, "nu"},
+       {"xi", 958, "xi"},
+       {"omicron", 959, "omicron"},
+       {"pi", 960, "pi"},
+       {"rho", 961, "rho"},
+       {"sigmaf", 962, "sigmaf"},
+       {"sigma", 963, "sigma"},
+       {"tau", 964, "tau"},
+       {"upsilon", 965, "upsilon"},
+       {"phi", 966, "phi"},
+       {"chi", 967, "chi"},
+       {"psi", 968, "psi"},
+       {"omega", 969, "omega"},
+       {"thetasym", 977, "thetasym"},
+       {"upsih", 978, "upsih"},
+       {"piv", 982, "piv"},
+       {"bull", 8226, "bull"},
+       {"hellip", 8230, "hellip"},
+       {"prime", 8242, "'"},
+       {"Prime", 8243, "'"},
+       {"oline", 8254, "-"},
+       {"frasl", 8260, NULL},
+       {"weierp", 8472, NULL},
+       {"image", 8465, NULL},
+       {"real", 8476, NULL},
+       {"trade", 8482, NULL},
+       {"alefsym", 8501, "a"},
+       {"larr", 8592, NULL},
+       {"uarr", 8593, NULL},
+       {"rarr", 8594, NULL},
+       {"darr", 8595, NULL},
+       {"harr", 8596, NULL},
+       {"crarr", 8629, NULL},
+       {"lArr", 8656, NULL},
+       {"uArr", 8657, NULL},
+       {"rArr", 8658, NULL},
+       {"dArr", 8659, NULL},
+       {"hArr", 8660, NULL},
+       {"forall", 8704, NULL},
+       {"part", 8706, NULL},
+       {"exist", 8707, NULL},
+       {"empty", 8709, NULL},
+       {"nabla", 8711, NULL},
+       {"isin", 8712, NULL},
+       {"notin", 8713, NULL},
+       {"ni", 8715, NULL},
+       {"prod", 8719, NULL},
+       {"sum", 8721, "E"},
+       {"minus", 8722, "-"},
+       {"lowast", 8727, NULL},
+       {"radic", 8730, NULL},
+       {"prop", 8733, NULL},
+       {"infin", 8734, NULL},
+       {"ang", 8736, "'"},
+       {"and", 8743, "&"},
+       {"or", 8744, "|"},
+       {"cap", 8745, NULL},
+       {"cup", 8746, NULL},
+       {"int", 8747, NULL},
+       {"there4", 8756, NULL},
+       {"sim", 8764, NULL},
+       {"cong", 8773, NULL},
+       {"asymp", 8776, NULL},
+       {"ne", 8800, "!="},
+       {"equiv", 8801, "=="},
+       {"le", 8804, "<="},
+       {"ge", 8805, ">="},
+       {"sub", 8834, NULL},
+       {"sup", 8835, NULL},
+       {"nsub", 8836, NULL},
+       {"sube", 8838, NULL},
+       {"supe", 8839, NULL},
+       {"oplus", 8853, NULL},
+       {"otimes", 8855, NULL},
+       {"perp", 8869, NULL},
+       {"sdot", 8901, NULL},
+       {"lceil", 8968, NULL},
+       {"rceil", 8969, NULL},
+       {"lfloor", 8970, NULL},
+       {"rfloor", 8971, NULL},
+       {"lang", 9001, NULL},
+       {"rang", 9002, NULL},
+       {"loz", 9674, NULL},
+       {"spades", 9824, NULL},
+       {"clubs", 9827, NULL},
+       {"hearts", 9829, NULL},
+       {"diams", 9830, NULL},
 
        /*
         ** Extended Entities defined in HTML 4: Special (less Markup at top)
         */
-       {"OElig", 338},
-       {"oelig", 339},
-       {"Scaron", 352},
-       {"scaron", 353},
-       {"Yuml", 376},
-       {"circ", 710},
-       {"tilde", 732},
-       {"ensp", 8194},
-       {"emsp", 8195},
-       {"thinsp", 8201},
-       {"zwnj", 8204},
-       {"zwj", 8205},
-       {"lrm", 8206},
-       {"rlm", 8207},
-       {"ndash", 8211},
-       {"mdash", 8212},
-       {"lsquo", 8216},
-       {"rsquo", 8217},
-       {"sbquo", 8218},
-       {"ldquo", 8220},
-       {"rdquo", 8221},
-       {"bdquo", 8222},
-       {"dagger", 8224},
-       {"Dagger", 8225},
-       {"permil", 8240},
-       {"lsaquo", 8249},
-       {"rsaquo", 8250},
-       {"euro", 8364},
+       {"OElig", 338, NULL},
+       {"oelig", 339, NULL},
+       {"Scaron", 352, NULL},
+       {"scaron", 353, NULL},
+       {"Yuml", 376, NULL},
+       {"circ", 710, NULL},
+       {"tilde", 732, NULL},
+       {"ensp", 8194, NULL},
+       {"emsp", 8195, NULL},
+       {"thinsp", 8201, NULL},
+       {"zwnj", 8204, NULL},
+       {"zwj", 8205, NULL},
+       {"lrm", 8206, NULL},
+       {"rlm", 8207, NULL},
+       {"ndash", 8211, "-"},
+       {"mdash", 8212, "-"},
+       {"lsquo", 8216, "'"},
+       {"rsquo", 8217, "'"},
+       {"sbquo", 8218, "\""},
+       {"ldquo", 8220, "\""},
+       {"rdquo", 8221, "\""},
+       {"bdquo", 8222, "\""},
+       {"dagger", 8224, "T"},
+       {"Dagger", 8225, "T"},
+       {"permil", 8240, NULL},
+       {"lsaquo", 8249, "\""},
+       {"rsaquo", 8250, "\""},
+       {"euro", 8364, "E"},
 };
 
+static entity                  *entities_defs_num = NULL;
 
 static int
 tag_cmp (const void *m1, const void *m2)
@@ -456,6 +458,15 @@ entity_cmp (const void *m1, const void *m2)
        return g_ascii_strcasecmp (p1->name, p2->name);
 }
 
+static int
+entity_cmp_num (const void *m1, const void *m2)
+{
+       const entity                   *p1 = m1;
+       const entity                   *p2 = m2;
+
+       return p1->code - p2->code;
+}
+
 static GNode                   *
 construct_html_node (memory_pool_t * pool, char *text)
 {
@@ -553,7 +564,7 @@ get_tag_by_name (const char *name)
 void
 decode_entitles (char *s, guint * len)
 {
-       guint                           l;
+       guint                           l, rep_len;
        char                           *t = s;  /* t - tortoise */
        char                           *h = s;  /* h - hare     */
        char                           *e = s;
@@ -592,12 +603,10 @@ decode_entitles (char *s, guint * len)
                                key.name = e + 1;
                                *h = '\0';
                                if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) {
-                                       if (found->code > 0 || found->code < 127) {
-                                               *t = (char)found->code;
-                                       }
-                                       else {
-                                               /* Skip undecoded */
-                                               t = h;
+                                       if (found->replacement) {
+                                               rep_len = strlen (found->replacement);
+                                               memcpy (t, found->replacement, rep_len);
+                                               t += rep_len;
                                        }
                                }
                                else {
@@ -616,17 +625,25 @@ decode_entitles (char *s, guint * len)
                                        else {
                                                val = strtoul ((e + 3), &end_ptr, base);
                                        }
-                                       if ((end_ptr != NULL && *end_ptr != '\0') || (val == 0 || val > 127)) {
+                                       if (end_ptr != NULL && *end_ptr != '\0') {
                                                /* Skip undecoded */
                                                t = h;
                                        }
                                        else {
-                                               *t = (char)val;
+                                               /* Search for a replacement */
+                                               key.code = val;
+                                               found = bsearch (&key, entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
+                                               if (found) {
+                                                       if (found->replacement) {
+                                                               rep_len = strlen (found->replacement);
+                                                               memcpy (t, found->replacement, rep_len);
+                                                               t += rep_len;
+                                                       }
+                                               }
                                        }
                                }
                                *h = ';';
                                state = 0;
-                               t++;
                        }
                        h++;
                        break;
@@ -764,6 +781,9 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
        }
        if (!entities_sorted) {
                qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp);
+               entities_defs_num = g_new (entity, G_N_ELEMENTS (entities_defs));
+               memcpy (entities_defs_num, entities_defs, sizeof (entities_defs));
+               qsort (entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num);
                entities_sorted = 1;
        }
 
index da8b87e64fea37be45f9aebdadbaba6729de0a21..3c292f2b4e04645103f7112cc3841d4fc8be7446 100644 (file)
@@ -74,6 +74,7 @@ static gchar                   *rspamd_group;
 static gchar                   *rspamd_pidfile;
 static gboolean                 dump_vars;
 static gboolean                 dump_cache;
+static gboolean                 is_debug;
 
 /* List of workers that are pending to start */
 static GList                   *workers_pending = NULL;
@@ -92,6 +93,7 @@ static GOptionEntry entries[] =
   { "pid", 'p', 0, G_OPTION_ARG_STRING, &rspamd_pidfile, "Path to pidfile", NULL },
   { "dump-vars", 'V', 0, G_OPTION_ARG_NONE, &dump_vars, "Print all rspamd variables and exit", NULL },
   { "dump-cache", 'C', 0, G_OPTION_ARG_NONE, &dump_cache, "Dump symbols cache stats and exit", NULL },
+  { "debug", 'd', 0, G_OPTION_ARG_NONE, &is_debug, "Force debug output", NULL },
   { NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL }
 };
 
@@ -271,6 +273,10 @@ reread_config (struct rspamd_main *rspamd)
                        close_log ();
                        g_free (rspamd->cfg);
                        rspamd->cfg = tmp_cfg;
+                       /* Force debug log */
+                       if (is_debug) {
+                               rspamd->cfg->log_level = G_LOG_LEVEL_DEBUG;
+                       }
                        config_logger (rspamd, FALSE);
                        /* Perform modules configuring */
                        l = g_list_first (rspamd->cfg->filters);
@@ -780,7 +786,7 @@ main (int argc, char **argv, char **env)
                rspamd->cfg->cfg_name = FIXED_CONFIG_FILE;
        }
 
-       if (rspamd->cfg->config_test) {
+       if (rspamd->cfg->config_test || is_debug) {
                rspamd->cfg->log_level = G_LOG_LEVEL_DEBUG;
        }
        else {
@@ -811,6 +817,10 @@ main (int argc, char **argv, char **env)
                exit (EXIT_FAILURE);
        }
        
+       /* Force debug log */
+       if (is_debug) {
+               rspamd->cfg->log_level = G_LOG_LEVEL_DEBUG;
+       }
        /* Pre-init of cache */
        rspamd->cfg->cache = g_new0 (struct symbols_cache, 1);
        rspamd->cfg->cache->static_pool = memory_pool_new (memory_pool_get_size ());