From: Vsevolod Stakhov Date: Sat, 19 Jun 2021 10:41:17 +0000 (+0100) Subject: [Minor] Html entities seem like a big anecdote, enable bug-to-bug compat with WebKit X-Git-Tag: 3.0~271 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=0e753fe22aa4e92cec5710dced0bb5e572180783;p=rspamd.git [Minor] Html entities seem like a big anecdote, enable bug-to-bug compat with WebKit --- diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx index 144de5d99..50bf34f18 100644 --- a/src/libserver/html/html_entities.cxx +++ b/src/libserver/html/html_entities.cxx @@ -34,12 +34,14 @@ struct html_entity_def { std::string name; std::string replacement; unsigned code; + bool allow_heuristic; }; -#define ENTITY_DEF(name, code, replacement) html_entity_def{(name), (replacement), (code)} +#define ENTITY_DEF(name, code, replacement) html_entity_def{(name), (replacement), (code), false} +#define ENTITY_DEF_HEUR(name, code, replacement) html_entity_def{(name), (replacement), (code), true} static const auto html_entities_array = rspamd::array_of( - ENTITY_DEF("szlig", 223, "\xc3\x9f"), + ENTITY_DEF_HEUR("szlig", 223, "\xc3\x9f"), ENTITY_DEF("prime", 8242, "\xe2\x80\xb2"), ENTITY_DEF("lnsim", 8934, "\xe2\x8b\xa6"), ENTITY_DEF("nvDash", 8877, "\xe2\x8a\xad"), @@ -53,7 +55,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Dstrok", 272, "\xc4\x90"), ENTITY_DEF("rrarr", 8649, "\xe2\x87\x89"), ENTITY_DEF("rArr", 8658, "\xe2\x87\x92"), - ENTITY_DEF("Aacute", 193, "\xc3\x81"), + ENTITY_DEF_HEUR("Aacute", 193, "\xc3\x81"), ENTITY_DEF("kappa", 954, "\xce\xba"), ENTITY_DEF("Iopf", 120128, "\xf0\x9d\x95\x80"), ENTITY_DEF("hyphen", 8208, "\xe2\x80\x90"), @@ -75,16 +77,16 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Vcy", 1042, "\xd0\x92"), ENTITY_DEF("erDot", 8787, "\xe2\x89\x93"), ENTITY_DEF("nsubE", 10949, "\xe2\xab\x85\xcc\xb8"), - ENTITY_DEF("egrave", 232, "\xc3\xa8"), + ENTITY_DEF_HEUR("egrave", 232, "\xc3\xa8"), ENTITY_DEF("Lcedil", 315, "\xc4\xbb"), ENTITY_DEF("lharul", 10602, "\xe2\xa5\xaa"), - ENTITY_DEF("middot", 183, "\xc2\xb7"), + ENTITY_DEF_HEUR("middot", 183, "\xc2\xb7"), ENTITY_DEF("ggg", 8921, "\xe2\x8b\x99"), ENTITY_DEF("NestedLessLess", 8810, "\xe2\x89\xaa"), ENTITY_DEF("tau", 964, "\xcf\x84"), ENTITY_DEF("setmn", 8726, "\xe2\x88\x96"), ENTITY_DEF("frac78", 8542, "\xe2\x85\x9e"), - ENTITY_DEF("para", 182, "\xc2\xb6"), + ENTITY_DEF_HEUR("para", 182, "\xc2\xb6"), ENTITY_DEF("Rcedil", 342, "\xc5\x96"), ENTITY_DEF("propto", 8733, "\xe2\x88\x9d"), ENTITY_DEF("sqsubset", 8847, "\xe2\x8a\x8f"), @@ -112,12 +114,12 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("popf", 120161, "\xf0\x9d\x95\xa1"), ENTITY_DEF("dbkarow", 10511, "\xe2\xa4\x8f"), ENTITY_DEF("roang", 10221, "\xe2\x9f\xad"), - ENTITY_DEF("brvbar", 166, "\xc2\xa6"), + ENTITY_DEF_HEUR("brvbar", 166, "\xc2\xa6"), ENTITY_DEF("CenterDot", 183, "\xc2\xb7"), ENTITY_DEF("notindot", 8949, "\xe2\x8b\xb5\xcc\xb8"), ENTITY_DEF("supmult", 10946, "\xe2\xab\x82"), ENTITY_DEF("multimap", 8888, "\xe2\x8a\xb8"), - ENTITY_DEF("frac34", 190, "\xc2\xbe"), + ENTITY_DEF_HEUR("frac34", 190, "\xc2\xbe"), ENTITY_DEF("mapsto", 8614, "\xe2\x86\xa6"), ENTITY_DEF("flat", 9837, "\xe2\x99\xad"), ENTITY_DEF("updownarrow", 8597, "\xe2\x86\x95"), @@ -133,7 +135,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("dot", 729, "\xcb\x99"), ENTITY_DEF("tbrk", 9140, "\xe2\x8e\xb4"), ENTITY_DEF("LeftUpDownVector", 10577, "\xe2\xa5\x91"), - ENTITY_DEF("uml", 168, "\xc2\xa8"), + ENTITY_DEF_HEUR("uml", 168, "\xc2\xa8"), ENTITY_DEF("bbrk", 9141, "\xe2\x8e\xb5"), ENTITY_DEF("nearrow", 8599, "\xe2\x86\x97"), ENTITY_DEF("backsimeq", 8909, "\xe2\x8b\x8d"), @@ -142,7 +144,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("ldsh", 8626, "\xe2\x86\xb2"), ENTITY_DEF("sce", 10928, "\xe2\xaa\xb0"), ENTITY_DEF("angst", 197, "\xc3\x85"), - ENTITY_DEF("yen", 165, "\xc2\xa5"), + ENTITY_DEF_HEUR("yen", 165, "\xc2\xa5"), ENTITY_DEF("nsupE", 10950, "\xe2\xab\x86\xcc\xb8"), ENTITY_DEF("Uscr", 119984, "\xf0\x9d\x92\xb0"), ENTITY_DEF("subplus", 10943, "\xe2\xaa\xbf"), @@ -180,7 +182,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("smid", 8739, "\xe2\x88\xa3"), ENTITY_DEF("cularr", 8630, "\xe2\x86\xb6"), ENTITY_DEF("olcross", 10683, "\xe2\xa6\xbb"), - ENTITY_DEF("GT", 62, "\x3e"), + ENTITY_DEF_HEUR("GT", 62, "\x3e"), ENTITY_DEF("scap", 10936, "\xe2\xaa\xb8"), ENTITY_DEF("capcup", 10823, "\xe2\xa9\x87"), ENTITY_DEF("NotSquareSubsetEqual", 8930, "\xe2\x8b\xa2"), @@ -239,8 +241,8 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("gfr", 120100, "\xf0\x9d\x94\xa4"), ENTITY_DEF("notnivb", 8958, "\xe2\x8b\xbe"), ENTITY_DEF("Afr", 120068, "\xf0\x9d\x94\x84"), - ENTITY_DEF("ge", 8805, "\xe2\x89\xa5"), - ENTITY_DEF("iexcl", 161, "\xc2\xa1"), + ENTITY_DEF_HEUR("ge", 8805, "\xe2\x89\xa5"), + ENTITY_DEF_HEUR("iexcl", 161, "\xc2\xa1"), ENTITY_DEF("dfr", 120097, "\xf0\x9d\x94\xa1"), ENTITY_DEF("rsaquo", 8250, "\xe2\x80\xba"), ENTITY_DEF("xcap", 8898, "\xe2\x8b\x82"), @@ -257,7 +259,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Prime", 8243, "\xe2\x80\xb3"), ENTITY_DEF("prec", 8826, "\xe2\x89\xba"), ENTITY_DEF("swnwar", 10538, "\xe2\xa4\xaa"), - ENTITY_DEF("COPY", 169, "\xc2\xa9"), + ENTITY_DEF_HEUR("COPY", 169, "\xc2\xa9"), ENTITY_DEF("cong", 8773, "\xe2\x89\x85"), ENTITY_DEF("sacute", 347, "\xc5\x9b"), ENTITY_DEF("Nopf", 8469, "\xe2\x84\x95"), @@ -267,9 +269,9 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("iota", 953, "\xce\xb9"), ENTITY_DEF("notinE", 8953, "\xe2\x8b\xb9\xcc\xb8"), ENTITY_DEF("jfr", 120103, "\xf0\x9d\x94\xa7"), - ENTITY_DEF("QUOT", 34, "\x22"), + ENTITY_DEF_HEUR("QUOT", 34, "\x22"), ENTITY_DEF("vsupnE", 10956, "\xe2\xab\x8c\xef\xb8\x80"), - ENTITY_DEF("igrave", 236, "\xc3\xac"), + ENTITY_DEF_HEUR("igrave", 236, "\xc3\xac"), ENTITY_DEF("bsim", 8765, "\xe2\x88\xbd"), ENTITY_DEF("npreceq", 10927, "\xe2\xaa\xaf\xcc\xb8"), ENTITY_DEF("zcaron", 382, "\xc5\xbe"), @@ -320,7 +322,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("xrArr", 10233, "\xe2\x9f\xb9"), ENTITY_DEF("NotTildeEqual", 8772, "\xe2\x89\x84"), ENTITY_DEF("Bfr", 120069, "\xf0\x9d\x94\x85"), - ENTITY_DEF("Iuml", 207, "\xc3\x8f"), + ENTITY_DEF_HEUR("Iuml", 207, "\xc3\x8f"), ENTITY_DEF("leg", 8922, "\xe2\x8b\x9a"), ENTITY_DEF("boxhU", 9576, "\xe2\x95\xa8"), ENTITY_DEF("Gopf", 120126, "\xf0\x9d\x94\xbe"), @@ -329,7 +331,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("precapprox", 10935, "\xe2\xaa\xb7"), ENTITY_DEF("lcedil", 316, "\xc4\xbc"), ENTITY_DEF("between", 8812, "\xe2\x89\xac"), - ENTITY_DEF("Oslash", 216, "\xc3\x98"), + ENTITY_DEF_HEUR("Oslash", 216, "\xc3\x98"), ENTITY_DEF("breve", 728, "\xcb\x98"), ENTITY_DEF("caps", 8745, "\xe2\x88\xa9\xef\xb8\x80"), ENTITY_DEF("vangrt", 10652, "\xe2\xa6\x9c"), @@ -349,7 +351,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("plussim", 10790, "\xe2\xa8\xa6"), ENTITY_DEF("Darr", 8609, "\xe2\x86\xa1"), ENTITY_DEF("nexist", 8708, "\xe2\x88\x84"), - ENTITY_DEF("cent", 162, "\xc2\xa2"), + ENTITY_DEF_HEUR("cent", 162, "\xc2\xa2"), ENTITY_DEF("khcy", 1093, "\xd1\x85"), ENTITY_DEF("smallsetminus", 8726, "\xe2\x88\x96"), ENTITY_DEF("ycirc", 375, "\xc5\xb7"), @@ -362,7 +364,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("dwangle", 10662, "\xe2\xa6\xa6"), ENTITY_DEF("bowtie", 8904, "\xe2\x8b\x88"), ENTITY_DEF("Dfr", 120071, "\xf0\x9d\x94\x87"), - ENTITY_DEF("iacute", 237, "\xc3\xad"), + ENTITY_DEF_HEUR("iacute", 237, "\xc3\xad"), ENTITY_DEF("njcy", 1114, "\xd1\x9a"), ENTITY_DEF("cfr", 120096, "\xf0\x9d\x94\xa0"), ENTITY_DEF("TripleDot", 8411, "\xe2\x83\x9b"), @@ -373,7 +375,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Rang", 10219, "\xe2\x9f\xab"), ENTITY_DEF("Wopf", 120142, "\xf0\x9d\x95\x8e"), ENTITY_DEF("boxUl", 9564, "\xe2\x95\x9c"), - ENTITY_DEF("frac12", 189, "\xc2\xbd"), + ENTITY_DEF_HEUR("frac12", 189, "\xc2\xbd"), ENTITY_DEF("clubs", 9827, "\xe2\x99\xa3"), ENTITY_DEF("amalg", 10815, "\xe2\xa8\xbf"), ENTITY_DEF("Lang", 10218, "\xe2\x9f\xaa"), @@ -397,7 +399,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("rscr", 120007, "\xf0\x9d\x93\x87"), ENTITY_DEF("Rrightarrow", 8667, "\xe2\x87\x9b"), ENTITY_DEF("equest", 8799, "\xe2\x89\x9f"), - ENTITY_DEF("ntilde", 241, "\xc3\xb1"), + ENTITY_DEF_HEUR("ntilde", 241, "\xc3\xb1"), ENTITY_DEF("Escr", 8496, "\xe2\x84\xb0"), ENTITY_DEF("Lopf", 120131, "\xf0\x9d\x95\x83"), ENTITY_DEF("GreaterGreater", 10914, "\xe2\xaa\xa2"), @@ -431,7 +433,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("varpropto", 8733, "\xe2\x88\x9d"), ENTITY_DEF("Lcaron", 317, "\xc4\xbd"), ENTITY_DEF("lbrkslu", 10637, "\xe2\xa6\x8d"), - ENTITY_DEF("AElig", 198, "\xc3\x86"), + ENTITY_DEF_HEUR("AElig", 198, "\xc3\x86"), ENTITY_DEF("varr", 8597, "\xe2\x86\x95"), ENTITY_DEF("nvinfin", 10718, "\xe2\xa7\x9e"), ENTITY_DEF("leq", 8804, "\xe2\x89\xa4"), @@ -455,7 +457,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("leftharpoondown", 8637, "\xe2\x86\xbd"), ENTITY_DEF("vfr", 120115, "\xf0\x9d\x94\xb3"), ENTITY_DEF("gvertneqq", 8809, "\xe2\x89\xa9\xef\xb8\x80"), - ENTITY_DEF("ouml", 246, "\xc3\xb6"), + ENTITY_DEF_HEUR("ouml", 246, "\xc3\xb6"), ENTITY_DEF("raemptyv", 10675, "\xe2\xa6\xb3"), ENTITY_DEF("Zcaron", 381, "\xc5\xbd"), ENTITY_DEF("scE", 10932, "\xe2\xaa\xb4"), @@ -506,14 +508,14 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("imagline", 8464, "\xe2\x84\x90"), ENTITY_DEF("ncy", 1085, "\xd0\xbd"), ENTITY_DEF("bigstar", 9733, "\xe2\x98\x85"), - ENTITY_DEF("REG", 174, "\xc2\xae"), + ENTITY_DEF_HEUR("REG", 174, "\xc2\xae"), ENTITY_DEF("triangleq", 8796, "\xe2\x89\x9c"), ENTITY_DEF("rsqb", 93, "\x5d"), ENTITY_DEF("ddarr", 8650, "\xe2\x87\x8a"), ENTITY_DEF("csub", 10959, "\xe2\xab\x8f"), ENTITY_DEF("quest", 63, "\x3f"), ENTITY_DEF("Star", 8902, "\xe2\x8b\x86"), - ENTITY_DEF("LT", 60, "\x3c"), + ENTITY_DEF_HEUR("LT", 60, "\x3c"), ENTITY_DEF("ncong", 8775, "\xe2\x89\x87"), ENTITY_DEF("prnE", 10933, "\xe2\xaa\xb5"), ENTITY_DEF("bigtriangleup", 9651, "\xe2\x96\xb3"), @@ -555,10 +557,10 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("boxDl", 9558, "\xe2\x95\x96"), ENTITY_DEF("kappav", 1008, "\xcf\xb0"), ENTITY_DEF("profsurf", 8979, "\xe2\x8c\x93"), - ENTITY_DEF("auml", 228, "\xc3\xa4"), + ENTITY_DEF_HEUR("auml", 228, "\xc3\xa4"), ENTITY_DEF("heartsuit", 9829, "\xe2\x99\xa5"), - ENTITY_DEF("eacute", 233, "\xc3\xa9"), - ENTITY_DEF("gt", 62, "\x3e"), + ENTITY_DEF_HEUR("eacute", 233, "\xc3\xa9"), + ENTITY_DEF_HEUR("gt", 62, "\x3e"), ENTITY_DEF("Gcedil", 290, "\xc4\xa2"), ENTITY_DEF("easter", 10862, "\xe2\xa9\xae"), ENTITY_DEF("Tcy", 1058, "\xd0\xa2"), @@ -580,7 +582,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Dot", 168, "\xc2\xa8"), ENTITY_DEF("SquareIntersection", 8851, "\xe2\x8a\x93"), ENTITY_DEF("map", 8614, "\xe2\x86\xa6"), - ENTITY_DEF("aelig", 230, "\xc3\xa6"), + ENTITY_DEF_HEUR("aelig", 230, "\xc3\xa6"), ENTITY_DEF("RightArrow", 8594, "\xe2\x86\x92"), ENTITY_DEF("rightharpoondown", 8641, "\xe2\x87\x81"), ENTITY_DEF("bNot", 10989, "\xe2\xab\xad"), @@ -591,7 +593,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("RightVectorBar", 10579, "\xe2\xa5\x93"), ENTITY_DEF("nrarrw", 8605, "\xe2\x86\x9d\xcc\xb8"), ENTITY_DEF("nbump", 8782, "\xe2\x89\x8e\xcc\xb8"), - ENTITY_DEF("iquest", 191, "\xc2\xbf"), + ENTITY_DEF_HEUR("iquest", 191, "\xc2\xbf"), ENTITY_DEF("wr", 8768, "\xe2\x89\x80"), ENTITY_DEF("UpArrow", 8593, "\xe2\x86\x91"), ENTITY_DEF("notinva", 8713, "\xe2\x88\x89"), @@ -615,9 +617,9 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Gt", 8811, "\xe2\x89\xab"), ENTITY_DEF("exist", 8707, "\xe2\x88\x83"), ENTITY_DEF("gtrapprox", 10886, "\xe2\xaa\x86"), - ENTITY_DEF("euml", 235, "\xc3\xab"), + ENTITY_DEF_HEUR("euml", 235, "\xc3\xab"), ENTITY_DEF("Equilibrium", 8652, "\xe2\x87\x8c"), - ENTITY_DEF("aacute", 225, "\xc3\xa1"), + ENTITY_DEF_HEUR("aacute", 225, "\xc3\xa1"), ENTITY_DEF("omid", 10678, "\xe2\xa6\xb6"), ENTITY_DEF("loarr", 8701, "\xe2\x87\xbd"), ENTITY_DEF("SucceedsSlantEqual", 8829, "\xe2\x89\xbd"), @@ -631,7 +633,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("UnderParenthesis", 9181, "\xe2\x8f\x9d"), ENTITY_DEF("nparsl", 11005, "\xe2\xab\xbd\xe2\x83\xa5"), ENTITY_DEF("Lacute", 313, "\xc4\xb9"), - ENTITY_DEF("deg", 176, "\xc2\xb0"), + ENTITY_DEF_HEUR("deg", 176, "\xc2\xb0"), ENTITY_DEF("Racute", 340, "\xc5\x94"), ENTITY_DEF("Verbar", 8214, "\xe2\x80\x96"), ENTITY_DEF("sqcups", 8852, "\xe2\x8a\x94\xef\xb8\x80"), @@ -647,7 +649,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("gE", 8807, "\xe2\x89\xa7"), ENTITY_DEF("SmallCircle", 8728, "\xe2\x88\x98"), ENTITY_DEF("diamondsuit", 9830, "\xe2\x99\xa6"), - ENTITY_DEF("Otilde", 213, "\xc3\x95"), + ENTITY_DEF_HEUR("Otilde", 213, "\xc3\x95"), ENTITY_DEF("lneq", 10887, "\xe2\xaa\x87"), ENTITY_DEF("lesdoto", 10881, "\xe2\xaa\x81"), ENTITY_DEF("ltquest", 10875, "\xe2\xa9\xbb"), @@ -697,7 +699,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("lfr", 120105, "\xf0\x9d\x94\xa9"), ENTITY_DEF("emsp13", 8196, "\xe2\x80\x84"), ENTITY_DEF("parsl", 11005, "\xe2\xab\xbd"), - ENTITY_DEF("ucirc", 251, "\xc3\xbb"), + ENTITY_DEF_HEUR("ucirc", 251, "\xc3\xbb"), ENTITY_DEF("gsiml", 10896, "\xe2\xaa\x90"), ENTITY_DEF("xsqcup", 10758, "\xe2\xa8\x86"), ENTITY_DEF("Omicron", 927, "\xce\x9f"), @@ -803,7 +805,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Lsh", 8624, "\xe2\x86\xb0"), ENTITY_DEF("boxvr", 9500, "\xe2\x94\x9c"), ENTITY_DEF("scedil", 351, "\xc5\x9f"), - ENTITY_DEF("iuml", 239, "\xc3\xaf"), + ENTITY_DEF_HEUR("iuml", 239, "\xc3\xaf"), ENTITY_DEF("NJcy", 1034, "\xd0\x8a"), ENTITY_DEF("Dagger", 8225, "\xe2\x80\xa1"), ENTITY_DEF("rarrap", 10613, "\xe2\xa5\xb5"), @@ -812,7 +814,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("scnsim", 8937, "\xe2\x8b\xa9"), ENTITY_DEF("hbar", 8463, "\xe2\x84\x8f"), ENTITY_DEF("frac15", 8533, "\xe2\x85\x95"), - ENTITY_DEF("sup3", 179, "\xc2\xb3"), + ENTITY_DEF_HEUR("sup3", 179, "\xc2\xb3"), ENTITY_DEF("NegativeThickSpace", 8203, "\xe2\x80\x8b"), ENTITY_DEF("npr", 8832, "\xe2\x8a\x80"), ENTITY_DEF("doteq", 8784, "\xe2\x89\x90"), @@ -887,7 +889,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("NotPrecedesSlantEqual", 8928, "\xe2\x8b\xa0"), ENTITY_DEF("phone", 9742, "\xe2\x98\x8e"), ENTITY_DEF("Ecirc", 202, "\xc3\x8a"), - ENTITY_DEF("lt", 60, "\x3c"), + ENTITY_DEF_HEUR("lt", 60, "\x3c"), ENTITY_DEF("intcal", 8890, "\xe2\x8a\xba"), ENTITY_DEF("xdtri", 9661, "\xe2\x96\xbd"), ENTITY_DEF("Abreve", 258, "\xc4\x82"), @@ -973,7 +975,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("LongLeftRightArrow", 10231, "\xe2\x9f\xb7"), ENTITY_DEF("Gfr", 120074, "\xf0\x9d\x94\x8a"), ENTITY_DEF("sqsubseteq", 8849, "\xe2\x8a\x91"), - ENTITY_DEF("ograve", 242, "\xc3\xb2"), + ENTITY_DEF_HEUR("ograve", 242, "\xc3\xb2"), ENTITY_DEF("larrhk", 8617, "\xe2\x86\xa9"), ENTITY_DEF("sigma", 963, "\xcf\x83"), ENTITY_DEF("NotSquareSupersetEqual", 8931, "\xe2\x8b\xa3"), @@ -1005,10 +1007,10 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("NotExists", 8708, "\xe2\x88\x84"), ENTITY_DEF("geq", 8805, "\xe2\x89\xa5"), ENTITY_DEF("Ffr", 120073, "\xf0\x9d\x94\x89"), - ENTITY_DEF("divide", 247, "\xc3\xb7"), + ENTITY_DEF_HEUR("divide", 247, "\xc3\xb7"), ENTITY_DEF("blank", 9251, "\xe2\x90\xa3"), ENTITY_DEF("IEcy", 1045, "\xd0\x95"), - ENTITY_DEF("ordm", 186, "\xc2\xba"), + ENTITY_DEF_HEUR("ordm", 186, "\xc2\xba"), ENTITY_DEF("fopf", 120151, "\xf0\x9d\x95\x97"), ENTITY_DEF("ecir", 8790, "\xe2\x89\x96"), ENTITY_DEF("complement", 8705, "\xe2\x88\x81"), @@ -1032,7 +1034,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("supset", 8835, "\xe2\x8a\x83"), ENTITY_DEF("gneqq", 8809, "\xe2\x89\xa9"), ENTITY_DEF("Lstrok", 321, "\xc5\x81"), - ENTITY_DEF("AMP", 38, "\x26"), + ENTITY_DEF_HEUR("AMP", 38, "\x26"), ENTITY_DEF("acE", 8766, "\xe2\x88\xbe\xcc\xb3"), ENTITY_DEF("sqsupseteq", 8850, "\xe2\x8a\x92"), ENTITY_DEF("nle", 8816, "\xe2\x89\xb0"), @@ -1045,7 +1047,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("demptyv", 10673, "\xe2\xa6\xb1"), ENTITY_DEF("eta", 951, "\xce\xb7"), ENTITY_DEF("GreaterSlantEqual", 10878, "\xe2\xa9\xbe"), - ENTITY_DEF("ccedil", 231, "\xc3\xa7"), + ENTITY_DEF_HEUR("ccedil", 231, "\xc3\xa7"), ENTITY_DEF("pfr", 120109, "\xf0\x9d\x94\xad"), ENTITY_DEF("bbrktbrk", 9142, "\xe2\x8e\xb6"), ENTITY_DEF("mcy", 1084, "\xd0\xbc"), @@ -1058,10 +1060,10 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("apos", 39, "\x27"), ENTITY_DEF("lrm", 8206, "\xe2\x80\x8e"), ENTITY_DEF("male", 9794, "\xe2\x99\x82"), - ENTITY_DEF("agrave", 224, "\xc3\xa0"), + ENTITY_DEF_HEUR("agrave", 224, "\xc3\xa0"), ENTITY_DEF("Lt", 8810, "\xe2\x89\xaa"), ENTITY_DEF("capand", 10820, "\xe2\xa9\x84"), - ENTITY_DEF("aring", 229, "\xc3\xa5"), + ENTITY_DEF_HEUR("aring", 229, "\xc3\xa5"), ENTITY_DEF("Jukcy", 1028, "\xd0\x84"), ENTITY_DEF("bumpe", 8783, "\xe2\x89\x8f"), ENTITY_DEF("dd", 8518, "\xe2\x85\x86"), @@ -1076,7 +1078,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("hfr", 120101, "\xf0\x9d\x94\xa5"), ENTITY_DEF("preceq", 10927, "\xe2\xaa\xaf"), ENTITY_DEF("rationals", 8474, "\xe2\x84\x9a"), - ENTITY_DEF("Auml", 196, "\xc3\x84"), + ENTITY_DEF_HEUR("Auml", 196, "\xc3\x84"), ENTITY_DEF("LeftRightArrow", 8596, "\xe2\x86\x94"), ENTITY_DEF("blacktriangleright", 9656, "\xe2\x96\xb8"), ENTITY_DEF("dharr", 8642, "\xe2\x87\x82"), @@ -1111,7 +1113,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("circlearrowright", 8635, "\xe2\x86\xbb"), ENTITY_DEF("NotCongruent", 8802, "\xe2\x89\xa2"), ENTITY_DEF("Scedil", 350, "\xc5\x9e"), - ENTITY_DEF("raquo", 187, "\xc2\xbb"), + ENTITY_DEF_HEUR("raquo", 187, "\xc2\xbb"), ENTITY_DEF("ycy", 1099, "\xd1\x8b"), ENTITY_DEF("notinvb", 8951, "\xe2\x8b\xb7"), ENTITY_DEF("andv", 10842, "\xe2\xa9\x9a"), @@ -1121,7 +1123,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("downarrow", 8595, "\xe2\x86\x93"), ENTITY_DEF("gesdotol", 10884, "\xe2\xaa\x84"), ENTITY_DEF("Congruent", 8801, "\xe2\x89\xa1"), - ENTITY_DEF("pound", 163, "\xc2\xa3"), + ENTITY_DEF_HEUR("pound", 163, "\xc2\xa3"), ENTITY_DEF("ZeroWidthSpace", 8203, "\xe2\x80\x8b"), ENTITY_DEF("rdca", 10551, "\xe2\xa4\xb7"), ENTITY_DEF("rmoust", 9137, "\xe2\x8e\xb1"), @@ -1148,8 +1150,8 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("NotSquareSuperset", 8848, "\xe2\x8a\x90\xcc\xb8"), ENTITY_DEF("Amacr", 256, "\xc4\x80"), ENTITY_DEF("OpenCurlyDoubleQuote", 8220, "\xe2\x80\x9c"), - ENTITY_DEF("thorn", 254, "\xc3\xbe"), - ENTITY_DEF("ordf", 170, "\xc2\xaa"), + ENTITY_DEF_HEUR("thorn", 254, "\xc3\xbe"), + ENTITY_DEF_HEUR("ordf", 170, "\xc2\xaa"), ENTITY_DEF("natur", 9838, "\xe2\x99\xae"), ENTITY_DEF("xi", 958, "\xce\xbe"), ENTITY_DEF("infin", 8734, "\xe2\x88\x9e"), @@ -1166,10 +1168,10 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("LessEqualGreater", 8922, "\xe2\x8b\x9a"), ENTITY_DEF("Implies", 8658, "\xe2\x87\x92"), ENTITY_DEF("ETH", 208, "\xc3\x90"), - ENTITY_DEF("Yacute", 221, "\xc3\x9d"), - ENTITY_DEF("shy", 173, "\xc2\xad"), + ENTITY_DEF_HEUR("Yacute", 221, "\xc3\x9d"), + ENTITY_DEF_HEUR("shy", 173, "\xc2\xad"), ENTITY_DEF("Rarrtl", 10518, "\xe2\xa4\x96"), - ENTITY_DEF("sup1", 185, "\xc2\xb9"), + ENTITY_DEF_HEUR("sup1", 185, "\xc2\xb9"), ENTITY_DEF("reals", 8477, "\xe2\x84\x9d"), ENTITY_DEF("blacklozenge", 10731, "\xe2\xa7\xab"), ENTITY_DEF("ncedil", 326, "\xc5\x86"), @@ -1201,7 +1203,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Fscr", 8497, "\xe2\x84\xb1"), ENTITY_DEF("veebar", 8891, "\xe2\x8a\xbb"), ENTITY_DEF("Longleftrightarrow", 10234, "\xe2\x9f\xba"), - ENTITY_DEF("reg", 174, "\xc2\xae"), + ENTITY_DEF_HEUR("reg", 174, "\xc2\xae"), ENTITY_DEF("NegativeMediumSpace", 8203, "\xe2\x80\x8b"), ENTITY_DEF("Upsi", 978, "\xcf\x92"), ENTITY_DEF("Mellintrf", 8499, "\xe2\x84\xb3"), @@ -1270,7 +1272,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("strns", 175, "\xc2\xaf"), ENTITY_DEF("intlarhk", 10775, "\xe2\xa8\x97"), ENTITY_DEF("downharpoonright", 8642, "\xe2\x87\x82"), - ENTITY_DEF("yacute", 253, "\xc3\xbd"), + ENTITY_DEF_HEUR("yacute", 253, "\xc3\xbd"), ENTITY_DEF("boxUr", 9561, "\xe2\x95\x99"), ENTITY_DEF("triangleleft", 9667, "\xe2\x97\x83"), ENTITY_DEF("DiacriticalDot", 729, "\xcb\x99"), @@ -1304,7 +1306,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("tridot", 9708, "\xe2\x97\xac"), ENTITY_DEF("ldquor", 8222, "\xe2\x80\x9e"), ENTITY_DEF("sol", 47, "\x2f"), - ENTITY_DEF("ecirc", 234, "\xc3\xaa"), + ENTITY_DEF_HEUR("ecirc", 234, "\xc3\xaa"), ENTITY_DEF("DoubleLeftArrow", 8656, "\xe2\x87\x90"), ENTITY_DEF("Gscr", 119970, "\xf0\x9d\x92\xa2"), ENTITY_DEF("ap", 8776, "\xe2\x89\x88"), @@ -1317,7 +1319,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("backprime", 8245, "\xe2\x80\xb5"), ENTITY_DEF("longleftrightarrow", 10231, "\xe2\x9f\xb7"), ENTITY_DEF("ntriangleleft", 8938, "\xe2\x8b\xaa"), - ENTITY_DEF("copy", 169, "\xc2\xa9"), + ENTITY_DEF_HEUR("copy", 169, "\xc2\xa9"), ENTITY_DEF("mapstodown", 8615, "\xe2\x86\xa7"), ENTITY_DEF("seArr", 8664, "\xe2\x87\x98"), ENTITY_DEF("ENG", 330, "\xc5\x8a"), @@ -1351,7 +1353,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("nless", 8814, "\xe2\x89\xae"), ENTITY_DEF("uharr", 8638, "\xe2\x86\xbe"), ENTITY_DEF("lambda", 955, "\xce\xbb"), - ENTITY_DEF("uuml", 252, "\xc3\xbc"), + ENTITY_DEF_HEUR("uuml", 252, "\xc3\xbc"), ENTITY_DEF("horbar", 8213, "\xe2\x80\x95"), ENTITY_DEF("ccirc", 265, "\xc4\x89"), ENTITY_DEF("sqcup", 8852, "\xe2\x8a\x94"), @@ -1385,9 +1387,9 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("nearr", 8599, "\xe2\x86\x97"), ENTITY_DEF("NotSubsetEqual", 8840, "\xe2\x8a\x88"), ENTITY_DEF("planck", 8463, "\xe2\x84\x8f"), - ENTITY_DEF("Uuml", 220, "\xc3\x9c"), + ENTITY_DEF_HEUR("Uuml", 220, "\xc3\x9c"), ENTITY_DEF("spadesuit", 9824, "\xe2\x99\xa0"), - ENTITY_DEF("sect", 167, "\xc2\xa7"), + ENTITY_DEF_HEUR("sect", 167, "\xc2\xa7"), ENTITY_DEF("cdot", 267, "\xc4\x8b"), ENTITY_DEF("boxVh", 9579, "\xe2\x95\xab"), ENTITY_DEF("zscr", 120015, "\xf0\x9d\x93\x8f"), @@ -1407,7 +1409,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("vrtri", 8883, "\xe2\x8a\xb3"), ENTITY_DEF("part", 8706, "\xe2\x88\x82"), ENTITY_DEF("esim", 8770, "\xe2\x89\x82"), - ENTITY_DEF("atilde", 227, "\xc3\xa3"), + ENTITY_DEF_HEUR("atilde", 227, "\xc3\xa3"), ENTITY_DEF("DownRightTeeVector", 10591, "\xe2\xa5\x9f"), ENTITY_DEF("jcirc", 309, "\xc4\xb5"), ENTITY_DEF("Ecaron", 282, "\xc4\x9a"), @@ -1464,7 +1466,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("nexists", 8708, "\xe2\x88\x84"), ENTITY_DEF("theta", 952, "\xce\xb8"), ENTITY_DEF("plankv", 8463, "\xe2\x84\x8f"), - ENTITY_DEF("sup2", 178, "\xc2\xb2"), + ENTITY_DEF_HEUR("sup2", 178, "\xc2\xb2"), ENTITY_DEF("lessapprox", 10885, "\xe2\xaa\x85"), ENTITY_DEF("gdot", 289, "\xc4\xa1"), ENTITY_DEF("angmsdae", 10668, "\xe2\xa6\xac"), @@ -1552,7 +1554,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("midast", 42, "\x2a"), ENTITY_DEF("lscr", 120001, "\xf0\x9d\x93\x81"), ENTITY_DEF("nGt", 8811, "\xe2\x89\xab\xe2\x83\x92"), - ENTITY_DEF("Euml", 203, "\xc3\x8b"), + ENTITY_DEF_HEUR("Euml", 203, "\xc3\x8b"), ENTITY_DEF("blacktriangledown", 9662, "\xe2\x96\xbe"), ENTITY_DEF("Rcy", 1056, "\xd0\xa0"), ENTITY_DEF("dfisht", 10623, "\xe2\xa5\xbf"), @@ -1588,14 +1590,14 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("plusb", 8862, "\xe2\x8a\x9e"), ENTITY_DEF("odsold", 10684, "\xe2\xa6\xbc"), ENTITY_DEF("varsupsetneqq", 10956, "\xe2\xab\x8c\xef\xb8\x80"), - ENTITY_DEF("otilde", 245, "\xc3\xb5"), + ENTITY_DEF_HEUR("otilde", 245, "\xc3\xb5"), ENTITY_DEF("gtcir", 10874, "\xe2\xa9\xba"), ENTITY_DEF("lltri", 9722, "\xe2\x97\xba"), ENTITY_DEF("rx", 8478, "\xe2\x84\x9e"), ENTITY_DEF("ljcy", 1113, "\xd1\x99"), ENTITY_DEF("parsim", 10995, "\xe2\xab\xb3"), ENTITY_DEF("NotElement", 8713, "\xe2\x88\x89"), - ENTITY_DEF("plusmn", 177, "\xc2\xb1"), + ENTITY_DEF_HEUR("plusmn", 177, "\xc2\xb1"), ENTITY_DEF("varsubsetneq", 8842, "\xe2\x8a\x8a\xef\xb8\x80"), ENTITY_DEF("subset", 8834, "\xe2\x8a\x82"), ENTITY_DEF("awint", 10769, "\xe2\xa8\x91"), @@ -1622,12 +1624,12 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("nharr", 8622, "\xe2\x86\xae"), ENTITY_DEF("varnothing", 8709, "\xe2\x88\x85"), ENTITY_DEF("ShortDownArrow", 8595, "\xe2\x86\x93"), - ENTITY_DEF("nbsp", 160, " "), + ENTITY_DEF_HEUR("nbsp", 160, " "), ENTITY_DEF("asympeq", 8781, "\xe2\x89\x8d"), ENTITY_DEF("rbrkslu", 10640, "\xe2\xa6\x90"), ENTITY_DEF("rho", 961, "\xcf\x81"), ENTITY_DEF("Mscr", 8499, "\xe2\x84\xb3"), - ENTITY_DEF("eth", 240, "\xc3\xb0"), + ENTITY_DEF_HEUR("eth", 240, "\xc3\xb0"), ENTITY_DEF("suplarr", 10619, "\xe2\xa5\xbb"), ENTITY_DEF("Tab", 9, "\x09"), ENTITY_DEF("omicron", 959, "\xce\xbf"), @@ -1692,7 +1694,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("vartheta", 977, "\xcf\x91"), ENTITY_DEF("nsub", 8836, "\xe2\x8a\x84"), ENTITY_DEF("DownTee", 8868, "\xe2\x8a\xa4"), - ENTITY_DEF("acute", 180, "\xc2\xb4"), + ENTITY_DEF_HEUR("acute", 180, "\xc2\xb4"), ENTITY_DEF("GreaterLess", 8823, "\xe2\x89\xb7"), ENTITY_DEF("supplus", 10944, "\xe2\xab\x80"), ENTITY_DEF("Vbar", 10987, "\xe2\xab\xab"), @@ -1760,7 +1762,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("LeftRightVector", 10574, "\xe2\xa5\x8e"), ENTITY_DEF("DownLeftVectorBar", 10582, "\xe2\xa5\x96"), ENTITY_DEF("suphsub", 10967, "\xe2\xab\x97"), - ENTITY_DEF("cedil", 184, "\xc2\xb8"), + ENTITY_DEF_HEUR("cedil", 184, "\xc2\xb8"), ENTITY_DEF("prurel", 8880, "\xe2\x8a\xb0"), ENTITY_DEF("imagpart", 8465, "\xe2\x84\x91"), ENTITY_DEF("Hscr", 8459, "\xe2\x84\x8b"), @@ -1772,7 +1774,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("nesim", 8770, "\xe2\x89\x82\xcc\xb8"), ENTITY_DEF("varepsilon", 1013, "\xcf\xb5"), ENTITY_DEF("DoubleRightTee", 8872, "\xe2\x8a\xa8"), - ENTITY_DEF("not", 172, "\xc2\xac"), + ENTITY_DEF_HEUR("not", 172, "\xc2\xac"), ENTITY_DEF("lesdot", 10879, "\xe2\xa9\xbf"), ENTITY_DEF("backepsilon", 1014, "\xcf\xb6"), ENTITY_DEF("srarr", 8594, "\xe2\x86\x92"), @@ -1792,7 +1794,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("NewLine", 10, "\x0a"), ENTITY_DEF("bigotimes", 10754, "\xe2\xa8\x82"), ENTITY_DEF("lAtail", 10523, "\xe2\xa4\x9b"), - ENTITY_DEF("frac14", 188, "\xc2\xbc"), + ENTITY_DEF_HEUR("frac14", 188, "\xc2\xbc"), ENTITY_DEF("or", 8744, "\xe2\x88\xa8"), ENTITY_DEF("subedot", 10947, "\xe2\xab\x83"), ENTITY_DEF("nmid", 8740, "\xe2\x88\xa4"), @@ -1805,7 +1807,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("topcir", 10993, "\xe2\xab\xb1"), ENTITY_DEF("ne", 8800, "\xe2\x89\xa0"), ENTITY_DEF("osol", 8856, "\xe2\x8a\x98"), - ENTITY_DEF("amp", 38, "\x26"), + ENTITY_DEF_HEUR("amp", 38, "\x26"), ENTITY_DEF("ncap", 10819, "\xe2\xa9\x83"), ENTITY_DEF("Sscr", 119982, "\xf0\x9d\x92\xae"), ENTITY_DEF("sung", 9834, "\xe2\x99\xaa"), @@ -1846,7 +1848,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Gcirc", 284, "\xc4\x9c"), ENTITY_DEF("lesdotor", 10883, "\xe2\xaa\x83"), ENTITY_DEF("escr", 8495, "\xe2\x84\xaf"), - ENTITY_DEF("THORN", 222, "\xc3\x9e"), + ENTITY_DEF_HEUR("THORN", 222, "\xc3\x9e"), ENTITY_DEF("UpArrowBar", 10514, "\xe2\xa4\x92"), ENTITY_DEF("nvrtrie", 8885, "\xe2\x8a\xb5\xe2\x83\x92"), ENTITY_DEF("varkappa", 1008, "\xcf\xb0"), @@ -1865,11 +1867,11 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("telrec", 8981, "\xe2\x8c\x95"), ENTITY_DEF("vellip", 8942, "\xe2\x8b\xae"), ENTITY_DEF("nrArr", 8655, "\xe2\x87\x8f"), - ENTITY_DEF("ugrave", 249, "\xc3\xb9"), + ENTITY_DEF_HEUR("ugrave", 249, "\xc3\xb9"), ENTITY_DEF("uring", 367, "\xc5\xaf"), ENTITY_DEF("Bernoullis", 8492, "\xe2\x84\xac"), ENTITY_DEF("nles", 10877, "\xe2\xa9\xbd\xcc\xb8"), - ENTITY_DEF("macr", 175, "\xc2\xaf"), + ENTITY_DEF_HEUR("macr", 175, "\xc2\xaf"), ENTITY_DEF("boxuR", 9560, "\xe2\x95\x98"), ENTITY_DEF("clubsuit", 9827, "\xe2\x99\xa3"), ENTITY_DEF("rightarrowtail", 8611, "\xe2\x86\xa3"), @@ -1881,7 +1883,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("vltri", 8882, "\xe2\x8a\xb2"), ENTITY_DEF("quaternions", 8461, "\xe2\x84\x8d"), ENTITY_DEF("rfr", 120111, "\xf0\x9d\x94\xaf"), - ENTITY_DEF("Ouml", 214, "\xc3\x96"), + ENTITY_DEF_HEUR("Ouml", 214, "\xc3\x96"), ENTITY_DEF("rsh", 8625, "\xe2\x86\xb1"), ENTITY_DEF("emptyv", 8709, "\xe2\x88\x85"), ENTITY_DEF("sqsup", 8848, "\xe2\x8a\x90"), @@ -1891,7 +1893,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("eqsim", 8770, "\xe2\x89\x82"), ENTITY_DEF("NotSucceedsEqual", 10928, "\xe2\xaa\xb0\xcc\xb8"), ENTITY_DEF("primes", 8473, "\xe2\x84\x99"), - ENTITY_DEF("times", 215, "\xc3\x97"), + ENTITY_DEF_HEUR("times", 215, "\xc3\x97"), ENTITY_DEF("rangd", 10642, "\xe2\xa6\x92"), ENTITY_DEF("rightharpoonup", 8640, "\xe2\x87\x80"), ENTITY_DEF("lrhard", 10605, "\xe2\xa5\xad"), @@ -1925,7 +1927,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("pi", 960, "\xcf\x80"), ENTITY_DEF("lesg", 8922, "\xe2\x8b\x9a\xef\xb8\x80"), ENTITY_DEF("orderof", 8500, "\xe2\x84\xb4"), - ENTITY_DEF("uacute", 250, "\xc3\xba"), + ENTITY_DEF_HEUR("uacute", 250, "\xc3\xba"), ENTITY_DEF("Barv", 10983, "\xe2\xab\xa7"), ENTITY_DEF("Theta", 920, "\xce\x98"), ENTITY_DEF("leftrightsquigarrow", 8621, "\xe2\x86\xad"), @@ -1972,7 +1974,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("angmsdab", 10665, "\xe2\xa6\xa9"), ENTITY_DEF("wedgeq", 8793, "\xe2\x89\x99"), ENTITY_DEF("iogon", 303, "\xc4\xaf"), - ENTITY_DEF("laquo", 171, "\xc2\xab"), + ENTITY_DEF_HEUR("laquo", 171, "\xc2\xab"), ENTITY_DEF("NestedGreaterGreater", 8811, "\xe2\x89\xab"), ENTITY_DEF("UnionPlus", 8846, "\xe2\x8a\x8e"), ENTITY_DEF("CircleDot", 8857, "\xe2\x8a\x99"), @@ -1991,7 +1993,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("egsdot", 10904, "\xe2\xaa\x98"), ENTITY_DEF("target", 8982, "\xe2\x8c\x96"), ENTITY_DEF("lesges", 10899, "\xe2\xaa\x93"), - ENTITY_DEF("curren", 164, "\xc2\xa4"), + ENTITY_DEF_HEUR("curren", 164, "\xc2\xa4"), ENTITY_DEF("yopf", 120170, "\xf0\x9d\x95\xaa"), ENTITY_DEF("frac23", 8532, "\xe2\x85\x94"), ENTITY_DEF("NotSucceedsTilde", 8831, "\xe2\x89\xbf\xcc\xb8"), @@ -2009,7 +2011,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("lessgtr", 8822, "\xe2\x89\xb6"), ENTITY_DEF("thickapprox", 8776, "\xe2\x89\x88"), ENTITY_DEF("lbrksld", 10639, "\xe2\xa6\x8f"), - ENTITY_DEF("oslash", 248, "\xc3\xb8"), + ENTITY_DEF_HEUR("oslash", 248, "\xc3\xb8"), ENTITY_DEF("NotCupCap", 8813, "\xe2\x89\xad"), ENTITY_DEF("elinters", 9191, "\xe2\x8f\xa7"), ENTITY_DEF("Assign", 8788, "\xe2\x89\x94"), @@ -2024,7 +2026,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("race", 8765, "\xe2\x88\xbd\xcc\xb1"), ENTITY_DEF("Ascr", 119964, "\xf0\x9d\x92\x9c"), ENTITY_DEF("Xscr", 119987, "\xf0\x9d\x92\xb3"), - ENTITY_DEF("acirc", 226, "\xc3\xa2"), + ENTITY_DEF_HEUR("acirc", 226, "\xc3\xa2"), ENTITY_DEF("otimesas", 10806, "\xe2\xa8\xb6"), ENTITY_DEF("gscr", 8458, "\xe2\x84\x8a"), ENTITY_DEF("gcy", 1075, "\xd0\xb3"), @@ -2033,8 +2035,8 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("Acy", 1040, "\xd0\x90"), ENTITY_DEF("NotGreaterLess", 8825, "\xe2\x89\xb9"), ENTITY_DEF("dtdot", 8945, "\xe2\x8b\xb1"), - ENTITY_DEF("quot", 34, "\x22"), - ENTITY_DEF("micro", 181, "\xc2\xb5"), + ENTITY_DEF_HEUR("quot", 34, "\x22"), + ENTITY_DEF_HEUR("micro", 181, "\xc2\xb5"), ENTITY_DEF("simplus", 10788, "\xe2\xa8\xa4"), ENTITY_DEF("nsupseteq", 8841, "\xe2\x8a\x89"), ENTITY_DEF("Ufr", 120088, "\xf0\x9d\x94\x98"), @@ -2062,7 +2064,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("dcy", 1076, "\xd0\xb4"), ENTITY_DEF("boxvl", 9508, "\xe2\x94\xa4"), ENTITY_DEF("RightArrowBar", 8677, "\xe2\x87\xa5"), - ENTITY_DEF("yuml", 255, "\xc3\xbf"), + ENTITY_DEF_HEUR("yuml", 255, "\xc3\xbf"), ENTITY_DEF("parallel", 8741, "\xe2\x88\xa5"), ENTITY_DEF("succneqq", 10934, "\xe2\xaa\xb6"), ENTITY_DEF("bemptyv", 10672, "\xe2\xa6\xb0"), @@ -2156,7 +2158,7 @@ static const auto html_entities_array = rspamd::array_of( ENTITY_DEF("cylcty", 9005, "\xe2\x8c\xad"), ENTITY_DEF("sube", 8838, "\xe2\x8a\x86"), ENTITY_DEF("NotEqualTilde", 8770, "\xe2\x89\x82\xcc\xb8"), - ENTITY_DEF("Yuml", 376, "\xc5\xb8"), + ENTITY_DEF_HEUR("Yuml", 376, "\xc5\xb8"), ENTITY_DEF("comp", 8705, "\xe2\x88\x81"), ENTITY_DEF("dotminus", 8760, "\xe2\x88\xb8"), ENTITY_DEF("crarr", 8629, "\xe2\x86\xb5"), @@ -2245,13 +2247,15 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) if (!entity_def && h - e > lookup_len) { entity_def = html_entities_defs.by_name({entity, lookup_len}); - if (entity_def) { + if (entity_def && entity_def->allow_heuristic) { replace_entity(); /* Adjust h back */ h = e + lookup_len; return true; } + + entity_def = nullptr; } return false;