diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-06-19 17:34:47 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-06-19 17:34:47 +0100 |
commit | 4395db507d29539c1b828813c0a7e3e2d09cdaa3 (patch) | |
tree | 80abbb41f92ac7769c1205ec82b6313db79ba533 | |
parent | 0e753fe22aa4e92cec5710dced0bb5e572180783 (diff) | |
download | rspamd-4395db507d29539c1b828813c0a7e3e2d09cdaa3.tar.gz rspamd-4395db507d29539c1b828813c0a7e3e2d09cdaa3.zip |
[Minor] Use separate htb for heuristical elements in entities
-rw-r--r-- | src/libserver/html/html_entities.cxx | 26 |
1 files changed, 20 insertions, 6 deletions
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx index 50bf34f18..1d72574b3 100644 --- a/src/libserver/html/html_entities.cxx +++ b/src/libserver/html/html_entities.cxx @@ -2169,6 +2169,7 @@ static const auto html_entities_array = rspamd::array_of<html_entity_def>( class html_entities_storage { robin_hood::unordered_flat_map<std::string_view, html_entity_def> entity_by_name; + robin_hood::unordered_flat_map<std::string_view, html_entity_def> entity_by_name_heur; robin_hood::unordered_flat_map<unsigned, html_entity_def> entity_by_id; public: html_entities_storage() { @@ -2178,13 +2179,25 @@ public: for (const auto &e : html_entities_array) { entity_by_name[e.name] = e; entity_by_id[e.code] = e; + + if (e.allow_heuristic) { + entity_by_name_heur[e.name] = e; + } } } - auto by_name(std::string_view name) const -> const html_entity_def * { - auto it = entity_by_name.find(name); + auto by_name(std::string_view name, bool use_heuristic = false) const -> const html_entity_def * { + const decltype(entity_by_name)* htb; + + if (use_heuristic) { + htb = &entity_by_name_heur; + } + else { + htb = &entity_by_name; + } + auto it = htb->find(name); - if (it != entity_by_name.end()) { + if (it != htb->end()) { return &(it->second); } @@ -2229,7 +2242,8 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) auto replace_named_entity = [&](const char *entity, std::size_t len) -> bool { const auto *entity_def = html_entities_defs.by_name({entity, - (std::size_t) (h - entity)}); + (std::size_t) (h - entity)}, + false); auto replace_entity = [&]() -> void { auto l = entity_def->replacement.size(); @@ -2245,9 +2259,9 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) /* Try heuristic */ auto heuristic_lookup_func = [&](std::size_t lookup_len) -> bool { if (!entity_def && h - e > lookup_len) { - entity_def = html_entities_defs.by_name({entity, lookup_len}); + entity_def = html_entities_defs.by_name({entity, lookup_len}, true); - if (entity_def && entity_def->allow_heuristic) { + if (entity_def) { replace_entity(); /* Adjust h back */ h = e + lookup_len; |