1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "config.h"
- #include "util.h"
- #include "rspamd.h"
- #include "message.h"
- #include "html.h"
- #include "html_tags.h"
- #include "html_colors.h"
- #include "html_entities.h"
- #include "url.h"
- #include "contrib/libucl/khash.h"
- #include "libmime/images.h"
-
- #include <unicode/uversion.h>
- #include <unicode/ucnv.h>
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- #include <unicode/uidna.h>
- #endif
-
- static sig_atomic_t tags_sorted = 0;
- static sig_atomic_t entities_sorted = 0;
- static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
-
- struct html_tag_def {
- const gchar *name;
- gint16 id;
- guint16 len;
- guint flags;
- };
-
- #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
- rspamd_html_log_id, "html", pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
-
- INIT_LOG_MODULE(html)
-
- #define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
-
- static struct html_tag_def tag_defs[] = {
- /* W3C defined elements */
- TAG_DEF(Tag_A, "a", FL_HREF),
- TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
- TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
- TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
- TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
- TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
- TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY | FL_HREF)),
- TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
- TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
- TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
- TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
- TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
- TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
- TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
- TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
- TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
- TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
- TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
- TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
- TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
- TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
- TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
- TAG_DEF(Tag_EM, "em", (CM_INLINE)),
- TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
- TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
- TAG_DEF(Tag_FORM, "form", (CM_BLOCK)),
- TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
- TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
- TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
- TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
- TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
- TAG_DEF(Tag_I, "i", (CM_INLINE)),
- TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
- TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
- TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
- TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
- TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
- TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
- TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY|FL_HREF)),
- TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)),
- TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
- TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
- TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
- TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
- TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
- TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
- TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
- TAG_DEF(Tag_Q, "q", (CM_INLINE)),
- TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
- TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
- TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
- TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
- TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
- TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
- TAG_DEF(Tag_S, "s", (CM_INLINE)),
- TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
- TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
- TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
- TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
- TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
- TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
- TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
- TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
- TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
- TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
- TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
- TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
- TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
- TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
- TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
- TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
- TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
- TAG_DEF(Tag_U, "u", (CM_INLINE)),
- TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
- TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
-
- /* proprietary elements */
- TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
- TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
- TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
- TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
- TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
- TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
- TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
- TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
- TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
- TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
- TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
- TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
- TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
- TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
- };
-
- KHASH_MAP_INIT_INT (entity_by_number, const char *);
- KHASH_MAP_INIT_STR (entity_by_name, const char *);
- KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
- KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
- KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
- rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
-
- khash_t(entity_by_number) *html_entity_by_number;
- khash_t(entity_by_name) *html_entity_by_name;
- khash_t(tag_by_name) *html_tag_by_name;
- khash_t(tag_by_id) *html_tag_by_id;
- khash_t(color_by_name) *html_color_by_name;
-
- static void
- rspamd_html_library_init (void)
- {
- guint i;
- khiter_t k;
- gint rc;
-
- if (!tags_sorted) {
- html_tag_by_id = kh_init (tag_by_id);
- html_tag_by_name = kh_init (tag_by_name);
- kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
- kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
-
- for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
- k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
- kh_val (html_tag_by_id, k) = tag_defs[i];
-
- k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
- kh_val (html_tag_by_name, k) = tag_defs[i];
- }
-
- tags_sorted = 1;
- }
-
- if (!entities_sorted) {
- html_entity_by_number = kh_init (entity_by_number);
- html_entity_by_name = kh_init (entity_by_name);
- kh_resize (entity_by_number, html_entity_by_number,
- G_N_ELEMENTS (entities_defs));
- kh_resize (entity_by_name, html_entity_by_name,
- G_N_ELEMENTS (entities_defs));
-
- for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
- if (entities_defs[i].code != 0) {
- k = kh_put (entity_by_number, html_entity_by_number,
- entities_defs[i].code, &rc);
- kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
- }
-
- k = kh_put (entity_by_name, html_entity_by_name,
- entities_defs[i].name, &rc);
- kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
- }
-
- html_color_by_name = kh_init (color_by_name);
- kh_resize (color_by_name, html_color_by_name,
- G_N_ELEMENTS (html_colornames));
-
- rspamd_ftok_t *keys;
-
- keys = g_malloc0 (sizeof (rspamd_ftok_t) *
- G_N_ELEMENTS (html_colornames));
-
- for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
- struct html_color c;
-
- keys[i].begin = html_colornames[i].name;
- keys[i].len = strlen (html_colornames[i].name);
- k = kh_put (color_by_name, html_color_by_name,
- &keys[i], &rc);
- c.valid = true;
- c.d.comp.r = html_colornames[i].rgb.r;
- c.d.comp.g = html_colornames[i].rgb.g;
- c.d.comp.b = html_colornames[i].rgb.b;
- c.d.comp.alpha = 255;
- kh_val (html_color_by_name, k) = c;
-
- }
-
- entities_sorted = 1;
- }
- }
-
- static gboolean
- rspamd_html_check_balance (GNode * node, GNode ** cur_level)
- {
- struct html_tag *arg = node->data, *tmp;
- GNode *cur;
-
- if (arg->flags & FL_CLOSING) {
- /* First of all check whether this tag is closing tag for parent node */
- cur = node->parent;
- while (cur && cur->data) {
- tmp = cur->data;
- if (tmp->id == arg->id &&
- (tmp->flags & FL_CLOSED) == 0) {
- tmp->flags |= FL_CLOSED;
- /* Destroy current node as we find corresponding parent node */
- g_node_destroy (node);
- /* Change level */
- *cur_level = cur->parent;
- return TRUE;
- }
- cur = cur->parent;
- }
- }
- else {
- return TRUE;
- }
-
- return FALSE;
- }
-
- gint
- rspamd_html_tag_by_name (const gchar *name)
- {
- khiter_t k;
-
- k = kh_get (tag_by_name, html_tag_by_name, name);
-
- if (k != kh_end (html_tag_by_name)) {
- return kh_val (html_tag_by_name, k).id;
- }
-
- return -1;
- }
-
- gboolean
- rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
- {
- gint id;
-
- g_assert (hc != NULL);
- g_assert (hc->tags_seen != NULL);
-
- id = rspamd_html_tag_by_name (tagname);
-
- if (id != -1) {
- return isset (hc->tags_seen, id);
- }
-
- return FALSE;
- }
-
- const gchar *
- rspamd_html_tag_by_id (gint id)
- {
- khiter_t k;
-
- k = kh_get (tag_by_id, html_tag_by_id, id);
-
- if (k != kh_end (html_tag_by_id)) {
- return kh_val (html_tag_by_id, k).name;
- }
-
- return NULL;
- }
-
- /* Decode HTML entitles in text */
- guint
- rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
- {
- goffset l, rep_len;
- gchar *t = s, *h = s, *e = s, *end_ptr;
- const gchar *end;
- const gchar *entity;
- gint state = 0, base;
- UChar32 uc;
- khiter_t k;
-
- if (len == 0) {
- l = strlen (s);
- }
- else {
- l = len;
- }
-
- end = s + l;
-
- while (h - s < l) {
- switch (state) {
- /* Out of entity */
- case 0:
- if (*h == '&') {
- state = 1;
- e = h;
- h++;
- continue;
- }
- else {
- *t = *h;
- h++;
- t++;
- }
- break;
- case 1:
- if (*h == ';' && h > e) {
- /* Determine base */
- /* First find in entities table */
- *h = '\0';
- entity = e + 1;
- uc = 0;
-
- if (*entity != '#') {
- k = kh_get (entity_by_name, html_entity_by_name, entity);
- *h = ';';
-
- if (k != kh_end (html_entity_by_name)) {
- if (kh_val (html_entity_by_name, k)) {
- rep_len = strlen (kh_val (html_entity_by_name, k));
-
- if (end - t >= rep_len) {
- memcpy (t, kh_val (html_entity_by_name, k),
- rep_len);
- t += rep_len;
- }
- } else {
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else {
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else if (e + 2 < h) {
- if (*(e + 2) == 'x' || *(e + 2) == 'X') {
- base = 16;
- }
- else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
- base = 8;
- }
- else {
- base = 10;
- }
-
- if (base == 10) {
- uc = strtoul ((e + 2), &end_ptr, base);
- }
- else {
- uc = strtoul ((e + 3), &end_ptr, base);
- }
-
- if (end_ptr != NULL && *end_ptr != '\0') {
- /* Skip undecoded */
- *h = ';';
-
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- else {
- /* Search for a replacement */
- *h = ';';
- k = kh_get (entity_by_number, html_entity_by_number, uc);
-
- if (k != kh_end (html_entity_by_number)) {
- if (kh_val (html_entity_by_number, k)) {
- rep_len = strlen (kh_val (html_entity_by_number, k));
-
- if (end - t >= rep_len) {
- memcpy (t, kh_val (html_entity_by_number, k),
- rep_len);
- t += rep_len;
- }
- } else {
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else {
- /* Unicode point */
- goffset off = t - s;
- UBool is_error = 0;
-
- if (uc > 0) {
- U8_APPEND (s, off, len, uc, is_error);
- if (!is_error) {
- t = s + off;
- }
- else {
- /* Leave invalid entities as is */
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- }
-
- state = 0;
- }
- else if (*h == '&') {
- /* Previous `&` was bogus */
- state = 1;
-
- if (end - t > h - e) {
- memmove (t, e, h - e);
- t += h - e;
- }
-
- e = h;
- }
-
- h++;
-
- break;
- }
- }
-
- /* Leftover */
- if (state == 1 && h > e) {
- /* Unfinished entity, copy as is */
- if (end - t > h - e) {
- memmove (t, e, h - e);
- t += h - e;
- }
- }
-
- return (t - s);
- }
-
- static gboolean
- rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
- {
- const gchar *p1, *p2;
-
- p1 = t1->begin + t1->len - 1;
- p2 = t2->begin + t2->len - 1;
-
- /* Skip trailing dots */
- while (p1 > t1->begin) {
- if (*p1 != '.') {
- break;
- }
-
- p1 --;
- }
-
- while (p2 > t2->begin) {
- if (*p2 != '.') {
- break;
- }
-
- p2 --;
- }
-
- while (p1 > t1->begin && p2 > t2->begin) {
- if (*p1 != *p2) {
- break;
- }
-
- p1 --;
- p2 --;
- }
-
- if (p2 == t2->begin) {
- /* p2 can be subdomain of p1 if *p1 is '.' */
- if (p1 != t1->begin && *(p1 - 1) == '.') {
- return TRUE;
- }
- }
- else if (p1 == t1->begin) {
- if (p2 != t2->begin && *(p2 - 1) == '.') {
- return TRUE;
- }
- }
-
- return FALSE;
- }
-
- static void
- rspamd_html_url_is_phished (rspamd_mempool_t *pool,
- struct rspamd_url *href_url,
- const guchar *url_text,
- gsize len,
- gboolean *url_found,
- struct rspamd_url **ptext_url)
- {
- struct rspamd_url *text_url;
- rspamd_ftok_t phished_tld, disp_tok, href_tok;
- gint rc;
- goffset url_pos;
- gchar *url_str = NULL, *idn_hbuf;
- const guchar *end = url_text + len, *p;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- static UIDNA *udn;
- UErrorCode uc_err = U_ZERO_ERROR;
- UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER;
- #endif
-
- *url_found = FALSE;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (udn == NULL) {
- udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
- }
- }
- #endif
-
- while (url_text < end && g_ascii_isspace (*url_text)) {
- url_text ++;
- }
-
- if (end > url_text + 4 &&
- rspamd_url_find (pool, url_text, end - url_text, &url_str,
- RSPAMD_URL_FIND_ALL,
- &url_pos, NULL) &&
- url_str != NULL) {
- if (url_pos > 0) {
- /*
- * We have some url at some offset, so we need to check what is
- * at the start of the text
- */
- p = url_text;
-
- while (p < url_text + url_pos) {
- if (!g_ascii_isspace (*p)) {
- *url_found = FALSE;
- return;
- }
-
- p++;
- }
- }
- text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
- rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
- RSPAMD_URL_PARSE_TEXT);
-
- if (rc == URI_ERRNO_OK) {
- disp_tok.len = text_url->hostlen;
- disp_tok.begin = text_url->host;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (text_url->host,
- text_url->hostlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
- /* We need to convert it to the normal value first */
- disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
- text_url->host, text_url->hostlen,
- idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- disp_tok.len = text_url->hostlen;
- }
- else {
- disp_tok.begin = idn_hbuf;
- }
- }
- #endif
- href_tok.len = href_url->hostlen;
- href_tok.begin = href_url->host;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (href_url->host,
- href_url->hostlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
- /* We need to convert it to the normal value first */
- href_tok.len = uidna_nameToUnicodeUTF8 (udn,
- href_url->host, href_url->hostlen,
- idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- href_tok.len = href_url->hostlen;
- }
- else {
- href_tok.begin = idn_hbuf;
- }
- }
- #endif
- if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0 &&
- text_url->tldlen > 0 && href_url->tldlen > 0) {
-
- /* Apply the same logic for TLD */
- disp_tok.len = text_url->tldlen;
- disp_tok.begin = text_url->tld;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (text_url->tld,
- text_url->tldlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
- /* We need to convert it to the normal value first */
- disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
- text_url->tld, text_url->tldlen,
- idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- disp_tok.len = text_url->tldlen;
- }
- else {
- disp_tok.begin = idn_hbuf;
- }
- }
- #endif
- href_tok.len = href_url->tldlen;
- href_tok.begin = href_url->tld;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (href_url->tld,
- href_url->tldlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
- /* We need to convert it to the normal value first */
- href_tok.len = uidna_nameToUnicodeUTF8 (udn,
- href_url->tld, href_url->tldlen,
- idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- href_tok.len = href_url->tldlen;
- }
- else {
- href_tok.begin = idn_hbuf;
- }
- }
- #endif
- if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
- /* Check if one url is a subdomain for another */
-
- if (!rspamd_url_is_subdomain (&disp_tok, &href_tok)) {
- href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
- href_url->phished_url = text_url;
- phished_tld.begin = href_tok.begin;
- phished_tld.len = href_tok.len;
- text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- }
- }
- }
-
- *ptext_url = text_url;
- *url_found = TRUE;
- }
- else {
- msg_info_pool ("extract of url '%s' failed: %s",
- url_str,
- rspamd_url_strerror (rc));
- }
- }
-
- }
-
- static gboolean
- rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
- struct html_tag *tag, GNode **cur_level, gboolean *balanced)
- {
- GNode *nnode;
- struct html_tag *parent;
-
- if (hc->html_tags == NULL) {
- nnode = g_node_new (NULL);
- *cur_level = nnode;
- hc->html_tags = nnode;
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t) g_node_destroy,
- nnode);
- }
-
- if (hc->total_tags > max_tags) {
- hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
- }
-
- if (tag->id == -1) {
- /* Ignore unknown tags */
- hc->total_tags ++;
- return FALSE;
- }
-
- tag->parent = *cur_level;
-
- if (!(tag->flags & CM_INLINE)) {
- /* Block tag */
- if (tag->flags & (FL_CLOSING|FL_CLOSED)) {
- if (!*cur_level) {
- msg_debug_html ("bad parent node");
- return FALSE;
- }
-
- if (hc->total_tags < max_tags) {
- nnode = g_node_new (tag);
- g_node_append (*cur_level, nnode);
-
- if (!rspamd_html_check_balance (nnode, cur_level)) {
- msg_debug_html (
- "mark part as unbalanced as it has not pairable closing tags");
- hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
- *balanced = FALSE;
- } else {
- *balanced = TRUE;
- }
-
- hc->total_tags ++;
- }
- }
- else {
- parent = (*cur_level)->data;
-
- if (parent) {
- if ((parent->flags & FL_IGNORE)) {
- tag->flags |= FL_IGNORE;
- }
-
- if (!(tag->flags & FL_CLOSED) &&
- !(parent->flags & FL_BLOCK)) {
- /* We likely have some bad nesting */
- if (parent->id == tag->id) {
- /* Something like <a>bla<a>foo... */
- hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
- *balanced = FALSE;
- tag->parent = parent->parent;
-
- if (hc->total_tags < max_tags) {
- nnode = g_node_new (tag);
- g_node_append (parent->parent, nnode);
- *cur_level = nnode;
- hc->total_tags ++;
- }
-
- return TRUE;
- }
- }
-
- parent->content_length += tag->content_length;
- }
-
- if (hc->total_tags < max_tags) {
- nnode = g_node_new (tag);
- g_node_append (*cur_level, nnode);
-
- if ((tag->flags & FL_CLOSED) == 0) {
- *cur_level = nnode;
- }
-
- hc->total_tags ++;
- }
-
- if (tag->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE)) {
- tag->flags |= FL_IGNORE;
-
- return FALSE;
- }
-
- }
- }
- else {
- /* Inline tag */
- parent = (*cur_level)->data;
-
- if (parent && (parent->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE))) {
- tag->flags |= FL_IGNORE;
-
- return FALSE;
- }
- }
-
- return TRUE;
- }
-
- #define NEW_COMPONENT(comp_type) do { \
- comp = rspamd_mempool_alloc (pool, sizeof (*comp)); \
- comp->type = (comp_type); \
- comp->start = NULL; \
- comp->len = 0; \
- g_queue_push_tail (tag->params, comp); \
- ret = TRUE; \
- } while(0)
-
- static gboolean
- rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
- const guchar *begin, const guchar *end,
- struct html_tag *tag)
- {
- struct html_tag_component *comp;
- gint len;
- gboolean ret = FALSE;
- gchar *p;
-
- g_assert (end >= begin);
- p = rspamd_mempool_alloc (pool, end - begin);
- memcpy (p, begin, end - begin);
- len = rspamd_html_decode_entitles_inplace (p, end - begin);
-
- if (len == 3) {
- if (g_ascii_strncasecmp (p, "src", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
- }
- }
- else if (len == 4) {
- if (g_ascii_strncasecmp (p, "href", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
- }
- }
-
- if (tag->id == Tag_IMG) {
- /* Check width and height if presented */
- if (len == 5 && g_ascii_strncasecmp (p, "width", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH);
- }
- else if (len == 6 && g_ascii_strncasecmp (p, "height", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT);
- }
- else if (g_ascii_strncasecmp (p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- }
- else if (tag->id == Tag_FONT) {
- if (len == 5){
- if (g_ascii_strncasecmp (p, "color", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
- }
- else if (g_ascii_strncasecmp (p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- else if (g_ascii_strncasecmp (p, "class", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
- }
- }
- else if (len == 7) {
- if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
- }
- }
- else if (len == 4) {
- if (g_ascii_strncasecmp (p, "size", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE);
- }
- }
- }
- else if (tag->flags & FL_BLOCK) {
- if (len == 5){
- if (g_ascii_strncasecmp (p, "color", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
- }
- else if (g_ascii_strncasecmp (p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- else if (g_ascii_strncasecmp (p, "class", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
- }
- }
- else if (len == 7) {
- if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
- }
- }
- }
-
- return ret;
- }
-
- static inline void
- rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
- struct html_content *hc, struct html_tag *tag, const guchar *in,
- gint *statep, guchar const **savep)
- {
- enum {
- parse_start = 0,
- parse_name,
- parse_attr_name,
- parse_equal,
- parse_start_dquote,
- parse_dqvalue,
- parse_end_dquote,
- parse_start_squote,
- parse_sqvalue,
- parse_end_squote,
- parse_value,
- spaces_after_name,
- spaces_before_eq,
- spaces_after_eq,
- spaces_after_param,
- ignore_bad_tag
- } state;
- struct html_tag_def *found;
- gboolean store = FALSE;
- struct html_tag_component *comp;
-
- state = *statep;
-
- switch (state) {
- case parse_start:
- if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = ignore_bad_tag;
- tag->id = -1;
- tag->flags |= FL_BROKEN;
- }
- else if (g_ascii_isalpha (*in)) {
- state = parse_name;
- tag->name.start = in;
- }
- break;
-
- case parse_name:
- if (g_ascii_isspace (*in) || *in == '>' || *in == '/') {
- g_assert (in >= tag->name.start);
-
- if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
-
- tag->name.len = in - tag->name.start;
-
- if (tag->name.len == 0) {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- tag->id = -1;
- tag->flags |= FL_BROKEN;
- state = ignore_bad_tag;
- }
- else {
- gchar *s;
- khiter_t k;
- /* We CANNOT safely modify tag's name here, as it is already parsed */
-
- s = rspamd_mempool_alloc (pool, tag->name.len + 1);
- memcpy (s, tag->name.start, tag->name.len);
- tag->name.len = rspamd_html_decode_entitles_inplace (s,
- tag->name.len);
- tag->name.start = s;
- tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len);
- s[tag->name.len] = '\0';
-
- k = kh_get (tag_by_name, html_tag_by_name, s);
-
- if (k == kh_end (html_tag_by_name)) {
- hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
- tag->id = -1;
- }
- else {
- found = &kh_val (html_tag_by_name, k);
- tag->id = found->id;
- tag->flags = found->flags;
- }
-
- state = spaces_after_name;
- }
- }
- break;
-
- case parse_attr_name:
- if (*savep == NULL) {
- state = ignore_bad_tag;
- }
- else {
- const guchar *attr_name_end = in;
-
- if (*in == '=') {
- state = parse_equal;
- }
- else if (*in == '"') {
- /* No equal or something sane but we have quote character */
- state = parse_start_dquote;
- attr_name_end = in - 1;
-
- while (attr_name_end > *savep) {
- if (!g_ascii_isalnum (*attr_name_end)) {
- attr_name_end --;
- }
- else {
- break;
- }
- }
-
- /* One character forward to obtain length */
- attr_name_end ++;
- }
- else if (g_ascii_isspace (*in)) {
- state = spaces_before_eq;
- }
- else if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
- else if (!g_ascii_isgraph (*in)) {
- state = parse_value;
- attr_name_end = in - 1;
-
- while (attr_name_end > *savep) {
- if (!g_ascii_isalnum (*attr_name_end)) {
- attr_name_end --;
- }
- else {
- break;
- }
- }
-
- /* One character forward to obtain length */
- attr_name_end ++;
- }
- else {
- return;
- }
-
- if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
- /* Ignore unknown params */
- *savep = NULL;
- }
- else if (state == parse_value) {
- *savep = in + 1;
- }
- }
-
- break;
-
- case spaces_after_name:
- if (!g_ascii_isspace (*in)) {
- *savep = in;
- if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
- else if (*in != '>') {
- state = parse_attr_name;
- }
- }
- break;
-
- case spaces_before_eq:
- if (*in == '=') {
- state = parse_equal;
- }
- else if (!g_ascii_isspace (*in)) {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- tag->flags |= FL_BROKEN;
- state = ignore_bad_tag;
- }
- break;
-
- case spaces_after_eq:
- if (*in == '"') {
- state = parse_start_dquote;
- }
- else if (*in == '\'') {
- state = parse_start_squote;
- }
- else if (!g_ascii_isspace (*in)) {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_value;
- }
- break;
-
- case parse_equal:
- if (g_ascii_isspace (*in)) {
- state = spaces_after_eq;
- }
- else if (*in == '"') {
- state = parse_start_dquote;
- }
- else if (*in == '\'') {
- state = parse_start_squote;
- }
- else {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_value;
- }
- break;
-
- case parse_start_dquote:
- if (*in == '"') {
- if (*savep != NULL) {
- /* We have an empty attribute value */
- savep = NULL;
- }
- state = spaces_after_param;
- }
- else {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_dqvalue;
- }
- break;
-
- case parse_start_squote:
- if (*in == '\'') {
- if (*savep != NULL) {
- /* We have an empty attribute value */
- savep = NULL;
- }
- state = spaces_after_param;
- }
- else {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_sqvalue;
- }
- break;
-
- case parse_dqvalue:
- if (*in == '"') {
- store = TRUE;
- state = parse_end_dquote;
- }
-
- if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = g_queue_peek_tail (tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = rspamd_mempool_alloc (pool, comp->len);
- memcpy (s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
- comp->start = s;
- *savep = NULL;
- }
- }
- break;
-
- case parse_sqvalue:
- if (*in == '\'') {
- store = TRUE;
- state = parse_end_squote;
- }
- if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = g_queue_peek_tail (tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = rspamd_mempool_alloc (pool, comp->len);
- memcpy (s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
- comp->start = s;
- *savep = NULL;
- }
- }
- break;
-
- case parse_value:
- if (*in == '/' && *(in + 1) == '>') {
- tag->flags |= FL_CLOSED;
- store = TRUE;
- }
- else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
- store = TRUE;
- state = spaces_after_param;
- }
-
- if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = g_queue_peek_tail (tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = rspamd_mempool_alloc (pool, comp->len);
- memcpy (s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
- comp->start = s;
- *savep = NULL;
- }
- }
- break;
-
- case parse_end_dquote:
- case parse_end_squote:
- if (g_ascii_isspace (*in)) {
- state = spaces_after_param;
- }
- else if (*in == '/' && *(in + 1) == '>') {
- tag->flags |= FL_CLOSED;
- }
- break;
-
- case spaces_after_param:
- if (!g_ascii_isspace (*in)) {
- if (*in == '/' && *(in + 1) == '>') {
- tag->flags |= FL_CLOSED;
- }
-
- state = parse_attr_name;
- *savep = in;
- }
- break;
-
- case ignore_bad_tag:
- break;
- }
-
- *statep = state;
- }
-
-
-
- struct rspamd_url *
- rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
- struct html_tag_component *comp)
- {
- struct rspamd_url *url;
- guint saved_flags = 0;
- gchar *decoded;
- gint rc;
- gsize decoded_len;
- const gchar *p, *s, *prefix = "http://";
- gchar *d;
- guint i, dlen;
- gboolean has_bad_chars = FALSE, no_prefix = FALSE;
- static const gchar hexdigests[16] = "0123456789abcdef";
-
- p = start;
-
- /* Strip spaces from the url */
- /* Head spaces */
- while (p < start + len && g_ascii_isspace (*p)) {
- p ++;
- start ++;
- len --;
- }
-
- if (comp) {
- comp->start = p;
- comp->len = len;
- }
-
- /* Trailing spaces */
- p = start + len - 1;
-
- while (p >= start && g_ascii_isspace (*p)) {
- p --;
- len --;
-
- if (comp) {
- comp->len --;
- }
- }
-
- s = start;
- dlen = 0;
-
- for (i = 0; i < len; i ++) {
- if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
- dlen += 3;
- }
- else {
- dlen ++;
- }
- }
-
- if (rspamd_substring_search (start, len, "://", 3) == -1) {
- if (len >= sizeof ("mailto:") &&
- (memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
- memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 ||
- memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) {
- /* Exclusion, has valid but 'strange' prefix */
- }
- else {
- for (i = 0; i < len; i ++) {
- if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
- if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
- prefix = "http:";
- dlen += sizeof ("http:") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == '@') {
- /* Likely email prefix */
- prefix = "mailto://";
- dlen += sizeof ("mailto://") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == ':' && i != 0) {
- /* Special case */
- no_prefix = FALSE;
- }
- else {
- if (i == 0) {
- /* No valid data */
- return NULL;
- }
- else {
- no_prefix = TRUE;
- dlen += strlen (prefix);
- }
- }
-
- break;
- }
- }
- }
- }
-
- decoded = rspamd_mempool_alloc (pool, dlen + 1);
- d = decoded;
-
- if (no_prefix) {
- gsize plen = strlen (prefix);
- memcpy (d, prefix, plen);
- d += plen;
- }
-
- /*
- * We also need to remove all internal newlines, spaces
- * and encode unsafe characters
- */
- for (i = 0; i < len; i ++) {
- if (G_UNLIKELY (g_ascii_isspace (s[i]))) {
- continue;
- }
- else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
- /* URL encode */
- *d++ = '%';
- *d++ = hexdigests[(s[i] >> 4) & 0xf];
- *d++ = hexdigests[s[i] & 0xf];
- has_bad_chars = TRUE;
- }
- else {
- *d++ = s[i];
- }
- }
-
- *d = '\0';
- dlen = d - decoded;
-
- url = rspamd_mempool_alloc0 (pool, sizeof (*url));
-
- enum rspamd_normalise_result norm_res;
-
- norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
-
- if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
- saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
-
- if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
- saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
-
- if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
- saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
- }
- }
-
- rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
-
- /* Filter some completely damaged urls */
- if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
- !((url->flags & RSPAMD_URL_FLAG_OBSCURED) && (url->protocol & PROTOCOL_UNKNOWN))) {
- url->flags |= saved_flags;
-
- if (has_bad_chars) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if (no_prefix) {
- url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
- }
-
- decoded = url->string;
- decoded_len = url->urllen;
-
- if (comp) {
- comp->start = decoded;
- comp->len = decoded_len;
- }
- /* Spaces in href usually mean an attempt to obfuscate URL */
- /* See https://github.com/vstakhov/rspamd/issues/593 */
- #if 0
- if (has_spaces) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
- #endif
-
- return url;
- }
-
- return NULL;
- }
-
- static struct rspamd_url *
- rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc)
- {
- struct html_tag_component *comp;
- GList *cur;
- struct rspamd_url *url;
- const gchar *start;
- gsize len;
-
- cur = tag->params->head;
-
- while (cur) {
- comp = cur->data;
-
- if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
- start = comp->start;
- len = comp->len;
-
- /* Check base url */
- if (hc && hc->base_url && comp->len > 2) {
- /*
- * Relative url canot start from the following:
- * schema://
- * slash
- */
- gchar *buf;
- gsize orig_len;
-
- if (rspamd_substring_search (start, len, "://", 3) == -1) {
- /* Assume relative url */
-
- gboolean need_slash = FALSE;
-
- orig_len = len;
- len += hc->base_url->urllen;
-
- if (hc->base_url->string[hc->base_url->urllen - 1] != '/') {
- need_slash = TRUE;
- len ++;
- }
-
- buf = rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf (buf, len + 1, "%*s%s%*s",
- hc->base_url->urllen, hc->base_url->string,
- need_slash ? "/" : "",
- (gint)orig_len, start);
- start = buf;
- }
- else if (start[0] == '/' && start[1] != '/') {
- /* Relative to the hostname */
- orig_len = len;
- len += hc->base_url->hostlen + hc->base_url->protocollen +
- 3 /* for :// */;
- buf = rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
- hc->base_url->protocollen, hc->base_url->string,
- hc->base_url->hostlen, hc->base_url->host,
- (gint)orig_len, start);
- start = buf;
- }
- }
-
- url = rspamd_html_process_url (pool, start, len, comp);
-
- if (url && tag->extra == NULL) {
- tag->extra = url;
- }
-
- return url;
- }
-
- cur = g_list_next (cur);
- }
-
- return NULL;
- }
-
- static void
- rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
- GHashTable *tbl_urls, GHashTable *tbl_emails)
- {
- GHashTable *target_tbl;
- struct rspamd_url *query_url, *existing;
- gchar *url_str;
- gint rc;
- gboolean prefix_added;
-
- if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if (url->querylen > 0) {
-
- if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
- RSPAMD_URL_FIND_ALL,
- NULL, &prefix_added)) {
- query_url = rspamd_mempool_alloc0 (pool,
- sizeof (struct rspamd_url));
-
- rc = rspamd_url_parse (query_url,
- url_str,
- strlen (url_str),
- pool,
- RSPAMD_URL_PARSE_TEXT);
-
- if (rc == URI_ERRNO_OK &&
- query_url->hostlen > 0) {
- msg_debug_html ("found url %s in query of url"
- " %*s", url_str, url->querylen, url->query);
-
- if (query_url->protocol == PROTOCOL_MAILTO) {
- target_tbl = tbl_emails;
- }
- else {
- target_tbl = tbl_urls;
- }
-
- if (prefix_added) {
- query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
- }
-
- if (query_url->flags
- & (RSPAMD_URL_FLAG_UNNORMALISED|RSPAMD_URL_FLAG_OBSCURED|
- RSPAMD_URL_FLAG_NUMERIC)) {
- /* Set obscured flag if query url is bad */
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- /* And vice-versa */
- if (url->flags & RSPAMD_URL_FLAG_OBSCURED) {
- query_url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if ((existing = g_hash_table_lookup (target_tbl,
- query_url)) == NULL) {
- g_hash_table_insert (target_tbl,
- query_url,
- query_url);
- }
- else {
- existing->count ++;
- }
- }
- }
- }
- }
-
- static void
- rspamd_html_process_data_image (rspamd_mempool_t *pool,
- struct html_image *img,
- struct html_tag_component *src)
- {
- /*
- * Here, we do very basic processing of the data:
- * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
- * We only parse base64 encoded data.
- * We ignore content type so far
- */
- struct rspamd_image *parsed_image;
- const gchar *semicolon_pos = NULL, *end = src->start + src->len;
-
- semicolon_pos = src->start;
-
- while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) {
- if (end - semicolon_pos > sizeof ("base64,")) {
- if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) {
- const gchar *data_pos = semicolon_pos + sizeof ("base64,");
- gchar *decoded;
- gsize encoded_len = end - data_pos, decoded_len;
- rspamd_ftok_t inp;
-
- decoded_len = (encoded_len / 4 * 3) + 12;
- decoded = rspamd_mempool_alloc (pool, decoded_len);
- rspamd_cryptobox_base64_decode (data_pos, encoded_len,
- decoded, &decoded_len);
- inp.begin = decoded;
- inp.len = decoded_len;
-
- parsed_image = rspamd_maybe_process_image (pool, &inp);
-
- if (parsed_image) {
- msg_debug_html ("detected %s image of size %ud x %ud in data url",
- rspamd_image_type_str (parsed_image->type),
- parsed_image->width, parsed_image->height);
- img->embedded_image = parsed_image;
- }
- }
-
- break;
- }
- else {
- /* Nothing useful */
- return;
- }
-
- semicolon_pos ++;
- }
- }
-
- static void
- rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc)
- {
- struct html_tag_component *comp;
- struct html_image *img;
- rspamd_ftok_t fstr;
- const guchar *p;
- GList *cur;
- gulong val;
- gboolean seen_width = FALSE, seen_height = FALSE;
- goffset pos;
-
- cur = tag->params->head;
- img = rspamd_mempool_alloc0 (pool, sizeof (*img));
- img->tag = tag;
-
- while (cur) {
- comp = cur->data;
-
- if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
- fstr.begin = (gchar *)comp->start;
- fstr.len = comp->len;
- img->src = rspamd_mempool_ftokdup (pool, &fstr);
-
- if (comp->len > sizeof ("cid:") - 1 && memcmp (comp->start,
- "cid:", sizeof ("cid:") - 1) == 0) {
- /* We have an embedded image */
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
- }
- else {
- if (comp->len > sizeof ("data:") - 1 && memcmp (comp->start,
- "data:", sizeof ("data:") - 1) == 0) {
- /* We have an embedded image in HTML tag */
- img->flags |=
- (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
- rspamd_html_process_data_image (pool, img, comp);
- hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
- }
- else {
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
- if (img->src) {
- img->url = rspamd_html_process_url (pool,
- img->src, fstr.len, NULL);
- }
- }
- }
- }
- else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) {
- rspamd_strtoul (comp->start, comp->len, &val);
- img->height = val;
- seen_height = TRUE;
- }
- else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) {
- rspamd_strtoul (comp->start, comp->len, &val);
- img->width = val;
- seen_width = TRUE;
- }
- else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) {
- /* Try to search for height= or width= in style tag */
- if (!seen_height && comp->len > 0) {
- pos = rspamd_substring_search_caseless (comp->start, comp->len,
- "height", sizeof ("height") - 1);
-
- if (pos != -1) {
- p = comp->start + pos + sizeof ("height") - 1;
-
- while (p < comp->start + comp->len) {
- if (g_ascii_isdigit (*p)) {
- rspamd_strtoul (p, comp->len - (p - comp->start), &val);
- img->height = val;
- break;
- }
- else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
- /* Fallback */
- break;
- }
- p ++;
- }
- }
- }
-
- if (!seen_width && comp->len > 0) {
- pos = rspamd_substring_search_caseless (comp->start, comp->len,
- "width", sizeof ("width") - 1);
-
- if (pos != -1) {
- p = comp->start + pos + sizeof ("width") - 1;
-
- while (p < comp->start + comp->len) {
- if (g_ascii_isdigit (*p)) {
- rspamd_strtoul (p, comp->len - (p - comp->start), &val);
- img->width = val;
- break;
- }
- else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
- /* Fallback */
- break;
- }
- p ++;
- }
- }
- }
- }
-
- cur = g_list_next (cur);
- }
-
- if (hc->images == NULL) {
- hc->images = g_ptr_array_sized_new (4);
- rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
- hc->images);
- }
-
- if (img->embedded_image) {
- if (!seen_height) {
- img->height = img->embedded_image->height;
- }
- if (!seen_width) {
- img->width = img->embedded_image->width;
- }
- }
-
- g_ptr_array_add (hc->images, img);
- tag->extra = img;
- }
-
- static void
- rspamd_html_process_color (const gchar *line, guint len, struct html_color *cl)
- {
- const gchar *p = line, *end = line + len;
- char hexbuf[7];
- rspamd_ftok_t search;
- struct html_color *el;
-
- memset (cl, 0, sizeof (*cl));
-
- if (*p == '#') {
- /* HEX color */
- p ++;
- rspamd_strlcpy (hexbuf, p, MIN ((gint)sizeof(hexbuf), end - p + 1));
- cl->d.val = strtoul (hexbuf, NULL, 16);
- cl->d.comp.alpha = 255;
- cl->valid = TRUE;
- }
- else if (len > 4 && rspamd_lc_cmp (p, "rgb", 3) == 0) {
- /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
- enum {
- obrace,
- num1,
- num2,
- num3,
- num4,
- skip_spaces
- } state = skip_spaces, next_state = obrace;
- gulong r = 0, g = 0, b = 0, opacity = 255;
- const gchar *c;
- gboolean valid = FALSE;
-
- p += 3;
-
- if (*p == 'a') {
- p ++;
- }
-
- c = p;
-
- while (p < end) {
- switch (state) {
- case obrace:
- if (*p == '(') {
- p ++;
- state = skip_spaces;
- next_state = num1;
- }
- else if (g_ascii_isspace (*p)) {
- state = skip_spaces;
- next_state = obrace;
- }
- else {
- goto stop;
- }
- break;
- case num1:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &r)) {
- goto stop;
- }
-
- p ++;
- state = skip_spaces;
- next_state = num2;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case num2:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &g)) {
- goto stop;
- }
-
- p ++;
- state = skip_spaces;
- next_state = num3;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case num3:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &b)) {
- goto stop;
- }
-
- valid = TRUE;
- p ++;
- state = skip_spaces;
- next_state = num4;
- }
- else if (*p == ')') {
- if (!rspamd_strtoul (c, p - c, &b)) {
- goto stop;
- }
-
- valid = TRUE;
- goto stop;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case num4:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &opacity)) {
- goto stop;
- }
-
- valid = TRUE;
- goto stop;
- }
- else if (*p == ')') {
- if (!rspamd_strtoul (c, p - c, &opacity)) {
- goto stop;
- }
-
- valid = TRUE;
- goto stop;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case skip_spaces:
- if (!g_ascii_isspace (*p)) {
- c = p;
- state = next_state;
- }
- else {
- p ++;
- }
- break;
- }
- }
-
- stop:
-
- if (valid) {
- cl->d.comp.r = r;
- cl->d.comp.g = g;
- cl->d.comp.b = b;
- cl->d.comp.alpha = opacity;
- cl->valid = TRUE;
- }
- }
- else {
- khiter_t k;
- /* Compare color by name */
- search.begin = line;
- search.len = len;
-
- k = kh_get (color_by_name, html_color_by_name, &search);
-
- if (k != kh_end (html_color_by_name)) {
- el = &kh_val (html_color_by_name, k);
- memcpy (cl, el, sizeof (*cl));
- cl->d.comp.alpha = 255; /* Non transparent */
- }
- }
- }
-
- /*
- * Target is used for in and out if this function returns TRUE
- */
- static gboolean
- rspamd_html_process_css_size (const gchar *suffix, gsize len,
- gdouble *tgt)
- {
- gdouble sz = *tgt;
- gboolean ret = FALSE;
-
- if (len >= 2) {
- if (memcmp (suffix, "px", 2) == 0) {
- sz = (guint) sz; /* Round to number */
- ret = TRUE;
- }
- else if (memcmp (suffix, "em", 2) == 0) {
- /* EM is 16 px, so multiply and round */
- sz = (guint) (sz * 16.0);
- ret = TRUE;
- }
- else if (len >= 3 && memcmp (suffix, "rem", 3) == 0) {
- /* equal to EM in our case */
- sz = (guint) (sz * 16.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "ex", 2) == 0) {
- /*
- * Represents the x-height of the element's font.
- * On fonts with the "x" letter, this is generally the height
- * of lowercase letters in the font; 1ex = 0.5em in many fonts.
- */
- sz = (guint) (sz * 8.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "vw", 2) == 0) {
- /*
- * Vewport width in percentages:
- * we assume 1% of viewport width as 8px
- */
- sz = (guint) (sz * 8.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "vh", 2) == 0) {
- /*
- * Vewport height in percentages
- * we assume 1% of viewport width as 6px
- */
- sz = (guint) (sz * 6.0);
- ret = TRUE;
- }
- else if (len >= 4 && memcmp (suffix, "vmax", 4) == 0) {
- /*
- * Vewport width in percentages
- * we assume 1% of viewport width as 6px
- */
- sz = (guint) (sz * 8.0);
- ret = TRUE;
- }
- else if (len >= 4 && memcmp (suffix, "vmin", 4) == 0) {
- /*
- * Vewport height in percentages
- * we assume 1% of viewport width as 6px
- */
- sz = (guint) (sz * 6.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "pt", 2) == 0) {
- sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
- ret = TRUE;
- }
- else if (memcmp (suffix, "cm", 2) == 0) {
- sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
- ret = TRUE;
- }
- else if (memcmp (suffix, "mm", 2) == 0) {
- sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
- ret = TRUE;
- }
- else if (memcmp (suffix, "in", 2) == 0) {
- sz = (guint) (sz * 96.0); /* 96px */
- ret = TRUE;
- }
- else if (memcmp (suffix, "pc", 2) == 0) {
- sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
- ret = TRUE;
- }
- }
- else if (suffix[0] == '%') {
- /* Percentages from 16 px */
- sz = (guint)(sz / 100.0 * 16.0);
- ret = TRUE;
- }
-
- if (ret) {
- *tgt = sz;
- }
-
- return ret;
- }
-
- static void
- rspamd_html_process_font_size (const gchar *line, guint len, guint *fs,
- gboolean is_css)
- {
- const gchar *p = line, *end = line + len;
- gchar *err = NULL, numbuf[64];
- gdouble sz = 0;
- gboolean failsafe = FALSE;
-
- while (p < end && g_ascii_isspace (*p)) {
- p ++;
- len --;
- }
-
- if (g_ascii_isdigit (*p)) {
- rspamd_strlcpy (numbuf, p, MIN (sizeof (numbuf), len + 1));
- sz = strtod (numbuf, &err);
-
- /* Now check leftover */
- if (sz < 0) {
- sz = 0;
- }
- }
- else {
- /* Ignore the rest */
- failsafe = TRUE;
- sz = is_css ? 16 : 1;
- /* TODO: add textual fonts descriptions */
- }
-
- if (err && *err != '\0') {
- const gchar *e = err;
- gsize slen;
-
- /* Skip spaces */
- while (*e && g_ascii_isspace (*e)) {
- e ++;
- }
-
- /* Lowercase */
- slen = strlen (e);
- rspamd_str_lc ((gchar *)e, slen);
-
- if (!rspamd_html_process_css_size (e, slen, &sz)) {
- failsafe = TRUE;
- }
- }
- else {
- /* Failsafe naked number */
- failsafe = TRUE;
- }
-
- if (failsafe) {
- if (is_css) {
- /*
- * In css mode we usually ignore sizes, but let's treat
- * small sizes specially
- */
- if (sz < 1) {
- sz = 0;
- } else {
- sz = 16; /* Ignore */
- }
- } else {
- /* In non-css mode we have to check legacy size */
- sz = sz >= 1 ? sz * 16 : 16;
- }
- }
-
- if (sz > 32) {
- sz = 32;
- }
-
- *fs = sz;
- }
-
- static void
- rspamd_html_process_style (rspamd_mempool_t *pool, struct html_block *bl,
- struct html_content *hc, const gchar *style, guint len)
- {
- const gchar *p, *c, *end, *key = NULL;
- enum {
- read_key,
- read_colon,
- read_value,
- skip_spaces,
- } state = skip_spaces, next_state = read_key;
- guint klen = 0;
- gdouble opacity = 1.0;
-
- p = style;
- c = p;
- end = p + len;
-
- while (p <= end) {
- switch(state) {
- case read_key:
- if (p == end || *p == ':') {
- key = c;
- klen = p - c;
- state = skip_spaces;
- next_state = read_value;
- }
- else if (g_ascii_isspace (*p)) {
- key = c;
- klen = p - c;
- state = skip_spaces;
- next_state = read_colon;
- }
-
- p ++;
- break;
-
- case read_colon:
- if (p == end || *p == ':') {
- state = skip_spaces;
- next_state = read_value;
- }
-
- p ++;
- break;
-
- case read_value:
- if (p == end || *p == ';') {
- if (key && klen && p - c > 0) {
- if ((klen == 5 && g_ascii_strncasecmp (key, "color", 5) == 0)
- || (klen == 10 && g_ascii_strncasecmp (key, "font-color", 10) == 0)) {
-
- rspamd_html_process_color (c, p - c, &bl->font_color);
- msg_debug_html ("got color: %xd", bl->font_color.d.val);
- }
- else if ((klen == 16 && g_ascii_strncasecmp (key,
- "background-color", 16) == 0) ||
- (klen == 10 && g_ascii_strncasecmp (key,
- "background", 10) == 0)) {
-
- rspamd_html_process_color (c, p - c, &bl->background_color);
- msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
- }
- else if (klen == 7 && g_ascii_strncasecmp (key, "display", 7) == 0) {
- if (p - c >= 4 && rspamd_substring_search_caseless (c, p - c,
- "none", 4) != -1) {
- bl->visible = FALSE;
- msg_debug_html ("tag is not visible");
- }
- }
- else if (klen == 9 &&
- g_ascii_strncasecmp (key, "font-size", 9) == 0) {
- rspamd_html_process_font_size (c, p - c,
- &bl->font_size, TRUE);
- msg_debug_html ("got font size: %ud", bl->font_size);
- }
- else if (klen == 7 &&
- g_ascii_strncasecmp (key, "opacity", 7) == 0) {
- gchar numbuf[64];
-
- rspamd_strlcpy (numbuf, c,
- MIN (sizeof (numbuf), p - c + 1));
- opacity = strtod (numbuf, NULL);
-
- if (opacity > 1) {
- opacity = 1;
- }
- else if (opacity < 0) {
- opacity = 0;
- }
-
- bl->font_color.d.comp.alpha = (guint8)(opacity * 255.0);
- }
- else if (klen == 10 &&
- g_ascii_strncasecmp (key, "visibility", 10) == 0) {
- if (p - c >= 6 && rspamd_substring_search_caseless (c,
- p - c,
- "hidden", 6) != -1) {
- bl->visible = FALSE;
- msg_debug_html ("tag is not visible");
- }
- }
- }
-
- key = NULL;
- klen = 0;
- state = skip_spaces;
- next_state = read_key;
- }
-
- p ++;
- break;
-
- case skip_spaces:
- if (p < end && !g_ascii_isspace (*p)) {
- c = p;
- state = next_state;
- }
- else {
- p ++;
- }
-
- break;
- }
- }
- }
-
- static void
- rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc)
- {
- struct html_tag_component *comp;
- struct html_block *bl;
- rspamd_ftok_t fstr;
- GList *cur;
-
- cur = tag->params->head;
- bl = rspamd_mempool_alloc0 (pool, sizeof (*bl));
- bl->tag = tag;
- bl->visible = TRUE;
- bl->font_size = (guint)-1;
- bl->font_color.d.comp.alpha = 255;
-
- while (cur) {
- comp = cur->data;
-
- if (comp->len > 0) {
- switch (comp->type) {
- case RSPAMD_HTML_COMPONENT_COLOR:
- fstr.begin = (gchar *) comp->start;
- fstr.len = comp->len;
- rspamd_html_process_color (comp->start, comp->len,
- &bl->font_color);
- msg_debug_html ("got color: %xd", bl->font_color.d.val);
- break;
- case RSPAMD_HTML_COMPONENT_BGCOLOR:
- fstr.begin = (gchar *) comp->start;
- fstr.len = comp->len;
- rspamd_html_process_color (comp->start, comp->len,
- &bl->background_color);
- msg_debug_html ("got color: %xd", bl->font_color.d.val);
-
- if (tag->id == Tag_BODY) {
- /* Set global background color */
- memcpy (&hc->bgcolor, &bl->background_color,
- sizeof (hc->bgcolor));
- }
- break;
- case RSPAMD_HTML_COMPONENT_STYLE:
- bl->style.len = comp->len;
- bl->style.start = comp->start;
- msg_debug_html ("got style: %*s", (gint) bl->style.len,
- bl->style.start);
- rspamd_html_process_style (pool, bl, hc, comp->start, comp->len);
- break;
- case RSPAMD_HTML_COMPONENT_CLASS:
- fstr.begin = (gchar *) comp->start;
- fstr.len = comp->len;
- bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
- msg_debug_html ("got class: %s", bl->html_class);
- break;
- case RSPAMD_HTML_COMPONENT_SIZE:
- /* Not supported by html5 */
- /* FIXME maybe support it */
- bl->font_size = 16;
- msg_debug_html ("got size: %*s", (gint)comp->len, comp->start);
- break;
- default:
- /* NYI */
- break;
- }
- }
-
- cur = g_list_next (cur);
- }
-
- if (hc->blocks == NULL) {
- hc->blocks = g_ptr_array_sized_new (64);
- rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
- hc->blocks);
- }
-
- g_ptr_array_add (hc->blocks, bl);
- tag->extra = bl;
- }
-
- static void
- rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
- GList **exceptions, GHashTable *urls, GHashTable *emails,
- GByteArray *dest, GHashTable *target_tbl,
- gint href_offset,
- struct rspamd_url *url)
- {
- struct rspamd_url *displayed_url = NULL;
- struct rspamd_url *turl;
- gboolean url_found = FALSE;
- struct rspamd_process_exception *ex;
-
- if (href_offset <= 0) {
- /* No dispalyed url, just some text within <a> tag */
- return;
- }
-
- url->visible_part = rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
- rspamd_strlcpy (url->visible_part, dest->data + href_offset,
- dest->len - href_offset + 1);
- g_strstrip (url->visible_part);
-
- rspamd_html_url_is_phished (pool, url,
- dest->data + href_offset,
- dest->len - href_offset,
- &url_found, &displayed_url);
-
- if (url_found) {
- url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
- }
- if (exceptions && url_found) {
- ex = rspamd_mempool_alloc (pool,
- sizeof (*ex));
- ex->pos = href_offset;
- ex->len = dest->len - href_offset;
- ex->type = RSPAMD_EXCEPTION_URL;
- ex->ptr = url;
-
- *exceptions = g_list_prepend (*exceptions,
- ex);
- }
-
- if (displayed_url) {
- if (displayed_url->protocol ==
- PROTOCOL_MAILTO) {
- target_tbl = emails;
- }
- else {
- target_tbl = urls;
- }
-
- if (target_tbl != NULL) {
- turl = g_hash_table_lookup (target_tbl,
- displayed_url);
-
- if (turl != NULL) {
- /* Here, we assume the following:
- * if we have a URL in the text part which
- * is the same as displayed URL in the
- * HTML part, we assume that it is also
- * hint only.
- */
- if (turl->flags &
- RSPAMD_URL_FLAG_FROM_TEXT) {
- turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
- }
-
- turl->count ++;
- }
- else {
- g_hash_table_insert (target_tbl,
- displayed_url,
- displayed_url);
- }
- }
- }
- }
-
- static gboolean
- rspamd_html_propagate_lengths (GNode *node, gpointer _unused)
- {
- GNode *child;
- struct html_tag *tag = node->data, *cld_tag;
-
- if (tag) {
- child = node->children;
-
- /* Summarize content length from children */
- while (child) {
- cld_tag = child->data;
- tag->content_length += cld_tag->content_length;
- child = child->next;
- }
- }
-
- return FALSE;
- }
-
- static void
- rspamd_html_propagate_style (struct html_content *hc,
- struct html_tag *tag,
- struct html_block *bl,
- GQueue *blocks)
- {
- struct html_block *bl_parent;
- gboolean push_block = FALSE;
-
-
- /* Propagate from the parent if needed */
- bl_parent = g_queue_peek_tail (blocks);
-
- if (bl_parent) {
- if (!bl->background_color.valid) {
- /* Try to propagate background color from parent nodes */
- if (bl_parent->background_color.valid) {
- memcpy (&bl->background_color, &bl_parent->background_color,
- sizeof (bl->background_color));
- }
- }
- else {
- push_block = TRUE;
- }
-
- if (!bl->font_color.valid) {
- /* Try to propagate background color from parent nodes */
- if (bl_parent->font_color.valid) {
- memcpy (&bl->font_color, &bl_parent->font_color,
- sizeof (bl->font_color));
- }
- }
- else {
- push_block = TRUE;
- }
-
- /* Propagate font size */
- if (bl->font_size == (guint)-1) {
- if (bl_parent->font_size != (guint)-1) {
- bl->font_size = bl_parent->font_size;
- }
- }
- else {
- push_block = TRUE;
- }
- }
-
- /* Set bgcolor to the html bgcolor and font color to black as a last resort */
- if (!bl->font_color.valid) {
- /* Don't touch opacity as it can be set separately */
- bl->font_color.d.comp.r = 0;
- bl->font_color.d.comp.g = 0;
- bl->font_color.d.comp.b = 0;
- bl->font_color.valid = TRUE;
- }
- else {
- push_block = TRUE;
- }
-
- if (!bl->background_color.valid) {
- memcpy (&bl->background_color, &hc->bgcolor, sizeof (hc->bgcolor));
- }
- else {
- push_block = TRUE;
- }
-
- if (bl->font_size == (guint)-1) {
- bl->font_size = 16; /* Default for browsers */
- }
- else {
- push_block = TRUE;
- }
-
- if (push_block && !(tag->flags & FL_CLOSED)) {
- g_queue_push_tail (blocks, bl);
- }
- }
-
- GByteArray*
- rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
- GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails)
- {
- const guchar *p, *c, *end, *savep = NULL;
- guchar t;
- gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
- balanced;
- GByteArray *dest;
- GHashTable *target_tbl;
- guint obrace = 0, ebrace = 0;
- GNode *cur_level = NULL;
- gint substate = 0, len, href_offset = -1;
- struct html_tag *cur_tag = NULL, *content_tag = NULL;
- struct rspamd_url *url = NULL, *turl;
- GQueue *styles_blocks;
-
- enum {
- parse_start = 0,
- tag_begin,
- sgml_tag,
- xml_tag,
- compound_tag,
- comment_tag,
- comment_content,
- sgml_content,
- tag_content,
- tag_end,
- xml_tag_end,
- content_ignore,
- content_write,
- content_ignore_sp
- } state = parse_start;
-
- g_assert (in != NULL);
- g_assert (hc != NULL);
- g_assert (pool != NULL);
-
- rspamd_html_library_init ();
- hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (G_N_ELEMENTS (tag_defs)));
-
- /* Set white background color by default */
- hc->bgcolor.d.comp.alpha = 0;
- hc->bgcolor.d.comp.r = 255;
- hc->bgcolor.d.comp.g = 255;
- hc->bgcolor.d.comp.b = 255;
- hc->bgcolor.valid = TRUE;
-
- dest = g_byte_array_sized_new (in->len / 3 * 2);
- styles_blocks = g_queue_new ();
-
- p = in->data;
- c = p;
- end = p + in->len;
-
- while (p < end) {
- t = *p;
-
- switch (state) {
- case parse_start:
- if (t == '<') {
- state = tag_begin;
- }
- else {
- /* We have no starting tag, so assume that it's content */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
- state = content_write;
- }
-
- break;
- case tag_begin:
- switch (t) {
- case '<':
- p ++;
- closing = FALSE;
- break;
- case '!':
- state = sgml_tag;
- p ++;
- break;
- case '?':
- state = xml_tag;
- hc->flags |= RSPAMD_HTML_FLAG_XML;
- p ++;
- break;
- case '/':
- closing = TRUE;
- p ++;
- break;
- case '>':
- /* Empty tag */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = tag_end;
- continue;
- default:
- state = tag_content;
- substate = 0;
- savep = NULL;
- cur_tag = rspamd_mempool_alloc0 (pool, sizeof (*cur_tag));
- cur_tag->params = g_queue_new ();
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t)g_queue_free, cur_tag->params);
- break;
- }
-
- break;
-
- case sgml_tag:
- switch (t) {
- case '[':
- state = compound_tag;
- obrace = 1;
- ebrace = 0;
- p ++;
- break;
- case '-':
- state = comment_tag;
- p ++;
- break;
- default:
- state = sgml_content;
- break;
- }
-
- break;
-
- case xml_tag:
- if (t == '?') {
- state = xml_tag_end;
- }
- else if (t == '>') {
- /* Misformed xml tag */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = tag_end;
- continue;
- }
- /* We efficiently ignore xml tags */
- p ++;
- break;
-
- case xml_tag_end:
- if (t == '>') {
- state = tag_end;
- continue;
- }
- else {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- p ++;
- }
- break;
-
- case compound_tag:
- if (t == '[') {
- obrace ++;
- }
- else if (t == ']') {
- ebrace ++;
- }
- else if (t == '>' && obrace == ebrace) {
- state = tag_end;
- continue;
- }
- p ++;
- break;
-
- case comment_tag:
- if (t != '-') {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = tag_end;
- }
- else {
- p++;
- ebrace = 0;
- /*
- * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
- * ... the text must not start with a single
- * U+003E GREATER-THAN SIGN character (>),
- * nor start with a "-" (U+002D) character followed by
- * a U+003E GREATER-THAN SIGN (>) character,
- * nor contain two consecutive U+002D HYPHEN-MINUS
- * characters (--), nor end with a "-" (U+002D) character.
- */
- if (p[0] == '-' && p + 1 < end && p[1] == '>') {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- p ++;
- state = tag_end;
- }
- else if (*p == '>') {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = tag_end;
- }
- else {
- state = comment_content;
- }
- }
- break;
-
- case comment_content:
- if (t == '-') {
- ebrace ++;
- }
- else if (t == '>' && ebrace >= 2) {
- state = tag_end;
- continue;
- }
- else {
- ebrace = 0;
- }
-
- p ++;
- break;
-
- case content_ignore:
- if (t != '<') {
- p ++;
- }
- else {
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
- }
-
- content_tag->content_length += p - c;
- }
- state = tag_begin;
- }
- break;
-
- case content_write:
-
- if (t != '<') {
- if (t == '&') {
- need_decode = TRUE;
- }
- else if (g_ascii_isspace (t)) {
- save_space = TRUE;
-
- if (p > c) {
- if (need_decode) {
- goffset old_offset = dest->len;
-
- g_byte_array_append (dest, c, (p - c));
-
- len = rspamd_html_decode_entitles_inplace (
- dest->data + old_offset,
- p - c);
- dest->len = dest->len + len - (p - c);
- }
- else {
- len = p - c;
- g_byte_array_append (dest, c, len);
- }
-
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
- }
-
- content_tag->content_length += p - c + 1;
- }
- }
-
- c = p;
- state = content_ignore_sp;
- }
- else {
- if (save_space) {
- /* Append one space if needed */
- if (dest->len > 0 &&
- !g_ascii_isspace (dest->data[dest->len - 1])) {
- g_byte_array_append (dest, " ", 1);
- }
- save_space = FALSE;
- }
- }
- }
- else {
- if (c != p) {
-
- if (need_decode) {
- goffset old_offset = dest->len;
-
- g_byte_array_append (dest, c, (p - c));
- len = rspamd_html_decode_entitles_inplace (
- dest->data + old_offset,
- p - c);
- dest->len = dest->len + len - (p - c);
- }
- else {
- len = p - c;
- g_byte_array_append (dest, c, len);
- }
-
-
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
- }
-
- content_tag->content_length += p - c;
- }
- }
-
- content_tag = NULL;
-
- state = tag_begin;
- continue;
- }
-
- p ++;
- break;
-
- case content_ignore_sp:
- if (!g_ascii_isspace (t)) {
- c = p;
- state = content_write;
- continue;
- }
-
- if (content_tag) {
- content_tag->content_length ++;
- }
-
- p ++;
- break;
-
- case sgml_content:
- /* TODO: parse DOCTYPE here */
- if (t == '>') {
- state = tag_end;
- /* We don't know a lot about sgml tags, ignore them */
- cur_tag = NULL;
- continue;
- }
- p ++;
- break;
-
- case tag_content:
- rspamd_html_parse_tag_content (pool, hc, cur_tag,
- p, &substate, &savep);
- if (t == '>') {
- if (closing) {
- cur_tag->flags |= FL_CLOSING;
-
- if (cur_tag->flags & FL_CLOSED) {
- /* Bad mix of closed and closing */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- }
-
- closing = FALSE;
- }
-
- state = tag_end;
- continue;
- }
- p ++;
- break;
-
- case tag_end:
- substate = 0;
- savep = NULL;
-
- if (cur_tag != NULL) {
- balanced = TRUE;
-
- if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level,
- &balanced)) {
- state = content_write;
- need_decode = FALSE;
- }
- else {
- state = content_ignore;
- }
-
- if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
- if (cur_tag->flags & CM_UNIQUE) {
- if (isset (hc->tags_seen, cur_tag->id)) {
- /* Duplicate tag has been found */
- hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
- }
- }
- setbit (hc->tags_seen, cur_tag->id);
- }
-
- if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
- content_tag = cur_tag;
- }
-
- /* Handle newlines */
- if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
- if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
- g_byte_array_append (dest, "\r\n", 2);
- }
- save_space = FALSE;
- }
-
- if ((cur_tag->id == Tag_P ||
- cur_tag->id == Tag_TR ||
- cur_tag->id == Tag_DIV)) {
- if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
- g_byte_array_append (dest, "\r\n", 2);
- }
- save_space = FALSE;
- }
-
- if (cur_tag->flags & FL_HREF) {
- if (!(cur_tag->flags & (FL_CLOSING))) {
- url = rspamd_html_process_url_tag (pool, cur_tag, hc);
-
- if (url != NULL) {
-
- if (url->protocol == PROTOCOL_MAILTO) {
- target_tbl = emails;
- }
- else {
- target_tbl = urls;
- }
-
- if (target_tbl != NULL) {
- turl = g_hash_table_lookup (target_tbl, url);
-
- if (turl == NULL) {
- g_hash_table_insert (target_tbl, url, url);
- }
- else {
- turl->count ++;
- url = NULL;
- }
-
- if (turl == NULL && url != NULL) {
- rspamd_process_html_url (pool,
- url,
- urls, emails);
- }
- }
-
- href_offset = dest->len;
- }
- }
-
- if (cur_tag->id == Tag_A) {
- if (!balanced && cur_level && cur_level->prev) {
- struct html_tag *prev_tag;
- struct rspamd_url *prev_url;
-
- prev_tag = cur_level->prev->data;
-
- if (prev_tag->id == Tag_A &&
- !(prev_tag->flags & (FL_CLOSING)) &&
- prev_tag->extra) {
- prev_url = prev_tag->extra;
-
- rspamd_html_check_displayed_url (pool,
- exceptions, urls, emails,
- dest, target_tbl, href_offset,
- prev_url);
- }
- }
-
- if (cur_tag->flags & (FL_CLOSING)) {
-
- /* Insert exception */
- if (url != NULL && (gint) dest->len > href_offset) {
- rspamd_html_check_displayed_url (pool,
- exceptions, urls, emails,
- dest, target_tbl, href_offset,
- url);
-
- }
-
- href_offset = -1;
- url = NULL;
- }
- }
- }
- else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
- struct html_tag *prev_tag = NULL;
-
- if (cur_level && cur_level->parent) {
- prev_tag = cur_level->parent->data;
- }
-
- /*
- * Base is allowed only within head tag but we slightly
- * relax that
- */
- if (!prev_tag || prev_tag->id == Tag_HEAD ||
- prev_tag->id == Tag_HTML) {
- url = rspamd_html_process_url_tag (pool, cur_tag, hc);
-
- if (url != NULL) {
- if (hc->base_url == NULL) {
- /* We have a base tag available */
- hc->base_url = url;
- }
-
- cur_tag->extra = url;
- }
- }
- }
-
- if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
- rspamd_html_process_img_tag (pool, cur_tag, hc);
- }
- else if (cur_tag->flags & FL_BLOCK) {
- struct html_block *bl;
-
- if (cur_tag->flags & FL_CLOSING) {
- /* Just remove block element from the queue if any */
- if (styles_blocks->length > 0) {
- g_queue_pop_tail (styles_blocks);
- }
- }
- else {
- rspamd_html_process_block_tag (pool, cur_tag, hc);
- bl = cur_tag->extra;
-
- if (bl) {
- rspamd_html_propagate_style (hc, cur_tag,
- cur_tag->extra, styles_blocks);
-
- /* Check visibility */
- if (bl->font_size < 3 ||
- bl->font_color.d.comp.alpha < 10) {
-
- bl->visible = FALSE;
- msg_debug_html ("tag is not visible");
- }
-
- if (!bl->visible) {
- state = content_ignore;
- }
- }
- }
- }
- }
- else {
- state = content_write;
- }
-
-
- p++;
- c = p;
- cur_tag = NULL;
- break;
- }
- }
-
- if (hc->html_tags) {
- g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1,
- rspamd_html_propagate_lengths, NULL);
- }
-
- g_queue_free (styles_blocks);
-
- return dest;
- }
-
- GByteArray*
- rspamd_html_process_part (rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in)
- {
- return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
- }
|