123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "config.h"
- #include "util.h"
- #include "rspamd.h"
- #include "message.h"
- #include "html.h"
- #include "html_tags.h"
- #include "html_colors.h"
- #include "html_entities.h"
- #include "url.h"
- #include "contrib/libucl/khash.h"
- #include "libmime/images.h"
-
- #include <unicode/uversion.h>
- #include <unicode/ucnv.h>
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- #include <unicode/uidna.h>
- #endif
-
- static sig_atomic_t tags_sorted = 0;
- static sig_atomic_t entities_sorted = 0;
- static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
-
- struct html_tag_def {
- const gchar *name;
- gint16 id;
- guint16 len;
- guint flags;
- };
-
- #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
- rspamd_html_log_id, "html", pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
-
- INIT_LOG_MODULE(html)
-
- #define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
-
- static struct html_tag_def tag_defs[] = {
- /* W3C defined elements */
- TAG_DEF(Tag_A, "a", 0),
- TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
- TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
- TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
- TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY)),
- TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
- TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
- TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
- TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
- TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
- TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
- TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
- TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
- TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
- TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
- TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
- TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
- TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
- TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
- TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
- TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
- TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
- TAG_DEF(Tag_EM, "em", (CM_INLINE)),
- TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
- TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
- TAG_DEF(Tag_FORM, "form", (CM_BLOCK)),
- TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY)),
- TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
- TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
- TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
- TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
- TAG_DEF(Tag_I, "i", (CM_INLINE)),
- TAG_DEF(Tag_IFRAME, "iframe", (0)),
- TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
- TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
- TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
- TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
- TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
- TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY)),
- TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_MAP, "map", (CM_INLINE)),
- TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
- TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
- TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
- TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
- TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
- TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
- TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
- TAG_DEF(Tag_Q, "q", (CM_INLINE)),
- TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
- TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
- TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
- TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
- TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
- TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
- TAG_DEF(Tag_S, "s", (CM_INLINE)),
- TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
- TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
- TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
- TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
- TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
- TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
- TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
- TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
- TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
- TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
- TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
- TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
- TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
- TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
- TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
- TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
- TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
- TAG_DEF(Tag_U, "u", (CM_INLINE)),
- TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
- TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
-
- /* proprietary elements */
- TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
- TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
- TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
- TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
- TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
- TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
- TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
- TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
- TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
- TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
- TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
- TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
- TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
- TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
- };
-
- KHASH_MAP_INIT_INT (entity_by_number, const char *);
- KHASH_MAP_INIT_STR (entity_by_name, const char *);
- KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
- KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
- KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
- rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
-
- khash_t(entity_by_number) *html_entity_by_number;
- khash_t(entity_by_name) *html_entity_by_name;
- khash_t(tag_by_name) *html_tag_by_name;
- khash_t(tag_by_id) *html_tag_by_id;
- khash_t(color_by_name) *html_color_by_name;
-
- static void
- rspamd_html_library_init (void)
- {
- guint i;
- khiter_t k;
- gint rc;
-
- if (!tags_sorted) {
- html_tag_by_id = kh_init (tag_by_id);
- html_tag_by_name = kh_init (tag_by_name);
- kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
- kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
-
- for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
- k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
- kh_val (html_tag_by_id, k) = tag_defs[i];
-
- k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
- kh_val (html_tag_by_name, k) = tag_defs[i];
- }
-
- tags_sorted = 1;
- }
-
- if (!entities_sorted) {
- html_entity_by_number = kh_init (entity_by_number);
- html_entity_by_name = kh_init (entity_by_name);
- kh_resize (entity_by_number, html_entity_by_number,
- G_N_ELEMENTS (entities_defs));
- kh_resize (entity_by_name, html_entity_by_name,
- G_N_ELEMENTS (entities_defs));
-
- for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
- k = kh_put (entity_by_number, html_entity_by_number,
- entities_defs[i].code, &rc);
- kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
-
- k = kh_put (entity_by_name, html_entity_by_name,
- entities_defs[i].name, &rc);
- kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
- }
-
- html_color_by_name = kh_init (color_by_name);
- kh_resize (color_by_name, html_color_by_name,
- G_N_ELEMENTS (html_colornames));
-
- rspamd_ftok_t *keys;
-
- keys = g_malloc0 (sizeof (rspamd_ftok_t) *
- G_N_ELEMENTS (html_colornames));
-
- for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
- struct html_color c;
-
- keys[i].begin = html_colornames[i].name;
- keys[i].len = strlen (html_colornames[i].name);
- k = kh_put (color_by_name, html_color_by_name,
- &keys[i], &rc);
- c.valid = true;
- c.d.comp.r = html_colornames[i].rgb.r;
- c.d.comp.g = html_colornames[i].rgb.g;
- c.d.comp.b = html_colornames[i].rgb.b;
- c.d.comp.alpha = 255;
- kh_val (html_color_by_name, k) = c;
-
- }
-
- entities_sorted = 1;
- }
- }
-
- static gboolean
- rspamd_html_check_balance (GNode * node, GNode ** cur_level)
- {
- struct html_tag *arg = node->data, *tmp;
- GNode *cur;
-
- if (arg->flags & FL_CLOSING) {
- /* First of all check whether this tag is closing tag for parent node */
- cur = node->parent;
- while (cur && cur->data) {
- tmp = cur->data;
- if (tmp->id == arg->id &&
- (tmp->flags & FL_CLOSED) == 0) {
- tmp->flags |= FL_CLOSED;
- /* Destroy current node as we find corresponding parent node */
- g_node_destroy (node);
- /* Change level */
- *cur_level = cur->parent;
- return TRUE;
- }
- cur = cur->parent;
- }
- }
- else {
- return TRUE;
- }
-
- return FALSE;
- }
-
- gint
- rspamd_html_tag_by_name (const gchar *name)
- {
- khiter_t k;
-
- k = kh_get (tag_by_name, html_tag_by_name, name);
-
- if (k != kh_end (html_tag_by_name)) {
- return kh_val (html_tag_by_name, k).id;
- }
-
- return -1;
- }
-
- gboolean
- rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
- {
- gint id;
-
- g_assert (hc != NULL);
- g_assert (hc->tags_seen != NULL);
-
- id = rspamd_html_tag_by_name (tagname);
-
- if (id != -1) {
- return isset (hc->tags_seen, id);
- }
-
- return FALSE;
- }
-
- const gchar *
- rspamd_html_tag_by_id (gint id)
- {
- khiter_t k;
-
- k = kh_get (tag_by_id, html_tag_by_id, id);
-
- if (k != kh_end (html_tag_by_id)) {
- return kh_val (html_tag_by_id, k).name;
- }
-
- return NULL;
- }
-
- /* Decode HTML entitles in text */
- guint
- rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
- {
- goffset l, rep_len;
- gchar *t = s, *h = s, *e = s, *end_ptr;
- const gchar *end;
- const gchar *entity;
- gint state = 0, base;
- UChar32 uc;
- khiter_t k;
-
- if (len == 0) {
- l = strlen (s);
- }
- else {
- l = len;
- }
-
- end = s + l;
-
- while (h - s < l) {
- switch (state) {
- /* Out of entity */
- case 0:
- if (*h == '&') {
- state = 1;
- e = h;
- h++;
- continue;
- }
- else {
- *t = *h;
- h++;
- t++;
- }
- break;
- case 1:
- if (*h == ';' && h > e) {
- /* Determine base */
- /* First find in entities table */
- *h = '\0';
- entity = e + 1;
- uc = 0;
-
- if (*entity != '#') {
- k = kh_get (entity_by_name, html_entity_by_name, entity);
- *h = ';';
-
- if (k != kh_end (html_entity_by_name)) {
- if (kh_val (html_entity_by_name, k)) {
- rep_len = strlen (kh_val (html_entity_by_name, k));
-
- if (end - t >= rep_len) {
- memcpy (t, kh_val (html_entity_by_name, k),
- rep_len);
- t += rep_len;
- }
- } else {
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else {
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else if (e + 2 < h) {
- if (*(e + 2) == 'x' || *(e + 2) == 'X') {
- base = 16;
- }
- else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
- base = 8;
- }
- else {
- base = 10;
- }
-
- if (base == 10) {
- uc = strtoul ((e + 2), &end_ptr, base);
- }
- else {
- uc = strtoul ((e + 3), &end_ptr, base);
- }
-
- if (end_ptr != NULL && *end_ptr != '\0') {
- /* Skip undecoded */
- *h = ';';
-
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- else {
- /* Search for a replacement */
- *h = ';';
- k = kh_get (entity_by_number, html_entity_by_number, uc);
-
- if (k != kh_end (html_entity_by_number)) {
- if (kh_val (html_entity_by_number, k)) {
- rep_len = strlen (kh_val (html_entity_by_number, k));
-
- if (end - t >= rep_len) {
- memcpy (t, kh_val (html_entity_by_number, k),
- rep_len);
- t += rep_len;
- }
- } else {
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else {
- /* Unicode point */
- goffset off = t - s;
- UBool is_error = 0;
-
- if (uc > 0) {
- U8_APPEND (s, off, len, uc, is_error);
- if (!is_error) {
- t = s + off;
- }
- else {
- /* Leave invalid entities as is */
- if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- else if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
- t += h - e + 1;
- }
- }
- }
- }
-
- state = 0;
- }
- else if (*h == '&') {
- /* Previous `&` was bogus */
- state = 1;
-
- if (end - t > h - e) {
- memmove (t, e, h - e);
- t += h - e;
- }
-
- e = h;
- }
-
- h++;
-
- break;
- }
- }
-
- /* Leftover */
- if (state == 1 && h > e) {
- /* Unfinished entity, copy as is */
- if (end - t > h - e) {
- memmove (t, e, h - e);
- t += h - e;
- }
- }
-
- return (t - s);
- }
-
- static gboolean
- rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
- {
- const gchar *p1, *p2;
-
- p1 = t1->begin + t1->len - 1;
- p2 = t2->begin + t2->len - 1;
-
- /* Skip trailing dots */
- while (p1 > t1->begin) {
- if (*p1 != '.') {
- break;
- }
-
- p1 --;
- }
-
- while (p2 > t2->begin) {
- if (*p2 != '.') {
- break;
- }
-
- p2 --;
- }
-
- while (p1 > t1->begin && p2 > t2->begin) {
- if (*p1 != *p2) {
- break;
- }
-
- p1 --;
- p2 --;
- }
-
- if (p2 == t2->begin) {
- /* p2 can be subdomain of p1 if *p1 is '.' */
- if (p1 != t1->begin && *(p1 - 1) == '.') {
- return TRUE;
- }
- }
- else if (p1 == t1->begin) {
- if (p2 != t2->begin && *(p2 - 1) == '.') {
- return TRUE;
- }
- }
-
- return FALSE;
- }
-
- static void
- rspamd_html_url_is_phished (rspamd_mempool_t *pool,
- struct rspamd_url *href_url,
- const guchar *url_text,
- gsize len,
- gboolean *url_found,
- struct rspamd_url **ptext_url)
- {
- struct rspamd_url *text_url;
- rspamd_ftok_t phished_tld, disp_tok, href_tok;
- gint rc;
- goffset url_pos;
- gchar *url_str = NULL, *idn_hbuf;
- const guchar *end = url_text + len, *p;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- static UIDNA *udn;
- UErrorCode uc_err = U_ZERO_ERROR;
- UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER;
- #endif
-
- *url_found = FALSE;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (udn == NULL) {
- udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
- }
- }
- #endif
-
- while (url_text < end && g_ascii_isspace (*url_text)) {
- url_text ++;
- }
-
- if (end > url_text + 4 &&
- rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE,
- &url_pos, NULL) &&
- url_str != NULL) {
- if (url_pos > 0) {
- /*
- * We have some url at some offset, so we need to check what is
- * at the start of the text
- */
- p = url_text;
-
- while (p < url_text + url_pos) {
- if (!g_ascii_isspace (*p)) {
- *url_found = FALSE;
- return;
- }
-
- p++;
- }
- }
- text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
- rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
- RSPAMD_URL_PARSE_TEXT);
-
- if (rc == URI_ERRNO_OK) {
- disp_tok.len = text_url->hostlen;
- disp_tok.begin = text_url->host;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (text_url->host,
- text_url->hostlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
- /* We need to convert it to the normal value first */
- disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
- text_url->host, text_url->hostlen,
- idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- disp_tok.len = text_url->hostlen;
- }
- else {
- disp_tok.begin = idn_hbuf;
- }
- }
- #endif
- href_tok.len = href_url->hostlen;
- href_tok.begin = href_url->host;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (href_url->host,
- href_url->hostlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
- /* We need to convert it to the normal value first */
- href_tok.len = uidna_nameToUnicodeUTF8 (udn,
- href_url->host, href_url->hostlen,
- idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- href_tok.len = href_url->hostlen;
- }
- else {
- href_tok.begin = idn_hbuf;
- }
- }
- #endif
- if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
-
- /* Apply the same logic for TLD */
- disp_tok.len = text_url->tldlen;
- disp_tok.begin = text_url->tld;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (text_url->tld,
- text_url->tldlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
- /* We need to convert it to the normal value first */
- disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
- text_url->tld, text_url->tldlen,
- idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- disp_tok.len = text_url->tldlen;
- }
- else {
- disp_tok.begin = idn_hbuf;
- }
- }
- #endif
- href_tok.len = href_url->tldlen;
- href_tok.begin = href_url->tld;
- #if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (href_url->tld,
- href_url->tldlen, "xn--", 4) != -1) {
- idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
- /* We need to convert it to the normal value first */
- href_tok.len = uidna_nameToUnicodeUTF8 (udn,
- href_url->tld, href_url->tldlen,
- idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
- href_tok.len = href_url->tldlen;
- }
- else {
- href_tok.begin = idn_hbuf;
- }
- }
- #endif
- if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
- /* Check if one url is a subdomain for another */
-
- if (!rspamd_url_is_subdomain (&disp_tok, &href_tok)) {
- href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
- href_url->phished_url = text_url;
- phished_tld.begin = href_tok.begin;
- phished_tld.len = href_tok.len;
- rspamd_url_add_tag (text_url, "phishing",
- rspamd_mempool_ftokdup (pool, &phished_tld),
- pool);
- text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- }
- }
- }
-
- *ptext_url = text_url;
- *url_found = TRUE;
- }
- else {
- msg_info_pool ("extract of url '%s' failed: %s",
- url_str,
- rspamd_url_strerror (rc));
- }
- }
-
- }
-
- static gboolean
- rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
- struct html_tag *tag, GNode **cur_level, gboolean *balanced)
- {
- GNode *nnode;
- struct html_tag *parent;
-
- if (hc->html_tags == NULL) {
- nnode = g_node_new (NULL);
- *cur_level = nnode;
- hc->html_tags = nnode;
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t) g_node_destroy,
- nnode);
- }
-
- if (hc->total_tags > max_tags) {
- hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
- }
-
- if (tag->id == -1) {
- /* Ignore unknown tags */
- hc->total_tags ++;
- return FALSE;
- }
-
- tag->parent = *cur_level;
-
- if (!(tag->flags & CM_INLINE)) {
- /* Block tag */
- if (tag->flags & (FL_CLOSING|FL_CLOSED)) {
- if (!*cur_level) {
- msg_debug_html ("bad parent node");
- return FALSE;
- }
-
- if (hc->total_tags < max_tags) {
- nnode = g_node_new (tag);
- g_node_append (*cur_level, nnode);
-
- if (!rspamd_html_check_balance (nnode, cur_level)) {
- msg_debug_html (
- "mark part as unbalanced as it has not pairable closing tags");
- hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
- *balanced = FALSE;
- } else {
- *balanced = TRUE;
- }
-
- hc->total_tags ++;
- }
- }
- else {
- parent = (*cur_level)->data;
-
- if (parent) {
- if ((parent->flags & FL_IGNORE)) {
- tag->flags |= FL_IGNORE;
- }
-
- if (!(tag->flags & FL_CLOSED) &&
- !(parent->flags & FL_BLOCK)) {
- /* We likely have some bad nesting */
- if (parent->id == tag->id) {
- /* Something like <a>bla<a>foo... */
- hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
- *balanced = FALSE;
- tag->parent = parent->parent;
-
- if (hc->total_tags < max_tags) {
- nnode = g_node_new (tag);
- g_node_append (parent->parent, nnode);
- *cur_level = nnode;
- hc->total_tags ++;
- }
-
- return TRUE;
- }
- }
-
- parent->content_length += tag->content_length;
- }
-
- if (hc->total_tags < max_tags) {
- nnode = g_node_new (tag);
- g_node_append (*cur_level, nnode);
-
- if ((tag->flags & FL_CLOSED) == 0) {
- *cur_level = nnode;
- }
-
- hc->total_tags ++;
- }
-
- if (tag->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE)) {
- tag->flags |= FL_IGNORE;
-
- return FALSE;
- }
-
- }
- }
- else {
- /* Inline tag */
- parent = (*cur_level)->data;
-
- if (parent && (parent->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE))) {
- tag->flags |= FL_IGNORE;
-
- return FALSE;
- }
- }
-
- return TRUE;
- }
-
- #define NEW_COMPONENT(comp_type) do { \
- comp = rspamd_mempool_alloc (pool, sizeof (*comp)); \
- comp->type = (comp_type); \
- comp->start = NULL; \
- comp->len = 0; \
- g_queue_push_tail (tag->params, comp); \
- ret = TRUE; \
- } while(0)
-
- static gboolean
- rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
- const guchar *begin, const guchar *end,
- struct html_tag *tag)
- {
- struct html_tag_component *comp;
- gint len;
- gboolean ret = FALSE;
- gchar *p;
-
- g_assert (end >= begin);
- p = rspamd_mempool_alloc (pool, end - begin);
- memcpy (p, begin, end - begin);
- len = rspamd_html_decode_entitles_inplace (p, end - begin);
-
- if (len == 3) {
- if (g_ascii_strncasecmp (p, "src", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
- }
- }
- else if (len == 4) {
- if (g_ascii_strncasecmp (p, "href", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
- }
- }
-
- if (tag->id == Tag_IMG) {
- /* Check width and height if presented */
- if (len == 5 && g_ascii_strncasecmp (p, "width", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH);
- }
- else if (len == 6 && g_ascii_strncasecmp (p, "height", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT);
- }
- else if (g_ascii_strncasecmp (p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- }
- else if (tag->id == Tag_FONT) {
- if (len == 5){
- if (g_ascii_strncasecmp (p, "color", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
- }
- else if (g_ascii_strncasecmp (p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- else if (g_ascii_strncasecmp (p, "class", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
- }
- }
- else if (len == 7) {
- if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
- }
- }
- else if (len == 4) {
- if (g_ascii_strncasecmp (p, "size", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE);
- }
- }
- }
- else if (tag->flags & FL_BLOCK) {
- if (len == 5){
- if (g_ascii_strncasecmp (p, "color", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
- }
- else if (g_ascii_strncasecmp (p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- else if (g_ascii_strncasecmp (p, "class", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
- }
- }
- else if (len == 7) {
- if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
- }
- }
- }
-
- return ret;
- }
-
- static inline void
- rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
- struct html_content *hc, struct html_tag *tag, const guchar *in,
- gint *statep, guchar const **savep)
- {
- enum {
- parse_start = 0,
- parse_name,
- parse_attr_name,
- parse_equal,
- parse_start_dquote,
- parse_dqvalue,
- parse_end_dquote,
- parse_start_squote,
- parse_sqvalue,
- parse_end_squote,
- parse_value,
- spaces_after_name,
- spaces_before_eq,
- spaces_after_eq,
- spaces_after_param,
- ignore_bad_tag
- } state;
- struct html_tag_def *found;
- gboolean store = FALSE;
- struct html_tag_component *comp;
-
- state = *statep;
-
- switch (state) {
- case parse_start:
- if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = ignore_bad_tag;
- tag->id = -1;
- tag->flags |= FL_BROKEN;
- }
- else if (g_ascii_isalpha (*in)) {
- state = parse_name;
- tag->name.start = in;
- }
- break;
-
- case parse_name:
- if (g_ascii_isspace (*in) || *in == '>' || *in == '/') {
- g_assert (in >= tag->name.start);
-
- if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
-
- tag->name.len = in - tag->name.start;
-
- if (tag->name.len == 0) {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- tag->id = -1;
- tag->flags |= FL_BROKEN;
- state = ignore_bad_tag;
- }
- else {
- gchar *s;
- khiter_t k;
- /* We CANNOT safely modify tag's name here, as it is already parsed */
-
- s = rspamd_mempool_alloc (pool, tag->name.len + 1);
- memcpy (s, tag->name.start, tag->name.len);
- tag->name.len = rspamd_html_decode_entitles_inplace (s,
- tag->name.len);
- tag->name.start = s;
- s[tag->name.len] = '\0';
- rspamd_str_lc_utf8 (s, tag->name.len);
-
- k = kh_get (tag_by_name, html_tag_by_name, s);
-
- if (k == kh_end (html_tag_by_name)) {
- hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
- tag->id = -1;
- }
- else {
- found = &kh_val (html_tag_by_name, k);
- tag->id = found->id;
- tag->flags = found->flags;
- }
-
- state = spaces_after_name;
- }
- }
- break;
-
- case parse_attr_name:
- if (*savep == NULL) {
- state = ignore_bad_tag;
- }
- else {
- const guchar *attr_name_end = in;
-
- if (*in == '=') {
- state = parse_equal;
- }
- else if (*in == '"') {
- /* No equal or something sane but we have quote character */
- state = parse_start_dquote;
- attr_name_end = in - 1;
-
- while (attr_name_end > *savep) {
- if (!g_ascii_isalnum (*attr_name_end)) {
- attr_name_end --;
- }
- else {
- break;
- }
- }
-
- /* One character forward to obtain length */
- attr_name_end ++;
- }
- else if (g_ascii_isspace (*in)) {
- state = spaces_before_eq;
- }
- else if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
- else if (!g_ascii_isgraph (*in)) {
- state = parse_value;
- attr_name_end = in - 1;
-
- while (attr_name_end > *savep) {
- if (!g_ascii_isalnum (*attr_name_end)) {
- attr_name_end --;
- }
- else {
- break;
- }
- }
-
- /* One character forward to obtain length */
- attr_name_end ++;
- }
- else {
- return;
- }
-
- if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
- /* Ignore unknown params */
- *savep = NULL;
- }
- else if (state == parse_value) {
- *savep = in + 1;
- }
- }
-
- break;
-
- case spaces_after_name:
- if (!g_ascii_isspace (*in)) {
- *savep = in;
- if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
- else if (*in != '>') {
- state = parse_attr_name;
- }
- }
- break;
-
- case spaces_before_eq:
- if (*in == '=') {
- state = parse_equal;
- }
- else if (!g_ascii_isspace (*in)) {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- tag->flags |= FL_BROKEN;
- state = ignore_bad_tag;
- }
- break;
-
- case spaces_after_eq:
- if (*in == '"') {
- state = parse_start_dquote;
- }
- else if (*in == '\'') {
- state = parse_start_squote;
- }
- else if (!g_ascii_isspace (*in)) {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_value;
- }
- break;
-
- case parse_equal:
- if (g_ascii_isspace (*in)) {
- state = spaces_after_eq;
- }
- else if (*in == '"') {
- state = parse_start_dquote;
- }
- else if (*in == '\'') {
- state = parse_start_squote;
- }
- else {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_value;
- }
- break;
-
- case parse_start_dquote:
- if (*in == '"') {
- if (*savep != NULL) {
- /* We have an empty attribute value */
- savep = NULL;
- }
- state = spaces_after_param;
- }
- else {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_dqvalue;
- }
- break;
-
- case parse_start_squote:
- if (*in == '\'') {
- if (*savep != NULL) {
- /* We have an empty attribute value */
- savep = NULL;
- }
- state = spaces_after_param;
- }
- else {
- if (*savep != NULL) {
- /* We need to save this param */
- *savep = in;
- }
- state = parse_sqvalue;
- }
- break;
-
- case parse_dqvalue:
- if (*in == '"') {
- store = TRUE;
- state = parse_end_dquote;
- }
-
- if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = g_queue_peek_tail (tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = rspamd_mempool_alloc (pool, comp->len);
- memcpy (s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
- comp->start = s;
- *savep = NULL;
- }
- }
- break;
-
- case parse_sqvalue:
- if (*in == '\'') {
- store = TRUE;
- state = parse_end_squote;
- }
- if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = g_queue_peek_tail (tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = rspamd_mempool_alloc (pool, comp->len);
- memcpy (s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
- comp->start = s;
- *savep = NULL;
- }
- }
- break;
-
- case parse_value:
- if (*in == '/' && *(in + 1) == '>') {
- tag->flags |= FL_CLOSED;
- store = TRUE;
- }
- else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
- store = TRUE;
- state = spaces_after_param;
- }
-
- if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = g_queue_peek_tail (tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = rspamd_mempool_alloc (pool, comp->len);
- memcpy (s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
- comp->start = s;
- *savep = NULL;
- }
- }
- break;
-
- case parse_end_dquote:
- case parse_end_squote:
- if (g_ascii_isspace (*in)) {
- state = spaces_after_param;
- }
- else if (*in == '/' && *(in + 1) == '>') {
- tag->flags |= FL_CLOSED;
- }
- break;
-
- case spaces_after_param:
- if (!g_ascii_isspace (*in)) {
- if (*in == '/' && *(in + 1) == '>') {
- tag->flags |= FL_CLOSED;
- }
-
- state = parse_attr_name;
- *savep = in;
- }
- break;
-
- case ignore_bad_tag:
- break;
- }
-
- *statep = state;
- }
-
-
-
- struct rspamd_url *
- rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
- struct html_tag_component *comp)
- {
- struct rspamd_url *url;
- guint saved_flags = 0;
- gchar *decoded;
- gint rc;
- gsize decoded_len;
- const gchar *p, *s;
- gchar *d;
- guint i, dlen;
- gboolean has_bad_chars = FALSE, no_prefix = FALSE;
- static const gchar hexdigests[16] = "0123456789abcdef";
-
- p = start;
-
- /* Strip spaces from the url */
- /* Head spaces */
- while (p < start + len && g_ascii_isspace (*p)) {
- p ++;
- start ++;
- len --;
- }
-
- if (comp) {
- comp->start = p;
- comp->len = len;
- }
-
- /* Trailing spaces */
- p = start + len - 1;
-
- while (p >= start && g_ascii_isspace (*p)) {
- p --;
- len --;
-
- if (comp) {
- comp->len --;
- }
- }
-
- s = start;
- dlen = 0;
-
- for (i = 0; i < len; i ++) {
- if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
- dlen += 3;
- }
- else {
- dlen ++;
- }
- }
-
- if (memchr (s, ':', len) == NULL) {
- /* We have no prefix */
- dlen += sizeof ("http://") - 1;
- no_prefix = TRUE;
- }
-
- decoded = rspamd_mempool_alloc (pool, dlen + 1);
- d = decoded;
-
- if (no_prefix) {
- if (s[0] == '/' && (len > 2 && s[1] == '/')) {
- /* //bla case */
- memcpy (d, "http:", sizeof ("http:") - 1);
- d += sizeof ("http:") - 1;
- }
- else {
- memcpy (d, "http://", sizeof ("http://") - 1);
- d += sizeof ("http://") - 1;
- }
- }
-
- /*
- * We also need to remove all internal newlines, spaces
- * and encode unsafe characters
- */
- for (i = 0; i < len; i ++) {
- if (G_UNLIKELY (g_ascii_isspace (s[i]))) {
- continue;
- }
- else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
- /* URL encode */
- *d++ = '%';
- *d++ = hexdigests[(s[i] >> 4) & 0xf];
- *d++ = hexdigests[s[i] & 0xf];
- has_bad_chars = TRUE;
- }
- else {
- *d++ = s[i];
- }
- }
-
- *d = '\0';
- dlen = d - decoded;
-
- url = rspamd_mempool_alloc0 (pool, sizeof (*url));
-
- enum rspamd_normalise_result norm_res;
-
- norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
-
- if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
- saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
-
- if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
- saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
-
- if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
- saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
- }
- }
-
- rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
-
- if (rc == URI_ERRNO_OK) {
- url->flags |= saved_flags;
-
- if (has_bad_chars) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if (no_prefix) {
- url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
- }
-
- decoded = url->string;
- decoded_len = url->urllen;
-
- if (comp) {
- comp->start = decoded;
- comp->len = decoded_len;
- }
- /* Spaces in href usually mean an attempt to obfuscate URL */
- /* See https://github.com/vstakhov/rspamd/issues/593 */
- #if 0
- if (has_spaces) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
- #endif
-
- return url;
- }
-
- return NULL;
- }
-
- static struct rspamd_url *
- rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc)
- {
- struct html_tag_component *comp;
- GList *cur;
- struct rspamd_url *url;
- const gchar *start;
- gsize len;
-
- cur = tag->params->head;
-
- while (cur) {
- comp = cur->data;
-
- if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
- start = comp->start;
- len = comp->len;
-
- /* Check base url */
- if (hc && hc->base_url && comp->len > 2) {
- /*
- * Relative url canot start from the following:
- * schema://
- * slash
- */
- gchar *buf;
- gsize orig_len;
-
- if (rspamd_substring_search (start, len, "://", 3) == -1) {
- /* Assume relative url */
-
- gboolean need_slash = FALSE;
-
- orig_len = len;
- len += hc->base_url->urllen;
-
- if (hc->base_url->string[hc->base_url->urllen - 1] != '/') {
- need_slash = TRUE;
- len ++;
- }
-
- buf = rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf (buf, len + 1, "%*s%s%*s",
- hc->base_url->urllen, hc->base_url->string,
- need_slash ? "/" : "",
- (gint)orig_len, start);
- start = buf;
- }
- else if (start[0] == '/' && start[1] != '/') {
- /* Relative to the hostname */
- orig_len = len;
- len += hc->base_url->hostlen + hc->base_url->protocollen +
- 3 /* for :// */;
- buf = rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
- hc->base_url->protocollen, hc->base_url->string,
- hc->base_url->hostlen, hc->base_url->host,
- (gint)orig_len, start);
- start = buf;
- }
- }
-
- url = rspamd_html_process_url (pool, start, len, comp);
-
- if (url && tag->extra == NULL) {
- tag->extra = url;
- }
-
- return url;
- }
-
- cur = g_list_next (cur);
- }
-
- return NULL;
- }
-
- static void
- rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
- GHashTable *tbl_urls, GHashTable *tbl_emails)
- {
- GHashTable *target_tbl;
- struct rspamd_url *query_url, *existing;
- gchar *url_str;
- gint rc;
- gboolean prefix_added;
-
- if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if (url->querylen > 0) {
-
- if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE,
- NULL, &prefix_added)) {
- query_url = rspamd_mempool_alloc0 (pool,
- sizeof (struct rspamd_url));
-
- rc = rspamd_url_parse (query_url,
- url_str,
- strlen (url_str),
- pool,
- RSPAMD_URL_PARSE_TEXT);
-
- if (rc == URI_ERRNO_OK &&
- query_url->hostlen > 0) {
- msg_debug_html ("found url %s in query of url"
- " %*s", url_str, url->querylen, url->query);
-
- if (query_url->protocol == PROTOCOL_MAILTO) {
- target_tbl = tbl_emails;
- }
- else {
- target_tbl = tbl_urls;
- }
-
- if (prefix_added) {
- query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
- }
-
- if (query_url->flags
- & (RSPAMD_URL_FLAG_UNNORMALISED|RSPAMD_URL_FLAG_OBSCURED|
- RSPAMD_URL_FLAG_NUMERIC)) {
- /* Set obscured flag if query url is bad */
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- /* And vice-versa */
- if (url->flags & RSPAMD_URL_FLAG_OBSCURED) {
- query_url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if ((existing = g_hash_table_lookup (target_tbl,
- query_url)) == NULL) {
- g_hash_table_insert (target_tbl,
- query_url,
- query_url);
- }
- else {
- existing->count ++;
- }
- }
- }
- }
- }
-
- static void
- rspamd_html_process_data_image (rspamd_mempool_t *pool,
- struct html_image *img,
- struct html_tag_component *src)
- {
- /*
- * Here, we do very basic processing of the data:
- * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
- * We only parse base64 encoded data.
- * We ignore content type so far
- */
- struct rspamd_image *parsed_image;
- const gchar *semicolon_pos = NULL, *end = src->start + src->len;
-
- semicolon_pos = src->start;
-
- while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) {
- if (end - semicolon_pos > sizeof ("base64,")) {
- if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) {
- const gchar *data_pos = semicolon_pos + sizeof ("base64,");
- gchar *decoded;
- gsize encoded_len = end - data_pos, decoded_len;
- rspamd_ftok_t inp;
-
- decoded_len = (encoded_len / 4 * 3) + 12;
- decoded = rspamd_mempool_alloc (pool, decoded_len);
- rspamd_cryptobox_base64_decode (data_pos, encoded_len,
- decoded, &decoded_len);
- inp.begin = decoded;
- inp.len = decoded_len;
-
- parsed_image = rspamd_maybe_process_image (pool, &inp);
-
- if (parsed_image) {
- msg_debug_html ("detected %s image of size %ud x %ud in data url",
- rspamd_image_type_str (parsed_image->type),
- parsed_image->width, parsed_image->height);
- img->embedded_image = parsed_image;
- }
- }
-
- break;
- }
- else {
- /* Nothing useful */
- return;
- }
-
- semicolon_pos ++;
- }
- }
-
- static void
- rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc)
- {
- struct html_tag_component *comp;
- struct html_image *img;
- rspamd_ftok_t fstr;
- const guchar *p;
- GList *cur;
- gulong val;
- gboolean seen_width = FALSE, seen_height = FALSE;
- goffset pos;
-
- cur = tag->params->head;
- img = rspamd_mempool_alloc0 (pool, sizeof (*img));
- img->tag = tag;
-
- while (cur) {
- comp = cur->data;
-
- if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
- fstr.begin = (gchar *)comp->start;
- fstr.len = comp->len;
- img->src = rspamd_mempool_ftokdup (pool, &fstr);
-
- if (comp->len > sizeof ("cid:") - 1 && memcmp (comp->start,
- "cid:", sizeof ("cid:") - 1) == 0) {
- /* We have an embedded image */
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
- }
- if (comp->len > sizeof ("data:") - 1 && memcmp (comp->start,
- "data:", sizeof ("data:") - 1) == 0) {
- /* We have an embedded image in HTML tag */
- img->flags |=
- (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED|RSPAMD_HTML_FLAG_IMAGE_DATA);
- rspamd_html_process_data_image (pool, img, comp);
- hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
- }
- else {
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
- if (img->src) {
- img->url = rspamd_html_process_url (pool,
- img->src, fstr.len, NULL);
- }
- }
- }
- else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) {
- rspamd_strtoul (comp->start, comp->len, &val);
- img->height = val;
- seen_height = TRUE;
- }
- else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) {
- rspamd_strtoul (comp->start, comp->len, &val);
- img->width = val;
- seen_width = TRUE;
- }
- else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) {
- /* Try to search for height= or width= in style tag */
- if (!seen_height && comp->len > 0) {
- pos = rspamd_substring_search_caseless (comp->start, comp->len,
- "height", sizeof ("height") - 1);
-
- if (pos != -1) {
- p = comp->start + pos + sizeof ("height") - 1;
-
- while (p < comp->start + comp->len) {
- if (g_ascii_isdigit (*p)) {
- rspamd_strtoul (p, comp->len - (p - comp->start), &val);
- img->height = val;
- break;
- }
- else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
- /* Fallback */
- break;
- }
- p ++;
- }
- }
- }
-
- if (!seen_width && comp->len > 0) {
- pos = rspamd_substring_search_caseless (comp->start, comp->len,
- "width", sizeof ("width") - 1);
-
- if (pos != -1) {
- p = comp->start + pos + sizeof ("width") - 1;
-
- while (p < comp->start + comp->len) {
- if (g_ascii_isdigit (*p)) {
- rspamd_strtoul (p, comp->len - (p - comp->start), &val);
- img->width = val;
- break;
- }
- else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
- /* Fallback */
- break;
- }
- p ++;
- }
- }
- }
- }
-
- cur = g_list_next (cur);
- }
-
- if (hc->images == NULL) {
- hc->images = g_ptr_array_sized_new (4);
- rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
- hc->images);
- }
-
- if (img->embedded_image) {
- if (!seen_height) {
- img->height = img->embedded_image->height;
- }
- if (!seen_width) {
- img->width = img->embedded_image->width;
- }
- }
-
- g_ptr_array_add (hc->images, img);
- tag->extra = img;
- }
-
- static void
- rspamd_html_process_color (const gchar *line, guint len, struct html_color *cl)
- {
- const gchar *p = line, *end = line + len;
- char hexbuf[7];
- rspamd_ftok_t search;
- struct html_color *el;
-
- memset (cl, 0, sizeof (*cl));
-
- if (*p == '#') {
- /* HEX color */
- p ++;
- rspamd_strlcpy (hexbuf, p, MIN ((gint)sizeof(hexbuf), end - p + 1));
- cl->d.val = strtoul (hexbuf, NULL, 16);
- cl->d.comp.alpha = 255;
- cl->valid = TRUE;
- }
- else if (len > 4 && rspamd_lc_cmp (p, "rgb", 3) == 0) {
- /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
- enum {
- obrace,
- num1,
- num2,
- num3,
- num4,
- skip_spaces
- } state = skip_spaces, next_state = obrace;
- gulong r = 0, g = 0, b = 0, opacity = 255;
- const gchar *c;
- gboolean valid = FALSE;
-
- p += 3;
-
- if (*p == 'a') {
- p ++;
- }
-
- c = p;
-
- while (p < end) {
- switch (state) {
- case obrace:
- if (*p == '(') {
- p ++;
- state = skip_spaces;
- next_state = num1;
- }
- else if (g_ascii_isspace (*p)) {
- state = skip_spaces;
- next_state = obrace;
- }
- else {
- goto stop;
- }
- break;
- case num1:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &r)) {
- goto stop;
- }
-
- p ++;
- state = skip_spaces;
- next_state = num2;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case num2:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &g)) {
- goto stop;
- }
-
- p ++;
- state = skip_spaces;
- next_state = num3;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case num3:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &b)) {
- goto stop;
- }
-
- valid = TRUE;
- p ++;
- state = skip_spaces;
- next_state = num4;
- }
- else if (*p == ')') {
- if (!rspamd_strtoul (c, p - c, &b)) {
- goto stop;
- }
-
- valid = TRUE;
- goto stop;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case num4:
- if (*p == ',') {
- if (!rspamd_strtoul (c, p - c, &opacity)) {
- goto stop;
- }
-
- valid = TRUE;
- goto stop;
- }
- else if (*p == ')') {
- if (!rspamd_strtoul (c, p - c, &opacity)) {
- goto stop;
- }
-
- valid = TRUE;
- goto stop;
- }
- else if (!g_ascii_isdigit (*p)) {
- goto stop;
- }
- else {
- p ++;
- }
- break;
- case skip_spaces:
- if (!g_ascii_isspace (*p)) {
- c = p;
- state = next_state;
- }
- else {
- p ++;
- }
- break;
- }
- }
-
- stop:
-
- if (valid) {
- cl->d.comp.r = r;
- cl->d.comp.g = g;
- cl->d.comp.b = b;
- cl->d.comp.alpha = opacity;
- cl->valid = TRUE;
- }
- }
- else {
- khiter_t k;
- /* Compare color by name */
- search.begin = line;
- search.len = len;
-
- k = kh_get (color_by_name, html_color_by_name, &search);
-
- if (k != kh_end (html_color_by_name)) {
- el = &kh_val (html_color_by_name, k);
- memcpy (cl, el, sizeof (*cl));
- cl->d.comp.alpha = 255; /* Non transparent */
- }
- }
- }
-
- /*
- * Target is used for in and out if this function returns TRUE
- */
- static gboolean
- rspamd_html_process_css_size (const gchar *suffix, gsize len,
- gdouble *tgt)
- {
- gdouble sz = *tgt;
- gboolean ret = FALSE;
-
- if (len >= 2) {
- if (memcmp (suffix, "px", 2) == 0) {
- sz = (guint) sz; /* Round to number */
- ret = TRUE;
- }
- else if (memcmp (suffix, "em", 2) == 0) {
- /* EM is 16 px, so multiply and round */
- sz = (guint) (sz * 16.0);
- ret = TRUE;
- }
- else if (len >= 3 && memcmp (suffix, "rem", 3) == 0) {
- /* equal to EM in our case */
- sz = (guint) (sz * 16.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "ex", 2) == 0) {
- /*
- * Represents the x-height of the element's font.
- * On fonts with the "x" letter, this is generally the height
- * of lowercase letters in the font; 1ex = 0.5em in many fonts.
- */
- sz = (guint) (sz * 8.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "vw", 2) == 0) {
- /*
- * Vewport width in percentages:
- * we assume 1% of viewport width as 8px
- */
- sz = (guint) (sz * 8.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "vh", 2) == 0) {
- /*
- * Vewport height in percentages
- * we assume 1% of viewport width as 6px
- */
- sz = (guint) (sz * 6.0);
- ret = TRUE;
- }
- else if (len >= 4 && memcmp (suffix, "vmax", 4) == 0) {
- /*
- * Vewport width in percentages
- * we assume 1% of viewport width as 6px
- */
- sz = (guint) (sz * 8.0);
- ret = TRUE;
- }
- else if (len >= 4 && memcmp (suffix, "vmin", 4) == 0) {
- /*
- * Vewport height in percentages
- * we assume 1% of viewport width as 6px
- */
- sz = (guint) (sz * 6.0);
- ret = TRUE;
- }
- else if (memcmp (suffix, "pt", 2) == 0) {
- sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
- ret = TRUE;
- }
- else if (memcmp (suffix, "cm", 2) == 0) {
- sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
- ret = TRUE;
- }
- else if (memcmp (suffix, "mm", 2) == 0) {
- sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
- ret = TRUE;
- }
- else if (memcmp (suffix, "in", 2) == 0) {
- sz = (guint) (sz * 96.0); /* 96px */
- ret = TRUE;
- }
- else if (memcmp (suffix, "pc", 2) == 0) {
- sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
- ret = TRUE;
- }
- }
- else if (suffix[0] == '%') {
- /* Percentages from 16 px */
- sz = (guint)(sz / 100.0 * 16.0);
- ret = TRUE;
- }
-
- if (ret) {
- *tgt = sz;
- }
-
- return ret;
- }
-
- static void
- rspamd_html_process_font_size (const gchar *line, guint len, guint *fs,
- gboolean is_css)
- {
- const gchar *p = line, *end = line + len;
- gchar *err = NULL, numbuf[64];
- gdouble sz = 0;
- gboolean failsafe = FALSE;
-
- while (p < end && g_ascii_isspace (*p)) {
- p ++;
- len --;
- }
-
- if (g_ascii_isdigit (*p)) {
- rspamd_strlcpy (numbuf, p, MIN (sizeof (numbuf), len + 1));
- sz = strtod (numbuf, &err);
-
- /* Now check leftover */
- if (sz < 0) {
- sz = 0;
- }
- }
- else {
- /* Ignore the rest */
- failsafe = TRUE;
- sz = is_css ? 16 : 1;
- /* TODO: add textual fonts descriptions */
- }
-
- if (err && *err != '\0') {
- const gchar *e = err;
- gsize slen;
-
- /* Skip spaces */
- while (*e && g_ascii_isspace (*e)) {
- e ++;
- }
-
- /* Lowercase */
- slen = strlen (e);
- rspamd_str_lc ((gchar *)e, slen);
-
- if (!rspamd_html_process_css_size (e, slen, &sz)) {
- failsafe = TRUE;
- }
- }
- else {
- /* Failsafe naked number */
- failsafe = TRUE;
- }
-
- if (failsafe) {
- if (is_css) {
- /*
- * In css mode we usually ignore sizes, but let's treat
- * small sizes specially
- */
- if (sz < 1) {
- sz = 0;
- } else {
- sz = 16; /* Ignore */
- }
- } else {
- /* In non-css mode we have to check legacy size */
- sz = sz >= 1 ? sz * 16 : 16;
- }
- }
-
- if (sz > 32) {
- sz = 32;
- }
-
- *fs = sz;
- }
-
- static void
- rspamd_html_process_style (rspamd_mempool_t *pool, struct html_block *bl,
- struct html_content *hc, const gchar *style, guint len)
- {
- const gchar *p, *c, *end, *key = NULL;
- enum {
- read_key,
- read_colon,
- read_value,
- skip_spaces,
- } state = skip_spaces, next_state = read_key;
- guint klen = 0;
- gdouble opacity = 1.0;
-
- p = style;
- c = p;
- end = p + len;
-
- while (p <= end) {
- switch(state) {
- case read_key:
- if (p == end || *p == ':') {
- key = c;
- klen = p - c;
- state = skip_spaces;
- next_state = read_value;
- }
- else if (g_ascii_isspace (*p)) {
- key = c;
- klen = p - c;
- state = skip_spaces;
- next_state = read_colon;
- }
-
- p ++;
- break;
-
- case read_colon:
- if (p == end || *p == ':') {
- state = skip_spaces;
- next_state = read_value;
- }
-
- p ++;
- break;
-
- case read_value:
- if (p == end || *p == ';') {
- if (key && klen && p - c > 0) {
- if ((klen == 5 && g_ascii_strncasecmp (key, "color", 5) == 0)
- || (klen == 10 && g_ascii_strncasecmp (key, "font-color", 10) == 0)) {
-
- rspamd_html_process_color (c, p - c, &bl->font_color);
- msg_debug_html ("got color: %xd", bl->font_color.d.val);
- }
- else if ((klen == 16 && g_ascii_strncasecmp (key,
- "background-color", 16) == 0) ||
- (klen == 10 && g_ascii_strncasecmp (key,
- "background", 10) == 0)) {
-
- rspamd_html_process_color (c, p - c, &bl->background_color);
- msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
- }
- else if (klen == 7 && g_ascii_strncasecmp (key, "display", 7) == 0) {
- if (p - c >= 4 && rspamd_substring_search_caseless (c, p - c,
- "none", 4) != -1) {
- bl->visible = FALSE;
- msg_debug_html ("tag is not visible");
- }
- }
- else if (klen == 9 &&
- g_ascii_strncasecmp (key, "font-size", 9) == 0) {
- rspamd_html_process_font_size (c, p - c,
- &bl->font_size, TRUE);
- msg_debug_html ("got font size: %ud", bl->font_size);
- }
- else if (klen == 7 &&
- g_ascii_strncasecmp (key, "opacity", 7) == 0) {
- gchar numbuf[64];
-
- rspamd_strlcpy (numbuf, c,
- MIN (sizeof (numbuf), p - c + 1));
- opacity = strtod (numbuf, NULL);
-
- if (opacity > 1) {
- opacity = 1;
- }
- else if (opacity < 0) {
- opacity = 0;
- }
-
- bl->font_color.d.comp.alpha = (guint8)(opacity * 255.0);
- }
- else if (klen == 10 &&
- g_ascii_strncasecmp (key, "visibility", 10) == 0) {
- if (p - c >= 6 && rspamd_substring_search_caseless (c,
- p - c,
- "hidden", 6) != -1) {
- bl->visible = FALSE;
- msg_debug_html ("tag is not visible");
- }
- }
- }
-
- key = NULL;
- klen = 0;
- state = skip_spaces;
- next_state = read_key;
- }
-
- p ++;
- break;
-
- case skip_spaces:
- if (p < end && !g_ascii_isspace (*p)) {
- c = p;
- state = next_state;
- }
- else {
- p ++;
- }
-
- break;
- }
- }
- }
-
- static void
- rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc)
- {
- struct html_tag_component *comp;
- struct html_block *bl;
- rspamd_ftok_t fstr;
- GList *cur;
-
- cur = tag->params->head;
- bl = rspamd_mempool_alloc0 (pool, sizeof (*bl));
- bl->tag = tag;
- bl->visible = TRUE;
- bl->font_size = (guint)-1;
- bl->font_color.d.comp.alpha = 255;
-
- while (cur) {
- comp = cur->data;
-
- if (comp->len > 0) {
- switch (comp->type) {
- case RSPAMD_HTML_COMPONENT_COLOR:
- fstr.begin = (gchar *) comp->start;
- fstr.len = comp->len;
- rspamd_html_process_color (comp->start, comp->len,
- &bl->font_color);
- msg_debug_html ("got color: %xd", bl->font_color.d.val);
- break;
- case RSPAMD_HTML_COMPONENT_BGCOLOR:
- fstr.begin = (gchar *) comp->start;
- fstr.len = comp->len;
- rspamd_html_process_color (comp->start, comp->len,
- &bl->background_color);
- msg_debug_html ("got color: %xd", bl->font_color.d.val);
-
- if (tag->id == Tag_BODY) {
- /* Set global background color */
- memcpy (&hc->bgcolor, &bl->background_color,
- sizeof (hc->bgcolor));
- }
- break;
- case RSPAMD_HTML_COMPONENT_STYLE:
- bl->style.len = comp->len;
- bl->style.start = comp->start;
- msg_debug_html ("got style: %*s", (gint) bl->style.len,
- bl->style.start);
- rspamd_html_process_style (pool, bl, hc, comp->start, comp->len);
- break;
- case RSPAMD_HTML_COMPONENT_CLASS:
- fstr.begin = (gchar *) comp->start;
- fstr.len = comp->len;
- bl->class = rspamd_mempool_ftokdup (pool, &fstr);
- msg_debug_html ("got class: %s", bl->class);
- break;
- case RSPAMD_HTML_COMPONENT_SIZE:
- fstr.begin = (gchar *) comp->start;
- fstr.len = comp->len;
- rspamd_html_process_color (comp->start, comp->len,
- &bl->font_color);
- msg_debug_html ("got color: %xd", bl->font_color.d.val);
- break;
- default:
- /* NYI */
- break;
- }
- }
-
- cur = g_list_next (cur);
- }
-
- if (hc->blocks == NULL) {
- hc->blocks = g_ptr_array_sized_new (64);
- rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
- hc->blocks);
- }
-
- g_ptr_array_add (hc->blocks, bl);
- tag->extra = bl;
- }
-
- static void
- rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
- GList **exceptions, GHashTable *urls, GHashTable *emails,
- GByteArray *dest, GHashTable *target_tbl,
- gint href_offset,
- struct rspamd_url *url)
- {
- struct rspamd_url *displayed_url = NULL;
- struct rspamd_url *turl;
- gboolean url_found = FALSE;
- struct rspamd_process_exception *ex;
-
- if (href_offset <= 0) {
- /* No dispalyed url, just some text within <a> tag */
- return;
- }
-
- rspamd_html_url_is_phished (pool, url,
- dest->data + href_offset,
- dest->len - href_offset,
- &url_found, &displayed_url);
-
- if (exceptions && url_found) {
- ex = rspamd_mempool_alloc (pool,
- sizeof (*ex));
- ex->pos = href_offset;
- ex->len = dest->len - href_offset;
- ex->type = RSPAMD_EXCEPTION_URL;
- ex->ptr = url;
-
- *exceptions = g_list_prepend (*exceptions,
- ex);
- }
-
- if (displayed_url) {
- if (displayed_url->protocol ==
- PROTOCOL_MAILTO) {
- target_tbl = emails;
- }
- else {
- target_tbl = urls;
- }
-
- if (target_tbl != NULL) {
- turl = g_hash_table_lookup (target_tbl,
- displayed_url);
-
- if (turl != NULL) {
- /* Here, we assume the following:
- * if we have a URL in the text part which
- * is the same as displayed URL in the
- * HTML part, we assume that it is also
- * hint only.
- */
- if (turl->flags &
- RSPAMD_URL_FLAG_FROM_TEXT) {
- turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
- }
-
- turl->count ++;
- }
- else {
- g_hash_table_insert (target_tbl,
- displayed_url,
- displayed_url);
- }
- }
- }
- }
-
- static gboolean
- rspamd_html_propagate_lengths (GNode *node, gpointer _unused)
- {
- GNode *child;
- struct html_tag *tag = node->data, *cld_tag;
-
- if (tag) {
- child = node->children;
-
- /* Summarize content length from children */
- while (child) {
- cld_tag = child->data;
- tag->content_length += cld_tag->content_length;
- child = child->next;
- }
- }
-
- return FALSE;
- }
-
- static void
- rspamd_html_propagate_style (struct html_content *hc,
- struct html_tag *tag,
- struct html_block *bl,
- GQueue *blocks)
- {
- struct html_block *bl_parent;
- gboolean push_block = FALSE;
-
-
- /* Propagate from the parent if needed */
- bl_parent = g_queue_peek_tail (blocks);
-
- if (bl_parent) {
- if (!bl->background_color.valid) {
- /* Try to propagate background color from parent nodes */
- if (bl_parent->background_color.valid) {
- memcpy (&bl->background_color, &bl_parent->background_color,
- sizeof (bl->background_color));
- }
- }
- else {
- push_block = TRUE;
- }
-
- if (!bl->font_color.valid) {
- /* Try to propagate background color from parent nodes */
- if (bl_parent->font_color.valid) {
- memcpy (&bl->font_color, &bl_parent->font_color,
- sizeof (bl->font_color));
- }
- }
- else {
- push_block = TRUE;
- }
-
- /* Propagate font size */
- if (bl->font_size == (guint)-1) {
- if (bl_parent->font_size != (guint)-1) {
- bl->font_size = bl_parent->font_size;
- }
- }
- else {
- push_block = TRUE;
- }
- }
-
- /* Set bgcolor to the html bgcolor and font color to black as a last resort */
- if (!bl->font_color.valid) {
- /* Don't touch opacity as it can be set separately */
- bl->font_color.d.comp.r = 0;
- bl->font_color.d.comp.g = 0;
- bl->font_color.d.comp.b = 0;
- bl->font_color.valid = TRUE;
- }
- else {
- push_block = TRUE;
- }
-
- if (!bl->background_color.valid) {
- memcpy (&bl->background_color, &hc->bgcolor, sizeof (hc->bgcolor));
- }
- else {
- push_block = TRUE;
- }
-
- if (bl->font_size == (guint)-1) {
- bl->font_size = 16; /* Default for browsers */
- }
- else {
- push_block = TRUE;
- }
-
- if (push_block && !(tag->flags & FL_CLOSED)) {
- g_queue_push_tail (blocks, bl);
- }
- }
-
- GByteArray*
- rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
- GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails)
- {
- const guchar *p, *c, *end, *savep = NULL;
- guchar t;
- gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
- balanced;
- GByteArray *dest;
- GHashTable *target_tbl;
- guint obrace = 0, ebrace = 0;
- GNode *cur_level = NULL;
- gint substate = 0, len, href_offset = -1;
- struct html_tag *cur_tag = NULL, *content_tag = NULL;
- struct rspamd_url *url = NULL, *turl;
- GQueue *styles_blocks;
-
- enum {
- parse_start = 0,
- tag_begin,
- sgml_tag,
- xml_tag,
- compound_tag,
- comment_tag,
- comment_content,
- sgml_content,
- tag_content,
- tag_end,
- xml_tag_end,
- content_ignore,
- content_write,
- content_ignore_sp
- } state = parse_start;
-
- g_assert (in != NULL);
- g_assert (hc != NULL);
- g_assert (pool != NULL);
-
- rspamd_html_library_init ();
- hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (G_N_ELEMENTS (tag_defs)));
-
- /* Set white background color by default */
- hc->bgcolor.d.comp.alpha = 0;
- hc->bgcolor.d.comp.r = 255;
- hc->bgcolor.d.comp.g = 255;
- hc->bgcolor.d.comp.b = 255;
- hc->bgcolor.valid = TRUE;
-
- dest = g_byte_array_sized_new (in->len / 3 * 2);
- styles_blocks = g_queue_new ();
-
- p = in->data;
- c = p;
- end = p + in->len;
-
- while (p < end) {
- t = *p;
-
- switch (state) {
- case parse_start:
- if (t == '<') {
- state = tag_begin;
- }
- else {
- /* We have no starting tag, so assume that it's content */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
- state = content_write;
- }
-
- break;
- case tag_begin:
- switch (t) {
- case '<':
- p ++;
- closing = FALSE;
- break;
- case '!':
- state = sgml_tag;
- p ++;
- break;
- case '?':
- state = xml_tag;
- hc->flags |= RSPAMD_HTML_FLAG_XML;
- p ++;
- break;
- case '/':
- closing = TRUE;
- p ++;
- break;
- case '>':
- /* Empty tag */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = tag_end;
- continue;
- default:
- state = tag_content;
- substate = 0;
- savep = NULL;
- cur_tag = rspamd_mempool_alloc0 (pool, sizeof (*cur_tag));
- cur_tag->params = g_queue_new ();
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t)g_queue_free, cur_tag->params);
- break;
- }
-
- break;
-
- case sgml_tag:
- switch (t) {
- case '[':
- state = compound_tag;
- obrace = 1;
- ebrace = 0;
- p ++;
- break;
- case '-':
- state = comment_tag;
- p ++;
- break;
- default:
- state = sgml_content;
- break;
- }
-
- break;
-
- case xml_tag:
- if (t == '?') {
- state = xml_tag_end;
- }
- else if (t == '>') {
- /* Misformed xml tag */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- state = tag_end;
- continue;
- }
- /* We efficiently ignore xml tags */
- p ++;
- break;
-
- case xml_tag_end:
- if (t == '>') {
- state = tag_end;
- continue;
- }
- else {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- p ++;
- }
- break;
-
- case compound_tag:
- if (t == '[') {
- obrace ++;
- }
- else if (t == ']') {
- ebrace ++;
- }
- else if (t == '>' && obrace == ebrace) {
- state = tag_end;
- continue;
- }
- p ++;
- break;
-
- case comment_tag:
- if (t != '-') {
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- }
- p ++;
- ebrace = 0;
- state = comment_content;
- break;
-
- case comment_content:
- if (t == '-') {
- ebrace ++;
- }
- else if (t == '>' && ebrace >= 2) {
- state = tag_end;
- continue;
- }
- else {
- ebrace = 0;
- }
-
- p ++;
- break;
-
- case content_ignore:
- if (t != '<') {
- p ++;
- }
- else {
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
- }
-
- content_tag->content_length += p - c;
- }
- state = tag_begin;
- }
- break;
-
- case content_write:
-
- if (t != '<') {
- if (t == '&') {
- need_decode = TRUE;
- }
- else if (g_ascii_isspace (t)) {
- save_space = TRUE;
-
- if (p > c) {
- if (need_decode) {
- goffset old_offset = dest->len;
-
- g_byte_array_append (dest, c, (p - c));
-
- len = rspamd_html_decode_entitles_inplace (
- dest->data + old_offset,
- p - c);
- dest->len = dest->len + len - (p - c);
- }
- else {
- len = p - c;
- g_byte_array_append (dest, c, len);
- }
-
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
- }
-
- content_tag->content_length += p - c + 1;
- }
- }
-
- c = p;
- state = content_ignore_sp;
- }
- else {
- if (save_space) {
- /* Append one space if needed */
- if (dest->len > 0 &&
- !g_ascii_isspace (dest->data[dest->len - 1])) {
- g_byte_array_append (dest, " ", 1);
- }
- save_space = FALSE;
- }
- }
- }
- else {
- if (c != p) {
-
- if (need_decode) {
- goffset old_offset = dest->len;
-
- g_byte_array_append (dest, c, (p - c));
- len = rspamd_html_decode_entitles_inplace (
- dest->data + old_offset,
- p - c);
- dest->len = dest->len + len - (p - c);
- }
- else {
- len = p - c;
- g_byte_array_append (dest, c, len);
- }
-
-
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
- }
-
- content_tag->content_length += p - c;
- }
- }
-
- content_tag = NULL;
-
- state = tag_begin;
- continue;
- }
-
- p ++;
- break;
-
- case content_ignore_sp:
- if (!g_ascii_isspace (t)) {
- c = p;
- state = content_write;
- continue;
- }
-
- if (content_tag) {
- content_tag->content_length ++;
- }
-
- p ++;
- break;
-
- case sgml_content:
- /* TODO: parse DOCTYPE here */
- if (t == '>') {
- state = tag_end;
- /* We don't know a lot about sgml tags, ignore them */
- cur_tag = NULL;
- continue;
- }
- p ++;
- break;
-
- case tag_content:
- rspamd_html_parse_tag_content (pool, hc, cur_tag,
- p, &substate, &savep);
- if (t == '>') {
- if (closing) {
- cur_tag->flags |= FL_CLOSING;
-
- if (cur_tag->flags & FL_CLOSED) {
- /* Bad mix of closed and closing */
- hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- }
-
- closing = FALSE;
- }
-
- state = tag_end;
- continue;
- }
- p ++;
- break;
-
- case tag_end:
- substate = 0;
- savep = NULL;
-
- if (cur_tag != NULL) {
- balanced = TRUE;
-
- if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level,
- &balanced)) {
- state = content_write;
- need_decode = FALSE;
- }
- else {
- state = content_ignore;
- }
-
- if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
- if (cur_tag->flags & CM_UNIQUE) {
- if (isset (hc->tags_seen, cur_tag->id)) {
- /* Duplicate tag has been found */
- hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
- }
- }
- setbit (hc->tags_seen, cur_tag->id);
- }
-
- if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
- content_tag = cur_tag;
- }
-
- /* Handle newlines */
- if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
- if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
- g_byte_array_append (dest, "\r\n", 2);
- }
- save_space = FALSE;
- }
-
- if ((cur_tag->id == Tag_P ||
- cur_tag->id == Tag_TR ||
- cur_tag->id == Tag_DIV)) {
- if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
- g_byte_array_append (dest, "\r\n", 2);
- }
- save_space = FALSE;
- }
-
- if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) {
- if (!(cur_tag->flags & (FL_CLOSING))) {
- url = rspamd_html_process_url_tag (pool, cur_tag, hc);
-
- if (url != NULL) {
-
- if (url->protocol == PROTOCOL_MAILTO) {
- target_tbl = emails;
- }
- else {
- target_tbl = urls;
- }
-
- if (target_tbl != NULL) {
- turl = g_hash_table_lookup (target_tbl, url);
-
- if (turl == NULL) {
- g_hash_table_insert (target_tbl, url, url);
- }
- else {
- turl->count ++;
- url = NULL;
- }
-
- if (turl == NULL && url != NULL) {
- rspamd_process_html_url (pool,
- url,
- urls, emails);
- }
- }
-
- href_offset = dest->len;
- }
- }
-
- if (cur_tag->id == Tag_A) {
- if (!balanced && cur_level && cur_level->prev) {
- struct html_tag *prev_tag;
- struct rspamd_url *prev_url;
-
- prev_tag = cur_level->prev->data;
-
- if (prev_tag->id == Tag_A &&
- !(prev_tag->flags & (FL_CLOSING)) &&
- prev_tag->extra) {
- prev_url = prev_tag->extra;
-
- rspamd_html_check_displayed_url (pool,
- exceptions, urls, emails,
- dest, target_tbl, href_offset,
- prev_url);
- }
- }
-
- if (cur_tag->flags & (FL_CLOSING)) {
-
- /* Insert exception */
- if (url != NULL && (gint) dest->len > href_offset) {
- rspamd_html_check_displayed_url (pool,
- exceptions, urls, emails,
- dest, target_tbl, href_offset,
- url);
-
- }
-
- href_offset = -1;
- url = NULL;
- }
- }
- }
- else if (cur_tag->id == Tag_LINK) {
- url = rspamd_html_process_url_tag (pool, cur_tag, hc);
- }
- else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
- struct html_tag *prev_tag = NULL;
-
- if (cur_level && cur_level->parent) {
- prev_tag = cur_level->parent->data;
- }
-
- /*
- * Base is allowed only within head tag but we slightly
- * relax that
- */
- if (!prev_tag || prev_tag->id == Tag_HEAD ||
- prev_tag->id == Tag_HTML) {
- url = rspamd_html_process_url_tag (pool, cur_tag, hc);
-
- if (url != NULL && hc->base_url == NULL) {
- /* We have a base tag available */
- hc->base_url = url;
- }
- }
- }
-
- if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
- rspamd_html_process_img_tag (pool, cur_tag, hc);
- }
- else if (cur_tag->flags & FL_BLOCK) {
- struct html_block *bl;
-
- if (cur_tag->flags & FL_CLOSING) {
- /* Just remove block element from the queue if any */
- if (styles_blocks->length > 0) {
- g_queue_pop_tail (styles_blocks);
- }
- }
- else {
- rspamd_html_process_block_tag (pool, cur_tag, hc);
- bl = cur_tag->extra;
-
- if (bl) {
- rspamd_html_propagate_style (hc, cur_tag,
- cur_tag->extra, styles_blocks);
-
- /* Check visibility */
- if (bl->font_size < 3 ||
- bl->font_color.d.comp.alpha < 10) {
-
- bl->visible = FALSE;
- msg_debug_html ("tag is not visible");
- }
-
- if (!bl->visible) {
- state = content_ignore;
- }
- }
- }
- }
- }
- else {
- state = content_write;
- }
-
-
- p++;
- c = p;
- cur_tag = NULL;
- break;
- }
- }
-
- if (hc->html_tags) {
- g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1,
- rspamd_html_propagate_lengths, NULL);
- }
-
- g_queue_free (styles_blocks);
-
- return dest;
- }
-
- GByteArray*
- rspamd_html_process_part (rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in)
- {
- return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
- }
|