您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "rspamd.h"
  19. #include "message.h"
  20. #include "html.h"
  21. #include "html_tags.h"
  22. #include "html_colors.h"
  23. #include "html_entities.h"
  24. #include "url.h"
  25. #include "contrib/libucl/khash.h"
  26. #include "libmime/images.h"
  27. #include <unicode/uversion.h>
  28. #include <unicode/ucnv.h>
  29. #if U_ICU_VERSION_MAJOR_NUM >= 46
  30. #include <unicode/uidna.h>
  31. #endif
  32. static sig_atomic_t tags_sorted = 0;
  33. static sig_atomic_t entities_sorted = 0;
  34. static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
  35. struct html_tag_def {
  36. const gchar *name;
  37. gint16 id;
  38. guint16 len;
  39. guint flags;
  40. };
  41. #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
  42. rspamd_html_log_id, "html", pool->tag.uid, \
  43. G_STRFUNC, \
  44. __VA_ARGS__)
  45. INIT_LOG_MODULE(html)
  46. #define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
  47. static struct html_tag_def tag_defs[] = {
  48. /* W3C defined elements */
  49. TAG_DEF(Tag_A, "a", 0),
  50. TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
  51. TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
  52. TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
  53. TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  54. TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY)),
  55. TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
  56. TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
  57. TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
  58. TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
  59. TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
  60. TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
  61. TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
  62. TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
  63. TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
  64. TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
  65. TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
  66. TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
  67. TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
  68. TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
  69. TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
  70. TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  71. TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  72. TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
  73. TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
  74. TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
  75. TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
  76. TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  77. TAG_DEF(Tag_EM, "em", (CM_INLINE)),
  78. TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
  79. TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
  80. TAG_DEF(Tag_FORM, "form", (CM_BLOCK)),
  81. TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY)),
  82. TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
  83. TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
  84. TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
  85. TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
  86. TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
  87. TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
  88. TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
  89. TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  90. TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
  91. TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  92. TAG_DEF(Tag_I, "i", (CM_INLINE)),
  93. TAG_DEF(Tag_IFRAME, "iframe", (0)),
  94. TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
  95. TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
  96. TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  97. TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
  98. TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
  99. TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
  100. TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
  101. TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  102. TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY)),
  103. TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
  104. TAG_DEF(Tag_MAP, "map", (CM_INLINE)),
  105. TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
  106. TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
  107. TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
  108. TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  109. TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
  110. TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
  111. TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
  112. TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
  113. TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
  114. TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
  115. TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
  116. TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
  117. TAG_DEF(Tag_Q, "q", (CM_INLINE)),
  118. TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
  119. TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
  120. TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
  121. TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
  122. TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
  123. TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
  124. TAG_DEF(Tag_S, "s", (CM_INLINE)),
  125. TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
  126. TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
  127. TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
  128. TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
  129. TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
  130. TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
  131. TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
  132. TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
  133. TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
  134. TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
  135. TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
  136. TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
  137. TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  138. TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
  139. TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  140. TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  141. TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  142. TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
  143. TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
  144. TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
  145. TAG_DEF(Tag_U, "u", (CM_INLINE)),
  146. TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
  147. TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
  148. TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
  149. TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
  150. /* proprietary elements */
  151. TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
  152. TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
  153. TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
  154. TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
  155. TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
  156. TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
  157. TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
  158. TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
  159. TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
  160. TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
  161. TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
  162. TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
  163. TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  164. TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
  165. TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
  166. TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  167. TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
  168. TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
  169. };
  170. KHASH_MAP_INIT_INT (entity_by_number, const char *);
  171. KHASH_MAP_INIT_STR (entity_by_name, const char *);
  172. KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
  173. KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
  174. KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
  175. rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
  176. khash_t(entity_by_number) *html_entity_by_number;
  177. khash_t(entity_by_name) *html_entity_by_name;
  178. khash_t(tag_by_name) *html_tag_by_name;
  179. khash_t(tag_by_id) *html_tag_by_id;
  180. khash_t(color_by_name) *html_color_by_name;
  181. static void
  182. rspamd_html_library_init (void)
  183. {
  184. guint i;
  185. khiter_t k;
  186. gint rc;
  187. if (!tags_sorted) {
  188. html_tag_by_id = kh_init (tag_by_id);
  189. html_tag_by_name = kh_init (tag_by_name);
  190. kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
  191. kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
  192. for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
  193. k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
  194. kh_val (html_tag_by_id, k) = tag_defs[i];
  195. k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
  196. kh_val (html_tag_by_name, k) = tag_defs[i];
  197. }
  198. tags_sorted = 1;
  199. }
  200. if (!entities_sorted) {
  201. html_entity_by_number = kh_init (entity_by_number);
  202. html_entity_by_name = kh_init (entity_by_name);
  203. kh_resize (entity_by_number, html_entity_by_number,
  204. G_N_ELEMENTS (entities_defs));
  205. kh_resize (entity_by_name, html_entity_by_name,
  206. G_N_ELEMENTS (entities_defs));
  207. for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
  208. k = kh_put (entity_by_number, html_entity_by_number,
  209. entities_defs[i].code, &rc);
  210. kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
  211. k = kh_put (entity_by_name, html_entity_by_name,
  212. entities_defs[i].name, &rc);
  213. kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
  214. }
  215. html_color_by_name = kh_init (color_by_name);
  216. kh_resize (color_by_name, html_color_by_name,
  217. G_N_ELEMENTS (html_colornames));
  218. rspamd_ftok_t *keys;
  219. keys = g_malloc0 (sizeof (rspamd_ftok_t) *
  220. G_N_ELEMENTS (html_colornames));
  221. for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
  222. struct html_color c;
  223. keys[i].begin = html_colornames[i].name;
  224. keys[i].len = strlen (html_colornames[i].name);
  225. k = kh_put (color_by_name, html_color_by_name,
  226. &keys[i], &rc);
  227. c.valid = true;
  228. c.d.comp.r = html_colornames[i].rgb.r;
  229. c.d.comp.g = html_colornames[i].rgb.g;
  230. c.d.comp.b = html_colornames[i].rgb.b;
  231. c.d.comp.alpha = 255;
  232. kh_val (html_color_by_name, k) = c;
  233. }
  234. entities_sorted = 1;
  235. }
  236. }
  237. static gboolean
  238. rspamd_html_check_balance (GNode * node, GNode ** cur_level)
  239. {
  240. struct html_tag *arg = node->data, *tmp;
  241. GNode *cur;
  242. if (arg->flags & FL_CLOSING) {
  243. /* First of all check whether this tag is closing tag for parent node */
  244. cur = node->parent;
  245. while (cur && cur->data) {
  246. tmp = cur->data;
  247. if (tmp->id == arg->id &&
  248. (tmp->flags & FL_CLOSED) == 0) {
  249. tmp->flags |= FL_CLOSED;
  250. /* Destroy current node as we find corresponding parent node */
  251. g_node_destroy (node);
  252. /* Change level */
  253. *cur_level = cur->parent;
  254. return TRUE;
  255. }
  256. cur = cur->parent;
  257. }
  258. }
  259. else {
  260. return TRUE;
  261. }
  262. return FALSE;
  263. }
  264. gint
  265. rspamd_html_tag_by_name (const gchar *name)
  266. {
  267. khiter_t k;
  268. k = kh_get (tag_by_name, html_tag_by_name, name);
  269. if (k != kh_end (html_tag_by_name)) {
  270. return kh_val (html_tag_by_name, k).id;
  271. }
  272. return -1;
  273. }
  274. gboolean
  275. rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
  276. {
  277. gint id;
  278. g_assert (hc != NULL);
  279. g_assert (hc->tags_seen != NULL);
  280. id = rspamd_html_tag_by_name (tagname);
  281. if (id != -1) {
  282. return isset (hc->tags_seen, id);
  283. }
  284. return FALSE;
  285. }
  286. const gchar *
  287. rspamd_html_tag_by_id (gint id)
  288. {
  289. khiter_t k;
  290. k = kh_get (tag_by_id, html_tag_by_id, id);
  291. if (k != kh_end (html_tag_by_id)) {
  292. return kh_val (html_tag_by_id, k).name;
  293. }
  294. return NULL;
  295. }
  296. /* Decode HTML entitles in text */
  297. guint
  298. rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
  299. {
  300. goffset l, rep_len;
  301. gchar *t = s, *h = s, *e = s, *end_ptr;
  302. const gchar *end;
  303. const gchar *entity;
  304. gint state = 0, base;
  305. UChar32 uc;
  306. khiter_t k;
  307. if (len == 0) {
  308. l = strlen (s);
  309. }
  310. else {
  311. l = len;
  312. }
  313. end = s + l;
  314. while (h - s < l) {
  315. switch (state) {
  316. /* Out of entity */
  317. case 0:
  318. if (*h == '&') {
  319. state = 1;
  320. e = h;
  321. h++;
  322. continue;
  323. }
  324. else {
  325. *t = *h;
  326. h++;
  327. t++;
  328. }
  329. break;
  330. case 1:
  331. if (*h == ';' && h > e) {
  332. /* Determine base */
  333. /* First find in entities table */
  334. *h = '\0';
  335. entity = e + 1;
  336. uc = 0;
  337. if (*entity != '#') {
  338. k = kh_get (entity_by_name, html_entity_by_name, entity);
  339. *h = ';';
  340. if (k != kh_end (html_entity_by_name)) {
  341. if (kh_val (html_entity_by_name, k)) {
  342. rep_len = strlen (kh_val (html_entity_by_name, k));
  343. if (end - t >= rep_len) {
  344. memcpy (t, kh_val (html_entity_by_name, k),
  345. rep_len);
  346. t += rep_len;
  347. }
  348. } else {
  349. if (end - t > h - e + 1) {
  350. memmove (t, e, h - e + 1);
  351. t += h - e + 1;
  352. }
  353. }
  354. }
  355. else {
  356. if (end - t > h - e + 1) {
  357. memmove (t, e, h - e + 1);
  358. t += h - e + 1;
  359. }
  360. }
  361. }
  362. else if (e + 2 < h) {
  363. if (*(e + 2) == 'x' || *(e + 2) == 'X') {
  364. base = 16;
  365. }
  366. else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
  367. base = 8;
  368. }
  369. else {
  370. base = 10;
  371. }
  372. if (base == 10) {
  373. uc = strtoul ((e + 2), &end_ptr, base);
  374. }
  375. else {
  376. uc = strtoul ((e + 3), &end_ptr, base);
  377. }
  378. if (end_ptr != NULL && *end_ptr != '\0') {
  379. /* Skip undecoded */
  380. *h = ';';
  381. if (end - t > h - e + 1) {
  382. memmove (t, e, h - e + 1);
  383. t += h - e + 1;
  384. }
  385. }
  386. else {
  387. /* Search for a replacement */
  388. *h = ';';
  389. k = kh_get (entity_by_number, html_entity_by_number, uc);
  390. if (k != kh_end (html_entity_by_number)) {
  391. if (kh_val (html_entity_by_number, k)) {
  392. rep_len = strlen (kh_val (html_entity_by_number, k));
  393. if (end - t >= rep_len) {
  394. memcpy (t, kh_val (html_entity_by_number, k),
  395. rep_len);
  396. t += rep_len;
  397. }
  398. } else {
  399. if (end - t > h - e + 1) {
  400. memmove (t, e, h - e + 1);
  401. t += h - e + 1;
  402. }
  403. }
  404. }
  405. else {
  406. /* Unicode point */
  407. goffset off = t - s;
  408. UBool is_error = 0;
  409. if (uc > 0) {
  410. U8_APPEND (s, off, len, uc, is_error);
  411. if (!is_error) {
  412. t = s + off;
  413. }
  414. else {
  415. /* Leave invalid entities as is */
  416. if (end - t > h - e + 1) {
  417. memmove (t, e, h - e + 1);
  418. t += h - e + 1;
  419. }
  420. }
  421. }
  422. else if (end - t > h - e + 1) {
  423. memmove (t, e, h - e + 1);
  424. t += h - e + 1;
  425. }
  426. }
  427. }
  428. }
  429. state = 0;
  430. }
  431. else if (*h == '&') {
  432. /* Previous `&` was bogus */
  433. state = 1;
  434. if (end - t > h - e) {
  435. memmove (t, e, h - e);
  436. t += h - e;
  437. }
  438. e = h;
  439. }
  440. h++;
  441. break;
  442. }
  443. }
  444. /* Leftover */
  445. if (state == 1 && h > e) {
  446. /* Unfinished entity, copy as is */
  447. if (end - t > h - e) {
  448. memmove (t, e, h - e);
  449. t += h - e;
  450. }
  451. }
  452. return (t - s);
  453. }
  454. static gboolean
  455. rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
  456. {
  457. const gchar *p1, *p2;
  458. p1 = t1->begin + t1->len - 1;
  459. p2 = t2->begin + t2->len - 1;
  460. /* Skip trailing dots */
  461. while (p1 > t1->begin) {
  462. if (*p1 != '.') {
  463. break;
  464. }
  465. p1 --;
  466. }
  467. while (p2 > t2->begin) {
  468. if (*p2 != '.') {
  469. break;
  470. }
  471. p2 --;
  472. }
  473. while (p1 > t1->begin && p2 > t2->begin) {
  474. if (*p1 != *p2) {
  475. break;
  476. }
  477. p1 --;
  478. p2 --;
  479. }
  480. if (p2 == t2->begin) {
  481. /* p2 can be subdomain of p1 if *p1 is '.' */
  482. if (p1 != t1->begin && *(p1 - 1) == '.') {
  483. return TRUE;
  484. }
  485. }
  486. else if (p1 == t1->begin) {
  487. if (p2 != t2->begin && *(p2 - 1) == '.') {
  488. return TRUE;
  489. }
  490. }
  491. return FALSE;
  492. }
  493. static void
  494. rspamd_html_url_is_phished (rspamd_mempool_t *pool,
  495. struct rspamd_url *href_url,
  496. const guchar *url_text,
  497. gsize len,
  498. gboolean *url_found,
  499. struct rspamd_url **ptext_url)
  500. {
  501. struct rspamd_url *text_url;
  502. rspamd_ftok_t phished_tld, disp_tok, href_tok;
  503. gint rc;
  504. goffset url_pos;
  505. gchar *url_str = NULL, *idn_hbuf;
  506. const guchar *end = url_text + len, *p;
  507. #if U_ICU_VERSION_MAJOR_NUM >= 46
  508. static UIDNA *udn;
  509. UErrorCode uc_err = U_ZERO_ERROR;
  510. UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER;
  511. #endif
  512. *url_found = FALSE;
  513. #if U_ICU_VERSION_MAJOR_NUM >= 46
  514. if (udn == NULL) {
  515. udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
  516. if (uc_err != U_ZERO_ERROR) {
  517. msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
  518. }
  519. }
  520. #endif
  521. while (url_text < end && g_ascii_isspace (*url_text)) {
  522. url_text ++;
  523. }
  524. if (end > url_text + 4 &&
  525. rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE,
  526. &url_pos, NULL) &&
  527. url_str != NULL) {
  528. if (url_pos > 0) {
  529. /*
  530. * We have some url at some offset, so we need to check what is
  531. * at the start of the text
  532. */
  533. p = url_text;
  534. while (p < url_text + url_pos) {
  535. if (!g_ascii_isspace (*p)) {
  536. *url_found = FALSE;
  537. return;
  538. }
  539. p++;
  540. }
  541. }
  542. text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
  543. rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
  544. RSPAMD_URL_PARSE_TEXT);
  545. if (rc == URI_ERRNO_OK) {
  546. disp_tok.len = text_url->hostlen;
  547. disp_tok.begin = text_url->host;
  548. #if U_ICU_VERSION_MAJOR_NUM >= 46
  549. if (rspamd_substring_search_caseless (text_url->host,
  550. text_url->hostlen, "xn--", 4) != -1) {
  551. idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
  552. /* We need to convert it to the normal value first */
  553. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  554. text_url->host, text_url->hostlen,
  555. idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
  556. if (uc_err != U_ZERO_ERROR) {
  557. msg_err_pool ("cannot convert to IDN: %s",
  558. u_errorName (uc_err));
  559. disp_tok.len = text_url->hostlen;
  560. }
  561. else {
  562. disp_tok.begin = idn_hbuf;
  563. }
  564. }
  565. #endif
  566. href_tok.len = href_url->hostlen;
  567. href_tok.begin = href_url->host;
  568. #if U_ICU_VERSION_MAJOR_NUM >= 46
  569. if (rspamd_substring_search_caseless (href_url->host,
  570. href_url->hostlen, "xn--", 4) != -1) {
  571. idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
  572. /* We need to convert it to the normal value first */
  573. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  574. href_url->host, href_url->hostlen,
  575. idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
  576. if (uc_err != U_ZERO_ERROR) {
  577. msg_err_pool ("cannot convert to IDN: %s",
  578. u_errorName (uc_err));
  579. href_tok.len = href_url->hostlen;
  580. }
  581. else {
  582. href_tok.begin = idn_hbuf;
  583. }
  584. }
  585. #endif
  586. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
  587. /* Apply the same logic for TLD */
  588. disp_tok.len = text_url->tldlen;
  589. disp_tok.begin = text_url->tld;
  590. #if U_ICU_VERSION_MAJOR_NUM >= 46
  591. if (rspamd_substring_search_caseless (text_url->tld,
  592. text_url->tldlen, "xn--", 4) != -1) {
  593. idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
  594. /* We need to convert it to the normal value first */
  595. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  596. text_url->tld, text_url->tldlen,
  597. idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
  598. if (uc_err != U_ZERO_ERROR) {
  599. msg_err_pool ("cannot convert to IDN: %s",
  600. u_errorName (uc_err));
  601. disp_tok.len = text_url->tldlen;
  602. }
  603. else {
  604. disp_tok.begin = idn_hbuf;
  605. }
  606. }
  607. #endif
  608. href_tok.len = href_url->tldlen;
  609. href_tok.begin = href_url->tld;
  610. #if U_ICU_VERSION_MAJOR_NUM >= 46
  611. if (rspamd_substring_search_caseless (href_url->tld,
  612. href_url->tldlen, "xn--", 4) != -1) {
  613. idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
  614. /* We need to convert it to the normal value first */
  615. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  616. href_url->tld, href_url->tldlen,
  617. idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
  618. if (uc_err != U_ZERO_ERROR) {
  619. msg_err_pool ("cannot convert to IDN: %s",
  620. u_errorName (uc_err));
  621. href_tok.len = href_url->tldlen;
  622. }
  623. else {
  624. href_tok.begin = idn_hbuf;
  625. }
  626. }
  627. #endif
  628. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
  629. /* Check if one url is a subdomain for another */
  630. if (!rspamd_url_is_subdomain (&disp_tok, &href_tok)) {
  631. href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
  632. href_url->phished_url = text_url;
  633. phished_tld.begin = href_tok.begin;
  634. phished_tld.len = href_tok.len;
  635. rspamd_url_add_tag (text_url, "phishing",
  636. rspamd_mempool_ftokdup (pool, &phished_tld),
  637. pool);
  638. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  639. }
  640. }
  641. }
  642. *ptext_url = text_url;
  643. *url_found = TRUE;
  644. }
  645. else {
  646. msg_info_pool ("extract of url '%s' failed: %s",
  647. url_str,
  648. rspamd_url_strerror (rc));
  649. }
  650. }
  651. }
  652. static gboolean
  653. rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
  654. struct html_tag *tag, GNode **cur_level, gboolean *balanced)
  655. {
  656. GNode *nnode;
  657. struct html_tag *parent;
  658. if (hc->html_tags == NULL) {
  659. nnode = g_node_new (NULL);
  660. *cur_level = nnode;
  661. hc->html_tags = nnode;
  662. rspamd_mempool_add_destructor (pool,
  663. (rspamd_mempool_destruct_t) g_node_destroy,
  664. nnode);
  665. }
  666. if (hc->total_tags > max_tags) {
  667. hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
  668. }
  669. if (tag->id == -1) {
  670. /* Ignore unknown tags */
  671. hc->total_tags ++;
  672. return FALSE;
  673. }
  674. tag->parent = *cur_level;
  675. if (!(tag->flags & CM_INLINE)) {
  676. /* Block tag */
  677. if (tag->flags & (FL_CLOSING|FL_CLOSED)) {
  678. if (!*cur_level) {
  679. msg_debug_html ("bad parent node");
  680. return FALSE;
  681. }
  682. if (hc->total_tags < max_tags) {
  683. nnode = g_node_new (tag);
  684. g_node_append (*cur_level, nnode);
  685. if (!rspamd_html_check_balance (nnode, cur_level)) {
  686. msg_debug_html (
  687. "mark part as unbalanced as it has not pairable closing tags");
  688. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  689. *balanced = FALSE;
  690. } else {
  691. *balanced = TRUE;
  692. }
  693. hc->total_tags ++;
  694. }
  695. }
  696. else {
  697. parent = (*cur_level)->data;
  698. if (parent) {
  699. if ((parent->flags & FL_IGNORE)) {
  700. tag->flags |= FL_IGNORE;
  701. }
  702. if (!(tag->flags & FL_CLOSED) &&
  703. !(parent->flags & FL_BLOCK)) {
  704. /* We likely have some bad nesting */
  705. if (parent->id == tag->id) {
  706. /* Something like <a>bla<a>foo... */
  707. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  708. *balanced = FALSE;
  709. tag->parent = parent->parent;
  710. if (hc->total_tags < max_tags) {
  711. nnode = g_node_new (tag);
  712. g_node_append (parent->parent, nnode);
  713. *cur_level = nnode;
  714. hc->total_tags ++;
  715. }
  716. return TRUE;
  717. }
  718. }
  719. parent->content_length += tag->content_length;
  720. }
  721. if (hc->total_tags < max_tags) {
  722. nnode = g_node_new (tag);
  723. g_node_append (*cur_level, nnode);
  724. if ((tag->flags & FL_CLOSED) == 0) {
  725. *cur_level = nnode;
  726. }
  727. hc->total_tags ++;
  728. }
  729. if (tag->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE)) {
  730. tag->flags |= FL_IGNORE;
  731. return FALSE;
  732. }
  733. }
  734. }
  735. else {
  736. /* Inline tag */
  737. parent = (*cur_level)->data;
  738. if (parent && (parent->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE))) {
  739. tag->flags |= FL_IGNORE;
  740. return FALSE;
  741. }
  742. }
  743. return TRUE;
  744. }
  745. #define NEW_COMPONENT(comp_type) do { \
  746. comp = rspamd_mempool_alloc (pool, sizeof (*comp)); \
  747. comp->type = (comp_type); \
  748. comp->start = NULL; \
  749. comp->len = 0; \
  750. g_queue_push_tail (tag->params, comp); \
  751. ret = TRUE; \
  752. } while(0)
  753. static gboolean
  754. rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
  755. const guchar *begin, const guchar *end,
  756. struct html_tag *tag)
  757. {
  758. struct html_tag_component *comp;
  759. gint len;
  760. gboolean ret = FALSE;
  761. gchar *p;
  762. g_assert (end >= begin);
  763. p = rspamd_mempool_alloc (pool, end - begin);
  764. memcpy (p, begin, end - begin);
  765. len = rspamd_html_decode_entitles_inplace (p, end - begin);
  766. if (len == 3) {
  767. if (g_ascii_strncasecmp (p, "src", len) == 0) {
  768. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  769. }
  770. }
  771. else if (len == 4) {
  772. if (g_ascii_strncasecmp (p, "href", len) == 0) {
  773. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  774. }
  775. }
  776. if (tag->id == Tag_IMG) {
  777. /* Check width and height if presented */
  778. if (len == 5 && g_ascii_strncasecmp (p, "width", len) == 0) {
  779. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH);
  780. }
  781. else if (len == 6 && g_ascii_strncasecmp (p, "height", len) == 0) {
  782. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT);
  783. }
  784. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  785. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  786. }
  787. }
  788. else if (tag->id == Tag_FONT) {
  789. if (len == 5){
  790. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  791. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  792. }
  793. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  794. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  795. }
  796. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  797. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  798. }
  799. }
  800. else if (len == 7) {
  801. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  802. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  803. }
  804. }
  805. else if (len == 4) {
  806. if (g_ascii_strncasecmp (p, "size", len) == 0) {
  807. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE);
  808. }
  809. }
  810. }
  811. else if (tag->flags & FL_BLOCK) {
  812. if (len == 5){
  813. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  814. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  815. }
  816. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  817. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  818. }
  819. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  820. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  821. }
  822. }
  823. else if (len == 7) {
  824. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  825. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  826. }
  827. }
  828. }
  829. return ret;
  830. }
  831. static inline void
  832. rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
  833. struct html_content *hc, struct html_tag *tag, const guchar *in,
  834. gint *statep, guchar const **savep)
  835. {
  836. enum {
  837. parse_start = 0,
  838. parse_name,
  839. parse_attr_name,
  840. parse_equal,
  841. parse_start_dquote,
  842. parse_dqvalue,
  843. parse_end_dquote,
  844. parse_start_squote,
  845. parse_sqvalue,
  846. parse_end_squote,
  847. parse_value,
  848. spaces_after_name,
  849. spaces_before_eq,
  850. spaces_after_eq,
  851. spaces_after_param,
  852. ignore_bad_tag
  853. } state;
  854. struct html_tag_def *found;
  855. gboolean store = FALSE;
  856. struct html_tag_component *comp;
  857. state = *statep;
  858. switch (state) {
  859. case parse_start:
  860. if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
  861. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  862. state = ignore_bad_tag;
  863. tag->id = -1;
  864. tag->flags |= FL_BROKEN;
  865. }
  866. else if (g_ascii_isalpha (*in)) {
  867. state = parse_name;
  868. tag->name.start = in;
  869. }
  870. break;
  871. case parse_name:
  872. if (g_ascii_isspace (*in) || *in == '>' || *in == '/') {
  873. g_assert (in >= tag->name.start);
  874. if (*in == '/') {
  875. tag->flags |= FL_CLOSED;
  876. }
  877. tag->name.len = in - tag->name.start;
  878. if (tag->name.len == 0) {
  879. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  880. tag->id = -1;
  881. tag->flags |= FL_BROKEN;
  882. state = ignore_bad_tag;
  883. }
  884. else {
  885. gchar *s;
  886. khiter_t k;
  887. /* We CANNOT safely modify tag's name here, as it is already parsed */
  888. s = rspamd_mempool_alloc (pool, tag->name.len + 1);
  889. memcpy (s, tag->name.start, tag->name.len);
  890. tag->name.len = rspamd_html_decode_entitles_inplace (s,
  891. tag->name.len);
  892. tag->name.start = s;
  893. s[tag->name.len] = '\0';
  894. rspamd_str_lc_utf8 (s, tag->name.len);
  895. k = kh_get (tag_by_name, html_tag_by_name, s);
  896. if (k == kh_end (html_tag_by_name)) {
  897. hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
  898. tag->id = -1;
  899. }
  900. else {
  901. found = &kh_val (html_tag_by_name, k);
  902. tag->id = found->id;
  903. tag->flags = found->flags;
  904. }
  905. state = spaces_after_name;
  906. }
  907. }
  908. break;
  909. case parse_attr_name:
  910. if (*savep == NULL) {
  911. state = ignore_bad_tag;
  912. }
  913. else {
  914. const guchar *attr_name_end = in;
  915. if (*in == '=') {
  916. state = parse_equal;
  917. }
  918. else if (*in == '"') {
  919. /* No equal or something sane but we have quote character */
  920. state = parse_start_dquote;
  921. attr_name_end = in - 1;
  922. while (attr_name_end > *savep) {
  923. if (!g_ascii_isalnum (*attr_name_end)) {
  924. attr_name_end --;
  925. }
  926. else {
  927. break;
  928. }
  929. }
  930. /* One character forward to obtain length */
  931. attr_name_end ++;
  932. }
  933. else if (g_ascii_isspace (*in)) {
  934. state = spaces_before_eq;
  935. }
  936. else if (*in == '/') {
  937. tag->flags |= FL_CLOSED;
  938. }
  939. else if (!g_ascii_isgraph (*in)) {
  940. state = parse_value;
  941. attr_name_end = in - 1;
  942. while (attr_name_end > *savep) {
  943. if (!g_ascii_isalnum (*attr_name_end)) {
  944. attr_name_end --;
  945. }
  946. else {
  947. break;
  948. }
  949. }
  950. /* One character forward to obtain length */
  951. attr_name_end ++;
  952. }
  953. else {
  954. return;
  955. }
  956. if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
  957. /* Ignore unknown params */
  958. *savep = NULL;
  959. }
  960. else if (state == parse_value) {
  961. *savep = in + 1;
  962. }
  963. }
  964. break;
  965. case spaces_after_name:
  966. if (!g_ascii_isspace (*in)) {
  967. *savep = in;
  968. if (*in == '/') {
  969. tag->flags |= FL_CLOSED;
  970. }
  971. else if (*in != '>') {
  972. state = parse_attr_name;
  973. }
  974. }
  975. break;
  976. case spaces_before_eq:
  977. if (*in == '=') {
  978. state = parse_equal;
  979. }
  980. else if (!g_ascii_isspace (*in)) {
  981. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  982. tag->flags |= FL_BROKEN;
  983. state = ignore_bad_tag;
  984. }
  985. break;
  986. case spaces_after_eq:
  987. if (*in == '"') {
  988. state = parse_start_dquote;
  989. }
  990. else if (*in == '\'') {
  991. state = parse_start_squote;
  992. }
  993. else if (!g_ascii_isspace (*in)) {
  994. if (*savep != NULL) {
  995. /* We need to save this param */
  996. *savep = in;
  997. }
  998. state = parse_value;
  999. }
  1000. break;
  1001. case parse_equal:
  1002. if (g_ascii_isspace (*in)) {
  1003. state = spaces_after_eq;
  1004. }
  1005. else if (*in == '"') {
  1006. state = parse_start_dquote;
  1007. }
  1008. else if (*in == '\'') {
  1009. state = parse_start_squote;
  1010. }
  1011. else {
  1012. if (*savep != NULL) {
  1013. /* We need to save this param */
  1014. *savep = in;
  1015. }
  1016. state = parse_value;
  1017. }
  1018. break;
  1019. case parse_start_dquote:
  1020. if (*in == '"') {
  1021. if (*savep != NULL) {
  1022. /* We have an empty attribute value */
  1023. savep = NULL;
  1024. }
  1025. state = spaces_after_param;
  1026. }
  1027. else {
  1028. if (*savep != NULL) {
  1029. /* We need to save this param */
  1030. *savep = in;
  1031. }
  1032. state = parse_dqvalue;
  1033. }
  1034. break;
  1035. case parse_start_squote:
  1036. if (*in == '\'') {
  1037. if (*savep != NULL) {
  1038. /* We have an empty attribute value */
  1039. savep = NULL;
  1040. }
  1041. state = spaces_after_param;
  1042. }
  1043. else {
  1044. if (*savep != NULL) {
  1045. /* We need to save this param */
  1046. *savep = in;
  1047. }
  1048. state = parse_sqvalue;
  1049. }
  1050. break;
  1051. case parse_dqvalue:
  1052. if (*in == '"') {
  1053. store = TRUE;
  1054. state = parse_end_dquote;
  1055. }
  1056. if (store) {
  1057. if (*savep != NULL) {
  1058. gchar *s;
  1059. g_assert (tag->params != NULL);
  1060. comp = g_queue_peek_tail (tag->params);
  1061. g_assert (comp != NULL);
  1062. comp->len = in - *savep;
  1063. s = rspamd_mempool_alloc (pool, comp->len);
  1064. memcpy (s, *savep, comp->len);
  1065. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1066. comp->start = s;
  1067. *savep = NULL;
  1068. }
  1069. }
  1070. break;
  1071. case parse_sqvalue:
  1072. if (*in == '\'') {
  1073. store = TRUE;
  1074. state = parse_end_squote;
  1075. }
  1076. if (store) {
  1077. if (*savep != NULL) {
  1078. gchar *s;
  1079. g_assert (tag->params != NULL);
  1080. comp = g_queue_peek_tail (tag->params);
  1081. g_assert (comp != NULL);
  1082. comp->len = in - *savep;
  1083. s = rspamd_mempool_alloc (pool, comp->len);
  1084. memcpy (s, *savep, comp->len);
  1085. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1086. comp->start = s;
  1087. *savep = NULL;
  1088. }
  1089. }
  1090. break;
  1091. case parse_value:
  1092. if (*in == '/' && *(in + 1) == '>') {
  1093. tag->flags |= FL_CLOSED;
  1094. store = TRUE;
  1095. }
  1096. else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
  1097. store = TRUE;
  1098. state = spaces_after_param;
  1099. }
  1100. if (store) {
  1101. if (*savep != NULL) {
  1102. gchar *s;
  1103. g_assert (tag->params != NULL);
  1104. comp = g_queue_peek_tail (tag->params);
  1105. g_assert (comp != NULL);
  1106. comp->len = in - *savep;
  1107. s = rspamd_mempool_alloc (pool, comp->len);
  1108. memcpy (s, *savep, comp->len);
  1109. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1110. comp->start = s;
  1111. *savep = NULL;
  1112. }
  1113. }
  1114. break;
  1115. case parse_end_dquote:
  1116. case parse_end_squote:
  1117. if (g_ascii_isspace (*in)) {
  1118. state = spaces_after_param;
  1119. }
  1120. else if (*in == '/' && *(in + 1) == '>') {
  1121. tag->flags |= FL_CLOSED;
  1122. }
  1123. break;
  1124. case spaces_after_param:
  1125. if (!g_ascii_isspace (*in)) {
  1126. if (*in == '/' && *(in + 1) == '>') {
  1127. tag->flags |= FL_CLOSED;
  1128. }
  1129. state = parse_attr_name;
  1130. *savep = in;
  1131. }
  1132. break;
  1133. case ignore_bad_tag:
  1134. break;
  1135. }
  1136. *statep = state;
  1137. }
  1138. struct rspamd_url *
  1139. rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
  1140. struct html_tag_component *comp)
  1141. {
  1142. struct rspamd_url *url;
  1143. guint saved_flags = 0;
  1144. gchar *decoded;
  1145. gint rc;
  1146. gsize decoded_len;
  1147. const gchar *p, *s;
  1148. gchar *d;
  1149. guint i, dlen;
  1150. gboolean has_bad_chars = FALSE, no_prefix = FALSE;
  1151. static const gchar hexdigests[16] = "0123456789abcdef";
  1152. p = start;
  1153. /* Strip spaces from the url */
  1154. /* Head spaces */
  1155. while (p < start + len && g_ascii_isspace (*p)) {
  1156. p ++;
  1157. start ++;
  1158. len --;
  1159. }
  1160. if (comp) {
  1161. comp->start = p;
  1162. comp->len = len;
  1163. }
  1164. /* Trailing spaces */
  1165. p = start + len - 1;
  1166. while (p >= start && g_ascii_isspace (*p)) {
  1167. p --;
  1168. len --;
  1169. if (comp) {
  1170. comp->len --;
  1171. }
  1172. }
  1173. s = start;
  1174. dlen = 0;
  1175. for (i = 0; i < len; i ++) {
  1176. if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1177. dlen += 3;
  1178. }
  1179. else {
  1180. dlen ++;
  1181. }
  1182. }
  1183. if (memchr (s, ':', len) == NULL) {
  1184. /* We have no prefix */
  1185. dlen += sizeof ("http://") - 1;
  1186. no_prefix = TRUE;
  1187. }
  1188. decoded = rspamd_mempool_alloc (pool, dlen + 1);
  1189. d = decoded;
  1190. if (no_prefix) {
  1191. if (s[0] == '/' && (len > 2 && s[1] == '/')) {
  1192. /* //bla case */
  1193. memcpy (d, "http:", sizeof ("http:") - 1);
  1194. d += sizeof ("http:") - 1;
  1195. }
  1196. else {
  1197. memcpy (d, "http://", sizeof ("http://") - 1);
  1198. d += sizeof ("http://") - 1;
  1199. }
  1200. }
  1201. /*
  1202. * We also need to remove all internal newlines, spaces
  1203. * and encode unsafe characters
  1204. */
  1205. for (i = 0; i < len; i ++) {
  1206. if (G_UNLIKELY (g_ascii_isspace (s[i]))) {
  1207. continue;
  1208. }
  1209. else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1210. /* URL encode */
  1211. *d++ = '%';
  1212. *d++ = hexdigests[(s[i] >> 4) & 0xf];
  1213. *d++ = hexdigests[s[i] & 0xf];
  1214. has_bad_chars = TRUE;
  1215. }
  1216. else {
  1217. *d++ = s[i];
  1218. }
  1219. }
  1220. *d = '\0';
  1221. dlen = d - decoded;
  1222. url = rspamd_mempool_alloc0 (pool, sizeof (*url));
  1223. enum rspamd_normalise_result norm_res;
  1224. norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
  1225. if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
  1226. saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
  1227. }
  1228. if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
  1229. saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
  1230. if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
  1231. saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
  1232. }
  1233. }
  1234. rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
  1235. if (rc == URI_ERRNO_OK) {
  1236. url->flags |= saved_flags;
  1237. if (has_bad_chars) {
  1238. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1239. }
  1240. if (no_prefix) {
  1241. url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  1242. }
  1243. decoded = url->string;
  1244. decoded_len = url->urllen;
  1245. if (comp) {
  1246. comp->start = decoded;
  1247. comp->len = decoded_len;
  1248. }
  1249. /* Spaces in href usually mean an attempt to obfuscate URL */
  1250. /* See https://github.com/vstakhov/rspamd/issues/593 */
  1251. #if 0
  1252. if (has_spaces) {
  1253. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1254. }
  1255. #endif
  1256. return url;
  1257. }
  1258. return NULL;
  1259. }
  1260. static struct rspamd_url *
  1261. rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1262. struct html_content *hc)
  1263. {
  1264. struct html_tag_component *comp;
  1265. GList *cur;
  1266. struct rspamd_url *url;
  1267. const gchar *start;
  1268. gsize len;
  1269. cur = tag->params->head;
  1270. while (cur) {
  1271. comp = cur->data;
  1272. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1273. start = comp->start;
  1274. len = comp->len;
  1275. /* Check base url */
  1276. if (hc && hc->base_url && comp->len > 2) {
  1277. /*
  1278. * Relative url canot start from the following:
  1279. * schema://
  1280. * slash
  1281. */
  1282. gchar *buf;
  1283. gsize orig_len;
  1284. if (rspamd_substring_search (start, len, "://", 3) == -1) {
  1285. /* Assume relative url */
  1286. gboolean need_slash = FALSE;
  1287. orig_len = len;
  1288. len += hc->base_url->urllen;
  1289. if (hc->base_url->string[hc->base_url->urllen - 1] != '/') {
  1290. need_slash = TRUE;
  1291. len ++;
  1292. }
  1293. buf = rspamd_mempool_alloc (pool, len + 1);
  1294. rspamd_snprintf (buf, len + 1, "%*s%s%*s",
  1295. hc->base_url->urllen, hc->base_url->string,
  1296. need_slash ? "/" : "",
  1297. (gint)orig_len, start);
  1298. start = buf;
  1299. }
  1300. else if (start[0] == '/' && start[1] != '/') {
  1301. /* Relative to the hostname */
  1302. orig_len = len;
  1303. len += hc->base_url->hostlen + hc->base_url->protocollen +
  1304. 3 /* for :// */;
  1305. buf = rspamd_mempool_alloc (pool, len + 1);
  1306. rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
  1307. hc->base_url->protocollen, hc->base_url->string,
  1308. hc->base_url->hostlen, hc->base_url->host,
  1309. (gint)orig_len, start);
  1310. start = buf;
  1311. }
  1312. }
  1313. url = rspamd_html_process_url (pool, start, len, comp);
  1314. if (url && tag->extra == NULL) {
  1315. tag->extra = url;
  1316. }
  1317. return url;
  1318. }
  1319. cur = g_list_next (cur);
  1320. }
  1321. return NULL;
  1322. }
  1323. static void
  1324. rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
  1325. GHashTable *tbl_urls, GHashTable *tbl_emails)
  1326. {
  1327. GHashTable *target_tbl;
  1328. struct rspamd_url *query_url, *existing;
  1329. gchar *url_str;
  1330. gint rc;
  1331. gboolean prefix_added;
  1332. if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) {
  1333. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1334. }
  1335. if (url->querylen > 0) {
  1336. if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE,
  1337. NULL, &prefix_added)) {
  1338. query_url = rspamd_mempool_alloc0 (pool,
  1339. sizeof (struct rspamd_url));
  1340. rc = rspamd_url_parse (query_url,
  1341. url_str,
  1342. strlen (url_str),
  1343. pool,
  1344. RSPAMD_URL_PARSE_TEXT);
  1345. if (rc == URI_ERRNO_OK &&
  1346. query_url->hostlen > 0) {
  1347. msg_debug_html ("found url %s in query of url"
  1348. " %*s", url_str, url->querylen, url->query);
  1349. if (query_url->protocol == PROTOCOL_MAILTO) {
  1350. target_tbl = tbl_emails;
  1351. }
  1352. else {
  1353. target_tbl = tbl_urls;
  1354. }
  1355. if (prefix_added) {
  1356. query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  1357. }
  1358. if (query_url->flags
  1359. & (RSPAMD_URL_FLAG_UNNORMALISED|RSPAMD_URL_FLAG_OBSCURED|
  1360. RSPAMD_URL_FLAG_NUMERIC)) {
  1361. /* Set obscured flag if query url is bad */
  1362. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1363. }
  1364. /* And vice-versa */
  1365. if (url->flags & RSPAMD_URL_FLAG_OBSCURED) {
  1366. query_url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1367. }
  1368. if ((existing = g_hash_table_lookup (target_tbl,
  1369. query_url)) == NULL) {
  1370. g_hash_table_insert (target_tbl,
  1371. query_url,
  1372. query_url);
  1373. }
  1374. else {
  1375. existing->count ++;
  1376. }
  1377. }
  1378. }
  1379. }
  1380. }
  1381. static void
  1382. rspamd_html_process_data_image (rspamd_mempool_t *pool,
  1383. struct html_image *img,
  1384. struct html_tag_component *src)
  1385. {
  1386. /*
  1387. * Here, we do very basic processing of the data:
  1388. * detect if we have something like: ``
  1389. * We only parse base64 encoded data.
  1390. * We ignore content type so far
  1391. */
  1392. struct rspamd_image *parsed_image;
  1393. const gchar *semicolon_pos = NULL, *end = src->start + src->len;
  1394. semicolon_pos = src->start;
  1395. while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) {
  1396. if (end - semicolon_pos > sizeof ("base64,")) {
  1397. if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) {
  1398. const gchar *data_pos = semicolon_pos + sizeof ("base64,");
  1399. gchar *decoded;
  1400. gsize encoded_len = end - data_pos, decoded_len;
  1401. rspamd_ftok_t inp;
  1402. decoded_len = (encoded_len / 4 * 3) + 12;
  1403. decoded = rspamd_mempool_alloc (pool, decoded_len);
  1404. rspamd_cryptobox_base64_decode (data_pos, encoded_len,
  1405. decoded, &decoded_len);
  1406. inp.begin = decoded;
  1407. inp.len = decoded_len;
  1408. parsed_image = rspamd_maybe_process_image (pool, &inp);
  1409. if (parsed_image) {
  1410. msg_debug_html ("detected %s image of size %ud x %ud in data url",
  1411. rspamd_image_type_str (parsed_image->type),
  1412. parsed_image->width, parsed_image->height);
  1413. img->embedded_image = parsed_image;
  1414. }
  1415. }
  1416. break;
  1417. }
  1418. else {
  1419. /* Nothing useful */
  1420. return;
  1421. }
  1422. semicolon_pos ++;
  1423. }
  1424. }
  1425. static void
  1426. rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1427. struct html_content *hc)
  1428. {
  1429. struct html_tag_component *comp;
  1430. struct html_image *img;
  1431. rspamd_ftok_t fstr;
  1432. const guchar *p;
  1433. GList *cur;
  1434. gulong val;
  1435. gboolean seen_width = FALSE, seen_height = FALSE;
  1436. goffset pos;
  1437. cur = tag->params->head;
  1438. img = rspamd_mempool_alloc0 (pool, sizeof (*img));
  1439. img->tag = tag;
  1440. while (cur) {
  1441. comp = cur->data;
  1442. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1443. fstr.begin = (gchar *)comp->start;
  1444. fstr.len = comp->len;
  1445. img->src = rspamd_mempool_ftokdup (pool, &fstr);
  1446. if (comp->len > sizeof ("cid:") - 1 && memcmp (comp->start,
  1447. "cid:", sizeof ("cid:") - 1) == 0) {
  1448. /* We have an embedded image */
  1449. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
  1450. }
  1451. if (comp->len > sizeof ("data:") - 1 && memcmp (comp->start,
  1452. "data:", sizeof ("data:") - 1) == 0) {
  1453. /* We have an embedded image in HTML tag */
  1454. img->flags |=
  1455. (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED|RSPAMD_HTML_FLAG_IMAGE_DATA);
  1456. rspamd_html_process_data_image (pool, img, comp);
  1457. hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
  1458. }
  1459. else {
  1460. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
  1461. if (img->src) {
  1462. img->url = rspamd_html_process_url (pool,
  1463. img->src, fstr.len, NULL);
  1464. }
  1465. }
  1466. }
  1467. else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) {
  1468. rspamd_strtoul (comp->start, comp->len, &val);
  1469. img->height = val;
  1470. seen_height = TRUE;
  1471. }
  1472. else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) {
  1473. rspamd_strtoul (comp->start, comp->len, &val);
  1474. img->width = val;
  1475. seen_width = TRUE;
  1476. }
  1477. else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) {
  1478. /* Try to search for height= or width= in style tag */
  1479. if (!seen_height && comp->len > 0) {
  1480. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1481. "height", sizeof ("height") - 1);
  1482. if (pos != -1) {
  1483. p = comp->start + pos + sizeof ("height") - 1;
  1484. while (p < comp->start + comp->len) {
  1485. if (g_ascii_isdigit (*p)) {
  1486. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1487. img->height = val;
  1488. break;
  1489. }
  1490. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1491. /* Fallback */
  1492. break;
  1493. }
  1494. p ++;
  1495. }
  1496. }
  1497. }
  1498. if (!seen_width && comp->len > 0) {
  1499. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1500. "width", sizeof ("width") - 1);
  1501. if (pos != -1) {
  1502. p = comp->start + pos + sizeof ("width") - 1;
  1503. while (p < comp->start + comp->len) {
  1504. if (g_ascii_isdigit (*p)) {
  1505. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1506. img->width = val;
  1507. break;
  1508. }
  1509. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1510. /* Fallback */
  1511. break;
  1512. }
  1513. p ++;
  1514. }
  1515. }
  1516. }
  1517. }
  1518. cur = g_list_next (cur);
  1519. }
  1520. if (hc->images == NULL) {
  1521. hc->images = g_ptr_array_sized_new (4);
  1522. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  1523. hc->images);
  1524. }
  1525. if (img->embedded_image) {
  1526. if (!seen_height) {
  1527. img->height = img->embedded_image->height;
  1528. }
  1529. if (!seen_width) {
  1530. img->width = img->embedded_image->width;
  1531. }
  1532. }
  1533. g_ptr_array_add (hc->images, img);
  1534. tag->extra = img;
  1535. }
  1536. static void
  1537. rspamd_html_process_color (const gchar *line, guint len, struct html_color *cl)
  1538. {
  1539. const gchar *p = line, *end = line + len;
  1540. char hexbuf[7];
  1541. rspamd_ftok_t search;
  1542. struct html_color *el;
  1543. memset (cl, 0, sizeof (*cl));
  1544. if (*p == '#') {
  1545. /* HEX color */
  1546. p ++;
  1547. rspamd_strlcpy (hexbuf, p, MIN ((gint)sizeof(hexbuf), end - p + 1));
  1548. cl->d.val = strtoul (hexbuf, NULL, 16);
  1549. cl->d.comp.alpha = 255;
  1550. cl->valid = TRUE;
  1551. }
  1552. else if (len > 4 && rspamd_lc_cmp (p, "rgb", 3) == 0) {
  1553. /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
  1554. enum {
  1555. obrace,
  1556. num1,
  1557. num2,
  1558. num3,
  1559. num4,
  1560. skip_spaces
  1561. } state = skip_spaces, next_state = obrace;
  1562. gulong r = 0, g = 0, b = 0, opacity = 255;
  1563. const gchar *c;
  1564. gboolean valid = FALSE;
  1565. p += 3;
  1566. if (*p == 'a') {
  1567. p ++;
  1568. }
  1569. c = p;
  1570. while (p < end) {
  1571. switch (state) {
  1572. case obrace:
  1573. if (*p == '(') {
  1574. p ++;
  1575. state = skip_spaces;
  1576. next_state = num1;
  1577. }
  1578. else if (g_ascii_isspace (*p)) {
  1579. state = skip_spaces;
  1580. next_state = obrace;
  1581. }
  1582. else {
  1583. goto stop;
  1584. }
  1585. break;
  1586. case num1:
  1587. if (*p == ',') {
  1588. if (!rspamd_strtoul (c, p - c, &r)) {
  1589. goto stop;
  1590. }
  1591. p ++;
  1592. state = skip_spaces;
  1593. next_state = num2;
  1594. }
  1595. else if (!g_ascii_isdigit (*p)) {
  1596. goto stop;
  1597. }
  1598. else {
  1599. p ++;
  1600. }
  1601. break;
  1602. case num2:
  1603. if (*p == ',') {
  1604. if (!rspamd_strtoul (c, p - c, &g)) {
  1605. goto stop;
  1606. }
  1607. p ++;
  1608. state = skip_spaces;
  1609. next_state = num3;
  1610. }
  1611. else if (!g_ascii_isdigit (*p)) {
  1612. goto stop;
  1613. }
  1614. else {
  1615. p ++;
  1616. }
  1617. break;
  1618. case num3:
  1619. if (*p == ',') {
  1620. if (!rspamd_strtoul (c, p - c, &b)) {
  1621. goto stop;
  1622. }
  1623. valid = TRUE;
  1624. p ++;
  1625. state = skip_spaces;
  1626. next_state = num4;
  1627. }
  1628. else if (*p == ')') {
  1629. if (!rspamd_strtoul (c, p - c, &b)) {
  1630. goto stop;
  1631. }
  1632. valid = TRUE;
  1633. goto stop;
  1634. }
  1635. else if (!g_ascii_isdigit (*p)) {
  1636. goto stop;
  1637. }
  1638. else {
  1639. p ++;
  1640. }
  1641. break;
  1642. case num4:
  1643. if (*p == ',') {
  1644. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1645. goto stop;
  1646. }
  1647. valid = TRUE;
  1648. goto stop;
  1649. }
  1650. else if (*p == ')') {
  1651. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1652. goto stop;
  1653. }
  1654. valid = TRUE;
  1655. goto stop;
  1656. }
  1657. else if (!g_ascii_isdigit (*p)) {
  1658. goto stop;
  1659. }
  1660. else {
  1661. p ++;
  1662. }
  1663. break;
  1664. case skip_spaces:
  1665. if (!g_ascii_isspace (*p)) {
  1666. c = p;
  1667. state = next_state;
  1668. }
  1669. else {
  1670. p ++;
  1671. }
  1672. break;
  1673. }
  1674. }
  1675. stop:
  1676. if (valid) {
  1677. cl->d.comp.r = r;
  1678. cl->d.comp.g = g;
  1679. cl->d.comp.b = b;
  1680. cl->d.comp.alpha = opacity;
  1681. cl->valid = TRUE;
  1682. }
  1683. }
  1684. else {
  1685. khiter_t k;
  1686. /* Compare color by name */
  1687. search.begin = line;
  1688. search.len = len;
  1689. k = kh_get (color_by_name, html_color_by_name, &search);
  1690. if (k != kh_end (html_color_by_name)) {
  1691. el = &kh_val (html_color_by_name, k);
  1692. memcpy (cl, el, sizeof (*cl));
  1693. cl->d.comp.alpha = 255; /* Non transparent */
  1694. }
  1695. }
  1696. }
  1697. /*
  1698. * Target is used for in and out if this function returns TRUE
  1699. */
  1700. static gboolean
  1701. rspamd_html_process_css_size (const gchar *suffix, gsize len,
  1702. gdouble *tgt)
  1703. {
  1704. gdouble sz = *tgt;
  1705. gboolean ret = FALSE;
  1706. if (len >= 2) {
  1707. if (memcmp (suffix, "px", 2) == 0) {
  1708. sz = (guint) sz; /* Round to number */
  1709. ret = TRUE;
  1710. }
  1711. else if (memcmp (suffix, "em", 2) == 0) {
  1712. /* EM is 16 px, so multiply and round */
  1713. sz = (guint) (sz * 16.0);
  1714. ret = TRUE;
  1715. }
  1716. else if (len >= 3 && memcmp (suffix, "rem", 3) == 0) {
  1717. /* equal to EM in our case */
  1718. sz = (guint) (sz * 16.0);
  1719. ret = TRUE;
  1720. }
  1721. else if (memcmp (suffix, "ex", 2) == 0) {
  1722. /*
  1723. * Represents the x-height of the element's font.
  1724. * On fonts with the "x" letter, this is generally the height
  1725. * of lowercase letters in the font; 1ex = 0.5em in many fonts.
  1726. */
  1727. sz = (guint) (sz * 8.0);
  1728. ret = TRUE;
  1729. }
  1730. else if (memcmp (suffix, "vw", 2) == 0) {
  1731. /*
  1732. * Vewport width in percentages:
  1733. * we assume 1% of viewport width as 8px
  1734. */
  1735. sz = (guint) (sz * 8.0);
  1736. ret = TRUE;
  1737. }
  1738. else if (memcmp (suffix, "vh", 2) == 0) {
  1739. /*
  1740. * Vewport height in percentages
  1741. * we assume 1% of viewport width as 6px
  1742. */
  1743. sz = (guint) (sz * 6.0);
  1744. ret = TRUE;
  1745. }
  1746. else if (len >= 4 && memcmp (suffix, "vmax", 4) == 0) {
  1747. /*
  1748. * Vewport width in percentages
  1749. * we assume 1% of viewport width as 6px
  1750. */
  1751. sz = (guint) (sz * 8.0);
  1752. ret = TRUE;
  1753. }
  1754. else if (len >= 4 && memcmp (suffix, "vmin", 4) == 0) {
  1755. /*
  1756. * Vewport height in percentages
  1757. * we assume 1% of viewport width as 6px
  1758. */
  1759. sz = (guint) (sz * 6.0);
  1760. ret = TRUE;
  1761. }
  1762. else if (memcmp (suffix, "pt", 2) == 0) {
  1763. sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
  1764. ret = TRUE;
  1765. }
  1766. else if (memcmp (suffix, "cm", 2) == 0) {
  1767. sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
  1768. ret = TRUE;
  1769. }
  1770. else if (memcmp (suffix, "mm", 2) == 0) {
  1771. sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
  1772. ret = TRUE;
  1773. }
  1774. else if (memcmp (suffix, "in", 2) == 0) {
  1775. sz = (guint) (sz * 96.0); /* 96px */
  1776. ret = TRUE;
  1777. }
  1778. else if (memcmp (suffix, "pc", 2) == 0) {
  1779. sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
  1780. ret = TRUE;
  1781. }
  1782. }
  1783. else if (suffix[0] == '%') {
  1784. /* Percentages from 16 px */
  1785. sz = (guint)(sz / 100.0 * 16.0);
  1786. ret = TRUE;
  1787. }
  1788. if (ret) {
  1789. *tgt = sz;
  1790. }
  1791. return ret;
  1792. }
  1793. static void
  1794. rspamd_html_process_font_size (const gchar *line, guint len, guint *fs,
  1795. gboolean is_css)
  1796. {
  1797. const gchar *p = line, *end = line + len;
  1798. gchar *err = NULL, numbuf[64];
  1799. gdouble sz = 0;
  1800. gboolean failsafe = FALSE;
  1801. while (p < end && g_ascii_isspace (*p)) {
  1802. p ++;
  1803. len --;
  1804. }
  1805. if (g_ascii_isdigit (*p)) {
  1806. rspamd_strlcpy (numbuf, p, MIN (sizeof (numbuf), len + 1));
  1807. sz = strtod (numbuf, &err);
  1808. /* Now check leftover */
  1809. if (sz < 0) {
  1810. sz = 0;
  1811. }
  1812. }
  1813. else {
  1814. /* Ignore the rest */
  1815. failsafe = TRUE;
  1816. sz = is_css ? 16 : 1;
  1817. /* TODO: add textual fonts descriptions */
  1818. }
  1819. if (err && *err != '\0') {
  1820. const gchar *e = err;
  1821. gsize slen;
  1822. /* Skip spaces */
  1823. while (*e && g_ascii_isspace (*e)) {
  1824. e ++;
  1825. }
  1826. /* Lowercase */
  1827. slen = strlen (e);
  1828. rspamd_str_lc ((gchar *)e, slen);
  1829. if (!rspamd_html_process_css_size (e, slen, &sz)) {
  1830. failsafe = TRUE;
  1831. }
  1832. }
  1833. else {
  1834. /* Failsafe naked number */
  1835. failsafe = TRUE;
  1836. }
  1837. if (failsafe) {
  1838. if (is_css) {
  1839. /*
  1840. * In css mode we usually ignore sizes, but let's treat
  1841. * small sizes specially
  1842. */
  1843. if (sz < 1) {
  1844. sz = 0;
  1845. } else {
  1846. sz = 16; /* Ignore */
  1847. }
  1848. } else {
  1849. /* In non-css mode we have to check legacy size */
  1850. sz = sz >= 1 ? sz * 16 : 16;
  1851. }
  1852. }
  1853. if (sz > 32) {
  1854. sz = 32;
  1855. }
  1856. *fs = sz;
  1857. }
  1858. static void
  1859. rspamd_html_process_style (rspamd_mempool_t *pool, struct html_block *bl,
  1860. struct html_content *hc, const gchar *style, guint len)
  1861. {
  1862. const gchar *p, *c, *end, *key = NULL;
  1863. enum {
  1864. read_key,
  1865. read_colon,
  1866. read_value,
  1867. skip_spaces,
  1868. } state = skip_spaces, next_state = read_key;
  1869. guint klen = 0;
  1870. gdouble opacity = 1.0;
  1871. p = style;
  1872. c = p;
  1873. end = p + len;
  1874. while (p <= end) {
  1875. switch(state) {
  1876. case read_key:
  1877. if (p == end || *p == ':') {
  1878. key = c;
  1879. klen = p - c;
  1880. state = skip_spaces;
  1881. next_state = read_value;
  1882. }
  1883. else if (g_ascii_isspace (*p)) {
  1884. key = c;
  1885. klen = p - c;
  1886. state = skip_spaces;
  1887. next_state = read_colon;
  1888. }
  1889. p ++;
  1890. break;
  1891. case read_colon:
  1892. if (p == end || *p == ':') {
  1893. state = skip_spaces;
  1894. next_state = read_value;
  1895. }
  1896. p ++;
  1897. break;
  1898. case read_value:
  1899. if (p == end || *p == ';') {
  1900. if (key && klen && p - c > 0) {
  1901. if ((klen == 5 && g_ascii_strncasecmp (key, "color", 5) == 0)
  1902. || (klen == 10 && g_ascii_strncasecmp (key, "font-color", 10) == 0)) {
  1903. rspamd_html_process_color (c, p - c, &bl->font_color);
  1904. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  1905. }
  1906. else if ((klen == 16 && g_ascii_strncasecmp (key,
  1907. "background-color", 16) == 0) ||
  1908. (klen == 10 && g_ascii_strncasecmp (key,
  1909. "background", 10) == 0)) {
  1910. rspamd_html_process_color (c, p - c, &bl->background_color);
  1911. msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
  1912. }
  1913. else if (klen == 7 && g_ascii_strncasecmp (key, "display", 7) == 0) {
  1914. if (p - c >= 4 && rspamd_substring_search_caseless (c, p - c,
  1915. "none", 4) != -1) {
  1916. bl->visible = FALSE;
  1917. msg_debug_html ("tag is not visible");
  1918. }
  1919. }
  1920. else if (klen == 9 &&
  1921. g_ascii_strncasecmp (key, "font-size", 9) == 0) {
  1922. rspamd_html_process_font_size (c, p - c,
  1923. &bl->font_size, TRUE);
  1924. msg_debug_html ("got font size: %ud", bl->font_size);
  1925. }
  1926. else if (klen == 7 &&
  1927. g_ascii_strncasecmp (key, "opacity", 7) == 0) {
  1928. gchar numbuf[64];
  1929. rspamd_strlcpy (numbuf, c,
  1930. MIN (sizeof (numbuf), p - c + 1));
  1931. opacity = strtod (numbuf, NULL);
  1932. if (opacity > 1) {
  1933. opacity = 1;
  1934. }
  1935. else if (opacity < 0) {
  1936. opacity = 0;
  1937. }
  1938. bl->font_color.d.comp.alpha = (guint8)(opacity * 255.0);
  1939. }
  1940. else if (klen == 10 &&
  1941. g_ascii_strncasecmp (key, "visibility", 10) == 0) {
  1942. if (p - c >= 6 && rspamd_substring_search_caseless (c,
  1943. p - c,
  1944. "hidden", 6) != -1) {
  1945. bl->visible = FALSE;
  1946. msg_debug_html ("tag is not visible");
  1947. }
  1948. }
  1949. }
  1950. key = NULL;
  1951. klen = 0;
  1952. state = skip_spaces;
  1953. next_state = read_key;
  1954. }
  1955. p ++;
  1956. break;
  1957. case skip_spaces:
  1958. if (p < end && !g_ascii_isspace (*p)) {
  1959. c = p;
  1960. state = next_state;
  1961. }
  1962. else {
  1963. p ++;
  1964. }
  1965. break;
  1966. }
  1967. }
  1968. }
  1969. static void
  1970. rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1971. struct html_content *hc)
  1972. {
  1973. struct html_tag_component *comp;
  1974. struct html_block *bl;
  1975. rspamd_ftok_t fstr;
  1976. GList *cur;
  1977. cur = tag->params->head;
  1978. bl = rspamd_mempool_alloc0 (pool, sizeof (*bl));
  1979. bl->tag = tag;
  1980. bl->visible = TRUE;
  1981. bl->font_size = (guint)-1;
  1982. bl->font_color.d.comp.alpha = 255;
  1983. while (cur) {
  1984. comp = cur->data;
  1985. if (comp->len > 0) {
  1986. switch (comp->type) {
  1987. case RSPAMD_HTML_COMPONENT_COLOR:
  1988. fstr.begin = (gchar *) comp->start;
  1989. fstr.len = comp->len;
  1990. rspamd_html_process_color (comp->start, comp->len,
  1991. &bl->font_color);
  1992. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  1993. break;
  1994. case RSPAMD_HTML_COMPONENT_BGCOLOR:
  1995. fstr.begin = (gchar *) comp->start;
  1996. fstr.len = comp->len;
  1997. rspamd_html_process_color (comp->start, comp->len,
  1998. &bl->background_color);
  1999. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  2000. if (tag->id == Tag_BODY) {
  2001. /* Set global background color */
  2002. memcpy (&hc->bgcolor, &bl->background_color,
  2003. sizeof (hc->bgcolor));
  2004. }
  2005. break;
  2006. case RSPAMD_HTML_COMPONENT_STYLE:
  2007. bl->style.len = comp->len;
  2008. bl->style.start = comp->start;
  2009. msg_debug_html ("got style: %*s", (gint) bl->style.len,
  2010. bl->style.start);
  2011. rspamd_html_process_style (pool, bl, hc, comp->start, comp->len);
  2012. break;
  2013. case RSPAMD_HTML_COMPONENT_CLASS:
  2014. fstr.begin = (gchar *) comp->start;
  2015. fstr.len = comp->len;
  2016. bl->class = rspamd_mempool_ftokdup (pool, &fstr);
  2017. msg_debug_html ("got class: %s", bl->class);
  2018. break;
  2019. case RSPAMD_HTML_COMPONENT_SIZE:
  2020. fstr.begin = (gchar *) comp->start;
  2021. fstr.len = comp->len;
  2022. rspamd_html_process_color (comp->start, comp->len,
  2023. &bl->font_color);
  2024. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  2025. break;
  2026. default:
  2027. /* NYI */
  2028. break;
  2029. }
  2030. }
  2031. cur = g_list_next (cur);
  2032. }
  2033. if (hc->blocks == NULL) {
  2034. hc->blocks = g_ptr_array_sized_new (64);
  2035. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  2036. hc->blocks);
  2037. }
  2038. g_ptr_array_add (hc->blocks, bl);
  2039. tag->extra = bl;
  2040. }
  2041. static void
  2042. rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
  2043. GList **exceptions, GHashTable *urls, GHashTable *emails,
  2044. GByteArray *dest, GHashTable *target_tbl,
  2045. gint href_offset,
  2046. struct rspamd_url *url)
  2047. {
  2048. struct rspamd_url *displayed_url = NULL;
  2049. struct rspamd_url *turl;
  2050. gboolean url_found = FALSE;
  2051. struct rspamd_process_exception *ex;
  2052. if (href_offset <= 0) {
  2053. /* No dispalyed url, just some text within <a> tag */
  2054. return;
  2055. }
  2056. rspamd_html_url_is_phished (pool, url,
  2057. dest->data + href_offset,
  2058. dest->len - href_offset,
  2059. &url_found, &displayed_url);
  2060. if (exceptions && url_found) {
  2061. ex = rspamd_mempool_alloc (pool,
  2062. sizeof (*ex));
  2063. ex->pos = href_offset;
  2064. ex->len = dest->len - href_offset;
  2065. ex->type = RSPAMD_EXCEPTION_URL;
  2066. ex->ptr = url;
  2067. *exceptions = g_list_prepend (*exceptions,
  2068. ex);
  2069. }
  2070. if (displayed_url) {
  2071. if (displayed_url->protocol ==
  2072. PROTOCOL_MAILTO) {
  2073. target_tbl = emails;
  2074. }
  2075. else {
  2076. target_tbl = urls;
  2077. }
  2078. if (target_tbl != NULL) {
  2079. turl = g_hash_table_lookup (target_tbl,
  2080. displayed_url);
  2081. if (turl != NULL) {
  2082. /* Here, we assume the following:
  2083. * if we have a URL in the text part which
  2084. * is the same as displayed URL in the
  2085. * HTML part, we assume that it is also
  2086. * hint only.
  2087. */
  2088. if (turl->flags &
  2089. RSPAMD_URL_FLAG_FROM_TEXT) {
  2090. turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  2091. turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
  2092. }
  2093. turl->count ++;
  2094. }
  2095. else {
  2096. g_hash_table_insert (target_tbl,
  2097. displayed_url,
  2098. displayed_url);
  2099. }
  2100. }
  2101. }
  2102. }
  2103. static gboolean
  2104. rspamd_html_propagate_lengths (GNode *node, gpointer _unused)
  2105. {
  2106. GNode *child;
  2107. struct html_tag *tag = node->data, *cld_tag;
  2108. if (tag) {
  2109. child = node->children;
  2110. /* Summarize content length from children */
  2111. while (child) {
  2112. cld_tag = child->data;
  2113. tag->content_length += cld_tag->content_length;
  2114. child = child->next;
  2115. }
  2116. }
  2117. return FALSE;
  2118. }
  2119. static void
  2120. rspamd_html_propagate_style (struct html_content *hc,
  2121. struct html_tag *tag,
  2122. struct html_block *bl,
  2123. GQueue *blocks)
  2124. {
  2125. struct html_block *bl_parent;
  2126. gboolean push_block = FALSE;
  2127. /* Propagate from the parent if needed */
  2128. bl_parent = g_queue_peek_tail (blocks);
  2129. if (bl_parent) {
  2130. if (!bl->background_color.valid) {
  2131. /* Try to propagate background color from parent nodes */
  2132. if (bl_parent->background_color.valid) {
  2133. memcpy (&bl->background_color, &bl_parent->background_color,
  2134. sizeof (bl->background_color));
  2135. }
  2136. }
  2137. else {
  2138. push_block = TRUE;
  2139. }
  2140. if (!bl->font_color.valid) {
  2141. /* Try to propagate background color from parent nodes */
  2142. if (bl_parent->font_color.valid) {
  2143. memcpy (&bl->font_color, &bl_parent->font_color,
  2144. sizeof (bl->font_color));
  2145. }
  2146. }
  2147. else {
  2148. push_block = TRUE;
  2149. }
  2150. /* Propagate font size */
  2151. if (bl->font_size == (guint)-1) {
  2152. if (bl_parent->font_size != (guint)-1) {
  2153. bl->font_size = bl_parent->font_size;
  2154. }
  2155. }
  2156. else {
  2157. push_block = TRUE;
  2158. }
  2159. }
  2160. /* Set bgcolor to the html bgcolor and font color to black as a last resort */
  2161. if (!bl->font_color.valid) {
  2162. /* Don't touch opacity as it can be set separately */
  2163. bl->font_color.d.comp.r = 0;
  2164. bl->font_color.d.comp.g = 0;
  2165. bl->font_color.d.comp.b = 0;
  2166. bl->font_color.valid = TRUE;
  2167. }
  2168. else {
  2169. push_block = TRUE;
  2170. }
  2171. if (!bl->background_color.valid) {
  2172. memcpy (&bl->background_color, &hc->bgcolor, sizeof (hc->bgcolor));
  2173. }
  2174. else {
  2175. push_block = TRUE;
  2176. }
  2177. if (bl->font_size == (guint)-1) {
  2178. bl->font_size = 16; /* Default for browsers */
  2179. }
  2180. else {
  2181. push_block = TRUE;
  2182. }
  2183. if (push_block && !(tag->flags & FL_CLOSED)) {
  2184. g_queue_push_tail (blocks, bl);
  2185. }
  2186. }
  2187. GByteArray*
  2188. rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
  2189. GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails)
  2190. {
  2191. const guchar *p, *c, *end, *savep = NULL;
  2192. guchar t;
  2193. gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
  2194. balanced;
  2195. GByteArray *dest;
  2196. GHashTable *target_tbl;
  2197. guint obrace = 0, ebrace = 0;
  2198. GNode *cur_level = NULL;
  2199. gint substate = 0, len, href_offset = -1;
  2200. struct html_tag *cur_tag = NULL, *content_tag = NULL;
  2201. struct rspamd_url *url = NULL, *turl;
  2202. GQueue *styles_blocks;
  2203. enum {
  2204. parse_start = 0,
  2205. tag_begin,
  2206. sgml_tag,
  2207. xml_tag,
  2208. compound_tag,
  2209. comment_tag,
  2210. comment_content,
  2211. sgml_content,
  2212. tag_content,
  2213. tag_end,
  2214. xml_tag_end,
  2215. content_ignore,
  2216. content_write,
  2217. content_ignore_sp
  2218. } state = parse_start;
  2219. g_assert (in != NULL);
  2220. g_assert (hc != NULL);
  2221. g_assert (pool != NULL);
  2222. rspamd_html_library_init ();
  2223. hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (G_N_ELEMENTS (tag_defs)));
  2224. /* Set white background color by default */
  2225. hc->bgcolor.d.comp.alpha = 0;
  2226. hc->bgcolor.d.comp.r = 255;
  2227. hc->bgcolor.d.comp.g = 255;
  2228. hc->bgcolor.d.comp.b = 255;
  2229. hc->bgcolor.valid = TRUE;
  2230. dest = g_byte_array_sized_new (in->len / 3 * 2);
  2231. styles_blocks = g_queue_new ();
  2232. p = in->data;
  2233. c = p;
  2234. end = p + in->len;
  2235. while (p < end) {
  2236. t = *p;
  2237. switch (state) {
  2238. case parse_start:
  2239. if (t == '<') {
  2240. state = tag_begin;
  2241. }
  2242. else {
  2243. /* We have no starting tag, so assume that it's content */
  2244. hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
  2245. state = content_write;
  2246. }
  2247. break;
  2248. case tag_begin:
  2249. switch (t) {
  2250. case '<':
  2251. p ++;
  2252. closing = FALSE;
  2253. break;
  2254. case '!':
  2255. state = sgml_tag;
  2256. p ++;
  2257. break;
  2258. case '?':
  2259. state = xml_tag;
  2260. hc->flags |= RSPAMD_HTML_FLAG_XML;
  2261. p ++;
  2262. break;
  2263. case '/':
  2264. closing = TRUE;
  2265. p ++;
  2266. break;
  2267. case '>':
  2268. /* Empty tag */
  2269. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2270. state = tag_end;
  2271. continue;
  2272. default:
  2273. state = tag_content;
  2274. substate = 0;
  2275. savep = NULL;
  2276. cur_tag = rspamd_mempool_alloc0 (pool, sizeof (*cur_tag));
  2277. cur_tag->params = g_queue_new ();
  2278. rspamd_mempool_add_destructor (pool,
  2279. (rspamd_mempool_destruct_t)g_queue_free, cur_tag->params);
  2280. break;
  2281. }
  2282. break;
  2283. case sgml_tag:
  2284. switch (t) {
  2285. case '[':
  2286. state = compound_tag;
  2287. obrace = 1;
  2288. ebrace = 0;
  2289. p ++;
  2290. break;
  2291. case '-':
  2292. state = comment_tag;
  2293. p ++;
  2294. break;
  2295. default:
  2296. state = sgml_content;
  2297. break;
  2298. }
  2299. break;
  2300. case xml_tag:
  2301. if (t == '?') {
  2302. state = xml_tag_end;
  2303. }
  2304. else if (t == '>') {
  2305. /* Misformed xml tag */
  2306. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2307. state = tag_end;
  2308. continue;
  2309. }
  2310. /* We efficiently ignore xml tags */
  2311. p ++;
  2312. break;
  2313. case xml_tag_end:
  2314. if (t == '>') {
  2315. state = tag_end;
  2316. continue;
  2317. }
  2318. else {
  2319. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2320. p ++;
  2321. }
  2322. break;
  2323. case compound_tag:
  2324. if (t == '[') {
  2325. obrace ++;
  2326. }
  2327. else if (t == ']') {
  2328. ebrace ++;
  2329. }
  2330. else if (t == '>' && obrace == ebrace) {
  2331. state = tag_end;
  2332. continue;
  2333. }
  2334. p ++;
  2335. break;
  2336. case comment_tag:
  2337. if (t != '-') {
  2338. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2339. }
  2340. p ++;
  2341. ebrace = 0;
  2342. state = comment_content;
  2343. break;
  2344. case comment_content:
  2345. if (t == '-') {
  2346. ebrace ++;
  2347. }
  2348. else if (t == '>' && ebrace >= 2) {
  2349. state = tag_end;
  2350. continue;
  2351. }
  2352. else {
  2353. ebrace = 0;
  2354. }
  2355. p ++;
  2356. break;
  2357. case content_ignore:
  2358. if (t != '<') {
  2359. p ++;
  2360. }
  2361. else {
  2362. if (content_tag) {
  2363. if (content_tag->content == NULL) {
  2364. content_tag->content = c;
  2365. }
  2366. content_tag->content_length += p - c;
  2367. }
  2368. state = tag_begin;
  2369. }
  2370. break;
  2371. case content_write:
  2372. if (t != '<') {
  2373. if (t == '&') {
  2374. need_decode = TRUE;
  2375. }
  2376. else if (g_ascii_isspace (t)) {
  2377. save_space = TRUE;
  2378. if (p > c) {
  2379. if (need_decode) {
  2380. goffset old_offset = dest->len;
  2381. g_byte_array_append (dest, c, (p - c));
  2382. len = rspamd_html_decode_entitles_inplace (
  2383. dest->data + old_offset,
  2384. p - c);
  2385. dest->len = dest->len + len - (p - c);
  2386. }
  2387. else {
  2388. len = p - c;
  2389. g_byte_array_append (dest, c, len);
  2390. }
  2391. if (content_tag) {
  2392. if (content_tag->content == NULL) {
  2393. content_tag->content = c;
  2394. }
  2395. content_tag->content_length += p - c + 1;
  2396. }
  2397. }
  2398. c = p;
  2399. state = content_ignore_sp;
  2400. }
  2401. else {
  2402. if (save_space) {
  2403. /* Append one space if needed */
  2404. if (dest->len > 0 &&
  2405. !g_ascii_isspace (dest->data[dest->len - 1])) {
  2406. g_byte_array_append (dest, " ", 1);
  2407. }
  2408. save_space = FALSE;
  2409. }
  2410. }
  2411. }
  2412. else {
  2413. if (c != p) {
  2414. if (need_decode) {
  2415. goffset old_offset = dest->len;
  2416. g_byte_array_append (dest, c, (p - c));
  2417. len = rspamd_html_decode_entitles_inplace (
  2418. dest->data + old_offset,
  2419. p - c);
  2420. dest->len = dest->len + len - (p - c);
  2421. }
  2422. else {
  2423. len = p - c;
  2424. g_byte_array_append (dest, c, len);
  2425. }
  2426. if (content_tag) {
  2427. if (content_tag->content == NULL) {
  2428. content_tag->content = c;
  2429. }
  2430. content_tag->content_length += p - c;
  2431. }
  2432. }
  2433. content_tag = NULL;
  2434. state = tag_begin;
  2435. continue;
  2436. }
  2437. p ++;
  2438. break;
  2439. case content_ignore_sp:
  2440. if (!g_ascii_isspace (t)) {
  2441. c = p;
  2442. state = content_write;
  2443. continue;
  2444. }
  2445. if (content_tag) {
  2446. content_tag->content_length ++;
  2447. }
  2448. p ++;
  2449. break;
  2450. case sgml_content:
  2451. /* TODO: parse DOCTYPE here */
  2452. if (t == '>') {
  2453. state = tag_end;
  2454. /* We don't know a lot about sgml tags, ignore them */
  2455. cur_tag = NULL;
  2456. continue;
  2457. }
  2458. p ++;
  2459. break;
  2460. case tag_content:
  2461. rspamd_html_parse_tag_content (pool, hc, cur_tag,
  2462. p, &substate, &savep);
  2463. if (t == '>') {
  2464. if (closing) {
  2465. cur_tag->flags |= FL_CLOSING;
  2466. if (cur_tag->flags & FL_CLOSED) {
  2467. /* Bad mix of closed and closing */
  2468. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2469. }
  2470. closing = FALSE;
  2471. }
  2472. state = tag_end;
  2473. continue;
  2474. }
  2475. p ++;
  2476. break;
  2477. case tag_end:
  2478. substate = 0;
  2479. savep = NULL;
  2480. if (cur_tag != NULL) {
  2481. balanced = TRUE;
  2482. if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level,
  2483. &balanced)) {
  2484. state = content_write;
  2485. need_decode = FALSE;
  2486. }
  2487. else {
  2488. state = content_ignore;
  2489. }
  2490. if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
  2491. if (cur_tag->flags & CM_UNIQUE) {
  2492. if (isset (hc->tags_seen, cur_tag->id)) {
  2493. /* Duplicate tag has been found */
  2494. hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
  2495. }
  2496. }
  2497. setbit (hc->tags_seen, cur_tag->id);
  2498. }
  2499. if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
  2500. content_tag = cur_tag;
  2501. }
  2502. /* Handle newlines */
  2503. if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
  2504. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2505. g_byte_array_append (dest, "\r\n", 2);
  2506. }
  2507. save_space = FALSE;
  2508. }
  2509. if ((cur_tag->id == Tag_P ||
  2510. cur_tag->id == Tag_TR ||
  2511. cur_tag->id == Tag_DIV)) {
  2512. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2513. g_byte_array_append (dest, "\r\n", 2);
  2514. }
  2515. save_space = FALSE;
  2516. }
  2517. if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) {
  2518. if (!(cur_tag->flags & (FL_CLOSING))) {
  2519. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2520. if (url != NULL) {
  2521. if (url->protocol == PROTOCOL_MAILTO) {
  2522. target_tbl = emails;
  2523. }
  2524. else {
  2525. target_tbl = urls;
  2526. }
  2527. if (target_tbl != NULL) {
  2528. turl = g_hash_table_lookup (target_tbl, url);
  2529. if (turl == NULL) {
  2530. g_hash_table_insert (target_tbl, url, url);
  2531. }
  2532. else {
  2533. turl->count ++;
  2534. url = NULL;
  2535. }
  2536. if (turl == NULL && url != NULL) {
  2537. rspamd_process_html_url (pool,
  2538. url,
  2539. urls, emails);
  2540. }
  2541. }
  2542. href_offset = dest->len;
  2543. }
  2544. }
  2545. if (cur_tag->id == Tag_A) {
  2546. if (!balanced && cur_level && cur_level->prev) {
  2547. struct html_tag *prev_tag;
  2548. struct rspamd_url *prev_url;
  2549. prev_tag = cur_level->prev->data;
  2550. if (prev_tag->id == Tag_A &&
  2551. !(prev_tag->flags & (FL_CLOSING)) &&
  2552. prev_tag->extra) {
  2553. prev_url = prev_tag->extra;
  2554. rspamd_html_check_displayed_url (pool,
  2555. exceptions, urls, emails,
  2556. dest, target_tbl, href_offset,
  2557. prev_url);
  2558. }
  2559. }
  2560. if (cur_tag->flags & (FL_CLOSING)) {
  2561. /* Insert exception */
  2562. if (url != NULL && (gint) dest->len > href_offset) {
  2563. rspamd_html_check_displayed_url (pool,
  2564. exceptions, urls, emails,
  2565. dest, target_tbl, href_offset,
  2566. url);
  2567. }
  2568. href_offset = -1;
  2569. url = NULL;
  2570. }
  2571. }
  2572. }
  2573. else if (cur_tag->id == Tag_LINK) {
  2574. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2575. }
  2576. else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
  2577. struct html_tag *prev_tag = NULL;
  2578. if (cur_level && cur_level->parent) {
  2579. prev_tag = cur_level->parent->data;
  2580. }
  2581. /*
  2582. * Base is allowed only within head tag but we slightly
  2583. * relax that
  2584. */
  2585. if (!prev_tag || prev_tag->id == Tag_HEAD ||
  2586. prev_tag->id == Tag_HTML) {
  2587. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2588. if (url != NULL && hc->base_url == NULL) {
  2589. /* We have a base tag available */
  2590. hc->base_url = url;
  2591. }
  2592. }
  2593. }
  2594. if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
  2595. rspamd_html_process_img_tag (pool, cur_tag, hc);
  2596. }
  2597. else if (cur_tag->flags & FL_BLOCK) {
  2598. struct html_block *bl;
  2599. if (cur_tag->flags & FL_CLOSING) {
  2600. /* Just remove block element from the queue if any */
  2601. if (styles_blocks->length > 0) {
  2602. g_queue_pop_tail (styles_blocks);
  2603. }
  2604. }
  2605. else {
  2606. rspamd_html_process_block_tag (pool, cur_tag, hc);
  2607. bl = cur_tag->extra;
  2608. if (bl) {
  2609. rspamd_html_propagate_style (hc, cur_tag,
  2610. cur_tag->extra, styles_blocks);
  2611. /* Check visibility */
  2612. if (bl->font_size < 3 ||
  2613. bl->font_color.d.comp.alpha < 10) {
  2614. bl->visible = FALSE;
  2615. msg_debug_html ("tag is not visible");
  2616. }
  2617. if (!bl->visible) {
  2618. state = content_ignore;
  2619. }
  2620. }
  2621. }
  2622. }
  2623. }
  2624. else {
  2625. state = content_write;
  2626. }
  2627. p++;
  2628. c = p;
  2629. cur_tag = NULL;
  2630. break;
  2631. }
  2632. }
  2633. if (hc->html_tags) {
  2634. g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1,
  2635. rspamd_html_propagate_lengths, NULL);
  2636. }
  2637. g_queue_free (styles_blocks);
  2638. return dest;
  2639. }
  2640. GByteArray*
  2641. rspamd_html_process_part (rspamd_mempool_t *pool,
  2642. struct html_content *hc,
  2643. GByteArray *in)
  2644. {
  2645. return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
  2646. }