You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.c 73KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "rspamd.h"
  19. #include "message.h"
  20. #include "html.h"
  21. #include "html_tags.h"
  22. #include "html_colors.h"
  23. #include "html_entities.h"
  24. #include "url.h"
  25. #include "contrib/libucl/khash.h"
  26. #include "libmime/images.h"
  27. #include <unicode/uversion.h>
  28. #include <unicode/ucnv.h>
  29. #if U_ICU_VERSION_MAJOR_NUM >= 46
  30. #include <unicode/uidna.h>
  31. #endif
  32. static sig_atomic_t tags_sorted = 0;
  33. static sig_atomic_t entities_sorted = 0;
  34. static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
  35. struct html_tag_def {
  36. const gchar *name;
  37. gint16 id;
  38. guint16 len;
  39. guint flags;
  40. };
  41. #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
  42. rspamd_html_log_id, "html", pool->tag.uid, \
  43. G_STRFUNC, \
  44. __VA_ARGS__)
  45. INIT_LOG_MODULE(html)
  46. #define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
  47. static struct html_tag_def tag_defs[] = {
  48. /* W3C defined elements */
  49. TAG_DEF(Tag_A, "a", FL_HREF),
  50. TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
  51. TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
  52. TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
  53. TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  54. TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
  55. TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
  56. TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
  57. TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
  58. TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
  59. TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
  60. TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
  61. TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
  62. TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
  63. TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
  64. TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
  65. TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
  66. TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
  67. TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
  68. TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
  69. TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
  70. TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  71. TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  72. TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
  73. TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
  74. TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
  75. TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
  76. TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  77. TAG_DEF(Tag_EM, "em", (CM_INLINE)),
  78. TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
  79. TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
  80. TAG_DEF(Tag_FORM, "form", (CM_BLOCK)),
  81. TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
  82. TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
  83. TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
  84. TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
  85. TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
  86. TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
  87. TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
  88. TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
  89. TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  90. TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
  91. TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  92. TAG_DEF(Tag_I, "i", (CM_INLINE)),
  93. TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
  94. TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
  95. TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
  96. TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  97. TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
  98. TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
  99. TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
  100. TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
  101. TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  102. TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY|FL_HREF)),
  103. TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
  104. TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)),
  105. TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
  106. TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
  107. TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
  108. TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  109. TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
  110. TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
  111. TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
  112. TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
  113. TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
  114. TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
  115. TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
  116. TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
  117. TAG_DEF(Tag_Q, "q", (CM_INLINE)),
  118. TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
  119. TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
  120. TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
  121. TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
  122. TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
  123. TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
  124. TAG_DEF(Tag_S, "s", (CM_INLINE)),
  125. TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
  126. TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
  127. TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
  128. TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
  129. TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
  130. TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
  131. TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
  132. TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
  133. TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
  134. TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
  135. TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
  136. TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
  137. TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  138. TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
  139. TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  140. TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  141. TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  142. TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
  143. TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
  144. TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
  145. TAG_DEF(Tag_U, "u", (CM_INLINE)),
  146. TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
  147. TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
  148. TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
  149. TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
  150. /* proprietary elements */
  151. TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
  152. TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
  153. TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
  154. TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
  155. TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
  156. TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
  157. TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
  158. TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
  159. TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
  160. TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
  161. TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
  162. TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
  163. TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  164. TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
  165. TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
  166. TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  167. TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
  168. TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
  169. };
  170. KHASH_MAP_INIT_INT (entity_by_number, const char *);
  171. KHASH_MAP_INIT_STR (entity_by_name, const char *);
  172. KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
  173. KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
  174. KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
  175. rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
  176. khash_t(entity_by_number) *html_entity_by_number;
  177. khash_t(entity_by_name) *html_entity_by_name;
  178. khash_t(tag_by_name) *html_tag_by_name;
  179. khash_t(tag_by_id) *html_tag_by_id;
  180. khash_t(color_by_name) *html_color_by_name;
  181. static void
  182. rspamd_html_library_init (void)
  183. {
  184. guint i;
  185. khiter_t k;
  186. gint rc;
  187. if (!tags_sorted) {
  188. html_tag_by_id = kh_init (tag_by_id);
  189. html_tag_by_name = kh_init (tag_by_name);
  190. kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
  191. kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
  192. for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
  193. k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
  194. kh_val (html_tag_by_id, k) = tag_defs[i];
  195. k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
  196. kh_val (html_tag_by_name, k) = tag_defs[i];
  197. }
  198. tags_sorted = 1;
  199. }
  200. if (!entities_sorted) {
  201. html_entity_by_number = kh_init (entity_by_number);
  202. html_entity_by_name = kh_init (entity_by_name);
  203. kh_resize (entity_by_number, html_entity_by_number,
  204. G_N_ELEMENTS (entities_defs));
  205. kh_resize (entity_by_name, html_entity_by_name,
  206. G_N_ELEMENTS (entities_defs));
  207. for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
  208. if (entities_defs[i].code != 0) {
  209. k = kh_put (entity_by_number, html_entity_by_number,
  210. entities_defs[i].code, &rc);
  211. kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
  212. }
  213. k = kh_put (entity_by_name, html_entity_by_name,
  214. entities_defs[i].name, &rc);
  215. kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
  216. }
  217. html_color_by_name = kh_init (color_by_name);
  218. kh_resize (color_by_name, html_color_by_name,
  219. G_N_ELEMENTS (html_colornames));
  220. rspamd_ftok_t *keys;
  221. keys = g_malloc0 (sizeof (rspamd_ftok_t) *
  222. G_N_ELEMENTS (html_colornames));
  223. for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
  224. struct html_color c;
  225. keys[i].begin = html_colornames[i].name;
  226. keys[i].len = strlen (html_colornames[i].name);
  227. k = kh_put (color_by_name, html_color_by_name,
  228. &keys[i], &rc);
  229. c.valid = true;
  230. c.d.comp.r = html_colornames[i].rgb.r;
  231. c.d.comp.g = html_colornames[i].rgb.g;
  232. c.d.comp.b = html_colornames[i].rgb.b;
  233. c.d.comp.alpha = 255;
  234. kh_val (html_color_by_name, k) = c;
  235. }
  236. entities_sorted = 1;
  237. }
  238. }
  239. static gboolean
  240. rspamd_html_check_balance (GNode * node, GNode ** cur_level)
  241. {
  242. struct html_tag *arg = node->data, *tmp;
  243. GNode *cur;
  244. if (arg->flags & FL_CLOSING) {
  245. /* First of all check whether this tag is closing tag for parent node */
  246. cur = node->parent;
  247. while (cur && cur->data) {
  248. tmp = cur->data;
  249. if (tmp->id == arg->id &&
  250. (tmp->flags & FL_CLOSED) == 0) {
  251. tmp->flags |= FL_CLOSED;
  252. /* Destroy current node as we find corresponding parent node */
  253. g_node_destroy (node);
  254. /* Change level */
  255. *cur_level = cur->parent;
  256. return TRUE;
  257. }
  258. cur = cur->parent;
  259. }
  260. }
  261. else {
  262. return TRUE;
  263. }
  264. return FALSE;
  265. }
  266. gint
  267. rspamd_html_tag_by_name (const gchar *name)
  268. {
  269. khiter_t k;
  270. k = kh_get (tag_by_name, html_tag_by_name, name);
  271. if (k != kh_end (html_tag_by_name)) {
  272. return kh_val (html_tag_by_name, k).id;
  273. }
  274. return -1;
  275. }
  276. gboolean
  277. rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
  278. {
  279. gint id;
  280. g_assert (hc != NULL);
  281. g_assert (hc->tags_seen != NULL);
  282. id = rspamd_html_tag_by_name (tagname);
  283. if (id != -1) {
  284. return isset (hc->tags_seen, id);
  285. }
  286. return FALSE;
  287. }
  288. const gchar *
  289. rspamd_html_tag_by_id (gint id)
  290. {
  291. khiter_t k;
  292. k = kh_get (tag_by_id, html_tag_by_id, id);
  293. if (k != kh_end (html_tag_by_id)) {
  294. return kh_val (html_tag_by_id, k).name;
  295. }
  296. return NULL;
  297. }
  298. /* Decode HTML entitles in text */
  299. guint
  300. rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
  301. {
  302. goffset l, rep_len;
  303. gchar *t = s, *h = s, *e = s, *end_ptr;
  304. const gchar *end;
  305. const gchar *entity;
  306. gint state = 0, base;
  307. UChar32 uc;
  308. khiter_t k;
  309. if (len == 0) {
  310. return 0;
  311. }
  312. else {
  313. l = len;
  314. }
  315. end = s + l;
  316. while (h - s < l) {
  317. switch (state) {
  318. /* Out of entity */
  319. case 0:
  320. if (*h == '&') {
  321. state = 1;
  322. e = h;
  323. h++;
  324. continue;
  325. }
  326. else {
  327. *t = *h;
  328. h++;
  329. t++;
  330. }
  331. break;
  332. case 1:
  333. if (*h == ';' && h > e) {
  334. /* Determine base */
  335. /* First find in entities table */
  336. *h = '\0';
  337. entity = e + 1;
  338. uc = 0;
  339. if (*entity != '#') {
  340. k = kh_get (entity_by_name, html_entity_by_name, entity);
  341. *h = ';';
  342. if (k != kh_end (html_entity_by_name)) {
  343. if (kh_val (html_entity_by_name, k)) {
  344. rep_len = strlen (kh_val (html_entity_by_name, k));
  345. if (end - t >= rep_len) {
  346. memcpy (t, kh_val (html_entity_by_name, k),
  347. rep_len);
  348. t += rep_len;
  349. }
  350. } else {
  351. if (end - t > h - e + 1) {
  352. memmove (t, e, h - e + 1);
  353. t += h - e + 1;
  354. }
  355. }
  356. }
  357. else {
  358. if (end - t > h - e + 1) {
  359. memmove (t, e, h - e + 1);
  360. t += h - e + 1;
  361. }
  362. }
  363. }
  364. else if (e + 2 < h) {
  365. if (*(e + 2) == 'x' || *(e + 2) == 'X') {
  366. base = 16;
  367. }
  368. else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
  369. base = 8;
  370. }
  371. else {
  372. base = 10;
  373. }
  374. if (base == 10) {
  375. uc = strtoul ((e + 2), &end_ptr, base);
  376. }
  377. else {
  378. uc = strtoul ((e + 3), &end_ptr, base);
  379. }
  380. if (end_ptr != NULL && *end_ptr != '\0') {
  381. /* Skip undecoded */
  382. *h = ';';
  383. if (end - t > h - e + 1) {
  384. memmove (t, e, h - e + 1);
  385. t += h - e + 1;
  386. }
  387. }
  388. else {
  389. /* Search for a replacement */
  390. *h = ';';
  391. k = kh_get (entity_by_number, html_entity_by_number, uc);
  392. if (k != kh_end (html_entity_by_number)) {
  393. if (kh_val (html_entity_by_number, k)) {
  394. rep_len = strlen (kh_val (html_entity_by_number, k));
  395. if (end - t >= rep_len) {
  396. memcpy (t, kh_val (html_entity_by_number, k),
  397. rep_len);
  398. t += rep_len;
  399. }
  400. } else {
  401. if (end - t > h - e + 1) {
  402. memmove (t, e, h - e + 1);
  403. t += h - e + 1;
  404. }
  405. }
  406. }
  407. else {
  408. /* Unicode point */
  409. goffset off = t - s;
  410. UBool is_error = 0;
  411. if (uc > 0) {
  412. U8_APPEND (s, off, len, uc, is_error);
  413. if (!is_error) {
  414. t = s + off;
  415. }
  416. else {
  417. /* Leave invalid entities as is */
  418. if (end - t > h - e + 1) {
  419. memmove (t, e, h - e + 1);
  420. t += h - e + 1;
  421. }
  422. }
  423. }
  424. else if (end - t > h - e + 1) {
  425. memmove (t, e, h - e + 1);
  426. t += h - e + 1;
  427. }
  428. }
  429. }
  430. }
  431. state = 0;
  432. }
  433. else if (*h == '&') {
  434. /* Previous `&` was bogus */
  435. state = 1;
  436. if (end - t > h - e) {
  437. memmove (t, e, h - e);
  438. t += h - e;
  439. }
  440. e = h;
  441. }
  442. h++;
  443. break;
  444. }
  445. }
  446. /* Leftover */
  447. if (state == 1 && h > e) {
  448. /* Unfinished entity, copy as is */
  449. if (end - t > h - e) {
  450. memmove (t, e, h - e);
  451. t += h - e;
  452. }
  453. }
  454. return (t - s);
  455. }
  456. static gboolean
  457. rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
  458. {
  459. const gchar *p1, *p2;
  460. p1 = t1->begin + t1->len - 1;
  461. p2 = t2->begin + t2->len - 1;
  462. /* Skip trailing dots */
  463. while (p1 > t1->begin) {
  464. if (*p1 != '.') {
  465. break;
  466. }
  467. p1 --;
  468. }
  469. while (p2 > t2->begin) {
  470. if (*p2 != '.') {
  471. break;
  472. }
  473. p2 --;
  474. }
  475. while (p1 > t1->begin && p2 > t2->begin) {
  476. if (*p1 != *p2) {
  477. break;
  478. }
  479. p1 --;
  480. p2 --;
  481. }
  482. if (p2 == t2->begin) {
  483. /* p2 can be subdomain of p1 if *p1 is '.' */
  484. if (p1 != t1->begin && *(p1 - 1) == '.') {
  485. return TRUE;
  486. }
  487. }
  488. else if (p1 == t1->begin) {
  489. if (p2 != t2->begin && *(p2 - 1) == '.') {
  490. return TRUE;
  491. }
  492. }
  493. return FALSE;
  494. }
  495. static void
  496. rspamd_html_url_is_phished (rspamd_mempool_t *pool,
  497. struct rspamd_url *href_url,
  498. const guchar *url_text,
  499. gsize len,
  500. gboolean *url_found,
  501. struct rspamd_url **ptext_url)
  502. {
  503. struct rspamd_url *text_url;
  504. rspamd_ftok_t disp_tok, href_tok;
  505. gint rc;
  506. goffset url_pos;
  507. gchar *url_str = NULL, *idn_hbuf;
  508. const guchar *end = url_text + len, *p;
  509. #if U_ICU_VERSION_MAJOR_NUM >= 46
  510. static UIDNA *udn;
  511. UErrorCode uc_err = U_ZERO_ERROR;
  512. UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER;
  513. #endif
  514. *url_found = FALSE;
  515. #if U_ICU_VERSION_MAJOR_NUM >= 46
  516. if (udn == NULL) {
  517. udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
  518. if (uc_err != U_ZERO_ERROR) {
  519. msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
  520. }
  521. }
  522. #endif
  523. while (url_text < end && g_ascii_isspace (*url_text)) {
  524. url_text ++;
  525. }
  526. if (end > url_text + 4 &&
  527. rspamd_url_find (pool, url_text, end - url_text, &url_str,
  528. RSPAMD_URL_FIND_ALL,
  529. &url_pos, NULL) &&
  530. url_str != NULL) {
  531. if (url_pos > 0) {
  532. /*
  533. * We have some url at some offset, so we need to check what is
  534. * at the start of the text
  535. */
  536. p = url_text;
  537. while (p < url_text + url_pos) {
  538. if (!g_ascii_isspace (*p)) {
  539. *url_found = FALSE;
  540. return;
  541. }
  542. p++;
  543. }
  544. }
  545. text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
  546. rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
  547. RSPAMD_URL_PARSE_TEXT);
  548. if (rc == URI_ERRNO_OK) {
  549. disp_tok.len = text_url->hostlen;
  550. disp_tok.begin = text_url->host;
  551. #if U_ICU_VERSION_MAJOR_NUM >= 46
  552. if (rspamd_substring_search_caseless (text_url->host,
  553. text_url->hostlen, "xn--", 4) != -1) {
  554. idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
  555. /* We need to convert it to the normal value first */
  556. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  557. text_url->host, text_url->hostlen,
  558. idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
  559. if (uc_err != U_ZERO_ERROR) {
  560. msg_err_pool ("cannot convert to IDN: %s",
  561. u_errorName (uc_err));
  562. disp_tok.len = text_url->hostlen;
  563. }
  564. else {
  565. disp_tok.begin = idn_hbuf;
  566. }
  567. }
  568. #endif
  569. href_tok.len = href_url->hostlen;
  570. href_tok.begin = href_url->host;
  571. #if U_ICU_VERSION_MAJOR_NUM >= 46
  572. if (rspamd_substring_search_caseless (href_url->host,
  573. href_url->hostlen, "xn--", 4) != -1) {
  574. idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
  575. /* We need to convert it to the normal value first */
  576. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  577. href_url->host, href_url->hostlen,
  578. idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
  579. if (uc_err != U_ZERO_ERROR) {
  580. msg_err_pool ("cannot convert to IDN: %s",
  581. u_errorName (uc_err));
  582. href_tok.len = href_url->hostlen;
  583. }
  584. else {
  585. href_tok.begin = idn_hbuf;
  586. }
  587. }
  588. #endif
  589. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0 &&
  590. text_url->tldlen > 0 && href_url->tldlen > 0) {
  591. /* Apply the same logic for TLD */
  592. disp_tok.len = text_url->tldlen;
  593. disp_tok.begin = text_url->tld;
  594. #if U_ICU_VERSION_MAJOR_NUM >= 46
  595. if (rspamd_substring_search_caseless (text_url->tld,
  596. text_url->tldlen, "xn--", 4) != -1) {
  597. idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
  598. /* We need to convert it to the normal value first */
  599. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  600. text_url->tld, text_url->tldlen,
  601. idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
  602. if (uc_err != U_ZERO_ERROR) {
  603. msg_err_pool ("cannot convert to IDN: %s",
  604. u_errorName (uc_err));
  605. disp_tok.len = text_url->tldlen;
  606. }
  607. else {
  608. disp_tok.begin = idn_hbuf;
  609. }
  610. }
  611. #endif
  612. href_tok.len = href_url->tldlen;
  613. href_tok.begin = href_url->tld;
  614. #if U_ICU_VERSION_MAJOR_NUM >= 46
  615. if (rspamd_substring_search_caseless (href_url->tld,
  616. href_url->tldlen, "xn--", 4) != -1) {
  617. idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
  618. /* We need to convert it to the normal value first */
  619. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  620. href_url->tld, href_url->tldlen,
  621. idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
  622. if (uc_err != U_ZERO_ERROR) {
  623. msg_err_pool ("cannot convert to IDN: %s",
  624. u_errorName (uc_err));
  625. href_tok.len = href_url->tldlen;
  626. }
  627. else {
  628. href_tok.begin = idn_hbuf;
  629. }
  630. }
  631. #endif
  632. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
  633. /* Check if one url is a subdomain for another */
  634. if (!rspamd_url_is_subdomain (&disp_tok, &href_tok)) {
  635. href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
  636. href_url->phished_url = text_url;
  637. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  638. }
  639. }
  640. }
  641. *ptext_url = text_url;
  642. *url_found = TRUE;
  643. }
  644. else {
  645. msg_info_pool ("extract of url '%s' failed: %s",
  646. url_str,
  647. rspamd_url_strerror (rc));
  648. }
  649. }
  650. }
  651. static gboolean
  652. rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
  653. struct html_tag *tag, GNode **cur_level, gboolean *balanced)
  654. {
  655. GNode *nnode;
  656. struct html_tag *parent;
  657. if (hc->html_tags == NULL) {
  658. nnode = g_node_new (NULL);
  659. *cur_level = nnode;
  660. hc->html_tags = nnode;
  661. rspamd_mempool_add_destructor (pool,
  662. (rspamd_mempool_destruct_t) g_node_destroy,
  663. nnode);
  664. }
  665. if (hc->total_tags > max_tags) {
  666. hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
  667. }
  668. if (tag->id == -1) {
  669. /* Ignore unknown tags */
  670. hc->total_tags ++;
  671. return FALSE;
  672. }
  673. tag->parent = *cur_level;
  674. if (!(tag->flags & CM_INLINE)) {
  675. /* Block tag */
  676. if (tag->flags & (FL_CLOSING|FL_CLOSED)) {
  677. if (!*cur_level) {
  678. msg_debug_html ("bad parent node");
  679. return FALSE;
  680. }
  681. if (hc->total_tags < max_tags) {
  682. nnode = g_node_new (tag);
  683. g_node_append (*cur_level, nnode);
  684. if (!rspamd_html_check_balance (nnode, cur_level)) {
  685. msg_debug_html (
  686. "mark part as unbalanced as it has not pairable closing tags");
  687. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  688. *balanced = FALSE;
  689. } else {
  690. *balanced = TRUE;
  691. }
  692. hc->total_tags ++;
  693. }
  694. }
  695. else {
  696. parent = (*cur_level)->data;
  697. if (parent) {
  698. if ((parent->flags & FL_IGNORE)) {
  699. tag->flags |= FL_IGNORE;
  700. }
  701. if (!(tag->flags & FL_CLOSED) &&
  702. !(parent->flags & FL_BLOCK)) {
  703. /* We likely have some bad nesting */
  704. if (parent->id == tag->id) {
  705. /* Something like <a>bla<a>foo... */
  706. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  707. *balanced = FALSE;
  708. tag->parent = parent->parent;
  709. if (hc->total_tags < max_tags) {
  710. nnode = g_node_new (tag);
  711. g_node_append (parent->parent, nnode);
  712. *cur_level = nnode;
  713. hc->total_tags ++;
  714. }
  715. return TRUE;
  716. }
  717. }
  718. }
  719. if (hc->total_tags < max_tags) {
  720. nnode = g_node_new (tag);
  721. g_node_append (*cur_level, nnode);
  722. if ((tag->flags & FL_CLOSED) == 0) {
  723. *cur_level = nnode;
  724. }
  725. hc->total_tags ++;
  726. }
  727. if (tag->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE)) {
  728. tag->flags |= FL_IGNORE;
  729. return FALSE;
  730. }
  731. }
  732. }
  733. else {
  734. /* Inline tag */
  735. parent = (*cur_level)->data;
  736. if (parent && (parent->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE))) {
  737. tag->flags |= FL_IGNORE;
  738. return FALSE;
  739. }
  740. }
  741. return TRUE;
  742. }
  743. #define NEW_COMPONENT(comp_type) do { \
  744. comp = rspamd_mempool_alloc (pool, sizeof (*comp)); \
  745. comp->type = (comp_type); \
  746. comp->start = NULL; \
  747. comp->len = 0; \
  748. g_queue_push_tail (tag->params, comp); \
  749. ret = TRUE; \
  750. } while(0)
  751. static gboolean
  752. rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
  753. const guchar *begin, const guchar *end,
  754. struct html_tag *tag)
  755. {
  756. struct html_tag_component *comp;
  757. gint len;
  758. gboolean ret = FALSE;
  759. gchar *p;
  760. if (end <= begin) {
  761. return FALSE;
  762. }
  763. p = rspamd_mempool_alloc (pool, end - begin);
  764. memcpy (p, begin, end - begin);
  765. len = rspamd_html_decode_entitles_inplace (p, end - begin);
  766. if (len == 3) {
  767. if (g_ascii_strncasecmp (p, "src", len) == 0) {
  768. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  769. }
  770. }
  771. else if (len == 4) {
  772. if (g_ascii_strncasecmp (p, "href", len) == 0) {
  773. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  774. }
  775. }
  776. if (tag->id == Tag_IMG) {
  777. /* Check width and height if presented */
  778. if (len == 5 && g_ascii_strncasecmp (p, "width", len) == 0) {
  779. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH);
  780. }
  781. else if (len == 6 && g_ascii_strncasecmp (p, "height", len) == 0) {
  782. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT);
  783. }
  784. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  785. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  786. }
  787. }
  788. else if (tag->id == Tag_FONT) {
  789. if (len == 5){
  790. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  791. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  792. }
  793. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  794. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  795. }
  796. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  797. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  798. }
  799. }
  800. else if (len == 7) {
  801. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  802. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  803. }
  804. }
  805. else if (len == 4) {
  806. if (g_ascii_strncasecmp (p, "size", len) == 0) {
  807. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE);
  808. }
  809. }
  810. }
  811. else if (tag->flags & FL_BLOCK) {
  812. if (len == 5){
  813. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  814. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  815. }
  816. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  817. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  818. }
  819. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  820. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  821. }
  822. }
  823. else if (len == 7) {
  824. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  825. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  826. }
  827. }
  828. }
  829. return ret;
  830. }
  831. static inline void
  832. rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
  833. struct html_content *hc, struct html_tag *tag, const guchar *in,
  834. gint *statep, guchar const **savep)
  835. {
  836. enum {
  837. parse_start = 0,
  838. parse_name,
  839. parse_attr_name,
  840. parse_equal,
  841. parse_start_dquote,
  842. parse_dqvalue,
  843. parse_end_dquote,
  844. parse_start_squote,
  845. parse_sqvalue,
  846. parse_end_squote,
  847. parse_value,
  848. spaces_after_name,
  849. spaces_before_eq,
  850. spaces_after_eq,
  851. spaces_after_param,
  852. ignore_bad_tag
  853. } state;
  854. struct html_tag_def *found;
  855. gboolean store = FALSE;
  856. struct html_tag_component *comp;
  857. state = *statep;
  858. switch (state) {
  859. case parse_start:
  860. if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
  861. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  862. state = ignore_bad_tag;
  863. tag->id = -1;
  864. tag->flags |= FL_BROKEN;
  865. }
  866. else if (g_ascii_isalpha (*in)) {
  867. state = parse_name;
  868. tag->name.start = in;
  869. }
  870. break;
  871. case parse_name:
  872. if (g_ascii_isspace (*in) || *in == '>' || *in == '/') {
  873. g_assert (in >= tag->name.start);
  874. if (*in == '/') {
  875. tag->flags |= FL_CLOSED;
  876. }
  877. tag->name.len = in - tag->name.start;
  878. if (tag->name.len == 0) {
  879. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  880. tag->id = -1;
  881. tag->flags |= FL_BROKEN;
  882. state = ignore_bad_tag;
  883. }
  884. else {
  885. gchar *s;
  886. khiter_t k;
  887. /* We CANNOT safely modify tag's name here, as it is already parsed */
  888. s = rspamd_mempool_alloc (pool, tag->name.len + 1);
  889. memcpy (s, tag->name.start, tag->name.len);
  890. tag->name.len = rspamd_html_decode_entitles_inplace (s,
  891. tag->name.len);
  892. tag->name.start = s;
  893. tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len);
  894. s[tag->name.len] = '\0';
  895. k = kh_get (tag_by_name, html_tag_by_name, s);
  896. if (k == kh_end (html_tag_by_name)) {
  897. hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
  898. tag->id = -1;
  899. }
  900. else {
  901. found = &kh_val (html_tag_by_name, k);
  902. tag->id = found->id;
  903. tag->flags = found->flags;
  904. }
  905. state = spaces_after_name;
  906. }
  907. }
  908. break;
  909. case parse_attr_name:
  910. if (*savep == NULL) {
  911. state = ignore_bad_tag;
  912. }
  913. else {
  914. const guchar *attr_name_end = in;
  915. if (*in == '=') {
  916. state = parse_equal;
  917. }
  918. else if (*in == '"') {
  919. /* No equal or something sane but we have quote character */
  920. state = parse_start_dquote;
  921. attr_name_end = in - 1;
  922. while (attr_name_end > *savep) {
  923. if (!g_ascii_isalnum (*attr_name_end)) {
  924. attr_name_end --;
  925. }
  926. else {
  927. break;
  928. }
  929. }
  930. /* One character forward to obtain length */
  931. attr_name_end ++;
  932. }
  933. else if (g_ascii_isspace (*in)) {
  934. state = spaces_before_eq;
  935. }
  936. else if (*in == '/') {
  937. tag->flags |= FL_CLOSED;
  938. }
  939. else if (!g_ascii_isgraph (*in)) {
  940. state = parse_value;
  941. attr_name_end = in - 1;
  942. while (attr_name_end > *savep) {
  943. if (!g_ascii_isalnum (*attr_name_end)) {
  944. attr_name_end --;
  945. }
  946. else {
  947. break;
  948. }
  949. }
  950. /* One character forward to obtain length */
  951. attr_name_end ++;
  952. }
  953. else {
  954. return;
  955. }
  956. if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
  957. /* Ignore unknown params */
  958. *savep = NULL;
  959. }
  960. else if (state == parse_value) {
  961. *savep = in + 1;
  962. }
  963. }
  964. break;
  965. case spaces_after_name:
  966. if (!g_ascii_isspace (*in)) {
  967. *savep = in;
  968. if (*in == '/') {
  969. tag->flags |= FL_CLOSED;
  970. }
  971. else if (*in != '>') {
  972. state = parse_attr_name;
  973. }
  974. }
  975. break;
  976. case spaces_before_eq:
  977. if (*in == '=') {
  978. state = parse_equal;
  979. }
  980. else if (!g_ascii_isspace (*in)) {
  981. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  982. tag->flags |= FL_BROKEN;
  983. state = ignore_bad_tag;
  984. }
  985. break;
  986. case spaces_after_eq:
  987. if (*in == '"') {
  988. state = parse_start_dquote;
  989. }
  990. else if (*in == '\'') {
  991. state = parse_start_squote;
  992. }
  993. else if (!g_ascii_isspace (*in)) {
  994. if (*savep != NULL) {
  995. /* We need to save this param */
  996. *savep = in;
  997. }
  998. state = parse_value;
  999. }
  1000. break;
  1001. case parse_equal:
  1002. if (g_ascii_isspace (*in)) {
  1003. state = spaces_after_eq;
  1004. }
  1005. else if (*in == '"') {
  1006. state = parse_start_dquote;
  1007. }
  1008. else if (*in == '\'') {
  1009. state = parse_start_squote;
  1010. }
  1011. else {
  1012. if (*savep != NULL) {
  1013. /* We need to save this param */
  1014. *savep = in;
  1015. }
  1016. state = parse_value;
  1017. }
  1018. break;
  1019. case parse_start_dquote:
  1020. if (*in == '"') {
  1021. if (*savep != NULL) {
  1022. /* We have an empty attribute value */
  1023. savep = NULL;
  1024. }
  1025. state = spaces_after_param;
  1026. }
  1027. else {
  1028. if (*savep != NULL) {
  1029. /* We need to save this param */
  1030. *savep = in;
  1031. }
  1032. state = parse_dqvalue;
  1033. }
  1034. break;
  1035. case parse_start_squote:
  1036. if (*in == '\'') {
  1037. if (*savep != NULL) {
  1038. /* We have an empty attribute value */
  1039. savep = NULL;
  1040. }
  1041. state = spaces_after_param;
  1042. }
  1043. else {
  1044. if (*savep != NULL) {
  1045. /* We need to save this param */
  1046. *savep = in;
  1047. }
  1048. state = parse_sqvalue;
  1049. }
  1050. break;
  1051. case parse_dqvalue:
  1052. if (*in == '"') {
  1053. store = TRUE;
  1054. state = parse_end_dquote;
  1055. }
  1056. if (store) {
  1057. if (*savep != NULL) {
  1058. gchar *s;
  1059. g_assert (tag->params != NULL);
  1060. comp = g_queue_peek_tail (tag->params);
  1061. g_assert (comp != NULL);
  1062. comp->len = in - *savep;
  1063. s = rspamd_mempool_alloc (pool, comp->len);
  1064. memcpy (s, *savep, comp->len);
  1065. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1066. comp->start = s;
  1067. *savep = NULL;
  1068. }
  1069. }
  1070. break;
  1071. case parse_sqvalue:
  1072. if (*in == '\'') {
  1073. store = TRUE;
  1074. state = parse_end_squote;
  1075. }
  1076. if (store) {
  1077. if (*savep != NULL) {
  1078. gchar *s;
  1079. g_assert (tag->params != NULL);
  1080. comp = g_queue_peek_tail (tag->params);
  1081. g_assert (comp != NULL);
  1082. comp->len = in - *savep;
  1083. s = rspamd_mempool_alloc (pool, comp->len);
  1084. memcpy (s, *savep, comp->len);
  1085. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1086. comp->start = s;
  1087. *savep = NULL;
  1088. }
  1089. }
  1090. break;
  1091. case parse_value:
  1092. if (*in == '/' && *(in + 1) == '>') {
  1093. tag->flags |= FL_CLOSED;
  1094. store = TRUE;
  1095. }
  1096. else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
  1097. store = TRUE;
  1098. state = spaces_after_param;
  1099. }
  1100. if (store) {
  1101. if (*savep != NULL) {
  1102. gchar *s;
  1103. g_assert (tag->params != NULL);
  1104. comp = g_queue_peek_tail (tag->params);
  1105. g_assert (comp != NULL);
  1106. comp->len = in - *savep;
  1107. s = rspamd_mempool_alloc (pool, comp->len);
  1108. memcpy (s, *savep, comp->len);
  1109. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1110. comp->start = s;
  1111. *savep = NULL;
  1112. }
  1113. }
  1114. break;
  1115. case parse_end_dquote:
  1116. case parse_end_squote:
  1117. if (g_ascii_isspace (*in)) {
  1118. state = spaces_after_param;
  1119. }
  1120. else if (*in == '/' && *(in + 1) == '>') {
  1121. tag->flags |= FL_CLOSED;
  1122. }
  1123. break;
  1124. case spaces_after_param:
  1125. if (!g_ascii_isspace (*in)) {
  1126. if (*in == '/' && *(in + 1) == '>') {
  1127. tag->flags |= FL_CLOSED;
  1128. }
  1129. state = parse_attr_name;
  1130. *savep = in;
  1131. }
  1132. break;
  1133. case ignore_bad_tag:
  1134. break;
  1135. }
  1136. *statep = state;
  1137. }
  1138. struct rspamd_url *
  1139. rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
  1140. struct html_tag_component *comp)
  1141. {
  1142. struct rspamd_url *url;
  1143. guint saved_flags = 0;
  1144. gchar *decoded;
  1145. gint rc;
  1146. gsize decoded_len;
  1147. const gchar *p, *s, *prefix = "http://";
  1148. gchar *d;
  1149. guint i, dlen;
  1150. gboolean has_bad_chars = FALSE, no_prefix = FALSE;
  1151. static const gchar hexdigests[16] = "0123456789abcdef";
  1152. p = start;
  1153. /* Strip spaces from the url */
  1154. /* Head spaces */
  1155. while (p < start + len && g_ascii_isspace (*p)) {
  1156. p ++;
  1157. start ++;
  1158. len --;
  1159. }
  1160. if (comp) {
  1161. comp->start = p;
  1162. comp->len = len;
  1163. }
  1164. /* Trailing spaces */
  1165. p = start + len - 1;
  1166. while (p >= start && g_ascii_isspace (*p)) {
  1167. p --;
  1168. len --;
  1169. if (comp) {
  1170. comp->len --;
  1171. }
  1172. }
  1173. s = start;
  1174. dlen = 0;
  1175. for (i = 0; i < len; i ++) {
  1176. if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1177. dlen += 3;
  1178. }
  1179. else {
  1180. dlen ++;
  1181. }
  1182. }
  1183. if (rspamd_substring_search (start, len, "://", 3) == -1) {
  1184. if (len >= sizeof ("mailto:") &&
  1185. (memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
  1186. memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 ||
  1187. memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) {
  1188. /* Exclusion, has valid but 'strange' prefix */
  1189. }
  1190. else {
  1191. for (i = 0; i < len; i ++) {
  1192. if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
  1193. if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
  1194. prefix = "http:";
  1195. dlen += sizeof ("http:") - 1;
  1196. no_prefix = TRUE;
  1197. }
  1198. else if (s[i] == '@') {
  1199. /* Likely email prefix */
  1200. prefix = "mailto://";
  1201. dlen += sizeof ("mailto://") - 1;
  1202. no_prefix = TRUE;
  1203. }
  1204. else if (s[i] == ':' && i != 0) {
  1205. /* Special case */
  1206. no_prefix = FALSE;
  1207. }
  1208. else {
  1209. if (i == 0) {
  1210. /* No valid data */
  1211. return NULL;
  1212. }
  1213. else {
  1214. no_prefix = TRUE;
  1215. dlen += strlen (prefix);
  1216. }
  1217. }
  1218. break;
  1219. }
  1220. }
  1221. }
  1222. }
  1223. decoded = rspamd_mempool_alloc (pool, dlen + 1);
  1224. d = decoded;
  1225. if (no_prefix) {
  1226. gsize plen = strlen (prefix);
  1227. memcpy (d, prefix, plen);
  1228. d += plen;
  1229. }
  1230. /*
  1231. * We also need to remove all internal newlines, spaces
  1232. * and encode unsafe characters
  1233. */
  1234. for (i = 0; i < len; i ++) {
  1235. if (G_UNLIKELY (g_ascii_isspace (s[i]))) {
  1236. continue;
  1237. }
  1238. else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1239. /* URL encode */
  1240. *d++ = '%';
  1241. *d++ = hexdigests[(s[i] >> 4) & 0xf];
  1242. *d++ = hexdigests[s[i] & 0xf];
  1243. has_bad_chars = TRUE;
  1244. }
  1245. else {
  1246. *d++ = s[i];
  1247. }
  1248. }
  1249. *d = '\0';
  1250. dlen = d - decoded;
  1251. url = rspamd_mempool_alloc0 (pool, sizeof (*url));
  1252. enum rspamd_normalise_result norm_res;
  1253. norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
  1254. if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
  1255. saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
  1256. }
  1257. if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
  1258. saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
  1259. if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
  1260. saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
  1261. }
  1262. }
  1263. rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
  1264. /* Filter some completely damaged urls */
  1265. if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
  1266. !((url->flags & RSPAMD_URL_FLAG_OBSCURED) && (url->protocol & PROTOCOL_UNKNOWN))) {
  1267. url->flags |= saved_flags;
  1268. if (has_bad_chars) {
  1269. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1270. }
  1271. if (no_prefix) {
  1272. url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  1273. }
  1274. decoded = url->string;
  1275. decoded_len = url->urllen;
  1276. if (comp) {
  1277. comp->start = decoded;
  1278. comp->len = decoded_len;
  1279. }
  1280. /* Spaces in href usually mean an attempt to obfuscate URL */
  1281. /* See https://github.com/vstakhov/rspamd/issues/593 */
  1282. #if 0
  1283. if (has_spaces) {
  1284. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1285. }
  1286. #endif
  1287. return url;
  1288. }
  1289. return NULL;
  1290. }
  1291. static struct rspamd_url *
  1292. rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1293. struct html_content *hc)
  1294. {
  1295. struct html_tag_component *comp;
  1296. GList *cur;
  1297. struct rspamd_url *url;
  1298. const gchar *start;
  1299. gsize len;
  1300. cur = tag->params->head;
  1301. while (cur) {
  1302. comp = cur->data;
  1303. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1304. start = comp->start;
  1305. len = comp->len;
  1306. /* Check base url */
  1307. if (hc && hc->base_url && comp->len > 2) {
  1308. /*
  1309. * Relative url canot start from the following:
  1310. * schema://
  1311. * slash
  1312. */
  1313. gchar *buf;
  1314. gsize orig_len;
  1315. if (rspamd_substring_search (start, len, "://", 3) == -1) {
  1316. /* Assume relative url */
  1317. gboolean need_slash = FALSE;
  1318. orig_len = len;
  1319. len += hc->base_url->urllen;
  1320. if (hc->base_url->string[hc->base_url->urllen - 1] != '/') {
  1321. need_slash = TRUE;
  1322. len ++;
  1323. }
  1324. buf = rspamd_mempool_alloc (pool, len + 1);
  1325. rspamd_snprintf (buf, len + 1, "%*s%s%*s",
  1326. hc->base_url->urllen, hc->base_url->string,
  1327. need_slash ? "/" : "",
  1328. (gint)orig_len, start);
  1329. start = buf;
  1330. }
  1331. else if (start[0] == '/' && start[1] != '/') {
  1332. /* Relative to the hostname */
  1333. orig_len = len;
  1334. len += hc->base_url->hostlen + hc->base_url->protocollen +
  1335. 3 /* for :// */;
  1336. buf = rspamd_mempool_alloc (pool, len + 1);
  1337. rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
  1338. hc->base_url->protocollen, hc->base_url->string,
  1339. hc->base_url->hostlen, hc->base_url->host,
  1340. (gint)orig_len, start);
  1341. start = buf;
  1342. }
  1343. }
  1344. url = rspamd_html_process_url (pool, start, len, comp);
  1345. if (url && tag->extra == NULL) {
  1346. tag->extra = url;
  1347. }
  1348. return url;
  1349. }
  1350. cur = g_list_next (cur);
  1351. }
  1352. return NULL;
  1353. }
  1354. static void
  1355. rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
  1356. GHashTable *tbl_urls, GHashTable *tbl_emails)
  1357. {
  1358. GHashTable *target_tbl;
  1359. struct rspamd_url *query_url, *existing;
  1360. gchar *url_str;
  1361. gint rc;
  1362. gboolean prefix_added;
  1363. if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) {
  1364. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1365. }
  1366. if (url->querylen > 0) {
  1367. if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
  1368. RSPAMD_URL_FIND_ALL,
  1369. NULL, &prefix_added)) {
  1370. query_url = rspamd_mempool_alloc0 (pool,
  1371. sizeof (struct rspamd_url));
  1372. rc = rspamd_url_parse (query_url,
  1373. url_str,
  1374. strlen (url_str),
  1375. pool,
  1376. RSPAMD_URL_PARSE_TEXT);
  1377. if (rc == URI_ERRNO_OK &&
  1378. query_url->hostlen > 0) {
  1379. msg_debug_html ("found url %s in query of url"
  1380. " %*s", url_str, url->querylen, url->query);
  1381. if (query_url->protocol == PROTOCOL_MAILTO) {
  1382. target_tbl = tbl_emails;
  1383. }
  1384. else {
  1385. target_tbl = tbl_urls;
  1386. }
  1387. if (prefix_added) {
  1388. query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  1389. }
  1390. if (query_url->flags
  1391. & (RSPAMD_URL_FLAG_UNNORMALISED|RSPAMD_URL_FLAG_OBSCURED|
  1392. RSPAMD_URL_FLAG_NUMERIC)) {
  1393. /* Set obscured flag if query url is bad */
  1394. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1395. }
  1396. /* And vice-versa */
  1397. if (url->flags & RSPAMD_URL_FLAG_OBSCURED) {
  1398. query_url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1399. }
  1400. if ((existing = g_hash_table_lookup (target_tbl,
  1401. query_url)) == NULL) {
  1402. g_hash_table_insert (target_tbl,
  1403. query_url,
  1404. query_url);
  1405. }
  1406. else {
  1407. existing->count ++;
  1408. }
  1409. }
  1410. }
  1411. }
  1412. }
  1413. static void
  1414. rspamd_html_process_data_image (rspamd_mempool_t *pool,
  1415. struct html_image *img,
  1416. struct html_tag_component *src)
  1417. {
  1418. /*
  1419. * Here, we do very basic processing of the data:
  1420. * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
  1421. * We only parse base64 encoded data.
  1422. * We ignore content type so far
  1423. */
  1424. struct rspamd_image *parsed_image;
  1425. const gchar *semicolon_pos = NULL, *end = src->start + src->len;
  1426. semicolon_pos = src->start;
  1427. while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) {
  1428. if (end - semicolon_pos > sizeof ("base64,")) {
  1429. if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) {
  1430. const gchar *data_pos = semicolon_pos + sizeof ("base64,");
  1431. gchar *decoded;
  1432. gsize encoded_len = end - data_pos, decoded_len;
  1433. rspamd_ftok_t inp;
  1434. decoded_len = (encoded_len / 4 * 3) + 12;
  1435. decoded = rspamd_mempool_alloc (pool, decoded_len);
  1436. rspamd_cryptobox_base64_decode (data_pos, encoded_len,
  1437. decoded, &decoded_len);
  1438. inp.begin = decoded;
  1439. inp.len = decoded_len;
  1440. parsed_image = rspamd_maybe_process_image (pool, &inp);
  1441. if (parsed_image) {
  1442. msg_debug_html ("detected %s image of size %ud x %ud in data url",
  1443. rspamd_image_type_str (parsed_image->type),
  1444. parsed_image->width, parsed_image->height);
  1445. img->embedded_image = parsed_image;
  1446. }
  1447. }
  1448. break;
  1449. }
  1450. else {
  1451. /* Nothing useful */
  1452. return;
  1453. }
  1454. semicolon_pos ++;
  1455. }
  1456. }
  1457. static void
  1458. rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1459. struct html_content *hc, GHashTable *urls)
  1460. {
  1461. struct html_tag_component *comp;
  1462. struct html_image *img;
  1463. rspamd_ftok_t fstr;
  1464. const guchar *p;
  1465. GList *cur;
  1466. gulong val;
  1467. gboolean seen_width = FALSE, seen_height = FALSE;
  1468. goffset pos;
  1469. cur = tag->params->head;
  1470. img = rspamd_mempool_alloc0 (pool, sizeof (*img));
  1471. img->tag = tag;
  1472. while (cur) {
  1473. comp = cur->data;
  1474. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1475. fstr.begin = (gchar *)comp->start;
  1476. fstr.len = comp->len;
  1477. img->src = rspamd_mempool_ftokdup (pool, &fstr);
  1478. if (comp->len > sizeof ("cid:") - 1 && memcmp (comp->start,
  1479. "cid:", sizeof ("cid:") - 1) == 0) {
  1480. /* We have an embedded image */
  1481. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
  1482. }
  1483. else {
  1484. if (comp->len > sizeof ("data:") - 1 && memcmp (comp->start,
  1485. "data:", sizeof ("data:") - 1) == 0) {
  1486. /* We have an embedded image in HTML tag */
  1487. img->flags |=
  1488. (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
  1489. rspamd_html_process_data_image (pool, img, comp);
  1490. hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
  1491. }
  1492. else {
  1493. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
  1494. if (img->src) {
  1495. img->url = rspamd_html_process_url (pool,
  1496. img->src, fstr.len, NULL);
  1497. if (img->url) {
  1498. struct rspamd_url *turl = g_hash_table_lookup (urls,
  1499. img->url);
  1500. img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
  1501. if (turl == NULL) {
  1502. g_hash_table_insert (urls, img->url, img->url);
  1503. }
  1504. else {
  1505. turl->count++;
  1506. }
  1507. }
  1508. }
  1509. }
  1510. }
  1511. }
  1512. else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) {
  1513. rspamd_strtoul (comp->start, comp->len, &val);
  1514. img->height = val;
  1515. seen_height = TRUE;
  1516. }
  1517. else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) {
  1518. rspamd_strtoul (comp->start, comp->len, &val);
  1519. img->width = val;
  1520. seen_width = TRUE;
  1521. }
  1522. else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) {
  1523. /* Try to search for height= or width= in style tag */
  1524. if (!seen_height && comp->len > 0) {
  1525. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1526. "height", sizeof ("height") - 1);
  1527. if (pos != -1) {
  1528. p = comp->start + pos + sizeof ("height") - 1;
  1529. while (p < comp->start + comp->len) {
  1530. if (g_ascii_isdigit (*p)) {
  1531. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1532. img->height = val;
  1533. break;
  1534. }
  1535. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1536. /* Fallback */
  1537. break;
  1538. }
  1539. p ++;
  1540. }
  1541. }
  1542. }
  1543. if (!seen_width && comp->len > 0) {
  1544. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1545. "width", sizeof ("width") - 1);
  1546. if (pos != -1) {
  1547. p = comp->start + pos + sizeof ("width") - 1;
  1548. while (p < comp->start + comp->len) {
  1549. if (g_ascii_isdigit (*p)) {
  1550. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1551. img->width = val;
  1552. break;
  1553. }
  1554. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1555. /* Fallback */
  1556. break;
  1557. }
  1558. p ++;
  1559. }
  1560. }
  1561. }
  1562. }
  1563. cur = g_list_next (cur);
  1564. }
  1565. if (hc->images == NULL) {
  1566. hc->images = g_ptr_array_sized_new (4);
  1567. rspamd_mempool_notify_alloc (pool, 4 * sizeof (gpointer) + sizeof (GPtrArray));
  1568. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  1569. hc->images);
  1570. }
  1571. if (img->embedded_image) {
  1572. if (!seen_height) {
  1573. img->height = img->embedded_image->height;
  1574. }
  1575. if (!seen_width) {
  1576. img->width = img->embedded_image->width;
  1577. }
  1578. }
  1579. g_ptr_array_add (hc->images, img);
  1580. tag->extra = img;
  1581. }
  1582. static void
  1583. rspamd_html_process_color (const gchar *line, guint len, struct html_color *cl)
  1584. {
  1585. const gchar *p = line, *end = line + len;
  1586. char hexbuf[7];
  1587. rspamd_ftok_t search;
  1588. struct html_color *el;
  1589. memset (cl, 0, sizeof (*cl));
  1590. if (*p == '#') {
  1591. /* HEX color */
  1592. p ++;
  1593. rspamd_strlcpy (hexbuf, p, MIN ((gint)sizeof(hexbuf), end - p + 1));
  1594. cl->d.val = strtoul (hexbuf, NULL, 16);
  1595. cl->d.comp.alpha = 255;
  1596. cl->valid = TRUE;
  1597. }
  1598. else if (len > 4 && rspamd_lc_cmp (p, "rgb", 3) == 0) {
  1599. /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
  1600. enum {
  1601. obrace,
  1602. num1,
  1603. num2,
  1604. num3,
  1605. num4,
  1606. skip_spaces
  1607. } state = skip_spaces, next_state = obrace;
  1608. gulong r = 0, g = 0, b = 0, opacity = 255;
  1609. const gchar *c;
  1610. gboolean valid = FALSE;
  1611. p += 3;
  1612. if (*p == 'a') {
  1613. p ++;
  1614. }
  1615. c = p;
  1616. while (p < end) {
  1617. switch (state) {
  1618. case obrace:
  1619. if (*p == '(') {
  1620. p ++;
  1621. state = skip_spaces;
  1622. next_state = num1;
  1623. }
  1624. else if (g_ascii_isspace (*p)) {
  1625. state = skip_spaces;
  1626. next_state = obrace;
  1627. }
  1628. else {
  1629. goto stop;
  1630. }
  1631. break;
  1632. case num1:
  1633. if (*p == ',') {
  1634. if (!rspamd_strtoul (c, p - c, &r)) {
  1635. goto stop;
  1636. }
  1637. p ++;
  1638. state = skip_spaces;
  1639. next_state = num2;
  1640. }
  1641. else if (!g_ascii_isdigit (*p)) {
  1642. goto stop;
  1643. }
  1644. else {
  1645. p ++;
  1646. }
  1647. break;
  1648. case num2:
  1649. if (*p == ',') {
  1650. if (!rspamd_strtoul (c, p - c, &g)) {
  1651. goto stop;
  1652. }
  1653. p ++;
  1654. state = skip_spaces;
  1655. next_state = num3;
  1656. }
  1657. else if (!g_ascii_isdigit (*p)) {
  1658. goto stop;
  1659. }
  1660. else {
  1661. p ++;
  1662. }
  1663. break;
  1664. case num3:
  1665. if (*p == ',') {
  1666. if (!rspamd_strtoul (c, p - c, &b)) {
  1667. goto stop;
  1668. }
  1669. valid = TRUE;
  1670. p ++;
  1671. state = skip_spaces;
  1672. next_state = num4;
  1673. }
  1674. else if (*p == ')') {
  1675. if (!rspamd_strtoul (c, p - c, &b)) {
  1676. goto stop;
  1677. }
  1678. valid = TRUE;
  1679. goto stop;
  1680. }
  1681. else if (!g_ascii_isdigit (*p)) {
  1682. goto stop;
  1683. }
  1684. else {
  1685. p ++;
  1686. }
  1687. break;
  1688. case num4:
  1689. if (*p == ',') {
  1690. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1691. goto stop;
  1692. }
  1693. valid = TRUE;
  1694. goto stop;
  1695. }
  1696. else if (*p == ')') {
  1697. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1698. goto stop;
  1699. }
  1700. valid = TRUE;
  1701. goto stop;
  1702. }
  1703. else if (!g_ascii_isdigit (*p)) {
  1704. goto stop;
  1705. }
  1706. else {
  1707. p ++;
  1708. }
  1709. break;
  1710. case skip_spaces:
  1711. if (!g_ascii_isspace (*p)) {
  1712. c = p;
  1713. state = next_state;
  1714. }
  1715. else {
  1716. p ++;
  1717. }
  1718. break;
  1719. }
  1720. }
  1721. stop:
  1722. if (valid) {
  1723. cl->d.comp.r = r;
  1724. cl->d.comp.g = g;
  1725. cl->d.comp.b = b;
  1726. cl->d.comp.alpha = opacity;
  1727. cl->valid = TRUE;
  1728. }
  1729. }
  1730. else {
  1731. khiter_t k;
  1732. /* Compare color by name */
  1733. search.begin = line;
  1734. search.len = len;
  1735. k = kh_get (color_by_name, html_color_by_name, &search);
  1736. if (k != kh_end (html_color_by_name)) {
  1737. el = &kh_val (html_color_by_name, k);
  1738. memcpy (cl, el, sizeof (*cl));
  1739. cl->d.comp.alpha = 255; /* Non transparent */
  1740. }
  1741. }
  1742. }
  1743. /*
  1744. * Target is used for in and out if this function returns TRUE
  1745. */
  1746. static gboolean
  1747. rspamd_html_process_css_size (const gchar *suffix, gsize len,
  1748. gdouble *tgt)
  1749. {
  1750. gdouble sz = *tgt;
  1751. gboolean ret = FALSE;
  1752. if (len >= 2) {
  1753. if (memcmp (suffix, "px", 2) == 0) {
  1754. sz = (guint) sz; /* Round to number */
  1755. ret = TRUE;
  1756. }
  1757. else if (memcmp (suffix, "em", 2) == 0) {
  1758. /* EM is 16 px, so multiply and round */
  1759. sz = (guint) (sz * 16.0);
  1760. ret = TRUE;
  1761. }
  1762. else if (len >= 3 && memcmp (suffix, "rem", 3) == 0) {
  1763. /* equal to EM in our case */
  1764. sz = (guint) (sz * 16.0);
  1765. ret = TRUE;
  1766. }
  1767. else if (memcmp (suffix, "ex", 2) == 0) {
  1768. /*
  1769. * Represents the x-height of the element's font.
  1770. * On fonts with the "x" letter, this is generally the height
  1771. * of lowercase letters in the font; 1ex = 0.5em in many fonts.
  1772. */
  1773. sz = (guint) (sz * 8.0);
  1774. ret = TRUE;
  1775. }
  1776. else if (memcmp (suffix, "vw", 2) == 0) {
  1777. /*
  1778. * Vewport width in percentages:
  1779. * we assume 1% of viewport width as 8px
  1780. */
  1781. sz = (guint) (sz * 8.0);
  1782. ret = TRUE;
  1783. }
  1784. else if (memcmp (suffix, "vh", 2) == 0) {
  1785. /*
  1786. * Vewport height in percentages
  1787. * we assume 1% of viewport width as 6px
  1788. */
  1789. sz = (guint) (sz * 6.0);
  1790. ret = TRUE;
  1791. }
  1792. else if (len >= 4 && memcmp (suffix, "vmax", 4) == 0) {
  1793. /*
  1794. * Vewport width in percentages
  1795. * we assume 1% of viewport width as 6px
  1796. */
  1797. sz = (guint) (sz * 8.0);
  1798. ret = TRUE;
  1799. }
  1800. else if (len >= 4 && memcmp (suffix, "vmin", 4) == 0) {
  1801. /*
  1802. * Vewport height in percentages
  1803. * we assume 1% of viewport width as 6px
  1804. */
  1805. sz = (guint) (sz * 6.0);
  1806. ret = TRUE;
  1807. }
  1808. else if (memcmp (suffix, "pt", 2) == 0) {
  1809. sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
  1810. ret = TRUE;
  1811. }
  1812. else if (memcmp (suffix, "cm", 2) == 0) {
  1813. sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
  1814. ret = TRUE;
  1815. }
  1816. else if (memcmp (suffix, "mm", 2) == 0) {
  1817. sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
  1818. ret = TRUE;
  1819. }
  1820. else if (memcmp (suffix, "in", 2) == 0) {
  1821. sz = (guint) (sz * 96.0); /* 96px */
  1822. ret = TRUE;
  1823. }
  1824. else if (memcmp (suffix, "pc", 2) == 0) {
  1825. sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
  1826. ret = TRUE;
  1827. }
  1828. }
  1829. else if (suffix[0] == '%') {
  1830. /* Percentages from 16 px */
  1831. sz = (guint)(sz / 100.0 * 16.0);
  1832. ret = TRUE;
  1833. }
  1834. if (ret) {
  1835. *tgt = sz;
  1836. }
  1837. return ret;
  1838. }
  1839. static void
  1840. rspamd_html_process_font_size (const gchar *line, guint len, guint *fs,
  1841. gboolean is_css)
  1842. {
  1843. const gchar *p = line, *end = line + len;
  1844. gchar *err = NULL, numbuf[64];
  1845. gdouble sz = 0;
  1846. gboolean failsafe = FALSE;
  1847. while (p < end && g_ascii_isspace (*p)) {
  1848. p ++;
  1849. len --;
  1850. }
  1851. if (g_ascii_isdigit (*p)) {
  1852. rspamd_strlcpy (numbuf, p, MIN (sizeof (numbuf), len + 1));
  1853. sz = strtod (numbuf, &err);
  1854. /* Now check leftover */
  1855. if (sz < 0) {
  1856. sz = 0;
  1857. }
  1858. }
  1859. else {
  1860. /* Ignore the rest */
  1861. failsafe = TRUE;
  1862. sz = is_css ? 16 : 1;
  1863. /* TODO: add textual fonts descriptions */
  1864. }
  1865. if (err && *err != '\0') {
  1866. const gchar *e = err;
  1867. gsize slen;
  1868. /* Skip spaces */
  1869. while (*e && g_ascii_isspace (*e)) {
  1870. e ++;
  1871. }
  1872. /* Lowercase */
  1873. slen = strlen (e);
  1874. rspamd_str_lc ((gchar *)e, slen);
  1875. if (!rspamd_html_process_css_size (e, slen, &sz)) {
  1876. failsafe = TRUE;
  1877. }
  1878. }
  1879. else {
  1880. /* Failsafe naked number */
  1881. failsafe = TRUE;
  1882. }
  1883. if (failsafe) {
  1884. if (is_css) {
  1885. /*
  1886. * In css mode we usually ignore sizes, but let's treat
  1887. * small sizes specially
  1888. */
  1889. if (sz < 1) {
  1890. sz = 0;
  1891. } else {
  1892. sz = 16; /* Ignore */
  1893. }
  1894. } else {
  1895. /* In non-css mode we have to check legacy size */
  1896. sz = sz >= 1 ? sz * 16 : 16;
  1897. }
  1898. }
  1899. if (sz > 32) {
  1900. sz = 32;
  1901. }
  1902. *fs = sz;
  1903. }
  1904. static void
  1905. rspamd_html_process_style (rspamd_mempool_t *pool, struct html_block *bl,
  1906. struct html_content *hc, const gchar *style, guint len)
  1907. {
  1908. const gchar *p, *c, *end, *key = NULL;
  1909. enum {
  1910. read_key,
  1911. read_colon,
  1912. read_value,
  1913. skip_spaces,
  1914. } state = skip_spaces, next_state = read_key;
  1915. guint klen = 0;
  1916. gdouble opacity = 1.0;
  1917. p = style;
  1918. c = p;
  1919. end = p + len;
  1920. while (p <= end) {
  1921. switch(state) {
  1922. case read_key:
  1923. if (p == end || *p == ':') {
  1924. key = c;
  1925. klen = p - c;
  1926. state = skip_spaces;
  1927. next_state = read_value;
  1928. }
  1929. else if (g_ascii_isspace (*p)) {
  1930. key = c;
  1931. klen = p - c;
  1932. state = skip_spaces;
  1933. next_state = read_colon;
  1934. }
  1935. p ++;
  1936. break;
  1937. case read_colon:
  1938. if (p == end || *p == ':') {
  1939. state = skip_spaces;
  1940. next_state = read_value;
  1941. }
  1942. p ++;
  1943. break;
  1944. case read_value:
  1945. if (p == end || *p == ';') {
  1946. if (key && klen && p - c > 0) {
  1947. if ((klen == 5 && g_ascii_strncasecmp (key, "color", 5) == 0)
  1948. || (klen == 10 && g_ascii_strncasecmp (key, "font-color", 10) == 0)) {
  1949. rspamd_html_process_color (c, p - c, &bl->font_color);
  1950. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  1951. }
  1952. else if ((klen == 16 && g_ascii_strncasecmp (key,
  1953. "background-color", 16) == 0) ||
  1954. (klen == 10 && g_ascii_strncasecmp (key,
  1955. "background", 10) == 0)) {
  1956. rspamd_html_process_color (c, p - c, &bl->background_color);
  1957. msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
  1958. }
  1959. else if (klen == 7 && g_ascii_strncasecmp (key, "display", 7) == 0) {
  1960. if (p - c >= 4 && rspamd_substring_search_caseless (c, p - c,
  1961. "none", 4) != -1) {
  1962. bl->visible = FALSE;
  1963. msg_debug_html ("tag is not visible");
  1964. }
  1965. }
  1966. else if (klen == 9 &&
  1967. g_ascii_strncasecmp (key, "font-size", 9) == 0) {
  1968. rspamd_html_process_font_size (c, p - c,
  1969. &bl->font_size, TRUE);
  1970. msg_debug_html ("got font size: %ud", bl->font_size);
  1971. }
  1972. else if (klen == 7 &&
  1973. g_ascii_strncasecmp (key, "opacity", 7) == 0) {
  1974. gchar numbuf[64];
  1975. rspamd_strlcpy (numbuf, c,
  1976. MIN (sizeof (numbuf), p - c + 1));
  1977. opacity = strtod (numbuf, NULL);
  1978. if (opacity > 1) {
  1979. opacity = 1;
  1980. }
  1981. else if (opacity < 0) {
  1982. opacity = 0;
  1983. }
  1984. bl->font_color.d.comp.alpha = (guint8)(opacity * 255.0);
  1985. }
  1986. else if (klen == 10 &&
  1987. g_ascii_strncasecmp (key, "visibility", 10) == 0) {
  1988. if (p - c >= 6 && rspamd_substring_search_caseless (c,
  1989. p - c,
  1990. "hidden", 6) != -1) {
  1991. bl->visible = FALSE;
  1992. msg_debug_html ("tag is not visible");
  1993. }
  1994. }
  1995. }
  1996. key = NULL;
  1997. klen = 0;
  1998. state = skip_spaces;
  1999. next_state = read_key;
  2000. }
  2001. p ++;
  2002. break;
  2003. case skip_spaces:
  2004. if (p < end && !g_ascii_isspace (*p)) {
  2005. c = p;
  2006. state = next_state;
  2007. }
  2008. else {
  2009. p ++;
  2010. }
  2011. break;
  2012. }
  2013. }
  2014. }
  2015. static void
  2016. rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  2017. struct html_content *hc)
  2018. {
  2019. struct html_tag_component *comp;
  2020. struct html_block *bl;
  2021. rspamd_ftok_t fstr;
  2022. GList *cur;
  2023. cur = tag->params->head;
  2024. bl = rspamd_mempool_alloc0 (pool, sizeof (*bl));
  2025. bl->tag = tag;
  2026. bl->visible = TRUE;
  2027. bl->font_size = (guint)-1;
  2028. bl->font_color.d.comp.alpha = 255;
  2029. while (cur) {
  2030. comp = cur->data;
  2031. if (comp->len > 0) {
  2032. switch (comp->type) {
  2033. case RSPAMD_HTML_COMPONENT_COLOR:
  2034. fstr.begin = (gchar *) comp->start;
  2035. fstr.len = comp->len;
  2036. rspamd_html_process_color (comp->start, comp->len,
  2037. &bl->font_color);
  2038. msg_debug_html ("tag %*s; got color: %xd",
  2039. tag->name.len, tag->name.start, bl->font_color.d.val);
  2040. break;
  2041. case RSPAMD_HTML_COMPONENT_BGCOLOR:
  2042. fstr.begin = (gchar *) comp->start;
  2043. fstr.len = comp->len;
  2044. rspamd_html_process_color (comp->start, comp->len,
  2045. &bl->background_color);
  2046. msg_debug_html ("tag %*s; got color: %xd",
  2047. tag->name.len, tag->name.start, bl->font_color.d.val);
  2048. if (tag->id == Tag_BODY) {
  2049. /* Set global background color */
  2050. memcpy (&hc->bgcolor, &bl->background_color,
  2051. sizeof (hc->bgcolor));
  2052. }
  2053. break;
  2054. case RSPAMD_HTML_COMPONENT_STYLE:
  2055. bl->style.len = comp->len;
  2056. bl->style.start = comp->start;
  2057. msg_debug_html ("tag: %*s; got style: %*s",
  2058. tag->name.len, tag->name.start,
  2059. (gint) bl->style.len, bl->style.start);
  2060. rspamd_html_process_style (pool, bl, hc, comp->start, comp->len);
  2061. break;
  2062. case RSPAMD_HTML_COMPONENT_CLASS:
  2063. fstr.begin = (gchar *) comp->start;
  2064. fstr.len = comp->len;
  2065. bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
  2066. msg_debug_html ("tag: %*s; got class: %s",
  2067. tag->name.len, tag->name.start, bl->html_class);
  2068. break;
  2069. case RSPAMD_HTML_COMPONENT_SIZE:
  2070. /* Not supported by html5 */
  2071. /* FIXME maybe support it */
  2072. bl->font_size = 16;
  2073. msg_debug_html ("tag %*s; got size: %*s",
  2074. tag->name.len, tag->name.start,
  2075. (gint)comp->len, comp->start);
  2076. break;
  2077. default:
  2078. /* NYI */
  2079. break;
  2080. }
  2081. }
  2082. cur = g_list_next (cur);
  2083. }
  2084. if (hc->blocks == NULL) {
  2085. hc->blocks = g_ptr_array_sized_new (64);
  2086. rspamd_mempool_notify_alloc (pool, 64 * sizeof (gpointer) + sizeof (GPtrArray));
  2087. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  2088. hc->blocks);
  2089. }
  2090. g_ptr_array_add (hc->blocks, bl);
  2091. tag->extra = bl;
  2092. }
  2093. static void
  2094. rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
  2095. GList **exceptions, GHashTable *urls, GHashTable *emails,
  2096. GByteArray *dest, GHashTable *target_tbl,
  2097. gint href_offset,
  2098. struct rspamd_url *url)
  2099. {
  2100. struct rspamd_url *displayed_url = NULL;
  2101. struct rspamd_url *turl;
  2102. gboolean url_found = FALSE;
  2103. struct rspamd_process_exception *ex;
  2104. if (href_offset <= 0) {
  2105. /* No dispalyed url, just some text within <a> tag */
  2106. return;
  2107. }
  2108. url->visible_part = rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
  2109. rspamd_strlcpy (url->visible_part, dest->data + href_offset,
  2110. dest->len - href_offset + 1);
  2111. g_strstrip (url->visible_part);
  2112. rspamd_html_url_is_phished (pool, url,
  2113. dest->data + href_offset,
  2114. dest->len - href_offset,
  2115. &url_found, &displayed_url);
  2116. if (url_found) {
  2117. url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
  2118. }
  2119. if (exceptions && url_found) {
  2120. ex = rspamd_mempool_alloc (pool,
  2121. sizeof (*ex));
  2122. ex->pos = href_offset;
  2123. ex->len = dest->len - href_offset;
  2124. ex->type = RSPAMD_EXCEPTION_URL;
  2125. ex->ptr = url;
  2126. *exceptions = g_list_prepend (*exceptions,
  2127. ex);
  2128. }
  2129. if (displayed_url) {
  2130. if (displayed_url->protocol ==
  2131. PROTOCOL_MAILTO) {
  2132. target_tbl = emails;
  2133. }
  2134. else {
  2135. target_tbl = urls;
  2136. }
  2137. if (target_tbl != NULL) {
  2138. turl = g_hash_table_lookup (target_tbl,
  2139. displayed_url);
  2140. if (turl != NULL) {
  2141. /* Here, we assume the following:
  2142. * if we have a URL in the text part which
  2143. * is the same as displayed URL in the
  2144. * HTML part, we assume that it is also
  2145. * hint only.
  2146. */
  2147. if (turl->flags &
  2148. RSPAMD_URL_FLAG_FROM_TEXT) {
  2149. turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  2150. turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
  2151. }
  2152. turl->count ++;
  2153. }
  2154. else {
  2155. g_hash_table_insert (target_tbl,
  2156. displayed_url,
  2157. displayed_url);
  2158. }
  2159. }
  2160. }
  2161. }
  2162. static gboolean
  2163. rspamd_html_propagate_lengths (GNode *node, gpointer _unused)
  2164. {
  2165. GNode *child;
  2166. struct html_tag *tag = node->data, *cld_tag;
  2167. if (tag) {
  2168. child = node->children;
  2169. /* Summarize content length from children */
  2170. while (child) {
  2171. cld_tag = child->data;
  2172. tag->content_length += cld_tag->content_length;
  2173. child = child->next;
  2174. }
  2175. }
  2176. return FALSE;
  2177. }
  2178. static void
  2179. rspamd_html_propagate_style (struct html_content *hc,
  2180. struct html_tag *tag,
  2181. struct html_block *bl,
  2182. GQueue *blocks)
  2183. {
  2184. struct html_block *bl_parent;
  2185. gboolean push_block = FALSE;
  2186. /* Propagate from the parent if needed */
  2187. bl_parent = g_queue_peek_tail (blocks);
  2188. if (bl_parent) {
  2189. if (!bl->background_color.valid) {
  2190. /* Try to propagate background color from parent nodes */
  2191. if (bl_parent->background_color.valid) {
  2192. memcpy (&bl->background_color, &bl_parent->background_color,
  2193. sizeof (bl->background_color));
  2194. }
  2195. }
  2196. else {
  2197. push_block = TRUE;
  2198. }
  2199. if (!bl->font_color.valid) {
  2200. /* Try to propagate background color from parent nodes */
  2201. if (bl_parent->font_color.valid) {
  2202. memcpy (&bl->font_color, &bl_parent->font_color,
  2203. sizeof (bl->font_color));
  2204. }
  2205. }
  2206. else {
  2207. push_block = TRUE;
  2208. }
  2209. /* Propagate font size */
  2210. if (bl->font_size == (guint)-1) {
  2211. if (bl_parent->font_size != (guint)-1) {
  2212. bl->font_size = bl_parent->font_size;
  2213. }
  2214. }
  2215. else {
  2216. push_block = TRUE;
  2217. }
  2218. }
  2219. /* Set bgcolor to the html bgcolor and font color to black as a last resort */
  2220. if (!bl->font_color.valid) {
  2221. /* Don't touch opacity as it can be set separately */
  2222. bl->font_color.d.comp.r = 0;
  2223. bl->font_color.d.comp.g = 0;
  2224. bl->font_color.d.comp.b = 0;
  2225. bl->font_color.valid = TRUE;
  2226. }
  2227. else {
  2228. push_block = TRUE;
  2229. }
  2230. if (!bl->background_color.valid) {
  2231. memcpy (&bl->background_color, &hc->bgcolor, sizeof (hc->bgcolor));
  2232. }
  2233. else {
  2234. push_block = TRUE;
  2235. }
  2236. if (bl->font_size == (guint)-1) {
  2237. bl->font_size = 16; /* Default for browsers */
  2238. }
  2239. else {
  2240. push_block = TRUE;
  2241. }
  2242. if (push_block && !(tag->flags & FL_CLOSED)) {
  2243. g_queue_push_tail (blocks, bl);
  2244. }
  2245. }
  2246. GByteArray*
  2247. rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
  2248. GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails)
  2249. {
  2250. const guchar *p, *c, *end, *savep = NULL;
  2251. guchar t;
  2252. gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
  2253. balanced;
  2254. GByteArray *dest;
  2255. GHashTable *target_tbl;
  2256. guint obrace = 0, ebrace = 0;
  2257. GNode *cur_level = NULL;
  2258. gint substate = 0, len, href_offset = -1;
  2259. struct html_tag *cur_tag = NULL, *content_tag = NULL;
  2260. struct rspamd_url *url = NULL, *turl;
  2261. GQueue *styles_blocks;
  2262. enum {
  2263. parse_start = 0,
  2264. tag_begin,
  2265. sgml_tag,
  2266. xml_tag,
  2267. compound_tag,
  2268. comment_tag,
  2269. comment_content,
  2270. sgml_content,
  2271. tag_content,
  2272. tag_end,
  2273. xml_tag_end,
  2274. content_ignore,
  2275. content_write,
  2276. content_ignore_sp
  2277. } state = parse_start;
  2278. g_assert (in != NULL);
  2279. g_assert (hc != NULL);
  2280. g_assert (pool != NULL);
  2281. rspamd_html_library_init ();
  2282. hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (G_N_ELEMENTS (tag_defs)));
  2283. /* Set white background color by default */
  2284. hc->bgcolor.d.comp.alpha = 0;
  2285. hc->bgcolor.d.comp.r = 255;
  2286. hc->bgcolor.d.comp.g = 255;
  2287. hc->bgcolor.d.comp.b = 255;
  2288. hc->bgcolor.valid = TRUE;
  2289. dest = g_byte_array_sized_new (in->len / 3 * 2);
  2290. styles_blocks = g_queue_new ();
  2291. p = in->data;
  2292. c = p;
  2293. end = p + in->len;
  2294. while (p < end) {
  2295. t = *p;
  2296. switch (state) {
  2297. case parse_start:
  2298. if (t == '<') {
  2299. state = tag_begin;
  2300. }
  2301. else {
  2302. /* We have no starting tag, so assume that it's content */
  2303. hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
  2304. state = content_write;
  2305. }
  2306. break;
  2307. case tag_begin:
  2308. switch (t) {
  2309. case '<':
  2310. p ++;
  2311. closing = FALSE;
  2312. break;
  2313. case '!':
  2314. state = sgml_tag;
  2315. p ++;
  2316. break;
  2317. case '?':
  2318. state = xml_tag;
  2319. hc->flags |= RSPAMD_HTML_FLAG_XML;
  2320. p ++;
  2321. break;
  2322. case '/':
  2323. closing = TRUE;
  2324. p ++;
  2325. break;
  2326. case '>':
  2327. /* Empty tag */
  2328. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2329. state = tag_end;
  2330. continue;
  2331. default:
  2332. state = tag_content;
  2333. substate = 0;
  2334. savep = NULL;
  2335. cur_tag = rspamd_mempool_alloc0 (pool, sizeof (*cur_tag));
  2336. cur_tag->params = g_queue_new ();
  2337. rspamd_mempool_add_destructor (pool,
  2338. (rspamd_mempool_destruct_t)g_queue_free, cur_tag->params);
  2339. break;
  2340. }
  2341. break;
  2342. case sgml_tag:
  2343. switch (t) {
  2344. case '[':
  2345. state = compound_tag;
  2346. obrace = 1;
  2347. ebrace = 0;
  2348. p ++;
  2349. break;
  2350. case '-':
  2351. state = comment_tag;
  2352. p ++;
  2353. break;
  2354. default:
  2355. state = sgml_content;
  2356. break;
  2357. }
  2358. break;
  2359. case xml_tag:
  2360. if (t == '?') {
  2361. state = xml_tag_end;
  2362. }
  2363. else if (t == '>') {
  2364. /* Misformed xml tag */
  2365. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2366. state = tag_end;
  2367. continue;
  2368. }
  2369. /* We efficiently ignore xml tags */
  2370. p ++;
  2371. break;
  2372. case xml_tag_end:
  2373. if (t == '>') {
  2374. state = tag_end;
  2375. continue;
  2376. }
  2377. else {
  2378. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2379. p ++;
  2380. }
  2381. break;
  2382. case compound_tag:
  2383. if (t == '[') {
  2384. obrace ++;
  2385. }
  2386. else if (t == ']') {
  2387. ebrace ++;
  2388. }
  2389. else if (t == '>' && obrace == ebrace) {
  2390. state = tag_end;
  2391. continue;
  2392. }
  2393. p ++;
  2394. break;
  2395. case comment_tag:
  2396. if (t != '-') {
  2397. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2398. state = tag_end;
  2399. }
  2400. else {
  2401. p++;
  2402. ebrace = 0;
  2403. /*
  2404. * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
  2405. * ... the text must not start with a single
  2406. * U+003E GREATER-THAN SIGN character (>),
  2407. * nor start with a "-" (U+002D) character followed by
  2408. * a U+003E GREATER-THAN SIGN (>) character,
  2409. * nor contain two consecutive U+002D HYPHEN-MINUS
  2410. * characters (--), nor end with a "-" (U+002D) character.
  2411. */
  2412. if (p[0] == '-' && p + 1 < end && p[1] == '>') {
  2413. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2414. p ++;
  2415. state = tag_end;
  2416. }
  2417. else if (*p == '>') {
  2418. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2419. state = tag_end;
  2420. }
  2421. else {
  2422. state = comment_content;
  2423. }
  2424. }
  2425. break;
  2426. case comment_content:
  2427. if (t == '-') {
  2428. ebrace ++;
  2429. }
  2430. else if (t == '>' && ebrace >= 2) {
  2431. state = tag_end;
  2432. continue;
  2433. }
  2434. else {
  2435. ebrace = 0;
  2436. }
  2437. p ++;
  2438. break;
  2439. case content_ignore:
  2440. if (t != '<') {
  2441. p ++;
  2442. }
  2443. else {
  2444. state = tag_begin;
  2445. }
  2446. break;
  2447. case content_write:
  2448. if (t != '<') {
  2449. if (t == '&') {
  2450. need_decode = TRUE;
  2451. }
  2452. else if (g_ascii_isspace (t)) {
  2453. save_space = TRUE;
  2454. if (p > c) {
  2455. if (need_decode) {
  2456. goffset old_offset = dest->len;
  2457. if (content_tag) {
  2458. if (content_tag->content_length == 0) {
  2459. content_tag->content_offset = old_offset;
  2460. }
  2461. }
  2462. g_byte_array_append (dest, c, (p - c));
  2463. len = rspamd_html_decode_entitles_inplace (
  2464. dest->data + old_offset,
  2465. p - c);
  2466. dest->len = dest->len + len - (p - c);
  2467. if (content_tag) {
  2468. content_tag->content_length += len;
  2469. }
  2470. }
  2471. else {
  2472. len = p - c;
  2473. if (content_tag) {
  2474. if (content_tag->content_length == 0) {
  2475. content_tag->content_offset = dest->len;
  2476. }
  2477. content_tag->content_length += len;
  2478. }
  2479. g_byte_array_append (dest, c, len);
  2480. }
  2481. }
  2482. c = p;
  2483. state = content_ignore_sp;
  2484. }
  2485. else {
  2486. if (save_space) {
  2487. /* Append one space if needed */
  2488. if (dest->len > 0 &&
  2489. !g_ascii_isspace (dest->data[dest->len - 1])) {
  2490. g_byte_array_append (dest, " ", 1);
  2491. if (content_tag) {
  2492. if (content_tag->content_length == 0) {
  2493. /*
  2494. * Special case
  2495. * we have a space at the beginning but
  2496. * we have no set content_offset
  2497. * so we need to do it here
  2498. */
  2499. content_tag->content_offset = dest->len;
  2500. }
  2501. else {
  2502. content_tag->content_length++;
  2503. }
  2504. }
  2505. }
  2506. save_space = FALSE;
  2507. }
  2508. }
  2509. }
  2510. else {
  2511. if (c != p) {
  2512. if (need_decode) {
  2513. goffset old_offset = dest->len;
  2514. if (content_tag) {
  2515. if (content_tag->content_length == 0) {
  2516. content_tag->content_offset = dest->len;
  2517. }
  2518. }
  2519. g_byte_array_append (dest, c, (p - c));
  2520. len = rspamd_html_decode_entitles_inplace (
  2521. dest->data + old_offset,
  2522. p - c);
  2523. dest->len = dest->len + len - (p - c);
  2524. if (content_tag) {
  2525. content_tag->content_length += len;
  2526. }
  2527. }
  2528. else {
  2529. len = p - c;
  2530. if (content_tag) {
  2531. if (content_tag->content_length == 0) {
  2532. content_tag->content_offset = dest->len;
  2533. }
  2534. content_tag->content_length += len;
  2535. }
  2536. g_byte_array_append (dest, c, len);
  2537. }
  2538. }
  2539. content_tag = NULL;
  2540. state = tag_begin;
  2541. continue;
  2542. }
  2543. p ++;
  2544. break;
  2545. case content_ignore_sp:
  2546. if (!g_ascii_isspace (t)) {
  2547. c = p;
  2548. state = content_write;
  2549. continue;
  2550. }
  2551. p ++;
  2552. break;
  2553. case sgml_content:
  2554. /* TODO: parse DOCTYPE here */
  2555. if (t == '>') {
  2556. state = tag_end;
  2557. /* We don't know a lot about sgml tags, ignore them */
  2558. cur_tag = NULL;
  2559. continue;
  2560. }
  2561. p ++;
  2562. break;
  2563. case tag_content:
  2564. rspamd_html_parse_tag_content (pool, hc, cur_tag,
  2565. p, &substate, &savep);
  2566. if (t == '>') {
  2567. if (closing) {
  2568. cur_tag->flags |= FL_CLOSING;
  2569. if (cur_tag->flags & FL_CLOSED) {
  2570. /* Bad mix of closed and closing */
  2571. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2572. }
  2573. closing = FALSE;
  2574. }
  2575. state = tag_end;
  2576. continue;
  2577. }
  2578. p ++;
  2579. break;
  2580. case tag_end:
  2581. substate = 0;
  2582. savep = NULL;
  2583. if (cur_tag != NULL) {
  2584. balanced = TRUE;
  2585. if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level,
  2586. &balanced)) {
  2587. state = content_write;
  2588. need_decode = FALSE;
  2589. }
  2590. else {
  2591. state = content_ignore;
  2592. }
  2593. if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
  2594. if (cur_tag->flags & CM_UNIQUE) {
  2595. if (isset (hc->tags_seen, cur_tag->id)) {
  2596. /* Duplicate tag has been found */
  2597. hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
  2598. }
  2599. }
  2600. setbit (hc->tags_seen, cur_tag->id);
  2601. }
  2602. if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
  2603. content_tag = cur_tag;
  2604. }
  2605. /* Handle newlines */
  2606. if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
  2607. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2608. g_byte_array_append (dest, "\r\n", 2);
  2609. if (content_tag) {
  2610. if (content_tag->content_length == 0) {
  2611. /*
  2612. * Special case
  2613. * we have a \r\n at the beginning but
  2614. * we have no set content_offset
  2615. * so we need to do it here
  2616. */
  2617. content_tag->content_offset = dest->len;
  2618. }
  2619. else {
  2620. content_tag->content_length += 2;
  2621. }
  2622. }
  2623. }
  2624. save_space = FALSE;
  2625. }
  2626. if ((cur_tag->id == Tag_P ||
  2627. cur_tag->id == Tag_TR ||
  2628. cur_tag->id == Tag_DIV)) {
  2629. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2630. g_byte_array_append (dest, "\r\n", 2);
  2631. if (content_tag) {
  2632. if (content_tag->content_length == 0) {
  2633. /*
  2634. * Special case
  2635. * we have a \r\n at the beginning but
  2636. * we have no set content_offset
  2637. * so we need to get it here
  2638. */
  2639. content_tag->content_offset = dest->len;
  2640. }
  2641. else {
  2642. content_tag->content_length += 2;
  2643. }
  2644. }
  2645. }
  2646. save_space = FALSE;
  2647. }
  2648. if (cur_tag->flags & FL_HREF) {
  2649. if (!(cur_tag->flags & (FL_CLOSING))) {
  2650. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2651. if (url != NULL) {
  2652. if (url->protocol == PROTOCOL_MAILTO) {
  2653. target_tbl = emails;
  2654. }
  2655. else {
  2656. target_tbl = urls;
  2657. }
  2658. if (target_tbl != NULL) {
  2659. turl = g_hash_table_lookup (target_tbl, url);
  2660. if (turl == NULL) {
  2661. g_hash_table_insert (target_tbl, url, url);
  2662. }
  2663. else {
  2664. turl->count ++;
  2665. url = NULL;
  2666. }
  2667. if (turl == NULL && url != NULL) {
  2668. rspamd_process_html_url (pool,
  2669. url,
  2670. urls, emails);
  2671. }
  2672. }
  2673. href_offset = dest->len;
  2674. }
  2675. }
  2676. if (cur_tag->id == Tag_A) {
  2677. if (!balanced && cur_level && cur_level->prev) {
  2678. struct html_tag *prev_tag;
  2679. struct rspamd_url *prev_url;
  2680. prev_tag = cur_level->prev->data;
  2681. if (prev_tag->id == Tag_A &&
  2682. !(prev_tag->flags & (FL_CLOSING)) &&
  2683. prev_tag->extra) {
  2684. prev_url = prev_tag->extra;
  2685. rspamd_html_check_displayed_url (pool,
  2686. exceptions, urls, emails,
  2687. dest, target_tbl, href_offset,
  2688. prev_url);
  2689. }
  2690. }
  2691. if (cur_tag->flags & (FL_CLOSING)) {
  2692. /* Insert exception */
  2693. if (url != NULL && (gint) dest->len > href_offset) {
  2694. rspamd_html_check_displayed_url (pool,
  2695. exceptions, urls, emails,
  2696. dest, target_tbl, href_offset,
  2697. url);
  2698. }
  2699. href_offset = -1;
  2700. url = NULL;
  2701. }
  2702. }
  2703. }
  2704. else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
  2705. /*
  2706. * Base is allowed only within head tag but HTML is retarded
  2707. */
  2708. if (hc->base_url == NULL) {
  2709. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2710. if (url != NULL) {
  2711. msg_debug_html ("got valid base tag");
  2712. hc->base_url = url;
  2713. cur_tag->extra = url;
  2714. }
  2715. else {
  2716. msg_debug_html ("got invalid base tag!");
  2717. }
  2718. }
  2719. }
  2720. if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
  2721. rspamd_html_process_img_tag (pool, cur_tag, hc, urls);
  2722. }
  2723. else if (cur_tag->flags & FL_BLOCK) {
  2724. struct html_block *bl;
  2725. if (cur_tag->flags & FL_CLOSING) {
  2726. /* Just remove block element from the queue if any */
  2727. if (styles_blocks->length > 0) {
  2728. g_queue_pop_tail (styles_blocks);
  2729. }
  2730. }
  2731. else {
  2732. rspamd_html_process_block_tag (pool, cur_tag, hc);
  2733. bl = cur_tag->extra;
  2734. if (bl) {
  2735. rspamd_html_propagate_style (hc, cur_tag,
  2736. cur_tag->extra, styles_blocks);
  2737. /* Check visibility */
  2738. if (bl->font_size < 3 ||
  2739. bl->font_color.d.comp.alpha < 10) {
  2740. bl->visible = FALSE;
  2741. msg_debug_html ("tag is not visible: font size: "
  2742. "%d, alpha: %d",
  2743. (int)bl->font_size,
  2744. (int)bl->font_color.d.comp.alpha);
  2745. }
  2746. if (!bl->visible) {
  2747. state = content_ignore;
  2748. }
  2749. }
  2750. }
  2751. }
  2752. }
  2753. else {
  2754. state = content_write;
  2755. }
  2756. p++;
  2757. c = p;
  2758. cur_tag = NULL;
  2759. break;
  2760. }
  2761. }
  2762. if (hc->html_tags) {
  2763. g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1,
  2764. rspamd_html_propagate_lengths, NULL);
  2765. }
  2766. g_queue_free (styles_blocks);
  2767. hc->parsed = dest;
  2768. return dest;
  2769. }
  2770. GByteArray*
  2771. rspamd_html_process_part (rspamd_mempool_t *pool,
  2772. struct html_content *hc,
  2773. GByteArray *in)
  2774. {
  2775. return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
  2776. }