You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.c 80KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "rspamd.h"
  19. #include "message.h"
  20. #include "html.h"
  21. #include "html_tags.h"
  22. #include "html_colors.h"
  23. #include "html_entities.h"
  24. #include "url.h"
  25. #include "contrib/libucl/khash.h"
  26. #include "libmime/images.h"
  27. #include "css/css.h"
  28. #include <unicode/uversion.h>
  29. #include <unicode/ucnv.h>
  30. #if U_ICU_VERSION_MAJOR_NUM >= 46
  31. #include <unicode/uidna.h>
  32. #endif
  33. static sig_atomic_t tags_sorted = 0;
  34. static sig_atomic_t entities_sorted = 0;
  35. static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
  36. struct html_tag_def {
  37. const gchar *name;
  38. gint16 id;
  39. guint16 len;
  40. guint flags;
  41. };
  42. #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
  43. rspamd_html_log_id, "html", pool->tag.uid, \
  44. G_STRFUNC, \
  45. __VA_ARGS__)
  46. INIT_LOG_MODULE(html)
  47. #define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
  48. static struct html_tag_def tag_defs[] = {
  49. /* W3C defined elements */
  50. TAG_DEF(Tag_A, "a", FL_HREF),
  51. TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
  52. TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
  53. TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
  54. TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  55. TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
  56. TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
  57. TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
  58. TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
  59. TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
  60. TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
  61. TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
  62. TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
  63. TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
  64. TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
  65. TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
  66. TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
  67. TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
  68. TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
  69. TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
  70. TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
  71. TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  72. TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  73. TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
  74. TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
  75. TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
  76. TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
  77. TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  78. TAG_DEF(Tag_EM, "em", (CM_INLINE)),
  79. TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
  80. TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
  81. TAG_DEF(Tag_FORM, "form", (CM_BLOCK|FL_HREF)),
  82. TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
  83. TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
  84. TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
  85. TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
  86. TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
  87. TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
  88. TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
  89. TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
  90. TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  91. TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
  92. TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  93. TAG_DEF(Tag_I, "i", (CM_INLINE)),
  94. TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
  95. TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
  96. TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
  97. TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  98. TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
  99. TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
  100. TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
  101. TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
  102. TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  103. TAG_DEF(Tag_LINK, "link", (CM_EMPTY|FL_HREF)),
  104. TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
  105. TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)),
  106. TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
  107. TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
  108. TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
  109. TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  110. TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
  111. TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
  112. TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
  113. TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
  114. TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
  115. TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
  116. TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
  117. TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
  118. TAG_DEF(Tag_Q, "q", (CM_INLINE)),
  119. TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
  120. TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
  121. TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
  122. TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
  123. TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
  124. TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
  125. TAG_DEF(Tag_S, "s", (CM_INLINE)),
  126. TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
  127. TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
  128. TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
  129. TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
  130. TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
  131. TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
  132. TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
  133. TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
  134. TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
  135. TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
  136. TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
  137. TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
  138. TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  139. TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
  140. TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  141. TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  142. TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  143. TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
  144. TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
  145. TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
  146. TAG_DEF(Tag_U, "u", (CM_INLINE)),
  147. TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
  148. TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
  149. TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
  150. TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
  151. /* proprietary elements */
  152. TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
  153. TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
  154. TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
  155. TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
  156. TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
  157. TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
  158. TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
  159. TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
  160. TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
  161. TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
  162. TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
  163. TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
  164. TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  165. TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
  166. TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
  167. TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  168. TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
  169. TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
  170. };
  171. KHASH_MAP_INIT_INT (entity_by_number, const char *);
  172. KHASH_MAP_INIT_STR (entity_by_name, const char *);
  173. KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
  174. KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
  175. KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
  176. rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
  177. khash_t(entity_by_number) *html_entity_by_number;
  178. khash_t(entity_by_name) *html_entity_by_name;
  179. khash_t(tag_by_name) *html_tag_by_name;
  180. khash_t(tag_by_id) *html_tag_by_id;
  181. khash_t(color_by_name) *html_color_by_name;
  182. static struct rspamd_url *rspamd_html_process_url (rspamd_mempool_t *pool,
  183. const gchar *start, guint len,
  184. struct html_tag_component *comp);
  185. static void
  186. rspamd_html_library_init (void)
  187. {
  188. guint i;
  189. khiter_t k;
  190. gint rc;
  191. if (!tags_sorted) {
  192. html_tag_by_id = kh_init (tag_by_id);
  193. html_tag_by_name = kh_init (tag_by_name);
  194. kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
  195. kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
  196. for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
  197. k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
  198. if (rc == 0) {
  199. /* Collision by id */
  200. msg_err ("collision in html tag id: %d (%s) vs %d (%s)",
  201. (int)tag_defs[i].id, tag_defs[i].name,
  202. (int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
  203. }
  204. kh_val (html_tag_by_id, k) = tag_defs[i];
  205. k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
  206. if (rc == 0) {
  207. /* Collision by name */
  208. msg_err ("collision in html tag name: %d (%s) vs %d (%s)",
  209. (int)tag_defs[i].id, tag_defs[i].name,
  210. (int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
  211. }
  212. kh_val (html_tag_by_name, k) = tag_defs[i];
  213. }
  214. tags_sorted = 1;
  215. }
  216. if (!entities_sorted) {
  217. html_entity_by_number = kh_init (entity_by_number);
  218. html_entity_by_name = kh_init (entity_by_name);
  219. kh_resize (entity_by_number, html_entity_by_number,
  220. G_N_ELEMENTS (entities_defs));
  221. kh_resize (entity_by_name, html_entity_by_name,
  222. G_N_ELEMENTS (entities_defs));
  223. for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
  224. if (entities_defs[i].code != 0) {
  225. k = kh_put (entity_by_number, html_entity_by_number,
  226. entities_defs[i].code, &rc);
  227. if (rc == 0) {
  228. /* Collision by id */
  229. gint cmp_res = strcmp (entities_defs[i].replacement,
  230. kh_val (html_entity_by_number, k));
  231. if (cmp_res != 0) {
  232. if (strlen (entities_defs[i].replacement) <
  233. strlen (kh_val (html_entity_by_number, k))) {
  234. /* Shorter replacement is more likely to be valid */
  235. msg_debug ("1 collision in html entity id: %d (%s); replace %s by %s",
  236. (int) entities_defs[i].code, entities_defs[i].name,
  237. kh_val (html_entity_by_number, k),
  238. entities_defs[i].replacement);
  239. kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
  240. }
  241. else if (strlen (entities_defs[i].replacement) ==
  242. strlen (kh_val (html_entity_by_number, k)) &&
  243. cmp_res < 0) {
  244. /* Identical len but lexicographically shorter */
  245. msg_debug ("collision in html entity id: %d (%s); replace %s by %s",
  246. (int) entities_defs[i].code, entities_defs[i].name,
  247. kh_val (html_entity_by_number, k),
  248. entities_defs[i].replacement);
  249. kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
  250. }
  251. /* Do not replace otherwise */
  252. }
  253. /* Identic replacement */
  254. }
  255. else {
  256. kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
  257. }
  258. }
  259. k = kh_put (entity_by_name, html_entity_by_name,
  260. entities_defs[i].name, &rc);
  261. if (rc == 0) {
  262. /* Collision by name */
  263. if (strcmp (kh_val (html_entity_by_number, k),
  264. entities_defs[i].replacement) != 0) {
  265. msg_err ("collision in html entity name: %d (%s)",
  266. (int) entities_defs[i].code, entities_defs[i].name);
  267. }
  268. }
  269. kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
  270. }
  271. html_color_by_name = kh_init (color_by_name);
  272. kh_resize (color_by_name, html_color_by_name,
  273. G_N_ELEMENTS (html_colornames));
  274. rspamd_ftok_t *keys;
  275. keys = g_malloc0 (sizeof (rspamd_ftok_t) *
  276. G_N_ELEMENTS (html_colornames));
  277. for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
  278. struct html_color c;
  279. keys[i].begin = html_colornames[i].name;
  280. keys[i].len = strlen (html_colornames[i].name);
  281. k = kh_put (color_by_name, html_color_by_name,
  282. &keys[i], &rc);
  283. c.valid = true;
  284. c.d.comp.r = html_colornames[i].rgb.r;
  285. c.d.comp.g = html_colornames[i].rgb.g;
  286. c.d.comp.b = html_colornames[i].rgb.b;
  287. c.d.comp.alpha = 255;
  288. kh_val (html_color_by_name, k) = c;
  289. }
  290. entities_sorted = 1;
  291. }
  292. }
  293. static gboolean
  294. rspamd_html_check_balance (GNode * node, GNode ** cur_level)
  295. {
  296. struct html_tag *arg = node->data, *tmp;
  297. GNode *cur;
  298. if (arg->flags & FL_CLOSING) {
  299. /* First of all check whether this tag is closing tag for parent node */
  300. cur = node->parent;
  301. while (cur && cur->data) {
  302. tmp = cur->data;
  303. if (tmp->id == arg->id &&
  304. (tmp->flags & FL_CLOSED) == 0) {
  305. tmp->flags |= FL_CLOSED;
  306. /* Destroy current node as we find corresponding parent node */
  307. g_node_destroy (node);
  308. /* Change level */
  309. *cur_level = cur->parent;
  310. return TRUE;
  311. }
  312. cur = cur->parent;
  313. }
  314. }
  315. else {
  316. return TRUE;
  317. }
  318. return FALSE;
  319. }
  320. gint
  321. rspamd_html_tag_by_name (const gchar *name)
  322. {
  323. khiter_t k;
  324. k = kh_get (tag_by_name, html_tag_by_name, name);
  325. if (k != kh_end (html_tag_by_name)) {
  326. return kh_val (html_tag_by_name, k).id;
  327. }
  328. return -1;
  329. }
  330. gboolean
  331. rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
  332. {
  333. gint id;
  334. g_assert (hc != NULL);
  335. g_assert (hc->tags_seen != NULL);
  336. id = rspamd_html_tag_by_name (tagname);
  337. if (id != -1) {
  338. return isset (hc->tags_seen, id);
  339. }
  340. return FALSE;
  341. }
  342. const gchar *
  343. rspamd_html_tag_by_id (gint id)
  344. {
  345. khiter_t k;
  346. k = kh_get (tag_by_id, html_tag_by_id, id);
  347. if (k != kh_end (html_tag_by_id)) {
  348. return kh_val (html_tag_by_id, k).name;
  349. }
  350. return NULL;
  351. }
  352. /* Decode HTML entitles in text */
  353. guint
  354. rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
  355. {
  356. goffset l, rep_len;
  357. gchar *t = s, *h = s, *e = s, *end_ptr, old_c;
  358. const gchar *end;
  359. const gchar *entity;
  360. gboolean seen_hash = FALSE, seen_hex = FALSE;
  361. enum {
  362. do_undefined,
  363. do_digits_only,
  364. do_mixed,
  365. } seen_digit_only;
  366. gint state = 0, base;
  367. UChar32 uc;
  368. khiter_t k;
  369. if (len == 0) {
  370. return 0;
  371. }
  372. else {
  373. l = len;
  374. }
  375. end = s + l;
  376. while (h - s < l && t <= h) {
  377. switch (state) {
  378. /* Out of entity */
  379. case 0:
  380. if (*h == '&') {
  381. state = 1;
  382. seen_hash = FALSE;
  383. seen_hex = FALSE;
  384. seen_digit_only = do_undefined;
  385. e = h;
  386. h++;
  387. continue;
  388. }
  389. else {
  390. *t = *h;
  391. h++;
  392. t++;
  393. }
  394. break;
  395. case 1:
  396. if (*h == ';' && h > e) {
  397. decode_entity:
  398. /* Determine base */
  399. /* First find in entities table */
  400. old_c = *h;
  401. *h = '\0';
  402. entity = e + 1;
  403. uc = 0;
  404. if (*entity != '#') {
  405. k = kh_get (entity_by_name, html_entity_by_name, entity);
  406. *h = old_c;
  407. if (k != kh_end (html_entity_by_name)) {
  408. if (kh_val (html_entity_by_name, k)) {
  409. rep_len = strlen (kh_val (html_entity_by_name, k));
  410. if (end - t >= rep_len) {
  411. memcpy (t, kh_val (html_entity_by_name, k),
  412. rep_len);
  413. t += rep_len;
  414. }
  415. } else {
  416. if (end - t > h - e + 1) {
  417. memmove (t, e, h - e + 1);
  418. t += h - e + 1;
  419. }
  420. }
  421. }
  422. else {
  423. if (end - t > h - e + 1) {
  424. memmove (t, e, h - e + 1);
  425. t += h - e + 1;
  426. }
  427. }
  428. }
  429. else if (e + 2 < h) {
  430. if (*(e + 2) == 'x' || *(e + 2) == 'X') {
  431. base = 16;
  432. }
  433. else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
  434. base = 8;
  435. }
  436. else {
  437. base = 10;
  438. }
  439. if (base == 10) {
  440. uc = strtoul ((e + 2), &end_ptr, base);
  441. }
  442. else {
  443. uc = strtoul ((e + 3), &end_ptr, base);
  444. }
  445. if (end_ptr != NULL && *end_ptr != '\0') {
  446. /* Skip undecoded */
  447. *h = old_c;
  448. if (end - t > h - e + 1) {
  449. memmove (t, e, h - e + 1);
  450. t += h - e + 1;
  451. }
  452. }
  453. else {
  454. /* Search for a replacement */
  455. *h = old_c;
  456. k = kh_get (entity_by_number, html_entity_by_number, uc);
  457. if (k != kh_end (html_entity_by_number)) {
  458. if (kh_val (html_entity_by_number, k)) {
  459. rep_len = strlen (kh_val (html_entity_by_number, k));
  460. if (end - t >= rep_len) {
  461. memcpy (t, kh_val (html_entity_by_number, k),
  462. rep_len);
  463. t += rep_len;
  464. }
  465. } else {
  466. if (end - t > h - e + 1) {
  467. memmove (t, e, h - e + 1);
  468. t += h - e + 1;
  469. }
  470. }
  471. }
  472. else {
  473. /* Unicode point */
  474. goffset off = t - s;
  475. UBool is_error = 0;
  476. if (uc > 0) {
  477. U8_APPEND (s, off, len, uc, is_error);
  478. if (!is_error) {
  479. t = s + off;
  480. }
  481. else {
  482. /* Leave invalid entities as is */
  483. if (end - t > h - e + 1) {
  484. memmove (t, e, h - e + 1);
  485. t += h - e + 1;
  486. }
  487. }
  488. }
  489. else if (end - t > h - e + 1) {
  490. memmove (t, e, h - e + 1);
  491. t += h - e + 1;
  492. }
  493. }
  494. if (end - t > 0 && old_c != ';') {
  495. /* Fuck email clients, fuck them */
  496. *t++ = old_c;
  497. }
  498. }
  499. }
  500. state = 0;
  501. }
  502. else if (*h == '&') {
  503. /* Previous `&` was bogus */
  504. state = 1;
  505. if (end - t > h - e) {
  506. memmove (t, e, h - e);
  507. t += h - e;
  508. }
  509. e = h;
  510. }
  511. else if (*h == '#') {
  512. seen_hash = TRUE;
  513. if (h + 1 < end && h[1] == 'x') {
  514. seen_hex = TRUE;
  515. /* Skip one more character */
  516. h ++;
  517. }
  518. }
  519. else if (seen_digit_only != do_mixed &&
  520. (g_ascii_isdigit (*h) || (seen_hex && g_ascii_isxdigit (*h)))) {
  521. seen_digit_only = do_digits_only;
  522. }
  523. else {
  524. if (seen_digit_only == do_digits_only && seen_hash && h > e) {
  525. /* We have seen some digits, so we can try to decode, eh */
  526. /* Fuck retarded email clients... */
  527. goto decode_entity;
  528. }
  529. seen_digit_only = do_mixed;
  530. }
  531. h++;
  532. break;
  533. }
  534. }
  535. /* Leftover */
  536. if (state == 1 && h > e) {
  537. /* Unfinished entity, copy as is */
  538. if (end - t >= h - e) {
  539. memmove (t, e, h - e);
  540. t += h - e;
  541. }
  542. }
  543. return (t - s);
  544. }
  545. static gboolean
  546. rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
  547. {
  548. const gchar *p1, *p2;
  549. p1 = t1->begin + t1->len - 1;
  550. p2 = t2->begin + t2->len - 1;
  551. /* Skip trailing dots */
  552. while (p1 > t1->begin) {
  553. if (*p1 != '.') {
  554. break;
  555. }
  556. p1 --;
  557. }
  558. while (p2 > t2->begin) {
  559. if (*p2 != '.') {
  560. break;
  561. }
  562. p2 --;
  563. }
  564. while (p1 > t1->begin && p2 > t2->begin) {
  565. if (*p1 != *p2) {
  566. break;
  567. }
  568. p1 --;
  569. p2 --;
  570. }
  571. if (p2 == t2->begin) {
  572. /* p2 can be subdomain of p1 if *p1 is '.' */
  573. if (p1 != t1->begin && *(p1 - 1) == '.') {
  574. return TRUE;
  575. }
  576. }
  577. else if (p1 == t1->begin) {
  578. if (p2 != t2->begin && *(p2 - 1) == '.') {
  579. return TRUE;
  580. }
  581. }
  582. return FALSE;
  583. }
  584. static void
  585. rspamd_html_url_is_phished (rspamd_mempool_t *pool,
  586. struct rspamd_url *href_url,
  587. const guchar *url_text,
  588. gsize len,
  589. gboolean *url_found,
  590. struct rspamd_url **ptext_url)
  591. {
  592. struct rspamd_url *text_url;
  593. rspamd_ftok_t disp_tok, href_tok;
  594. gint rc;
  595. goffset url_pos;
  596. gchar *url_str = NULL, *idn_hbuf;
  597. const guchar *end = url_text + len, *p;
  598. #if U_ICU_VERSION_MAJOR_NUM >= 46
  599. static UIDNA *udn;
  600. UErrorCode uc_err = U_ZERO_ERROR;
  601. UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER;
  602. #endif
  603. *url_found = FALSE;
  604. #if U_ICU_VERSION_MAJOR_NUM >= 46
  605. if (udn == NULL) {
  606. udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
  607. if (uc_err != U_ZERO_ERROR) {
  608. msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
  609. }
  610. }
  611. #endif
  612. while (url_text < end && g_ascii_isspace (*url_text)) {
  613. url_text ++;
  614. }
  615. if (end > url_text + 4 &&
  616. rspamd_url_find (pool, url_text, end - url_text, &url_str,
  617. RSPAMD_URL_FIND_ALL,
  618. &url_pos, NULL) &&
  619. url_str != NULL) {
  620. if (url_pos > 0) {
  621. /*
  622. * We have some url at some offset, so we need to check what is
  623. * at the start of the text
  624. */
  625. p = url_text;
  626. while (p < url_text + url_pos) {
  627. if (!g_ascii_isspace (*p)) {
  628. *url_found = FALSE;
  629. return;
  630. }
  631. p++;
  632. }
  633. }
  634. text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
  635. rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
  636. RSPAMD_URL_PARSE_TEXT);
  637. if (rc == URI_ERRNO_OK) {
  638. disp_tok.len = text_url->hostlen;
  639. disp_tok.begin = rspamd_url_host_unsafe (text_url);
  640. #if U_ICU_VERSION_MAJOR_NUM >= 46
  641. if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (text_url),
  642. text_url->hostlen, "xn--", 4) != -1) {
  643. idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
  644. /* We need to convert it to the normal value first */
  645. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  646. rspamd_url_host_unsafe (text_url), text_url->hostlen,
  647. idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
  648. if (uc_err != U_ZERO_ERROR) {
  649. msg_err_pool ("cannot convert to IDN: %s",
  650. u_errorName (uc_err));
  651. disp_tok.len = text_url->hostlen;
  652. }
  653. else {
  654. disp_tok.begin = idn_hbuf;
  655. }
  656. }
  657. #endif
  658. href_tok.len = href_url->hostlen;
  659. href_tok.begin = rspamd_url_host_unsafe (href_url);
  660. #if U_ICU_VERSION_MAJOR_NUM >= 46
  661. if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (href_url),
  662. href_url->hostlen, "xn--", 4) != -1) {
  663. idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
  664. /* We need to convert it to the normal value first */
  665. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  666. rspamd_url_host_unsafe (href_url), href_url->hostlen,
  667. idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
  668. if (uc_err != U_ZERO_ERROR) {
  669. msg_err_pool ("cannot convert to IDN: %s",
  670. u_errorName (uc_err));
  671. href_tok.len = href_url->hostlen;
  672. }
  673. else {
  674. href_tok.begin = idn_hbuf;
  675. }
  676. }
  677. #endif
  678. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0 &&
  679. text_url->tldlen > 0 && href_url->tldlen > 0) {
  680. /* Apply the same logic for TLD */
  681. disp_tok.len = text_url->tldlen;
  682. disp_tok.begin = rspamd_url_tld_unsafe (text_url);
  683. #if U_ICU_VERSION_MAJOR_NUM >= 46
  684. if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (text_url),
  685. text_url->tldlen, "xn--", 4) != -1) {
  686. idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
  687. /* We need to convert it to the normal value first */
  688. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  689. rspamd_url_tld_unsafe (text_url), text_url->tldlen,
  690. idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
  691. if (uc_err != U_ZERO_ERROR) {
  692. msg_err_pool ("cannot convert to IDN: %s",
  693. u_errorName (uc_err));
  694. disp_tok.len = text_url->tldlen;
  695. }
  696. else {
  697. disp_tok.begin = idn_hbuf;
  698. }
  699. }
  700. #endif
  701. href_tok.len = href_url->tldlen;
  702. href_tok.begin = rspamd_url_tld_unsafe (href_url);
  703. #if U_ICU_VERSION_MAJOR_NUM >= 46
  704. if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (href_url),
  705. href_url->tldlen, "xn--", 4) != -1) {
  706. idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
  707. /* We need to convert it to the normal value first */
  708. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  709. rspamd_url_tld_unsafe (href_url), href_url->tldlen,
  710. idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
  711. if (uc_err != U_ZERO_ERROR) {
  712. msg_err_pool ("cannot convert to IDN: %s",
  713. u_errorName (uc_err));
  714. href_tok.len = href_url->tldlen;
  715. }
  716. else {
  717. href_tok.begin = idn_hbuf;
  718. }
  719. }
  720. #endif
  721. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
  722. /* Check if one url is a subdomain for another */
  723. if (!rspamd_url_is_subdomain (&disp_tok, &href_tok)) {
  724. href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
  725. href_url->phished_url = text_url;
  726. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  727. }
  728. }
  729. }
  730. *ptext_url = text_url;
  731. *url_found = TRUE;
  732. }
  733. else {
  734. /*
  735. * We have found something that looks like an url but it was
  736. * not parsed correctly.
  737. * Sometimes it means an obfuscation attempt, so we have to check
  738. * what's inside of the text
  739. */
  740. gboolean obfuscation_found = FALSE;
  741. if (g_ascii_strncasecmp (url_str, "http", 4) == 0 &&
  742. strstr (url_str, "://") != NULL) {
  743. /* Clearly an obfuscation attempt */
  744. obfuscation_found = TRUE;
  745. }
  746. msg_info_pool ("extract of url '%s' failed: %s; obfuscation detected: %s",
  747. url_str,
  748. rspamd_url_strerror (rc),
  749. obfuscation_found ? "yes" : "no");
  750. if (obfuscation_found) {
  751. href_url->flags |= RSPAMD_URL_FLAG_PHISHED|RSPAMD_URL_FLAG_OBSCURED;
  752. }
  753. }
  754. }
  755. }
  756. static gboolean
  757. rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
  758. struct html_tag *tag, GNode **cur_level, gboolean *balanced)
  759. {
  760. GNode *nnode;
  761. struct html_tag *parent;
  762. if (hc->html_tags == NULL) {
  763. nnode = g_node_new (NULL);
  764. *cur_level = nnode;
  765. hc->html_tags = nnode;
  766. rspamd_mempool_add_destructor (pool,
  767. (rspamd_mempool_destruct_t) g_node_destroy,
  768. nnode);
  769. }
  770. if (hc->total_tags > max_tags) {
  771. hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
  772. }
  773. if (tag->id == -1) {
  774. /* Ignore unknown tags */
  775. hc->total_tags ++;
  776. return FALSE;
  777. }
  778. tag->parent = *cur_level;
  779. if (!(tag->flags & (CM_INLINE|CM_EMPTY))) {
  780. /* Block tag */
  781. if (tag->flags & (FL_CLOSING|FL_CLOSED)) {
  782. if (!*cur_level) {
  783. msg_debug_html ("bad parent node");
  784. return FALSE;
  785. }
  786. if (hc->total_tags < max_tags) {
  787. nnode = g_node_new (tag);
  788. g_node_append (*cur_level, nnode);
  789. if (!rspamd_html_check_balance (nnode, cur_level)) {
  790. msg_debug_html (
  791. "mark part as unbalanced as it has not pairable closing tags");
  792. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  793. *balanced = FALSE;
  794. } else {
  795. *balanced = TRUE;
  796. }
  797. hc->total_tags ++;
  798. }
  799. }
  800. else {
  801. parent = (*cur_level)->data;
  802. if (parent) {
  803. if ((parent->flags & FL_IGNORE)) {
  804. tag->flags |= FL_IGNORE;
  805. }
  806. if (!(tag->flags & FL_CLOSED) &&
  807. !(parent->flags & FL_BLOCK)) {
  808. /* We likely have some bad nesting */
  809. if (parent->id == tag->id) {
  810. /* Something like <a>bla<a>foo... */
  811. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  812. *balanced = FALSE;
  813. tag->parent = parent->parent;
  814. if (hc->total_tags < max_tags) {
  815. nnode = g_node_new (tag);
  816. g_node_append (parent->parent, nnode);
  817. *cur_level = nnode;
  818. hc->total_tags ++;
  819. }
  820. return TRUE;
  821. }
  822. }
  823. }
  824. if (hc->total_tags < max_tags) {
  825. nnode = g_node_new (tag);
  826. g_node_append (*cur_level, nnode);
  827. if ((tag->flags & FL_CLOSED) == 0) {
  828. *cur_level = nnode;
  829. }
  830. hc->total_tags ++;
  831. }
  832. if (tag->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE)) {
  833. tag->flags |= FL_IGNORE;
  834. return FALSE;
  835. }
  836. }
  837. }
  838. else {
  839. /* Inline tag */
  840. parent = (*cur_level)->data;
  841. if (parent) {
  842. if (hc->total_tags < max_tags) {
  843. nnode = g_node_new (tag);
  844. g_node_append (*cur_level, nnode);
  845. hc->total_tags ++;
  846. }
  847. if ((parent->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE))) {
  848. tag->flags |= FL_IGNORE;
  849. return FALSE;
  850. }
  851. }
  852. }
  853. return TRUE;
  854. }
  855. #define NEW_COMPONENT(comp_type) do { \
  856. comp = rspamd_mempool_alloc (pool, sizeof (*comp)); \
  857. comp->type = (comp_type); \
  858. comp->start = NULL; \
  859. comp->len = 0; \
  860. g_queue_push_tail (tag->params, comp); \
  861. ret = TRUE; \
  862. } while(0)
  863. static gboolean
  864. rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
  865. const guchar *begin, const guchar *end,
  866. struct html_tag *tag)
  867. {
  868. struct html_tag_component *comp;
  869. gint len;
  870. gboolean ret = FALSE;
  871. gchar *p;
  872. if (end <= begin) {
  873. return FALSE;
  874. }
  875. p = rspamd_mempool_alloc (pool, end - begin);
  876. memcpy (p, begin, end - begin);
  877. len = rspamd_html_decode_entitles_inplace (p, end - begin);
  878. if (len == 3) {
  879. if (g_ascii_strncasecmp (p, "src", len) == 0) {
  880. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  881. }
  882. else if (g_ascii_strncasecmp (p, "rel", len) == 0) {
  883. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_REL);
  884. }
  885. else if (g_ascii_strncasecmp (p, "alt", len) == 0) {
  886. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_ALT);
  887. }
  888. }
  889. else if (len == 4) {
  890. if (g_ascii_strncasecmp (p, "href", len) == 0) {
  891. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  892. }
  893. }
  894. else if (len == 6) {
  895. if (g_ascii_strncasecmp (p, "action", len) == 0) {
  896. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  897. }
  898. }
  899. if (tag->id == Tag_IMG) {
  900. /* Check width and height if presented */
  901. if (len == 5 && g_ascii_strncasecmp (p, "width", len) == 0) {
  902. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH);
  903. }
  904. else if (len == 6 && g_ascii_strncasecmp (p, "height", len) == 0) {
  905. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT);
  906. }
  907. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  908. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  909. }
  910. }
  911. else if (tag->id == Tag_FONT) {
  912. if (len == 5){
  913. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  914. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  915. }
  916. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  917. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  918. }
  919. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  920. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  921. }
  922. }
  923. else if (len == 7) {
  924. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  925. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  926. }
  927. }
  928. else if (len == 4) {
  929. if (g_ascii_strncasecmp (p, "size", len) == 0) {
  930. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE);
  931. }
  932. }
  933. }
  934. else if (tag->flags & FL_BLOCK) {
  935. if (len == 5){
  936. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  937. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  938. }
  939. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  940. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  941. }
  942. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  943. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  944. }
  945. }
  946. else if (len == 7) {
  947. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  948. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  949. }
  950. }
  951. }
  952. return ret;
  953. }
  954. static inline void
  955. rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
  956. struct html_content *hc, struct html_tag *tag, const guchar *in,
  957. gint *statep, guchar const **savep)
  958. {
  959. enum {
  960. parse_start = 0,
  961. parse_name,
  962. parse_attr_name,
  963. parse_equal,
  964. parse_start_dquote,
  965. parse_dqvalue,
  966. parse_end_dquote,
  967. parse_start_squote,
  968. parse_sqvalue,
  969. parse_end_squote,
  970. parse_value,
  971. spaces_after_name,
  972. spaces_before_eq,
  973. spaces_after_eq,
  974. spaces_after_param,
  975. ignore_bad_tag
  976. } state;
  977. struct html_tag_def *found;
  978. gboolean store = FALSE;
  979. struct html_tag_component *comp;
  980. state = *statep;
  981. switch (state) {
  982. case parse_start:
  983. if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
  984. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  985. state = ignore_bad_tag;
  986. tag->id = -1;
  987. tag->flags |= FL_BROKEN;
  988. }
  989. else if (g_ascii_isalpha (*in)) {
  990. state = parse_name;
  991. tag->name.start = in;
  992. }
  993. break;
  994. case parse_name:
  995. if (g_ascii_isspace (*in) || *in == '>' || *in == '/') {
  996. g_assert (in >= tag->name.start);
  997. if (*in == '/') {
  998. tag->flags |= FL_CLOSED;
  999. }
  1000. tag->name.len = in - tag->name.start;
  1001. if (tag->name.len == 0) {
  1002. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  1003. tag->id = -1;
  1004. tag->flags |= FL_BROKEN;
  1005. state = ignore_bad_tag;
  1006. }
  1007. else {
  1008. gchar *s;
  1009. khiter_t k;
  1010. /* We CANNOT safely modify tag's name here, as it is already parsed */
  1011. s = rspamd_mempool_alloc (pool, tag->name.len + 1);
  1012. memcpy (s, tag->name.start, tag->name.len);
  1013. tag->name.len = rspamd_html_decode_entitles_inplace (s,
  1014. tag->name.len);
  1015. tag->name.start = s;
  1016. tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len);
  1017. s[tag->name.len] = '\0';
  1018. k = kh_get (tag_by_name, html_tag_by_name, s);
  1019. if (k == kh_end (html_tag_by_name)) {
  1020. hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
  1021. tag->id = -1;
  1022. }
  1023. else {
  1024. found = &kh_val (html_tag_by_name, k);
  1025. tag->id = found->id;
  1026. tag->flags = found->flags;
  1027. }
  1028. state = spaces_after_name;
  1029. }
  1030. }
  1031. break;
  1032. case parse_attr_name:
  1033. if (*savep == NULL) {
  1034. state = ignore_bad_tag;
  1035. }
  1036. else {
  1037. const guchar *attr_name_end = in;
  1038. if (*in == '=') {
  1039. state = parse_equal;
  1040. }
  1041. else if (*in == '"') {
  1042. /* No equal or something sane but we have quote character */
  1043. state = parse_start_dquote;
  1044. attr_name_end = in - 1;
  1045. while (attr_name_end > *savep) {
  1046. if (!g_ascii_isalnum (*attr_name_end)) {
  1047. attr_name_end --;
  1048. }
  1049. else {
  1050. break;
  1051. }
  1052. }
  1053. /* One character forward to obtain length */
  1054. attr_name_end ++;
  1055. }
  1056. else if (g_ascii_isspace (*in)) {
  1057. state = spaces_before_eq;
  1058. }
  1059. else if (*in == '/') {
  1060. tag->flags |= FL_CLOSED;
  1061. }
  1062. else if (!g_ascii_isgraph (*in)) {
  1063. state = parse_value;
  1064. attr_name_end = in - 1;
  1065. while (attr_name_end > *savep) {
  1066. if (!g_ascii_isalnum (*attr_name_end)) {
  1067. attr_name_end --;
  1068. }
  1069. else {
  1070. break;
  1071. }
  1072. }
  1073. /* One character forward to obtain length */
  1074. attr_name_end ++;
  1075. }
  1076. else {
  1077. return;
  1078. }
  1079. if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
  1080. /* Ignore unknown params */
  1081. *savep = NULL;
  1082. }
  1083. else if (state == parse_value) {
  1084. *savep = in + 1;
  1085. }
  1086. }
  1087. break;
  1088. case spaces_after_name:
  1089. if (!g_ascii_isspace (*in)) {
  1090. *savep = in;
  1091. if (*in == '/') {
  1092. tag->flags |= FL_CLOSED;
  1093. }
  1094. else if (*in != '>') {
  1095. state = parse_attr_name;
  1096. }
  1097. }
  1098. break;
  1099. case spaces_before_eq:
  1100. if (*in == '=') {
  1101. state = parse_equal;
  1102. }
  1103. else if (!g_ascii_isspace (*in)) {
  1104. /*
  1105. * HTML defines that crap could still be restored and
  1106. * calculated somehow... So we have to follow this stupid behaviour
  1107. */
  1108. /*
  1109. * TODO: estimate what insane things do email clients in each case
  1110. */
  1111. if (*in == '>') {
  1112. /*
  1113. * Attribtute name followed by end of tag
  1114. * Should be okay (empty attribute). The rest is handled outside
  1115. * this automata.
  1116. */
  1117. }
  1118. else if (*in == '"' || *in == '\'') {
  1119. /* Attribute followed by quote... Missing '=' ? Dunno, need to test */
  1120. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  1121. tag->flags |= FL_BROKEN;
  1122. state = ignore_bad_tag;
  1123. }
  1124. else {
  1125. /*
  1126. * Just start another attribute ignoring an empty attributes for
  1127. * now. We don't use them in fact...
  1128. */
  1129. state = parse_attr_name;
  1130. *savep = in;
  1131. }
  1132. }
  1133. break;
  1134. case spaces_after_eq:
  1135. if (*in == '"') {
  1136. state = parse_start_dquote;
  1137. }
  1138. else if (*in == '\'') {
  1139. state = parse_start_squote;
  1140. }
  1141. else if (!g_ascii_isspace (*in)) {
  1142. if (*savep != NULL) {
  1143. /* We need to save this param */
  1144. *savep = in;
  1145. }
  1146. state = parse_value;
  1147. }
  1148. break;
  1149. case parse_equal:
  1150. if (g_ascii_isspace (*in)) {
  1151. state = spaces_after_eq;
  1152. }
  1153. else if (*in == '"') {
  1154. state = parse_start_dquote;
  1155. }
  1156. else if (*in == '\'') {
  1157. state = parse_start_squote;
  1158. }
  1159. else {
  1160. if (*savep != NULL) {
  1161. /* We need to save this param */
  1162. *savep = in;
  1163. }
  1164. state = parse_value;
  1165. }
  1166. break;
  1167. case parse_start_dquote:
  1168. if (*in == '"') {
  1169. if (*savep != NULL) {
  1170. /* We have an empty attribute value */
  1171. savep = NULL;
  1172. }
  1173. state = spaces_after_param;
  1174. }
  1175. else {
  1176. if (*savep != NULL) {
  1177. /* We need to save this param */
  1178. *savep = in;
  1179. }
  1180. state = parse_dqvalue;
  1181. }
  1182. break;
  1183. case parse_start_squote:
  1184. if (*in == '\'') {
  1185. if (*savep != NULL) {
  1186. /* We have an empty attribute value */
  1187. savep = NULL;
  1188. }
  1189. state = spaces_after_param;
  1190. }
  1191. else {
  1192. if (*savep != NULL) {
  1193. /* We need to save this param */
  1194. *savep = in;
  1195. }
  1196. state = parse_sqvalue;
  1197. }
  1198. break;
  1199. case parse_dqvalue:
  1200. if (*in == '"') {
  1201. store = TRUE;
  1202. state = parse_end_dquote;
  1203. }
  1204. if (store) {
  1205. if (*savep != NULL) {
  1206. gchar *s;
  1207. g_assert (tag->params != NULL);
  1208. comp = g_queue_peek_tail (tag->params);
  1209. g_assert (comp != NULL);
  1210. comp->len = in - *savep;
  1211. s = rspamd_mempool_alloc (pool, comp->len);
  1212. memcpy (s, *savep, comp->len);
  1213. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1214. comp->start = s;
  1215. *savep = NULL;
  1216. }
  1217. }
  1218. break;
  1219. case parse_sqvalue:
  1220. if (*in == '\'') {
  1221. store = TRUE;
  1222. state = parse_end_squote;
  1223. }
  1224. if (store) {
  1225. if (*savep != NULL) {
  1226. gchar *s;
  1227. g_assert (tag->params != NULL);
  1228. comp = g_queue_peek_tail (tag->params);
  1229. g_assert (comp != NULL);
  1230. comp->len = in - *savep;
  1231. s = rspamd_mempool_alloc (pool, comp->len);
  1232. memcpy (s, *savep, comp->len);
  1233. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1234. comp->start = s;
  1235. *savep = NULL;
  1236. }
  1237. }
  1238. break;
  1239. case parse_value:
  1240. if (*in == '/' && *(in + 1) == '>') {
  1241. tag->flags |= FL_CLOSED;
  1242. store = TRUE;
  1243. }
  1244. else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
  1245. store = TRUE;
  1246. state = spaces_after_param;
  1247. }
  1248. if (store) {
  1249. if (*savep != NULL) {
  1250. gchar *s;
  1251. g_assert (tag->params != NULL);
  1252. comp = g_queue_peek_tail (tag->params);
  1253. g_assert (comp != NULL);
  1254. comp->len = in - *savep;
  1255. s = rspamd_mempool_alloc (pool, comp->len);
  1256. memcpy (s, *savep, comp->len);
  1257. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1258. comp->start = s;
  1259. *savep = NULL;
  1260. }
  1261. }
  1262. break;
  1263. case parse_end_dquote:
  1264. case parse_end_squote:
  1265. if (g_ascii_isspace (*in)) {
  1266. state = spaces_after_param;
  1267. }
  1268. else if (*in == '/' && *(in + 1) == '>') {
  1269. tag->flags |= FL_CLOSED;
  1270. }
  1271. else {
  1272. /* No space, proceed immediately to the attribute name */
  1273. state = parse_attr_name;
  1274. *savep = in;
  1275. }
  1276. break;
  1277. case spaces_after_param:
  1278. if (!g_ascii_isspace (*in)) {
  1279. if (*in == '/' && *(in + 1) == '>') {
  1280. tag->flags |= FL_CLOSED;
  1281. }
  1282. state = parse_attr_name;
  1283. *savep = in;
  1284. }
  1285. break;
  1286. case ignore_bad_tag:
  1287. break;
  1288. }
  1289. *statep = state;
  1290. }
  1291. struct rspamd_url *
  1292. rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
  1293. struct html_tag_component *comp)
  1294. {
  1295. struct rspamd_url *url;
  1296. guint saved_flags = 0;
  1297. gchar *decoded;
  1298. gint rc;
  1299. gsize decoded_len;
  1300. const gchar *p, *s, *prefix = "http://";
  1301. gchar *d;
  1302. guint i;
  1303. gsize dlen;
  1304. gboolean has_bad_chars = FALSE, no_prefix = FALSE;
  1305. static const gchar hexdigests[16] = "0123456789abcdef";
  1306. p = start;
  1307. /* Strip spaces from the url */
  1308. /* Head spaces */
  1309. while (p < start + len && g_ascii_isspace (*p)) {
  1310. p ++;
  1311. start ++;
  1312. len --;
  1313. }
  1314. if (comp) {
  1315. comp->start = p;
  1316. comp->len = len;
  1317. }
  1318. /* Trailing spaces */
  1319. p = start + len - 1;
  1320. while (p >= start && g_ascii_isspace (*p)) {
  1321. p --;
  1322. len --;
  1323. if (comp) {
  1324. comp->len --;
  1325. }
  1326. }
  1327. s = start;
  1328. dlen = 0;
  1329. for (i = 0; i < len; i ++) {
  1330. if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1331. dlen += 3;
  1332. }
  1333. else {
  1334. dlen ++;
  1335. }
  1336. }
  1337. if (rspamd_substring_search (start, len, "://", 3) == -1) {
  1338. if (len >= sizeof ("mailto:") &&
  1339. (memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
  1340. memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 ||
  1341. memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) {
  1342. /* Exclusion, has valid but 'strange' prefix */
  1343. }
  1344. else {
  1345. for (i = 0; i < len; i ++) {
  1346. if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
  1347. if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
  1348. prefix = "http:";
  1349. dlen += sizeof ("http:") - 1;
  1350. no_prefix = TRUE;
  1351. }
  1352. else if (s[i] == '@') {
  1353. /* Likely email prefix */
  1354. prefix = "mailto://";
  1355. dlen += sizeof ("mailto://") - 1;
  1356. no_prefix = TRUE;
  1357. }
  1358. else if (s[i] == ':' && i != 0) {
  1359. /* Special case */
  1360. no_prefix = FALSE;
  1361. }
  1362. else {
  1363. if (i == 0) {
  1364. /* No valid data */
  1365. return NULL;
  1366. }
  1367. else {
  1368. no_prefix = TRUE;
  1369. dlen += strlen (prefix);
  1370. }
  1371. }
  1372. break;
  1373. }
  1374. }
  1375. }
  1376. }
  1377. decoded = rspamd_mempool_alloc (pool, dlen + 1);
  1378. d = decoded;
  1379. if (no_prefix) {
  1380. gsize plen = strlen (prefix);
  1381. memcpy (d, prefix, plen);
  1382. d += plen;
  1383. }
  1384. /*
  1385. * We also need to remove all internal newlines, spaces
  1386. * and encode unsafe characters
  1387. */
  1388. for (i = 0; i < len; i ++) {
  1389. if (G_UNLIKELY (g_ascii_isspace (s[i]))) {
  1390. continue;
  1391. }
  1392. else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1393. /* URL encode */
  1394. *d++ = '%';
  1395. *d++ = hexdigests[(s[i] >> 4) & 0xf];
  1396. *d++ = hexdigests[s[i] & 0xf];
  1397. has_bad_chars = TRUE;
  1398. }
  1399. else {
  1400. *d++ = s[i];
  1401. }
  1402. }
  1403. *d = '\0';
  1404. dlen = d - decoded;
  1405. url = rspamd_mempool_alloc0 (pool, sizeof (*url));
  1406. enum rspamd_normalise_result norm_res;
  1407. norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
  1408. if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
  1409. saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
  1410. }
  1411. if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
  1412. saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
  1413. if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
  1414. saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
  1415. }
  1416. }
  1417. rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
  1418. /* Filter some completely damaged urls */
  1419. if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
  1420. !((url->protocol & PROTOCOL_UNKNOWN))) {
  1421. url->flags |= saved_flags;
  1422. if (has_bad_chars) {
  1423. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1424. }
  1425. if (no_prefix) {
  1426. url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  1427. if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
  1428. /* Ignore urls with both no schema and no tld */
  1429. return NULL;
  1430. }
  1431. }
  1432. decoded = url->string;
  1433. decoded_len = url->urllen;
  1434. if (comp) {
  1435. comp->start = decoded;
  1436. comp->len = decoded_len;
  1437. }
  1438. /* Spaces in href usually mean an attempt to obfuscate URL */
  1439. /* See https://github.com/vstakhov/rspamd/issues/593 */
  1440. #if 0
  1441. if (has_spaces) {
  1442. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1443. }
  1444. #endif
  1445. return url;
  1446. }
  1447. return NULL;
  1448. }
  1449. static struct rspamd_url *
  1450. rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1451. struct html_content *hc)
  1452. {
  1453. struct html_tag_component *comp;
  1454. GList *cur;
  1455. struct rspamd_url *url;
  1456. const gchar *start;
  1457. gsize len;
  1458. cur = tag->params->head;
  1459. while (cur) {
  1460. comp = cur->data;
  1461. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1462. start = comp->start;
  1463. len = comp->len;
  1464. /* Check base url */
  1465. if (hc && hc->base_url && comp->len > 2) {
  1466. /*
  1467. * Relative url cannot start from the following:
  1468. * schema://
  1469. * data:
  1470. * slash
  1471. */
  1472. gchar *buf;
  1473. gsize orig_len;
  1474. if (rspamd_substring_search (start, len, "://", 3) == -1) {
  1475. if (len >= sizeof ("data:") &&
  1476. g_ascii_strncasecmp (start, "data:", sizeof ("data:") - 1) == 0) {
  1477. /* Image data url, never insert as url */
  1478. return NULL;
  1479. }
  1480. /* Assume relative url */
  1481. gboolean need_slash = FALSE;
  1482. orig_len = len;
  1483. len += hc->base_url->urllen;
  1484. if (hc->base_url->datalen == 0) {
  1485. need_slash = TRUE;
  1486. len ++;
  1487. }
  1488. buf = rspamd_mempool_alloc (pool, len + 1);
  1489. rspamd_snprintf (buf, len + 1, "%*s%s%*s",
  1490. hc->base_url->urllen, hc->base_url->string,
  1491. need_slash ? "/" : "",
  1492. (gint)orig_len, start);
  1493. start = buf;
  1494. }
  1495. else if (start[0] == '/' && start[1] != '/') {
  1496. /* Relative to the hostname */
  1497. orig_len = len;
  1498. len += hc->base_url->hostlen + hc->base_url->protocollen +
  1499. 3 /* for :// */;
  1500. buf = rspamd_mempool_alloc (pool, len + 1);
  1501. rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
  1502. hc->base_url->protocollen, hc->base_url->string,
  1503. hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
  1504. (gint)orig_len, start);
  1505. start = buf;
  1506. }
  1507. }
  1508. url = rspamd_html_process_url (pool, start, len, comp);
  1509. if (url && tag->extra == NULL) {
  1510. tag->extra = url;
  1511. }
  1512. return url;
  1513. }
  1514. cur = g_list_next (cur);
  1515. }
  1516. return NULL;
  1517. }
  1518. struct rspamd_html_url_query_cbd {
  1519. rspamd_mempool_t *pool;
  1520. khash_t (rspamd_url_hash) *url_set;
  1521. struct rspamd_url *url;
  1522. GPtrArray *part_urls;
  1523. };
  1524. static gboolean
  1525. rspamd_html_url_query_callback (struct rspamd_url *url, gsize start_offset,
  1526. gsize end_offset, gpointer ud)
  1527. {
  1528. struct rspamd_html_url_query_cbd *cbd =
  1529. (struct rspamd_html_url_query_cbd *)ud;
  1530. rspamd_mempool_t *pool;
  1531. pool = cbd->pool;
  1532. if (url->protocol == PROTOCOL_MAILTO) {
  1533. if (url->userlen == 0) {
  1534. return FALSE;
  1535. }
  1536. }
  1537. msg_debug_html ("found url %s in query of url"
  1538. " %*s", url->string,
  1539. cbd->url->querylen, rspamd_url_query_unsafe (cbd->url));
  1540. url->flags |= RSPAMD_URL_FLAG_QUERY;
  1541. if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
  1542. g_ptr_array_add (cbd->part_urls, url);
  1543. }
  1544. return TRUE;
  1545. }
  1546. static void
  1547. rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
  1548. khash_t (rspamd_url_hash) *url_set,
  1549. GPtrArray *part_urls)
  1550. {
  1551. if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) {
  1552. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1553. }
  1554. if (url->querylen > 0) {
  1555. struct rspamd_html_url_query_cbd qcbd;
  1556. qcbd.pool = pool;
  1557. qcbd.url_set = url_set;
  1558. qcbd.url = url;
  1559. qcbd.part_urls = part_urls;
  1560. rspamd_url_find_multiple(pool,
  1561. rspamd_url_query_unsafe (url), url->querylen,
  1562. RSPAMD_URL_FIND_ALL, NULL,
  1563. rspamd_html_url_query_callback, &qcbd);
  1564. }
  1565. if (part_urls) {
  1566. g_ptr_array_add (part_urls, url);
  1567. }
  1568. }
  1569. static void
  1570. rspamd_html_process_data_image (rspamd_mempool_t *pool,
  1571. struct html_image *img,
  1572. struct html_tag_component *src)
  1573. {
  1574. /*
  1575. * Here, we do very basic processing of the data:
  1576. * detect if we have something like: ``
  1577. * We only parse base64 encoded data.
  1578. * We ignore content type so far
  1579. */
  1580. struct rspamd_image *parsed_image;
  1581. const gchar *semicolon_pos = NULL, *end = src->start + src->len;
  1582. semicolon_pos = src->start;
  1583. while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) {
  1584. if (end - semicolon_pos > sizeof ("base64,")) {
  1585. if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) {
  1586. const gchar *data_pos = semicolon_pos + sizeof ("base64,");
  1587. gchar *decoded;
  1588. gsize encoded_len = end - data_pos, decoded_len;
  1589. rspamd_ftok_t inp;
  1590. decoded_len = (encoded_len / 4 * 3) + 12;
  1591. decoded = rspamd_mempool_alloc (pool, decoded_len);
  1592. rspamd_cryptobox_base64_decode (data_pos, encoded_len,
  1593. decoded, &decoded_len);
  1594. inp.begin = decoded;
  1595. inp.len = decoded_len;
  1596. parsed_image = rspamd_maybe_process_image (pool, &inp);
  1597. if (parsed_image) {
  1598. msg_debug_html ("detected %s image of size %ud x %ud in data url",
  1599. rspamd_image_type_str (parsed_image->type),
  1600. parsed_image->width, parsed_image->height);
  1601. img->embedded_image = parsed_image;
  1602. }
  1603. }
  1604. break;
  1605. }
  1606. else {
  1607. /* Nothing useful */
  1608. return;
  1609. }
  1610. semicolon_pos ++;
  1611. }
  1612. }
  1613. static void
  1614. rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1615. struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
  1616. GPtrArray *part_urls,
  1617. GByteArray *dest)
  1618. {
  1619. struct html_tag_component *comp;
  1620. struct html_image *img;
  1621. rspamd_ftok_t fstr;
  1622. const guchar *p;
  1623. GList *cur;
  1624. gulong val;
  1625. gboolean seen_width = FALSE, seen_height = FALSE;
  1626. goffset pos;
  1627. cur = tag->params->head;
  1628. img = rspamd_mempool_alloc0 (pool, sizeof (*img));
  1629. img->tag = tag;
  1630. tag->flags |= FL_IMAGE;
  1631. while (cur) {
  1632. comp = cur->data;
  1633. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1634. fstr.begin = (gchar *)comp->start;
  1635. fstr.len = comp->len;
  1636. img->src = rspamd_mempool_ftokdup (pool, &fstr);
  1637. if (comp->len > sizeof ("cid:") - 1 && memcmp (comp->start,
  1638. "cid:", sizeof ("cid:") - 1) == 0) {
  1639. /* We have an embedded image */
  1640. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
  1641. }
  1642. else {
  1643. if (comp->len > sizeof ("data:") - 1 && memcmp (comp->start,
  1644. "data:", sizeof ("data:") - 1) == 0) {
  1645. /* We have an embedded image in HTML tag */
  1646. img->flags |=
  1647. (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
  1648. rspamd_html_process_data_image (pool, img, comp);
  1649. hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
  1650. }
  1651. else {
  1652. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
  1653. if (img->src) {
  1654. img->url = rspamd_html_process_url (pool,
  1655. img->src, fstr.len, NULL);
  1656. if (img->url) {
  1657. img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
  1658. if (rspamd_url_set_add_or_increase(url_set, img->url, false) &&
  1659. part_urls) {
  1660. g_ptr_array_add (part_urls, img->url);
  1661. }
  1662. }
  1663. }
  1664. }
  1665. }
  1666. }
  1667. else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) {
  1668. rspamd_strtoul (comp->start, comp->len, &val);
  1669. img->height = val;
  1670. seen_height = TRUE;
  1671. }
  1672. else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) {
  1673. rspamd_strtoul (comp->start, comp->len, &val);
  1674. img->width = val;
  1675. seen_width = TRUE;
  1676. }
  1677. else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) {
  1678. /* Try to search for height= or width= in style tag */
  1679. if (!seen_height && comp->len > 0) {
  1680. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1681. "height", sizeof ("height") - 1);
  1682. if (pos != -1) {
  1683. p = comp->start + pos + sizeof ("height") - 1;
  1684. while (p < comp->start + comp->len) {
  1685. if (g_ascii_isdigit (*p)) {
  1686. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1687. img->height = val;
  1688. break;
  1689. }
  1690. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1691. /* Fallback */
  1692. break;
  1693. }
  1694. p ++;
  1695. }
  1696. }
  1697. }
  1698. if (!seen_width && comp->len > 0) {
  1699. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1700. "width", sizeof ("width") - 1);
  1701. if (pos != -1) {
  1702. p = comp->start + pos + sizeof ("width") - 1;
  1703. while (p < comp->start + comp->len) {
  1704. if (g_ascii_isdigit (*p)) {
  1705. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1706. img->width = val;
  1707. break;
  1708. }
  1709. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1710. /* Fallback */
  1711. break;
  1712. }
  1713. p ++;
  1714. }
  1715. }
  1716. }
  1717. }
  1718. else if (comp->type == RSPAMD_HTML_COMPONENT_ALT && comp->len > 0 && dest != NULL) {
  1719. if (dest->len > 0 && !g_ascii_isspace (dest->data[dest->len - 1])) {
  1720. /* Add a space */
  1721. g_byte_array_append (dest, " ", 1);
  1722. }
  1723. g_byte_array_append (dest, comp->start, comp->len);
  1724. if (!g_ascii_isspace (dest->data[dest->len - 1])) {
  1725. /* Add a space */
  1726. g_byte_array_append (dest, " ", 1);
  1727. }
  1728. }
  1729. cur = g_list_next (cur);
  1730. }
  1731. if (hc->images == NULL) {
  1732. hc->images = g_ptr_array_sized_new (4);
  1733. rspamd_mempool_notify_alloc (pool, 4 * sizeof (gpointer) + sizeof (GPtrArray));
  1734. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  1735. hc->images);
  1736. }
  1737. if (img->embedded_image) {
  1738. if (!seen_height) {
  1739. img->height = img->embedded_image->height;
  1740. }
  1741. if (!seen_width) {
  1742. img->width = img->embedded_image->width;
  1743. }
  1744. }
  1745. g_ptr_array_add (hc->images, img);
  1746. tag->extra = img;
  1747. }
  1748. static void
  1749. rspamd_html_process_link_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1750. struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
  1751. GPtrArray *part_urls)
  1752. {
  1753. struct html_tag_component *comp;
  1754. GList *cur;
  1755. cur = tag->params->head;
  1756. while (cur) {
  1757. comp = cur->data;
  1758. if (comp->type == RSPAMD_HTML_COMPONENT_REL && comp->len > 0) {
  1759. if (comp->len == sizeof ("icon") - 1 &&
  1760. rspamd_lc_cmp (comp->start, "icon", sizeof ("icon") - 1) == 0) {
  1761. rspamd_html_process_img_tag (pool, tag, hc, url_set, part_urls, NULL);
  1762. }
  1763. }
  1764. cur = g_list_next (cur);
  1765. }
  1766. }
  1767. static void
  1768. rspamd_html_process_color (const gchar *line, guint len, struct html_color *cl)
  1769. {
  1770. const gchar *p = line, *end = line + len;
  1771. char hexbuf[7];
  1772. rspamd_ftok_t search;
  1773. struct html_color *el;
  1774. memset (cl, 0, sizeof (*cl));
  1775. if (*p == '#') {
  1776. /* HEX color */
  1777. p ++;
  1778. rspamd_strlcpy (hexbuf, p, MIN ((gint)sizeof(hexbuf), end - p + 1));
  1779. cl->d.val = strtoul (hexbuf, NULL, 16);
  1780. cl->d.comp.alpha = 255;
  1781. cl->valid = TRUE;
  1782. }
  1783. else if (len > 4 && rspamd_lc_cmp (p, "rgb", 3) == 0) {
  1784. /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
  1785. enum {
  1786. obrace,
  1787. num1,
  1788. num2,
  1789. num3,
  1790. num4,
  1791. skip_spaces
  1792. } state = skip_spaces, next_state = obrace;
  1793. gulong r = 0, g = 0, b = 0, opacity = 255;
  1794. const gchar *c;
  1795. gboolean valid = FALSE;
  1796. p += 3;
  1797. if (*p == 'a') {
  1798. p ++;
  1799. }
  1800. c = p;
  1801. while (p < end) {
  1802. switch (state) {
  1803. case obrace:
  1804. if (*p == '(') {
  1805. p ++;
  1806. state = skip_spaces;
  1807. next_state = num1;
  1808. }
  1809. else if (g_ascii_isspace (*p)) {
  1810. state = skip_spaces;
  1811. next_state = obrace;
  1812. }
  1813. else {
  1814. goto stop;
  1815. }
  1816. break;
  1817. case num1:
  1818. if (*p == ',') {
  1819. if (!rspamd_strtoul (c, p - c, &r)) {
  1820. goto stop;
  1821. }
  1822. p ++;
  1823. state = skip_spaces;
  1824. next_state = num2;
  1825. }
  1826. else if (!g_ascii_isdigit (*p)) {
  1827. goto stop;
  1828. }
  1829. else {
  1830. p ++;
  1831. }
  1832. break;
  1833. case num2:
  1834. if (*p == ',') {
  1835. if (!rspamd_strtoul (c, p - c, &g)) {
  1836. goto stop;
  1837. }
  1838. p ++;
  1839. state = skip_spaces;
  1840. next_state = num3;
  1841. }
  1842. else if (!g_ascii_isdigit (*p)) {
  1843. goto stop;
  1844. }
  1845. else {
  1846. p ++;
  1847. }
  1848. break;
  1849. case num3:
  1850. if (*p == ',') {
  1851. if (!rspamd_strtoul (c, p - c, &b)) {
  1852. goto stop;
  1853. }
  1854. valid = TRUE;
  1855. p ++;
  1856. state = skip_spaces;
  1857. next_state = num4;
  1858. }
  1859. else if (*p == ')') {
  1860. if (!rspamd_strtoul (c, p - c, &b)) {
  1861. goto stop;
  1862. }
  1863. valid = TRUE;
  1864. goto stop;
  1865. }
  1866. else if (!g_ascii_isdigit (*p)) {
  1867. goto stop;
  1868. }
  1869. else {
  1870. p ++;
  1871. }
  1872. break;
  1873. case num4:
  1874. if (*p == ',') {
  1875. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1876. goto stop;
  1877. }
  1878. valid = TRUE;
  1879. goto stop;
  1880. }
  1881. else if (*p == ')') {
  1882. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1883. goto stop;
  1884. }
  1885. valid = TRUE;
  1886. goto stop;
  1887. }
  1888. else if (!g_ascii_isdigit (*p)) {
  1889. goto stop;
  1890. }
  1891. else {
  1892. p ++;
  1893. }
  1894. break;
  1895. case skip_spaces:
  1896. if (!g_ascii_isspace (*p)) {
  1897. c = p;
  1898. state = next_state;
  1899. }
  1900. else {
  1901. p ++;
  1902. }
  1903. break;
  1904. }
  1905. }
  1906. stop:
  1907. if (valid) {
  1908. cl->d.comp.r = r;
  1909. cl->d.comp.g = g;
  1910. cl->d.comp.b = b;
  1911. cl->d.comp.alpha = opacity;
  1912. cl->valid = TRUE;
  1913. }
  1914. }
  1915. else {
  1916. khiter_t k;
  1917. /* Compare color by name */
  1918. search.begin = line;
  1919. search.len = len;
  1920. k = kh_get (color_by_name, html_color_by_name, &search);
  1921. if (k != kh_end (html_color_by_name)) {
  1922. el = &kh_val (html_color_by_name, k);
  1923. memcpy (cl, el, sizeof (*cl));
  1924. cl->d.comp.alpha = 255; /* Non transparent */
  1925. }
  1926. }
  1927. }
  1928. /*
  1929. * Target is used for in and out if this function returns TRUE
  1930. */
  1931. static gboolean
  1932. rspamd_html_process_css_size (const gchar *suffix, gsize len,
  1933. gdouble *tgt)
  1934. {
  1935. gdouble sz = *tgt;
  1936. gboolean ret = FALSE;
  1937. if (len >= 2) {
  1938. if (memcmp (suffix, "px", 2) == 0) {
  1939. sz = (guint) sz; /* Round to number */
  1940. ret = TRUE;
  1941. }
  1942. else if (memcmp (suffix, "em", 2) == 0) {
  1943. /* EM is 16 px, so multiply and round */
  1944. sz = (guint) (sz * 16.0);
  1945. ret = TRUE;
  1946. }
  1947. else if (len >= 3 && memcmp (suffix, "rem", 3) == 0) {
  1948. /* equal to EM in our case */
  1949. sz = (guint) (sz * 16.0);
  1950. ret = TRUE;
  1951. }
  1952. else if (memcmp (suffix, "ex", 2) == 0) {
  1953. /*
  1954. * Represents the x-height of the element's font.
  1955. * On fonts with the "x" letter, this is generally the height
  1956. * of lowercase letters in the font; 1ex = 0.5em in many fonts.
  1957. */
  1958. sz = (guint) (sz * 8.0);
  1959. ret = TRUE;
  1960. }
  1961. else if (memcmp (suffix, "vw", 2) == 0) {
  1962. /*
  1963. * Vewport width in percentages:
  1964. * we assume 1% of viewport width as 8px
  1965. */
  1966. sz = (guint) (sz * 8.0);
  1967. ret = TRUE;
  1968. }
  1969. else if (memcmp (suffix, "vh", 2) == 0) {
  1970. /*
  1971. * Vewport height in percentages
  1972. * we assume 1% of viewport width as 6px
  1973. */
  1974. sz = (guint) (sz * 6.0);
  1975. ret = TRUE;
  1976. }
  1977. else if (len >= 4 && memcmp (suffix, "vmax", 4) == 0) {
  1978. /*
  1979. * Vewport width in percentages
  1980. * we assume 1% of viewport width as 6px
  1981. */
  1982. sz = (guint) (sz * 8.0);
  1983. ret = TRUE;
  1984. }
  1985. else if (len >= 4 && memcmp (suffix, "vmin", 4) == 0) {
  1986. /*
  1987. * Vewport height in percentages
  1988. * we assume 1% of viewport width as 6px
  1989. */
  1990. sz = (guint) (sz * 6.0);
  1991. ret = TRUE;
  1992. }
  1993. else if (memcmp (suffix, "pt", 2) == 0) {
  1994. sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
  1995. ret = TRUE;
  1996. }
  1997. else if (memcmp (suffix, "cm", 2) == 0) {
  1998. sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
  1999. ret = TRUE;
  2000. }
  2001. else if (memcmp (suffix, "mm", 2) == 0) {
  2002. sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
  2003. ret = TRUE;
  2004. }
  2005. else if (memcmp (suffix, "in", 2) == 0) {
  2006. sz = (guint) (sz * 96.0); /* 96px */
  2007. ret = TRUE;
  2008. }
  2009. else if (memcmp (suffix, "pc", 2) == 0) {
  2010. sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
  2011. ret = TRUE;
  2012. }
  2013. }
  2014. else if (suffix[0] == '%') {
  2015. /* Percentages from 16 px */
  2016. sz = (guint)(sz / 100.0 * 16.0);
  2017. ret = TRUE;
  2018. }
  2019. if (ret) {
  2020. *tgt = sz;
  2021. }
  2022. return ret;
  2023. }
  2024. static void
  2025. rspamd_html_process_font_size (const gchar *line, guint len, guint *fs,
  2026. gboolean is_css)
  2027. {
  2028. const gchar *p = line, *end = line + len;
  2029. gchar *err = NULL, numbuf[64];
  2030. gdouble sz = 0;
  2031. gboolean failsafe = FALSE;
  2032. while (p < end && g_ascii_isspace (*p)) {
  2033. p ++;
  2034. len --;
  2035. }
  2036. if (g_ascii_isdigit (*p)) {
  2037. rspamd_strlcpy (numbuf, p, MIN (sizeof (numbuf), len + 1));
  2038. sz = strtod (numbuf, &err);
  2039. /* Now check leftover */
  2040. if (sz < 0) {
  2041. sz = 0;
  2042. }
  2043. }
  2044. else {
  2045. /* Ignore the rest */
  2046. failsafe = TRUE;
  2047. sz = is_css ? 16 : 1;
  2048. /* TODO: add textual fonts descriptions */
  2049. }
  2050. if (err && *err != '\0') {
  2051. const gchar *e = err;
  2052. gsize slen;
  2053. /* Skip spaces */
  2054. while (*e && g_ascii_isspace (*e)) {
  2055. e ++;
  2056. }
  2057. /* Lowercase */
  2058. slen = strlen (e);
  2059. rspamd_str_lc ((gchar *)e, slen);
  2060. if (!rspamd_html_process_css_size (e, slen, &sz)) {
  2061. failsafe = TRUE;
  2062. }
  2063. }
  2064. else {
  2065. /* Failsafe naked number */
  2066. failsafe = TRUE;
  2067. }
  2068. if (failsafe) {
  2069. if (is_css) {
  2070. /*
  2071. * In css mode we usually ignore sizes, but let's treat
  2072. * small sizes specially
  2073. */
  2074. if (sz < 1) {
  2075. sz = 0;
  2076. } else {
  2077. sz = 16; /* Ignore */
  2078. }
  2079. } else {
  2080. /* In non-css mode we have to check legacy size */
  2081. sz = sz >= 1 ? sz * 16 : 16;
  2082. }
  2083. }
  2084. if (sz > 32) {
  2085. sz = 32;
  2086. }
  2087. *fs = sz;
  2088. }
  2089. static void
  2090. rspamd_html_process_style (rspamd_mempool_t *pool, struct html_block *bl,
  2091. struct html_content *hc, const gchar *style, guint len)
  2092. {
  2093. const gchar *p, *c, *end, *key = NULL;
  2094. enum {
  2095. read_key,
  2096. read_colon,
  2097. read_value,
  2098. skip_spaces,
  2099. } state = skip_spaces, next_state = read_key;
  2100. guint klen = 0;
  2101. gdouble opacity = 1.0;
  2102. p = style;
  2103. c = p;
  2104. end = p + len;
  2105. while (p <= end) {
  2106. switch(state) {
  2107. case read_key:
  2108. if (p == end || *p == ':') {
  2109. key = c;
  2110. klen = p - c;
  2111. state = skip_spaces;
  2112. next_state = read_value;
  2113. }
  2114. else if (g_ascii_isspace (*p)) {
  2115. key = c;
  2116. klen = p - c;
  2117. state = skip_spaces;
  2118. next_state = read_colon;
  2119. }
  2120. p ++;
  2121. break;
  2122. case read_colon:
  2123. if (p == end || *p == ':') {
  2124. state = skip_spaces;
  2125. next_state = read_value;
  2126. }
  2127. p ++;
  2128. break;
  2129. case read_value:
  2130. if (p == end || *p == ';') {
  2131. if (key && klen && p - c > 0) {
  2132. if ((klen == 5 && g_ascii_strncasecmp (key, "color", 5) == 0)
  2133. || (klen == 10 && g_ascii_strncasecmp (key, "font-color", 10) == 0)) {
  2134. rspamd_html_process_color (c, p - c, &bl->font_color);
  2135. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  2136. }
  2137. else if ((klen == 16 && g_ascii_strncasecmp (key,
  2138. "background-color", 16) == 0) ||
  2139. (klen == 10 && g_ascii_strncasecmp (key,
  2140. "background", 10) == 0)) {
  2141. rspamd_html_process_color (c, p - c, &bl->background_color);
  2142. msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
  2143. }
  2144. else if (klen == 7 && g_ascii_strncasecmp (key, "display", 7) == 0) {
  2145. if (p - c >= 4 && rspamd_substring_search_caseless (c, p - c,
  2146. "none", 4) != -1) {
  2147. bl->visible = FALSE;
  2148. msg_debug_html ("tag is not visible");
  2149. }
  2150. }
  2151. else if (klen == 9 &&
  2152. g_ascii_strncasecmp (key, "font-size", 9) == 0) {
  2153. rspamd_html_process_font_size (c, p - c,
  2154. &bl->font_size, TRUE);
  2155. msg_debug_html ("got font size: %ud", bl->font_size);
  2156. }
  2157. else if (klen == 7 &&
  2158. g_ascii_strncasecmp (key, "opacity", 7) == 0) {
  2159. gchar numbuf[64];
  2160. rspamd_strlcpy (numbuf, c,
  2161. MIN (sizeof (numbuf), p - c + 1));
  2162. opacity = strtod (numbuf, NULL);
  2163. if (opacity > 1) {
  2164. opacity = 1;
  2165. }
  2166. else if (opacity < 0) {
  2167. opacity = 0;
  2168. }
  2169. bl->font_color.d.comp.alpha = (guint8)(opacity * 255.0);
  2170. }
  2171. else if (klen == 10 &&
  2172. g_ascii_strncasecmp (key, "visibility", 10) == 0) {
  2173. if (p - c >= 6 && rspamd_substring_search_caseless (c,
  2174. p - c,
  2175. "hidden", 6) != -1) {
  2176. bl->visible = FALSE;
  2177. msg_debug_html ("tag is not visible");
  2178. }
  2179. }
  2180. }
  2181. key = NULL;
  2182. klen = 0;
  2183. state = skip_spaces;
  2184. next_state = read_key;
  2185. }
  2186. p ++;
  2187. break;
  2188. case skip_spaces:
  2189. if (p < end && !g_ascii_isspace (*p)) {
  2190. c = p;
  2191. state = next_state;
  2192. }
  2193. else {
  2194. p ++;
  2195. }
  2196. break;
  2197. }
  2198. }
  2199. }
  2200. static void
  2201. rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  2202. struct html_content *hc)
  2203. {
  2204. struct html_tag_component *comp;
  2205. struct html_block *bl;
  2206. rspamd_ftok_t fstr;
  2207. GList *cur;
  2208. cur = tag->params->head;
  2209. bl = rspamd_mempool_alloc0 (pool, sizeof (*bl));
  2210. bl->tag = tag;
  2211. bl->visible = TRUE;
  2212. bl->font_size = (guint)-1;
  2213. bl->font_color.d.comp.alpha = 255;
  2214. while (cur) {
  2215. comp = cur->data;
  2216. if (comp->len > 0) {
  2217. switch (comp->type) {
  2218. case RSPAMD_HTML_COMPONENT_COLOR:
  2219. fstr.begin = (gchar *) comp->start;
  2220. fstr.len = comp->len;
  2221. rspamd_html_process_color (comp->start, comp->len,
  2222. &bl->font_color);
  2223. msg_debug_html ("tag %*s; got color: %xd",
  2224. tag->name.len, tag->name.start, bl->font_color.d.val);
  2225. break;
  2226. case RSPAMD_HTML_COMPONENT_BGCOLOR:
  2227. fstr.begin = (gchar *) comp->start;
  2228. fstr.len = comp->len;
  2229. rspamd_html_process_color (comp->start, comp->len,
  2230. &bl->background_color);
  2231. msg_debug_html ("tag %*s; got color: %xd",
  2232. tag->name.len, tag->name.start, bl->font_color.d.val);
  2233. if (tag->id == Tag_BODY) {
  2234. /* Set global background color */
  2235. memcpy (&hc->bgcolor, &bl->background_color,
  2236. sizeof (hc->bgcolor));
  2237. }
  2238. break;
  2239. case RSPAMD_HTML_COMPONENT_STYLE:
  2240. bl->style.len = comp->len;
  2241. bl->style.start = comp->start;
  2242. msg_debug_html ("tag: %*s; got style: %*s",
  2243. tag->name.len, tag->name.start,
  2244. (gint) bl->style.len, bl->style.start);
  2245. rspamd_html_process_style (pool, bl, hc, comp->start, comp->len);
  2246. break;
  2247. case RSPAMD_HTML_COMPONENT_CLASS:
  2248. fstr.begin = (gchar *) comp->start;
  2249. fstr.len = comp->len;
  2250. bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
  2251. msg_debug_html ("tag: %*s; got class: %s",
  2252. tag->name.len, tag->name.start, bl->html_class);
  2253. break;
  2254. case RSPAMD_HTML_COMPONENT_SIZE:
  2255. /* Not supported by html5 */
  2256. /* FIXME maybe support it */
  2257. bl->font_size = 16;
  2258. msg_debug_html ("tag %*s; got size: %*s",
  2259. tag->name.len, tag->name.start,
  2260. (gint)comp->len, comp->start);
  2261. break;
  2262. default:
  2263. /* NYI */
  2264. break;
  2265. }
  2266. }
  2267. cur = g_list_next (cur);
  2268. }
  2269. if (hc->blocks == NULL) {
  2270. hc->blocks = g_ptr_array_sized_new (64);
  2271. rspamd_mempool_notify_alloc (pool, 64 * sizeof (gpointer) + sizeof (GPtrArray));
  2272. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  2273. hc->blocks);
  2274. }
  2275. g_ptr_array_add (hc->blocks, bl);
  2276. tag->extra = bl;
  2277. }
  2278. static void
  2279. rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
  2280. GList **exceptions,
  2281. khash_t (rspamd_url_hash) *url_set,
  2282. GByteArray *dest,
  2283. gint href_offset,
  2284. struct rspamd_url *url)
  2285. {
  2286. struct rspamd_url *displayed_url = NULL;
  2287. struct rspamd_url *turl;
  2288. gboolean url_found = FALSE;
  2289. struct rspamd_process_exception *ex;
  2290. enum rspamd_normalise_result norm_res;
  2291. guint saved_flags = 0;
  2292. gsize dlen;
  2293. if (href_offset < 0) {
  2294. /* No dispalyed url, just some text within <a> tag */
  2295. return;
  2296. }
  2297. url->visible_part = rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
  2298. rspamd_strlcpy (url->visible_part, dest->data + href_offset,
  2299. dest->len - href_offset + 1);
  2300. dlen = dest->len - href_offset;
  2301. url->visible_part =
  2302. (gchar *)rspamd_string_len_strip (url->visible_part, &dlen, " \t\v\r\n");
  2303. norm_res = rspamd_normalise_unicode_inplace (pool, url->visible_part, &dlen);
  2304. if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
  2305. saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
  2306. }
  2307. rspamd_html_url_is_phished (pool, url,
  2308. url->visible_part,
  2309. dlen,
  2310. &url_found, &displayed_url);
  2311. if (url_found) {
  2312. url->flags |= saved_flags|RSPAMD_URL_FLAG_DISPLAY_URL;
  2313. }
  2314. if (exceptions && url_found) {
  2315. ex = rspamd_mempool_alloc (pool,
  2316. sizeof (*ex));
  2317. ex->pos = href_offset;
  2318. ex->len = dest->len - href_offset;
  2319. ex->type = RSPAMD_EXCEPTION_URL;
  2320. ex->ptr = url;
  2321. *exceptions = g_list_prepend (*exceptions,
  2322. ex);
  2323. }
  2324. if (displayed_url && url_set) {
  2325. turl = rspamd_url_set_add_or_return (url_set,
  2326. displayed_url);
  2327. if (turl != NULL) {
  2328. /* Here, we assume the following:
  2329. * if we have a URL in the text part which
  2330. * is the same as displayed URL in the
  2331. * HTML part, we assume that it is also
  2332. * hint only.
  2333. */
  2334. if (turl->flags &
  2335. RSPAMD_URL_FLAG_FROM_TEXT) {
  2336. turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  2337. turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
  2338. }
  2339. turl->count ++;
  2340. }
  2341. else {
  2342. /* Already inserted by `rspamd_url_set_add_or_return` */
  2343. }
  2344. }
  2345. }
  2346. static gboolean
  2347. rspamd_html_propagate_lengths (GNode *node, gpointer _unused)
  2348. {
  2349. GNode *child;
  2350. struct html_tag *tag = node->data, *cld_tag;
  2351. if (tag) {
  2352. child = node->children;
  2353. /* Summarize content length from children */
  2354. while (child) {
  2355. cld_tag = child->data;
  2356. tag->content_length += cld_tag->content_length;
  2357. child = child->next;
  2358. }
  2359. }
  2360. return FALSE;
  2361. }
  2362. static void
  2363. rspamd_html_propagate_style (struct html_content *hc,
  2364. struct html_tag *tag,
  2365. struct html_block *bl,
  2366. GQueue *blocks)
  2367. {
  2368. struct html_block *bl_parent;
  2369. gboolean push_block = FALSE;
  2370. /* Propagate from the parent if needed */
  2371. bl_parent = g_queue_peek_tail (blocks);
  2372. if (bl_parent) {
  2373. if (!bl->background_color.valid) {
  2374. /* Try to propagate background color from parent nodes */
  2375. if (bl_parent->background_color.valid) {
  2376. memcpy (&bl->background_color, &bl_parent->background_color,
  2377. sizeof (bl->background_color));
  2378. }
  2379. }
  2380. else {
  2381. push_block = TRUE;
  2382. }
  2383. if (!bl->font_color.valid) {
  2384. /* Try to propagate background color from parent nodes */
  2385. if (bl_parent->font_color.valid) {
  2386. memcpy (&bl->font_color, &bl_parent->font_color,
  2387. sizeof (bl->font_color));
  2388. }
  2389. }
  2390. else {
  2391. push_block = TRUE;
  2392. }
  2393. /* Propagate font size */
  2394. if (bl->font_size == (guint)-1) {
  2395. if (bl_parent->font_size != (guint)-1) {
  2396. bl->font_size = bl_parent->font_size;
  2397. }
  2398. }
  2399. else {
  2400. push_block = TRUE;
  2401. }
  2402. }
  2403. /* Set bgcolor to the html bgcolor and font color to black as a last resort */
  2404. if (!bl->font_color.valid) {
  2405. /* Don't touch opacity as it can be set separately */
  2406. bl->font_color.d.comp.r = 0;
  2407. bl->font_color.d.comp.g = 0;
  2408. bl->font_color.d.comp.b = 0;
  2409. bl->font_color.valid = TRUE;
  2410. }
  2411. else {
  2412. push_block = TRUE;
  2413. }
  2414. if (!bl->background_color.valid) {
  2415. memcpy (&bl->background_color, &hc->bgcolor, sizeof (hc->bgcolor));
  2416. }
  2417. else {
  2418. push_block = TRUE;
  2419. }
  2420. if (bl->font_size == (guint)-1) {
  2421. bl->font_size = 16; /* Default for browsers */
  2422. }
  2423. else {
  2424. push_block = TRUE;
  2425. }
  2426. if (push_block && !(tag->flags & FL_CLOSED)) {
  2427. g_queue_push_tail (blocks, bl);
  2428. }
  2429. }
  2430. GByteArray*
  2431. rspamd_html_process_part_full (rspamd_mempool_t *pool,
  2432. struct html_content *hc,
  2433. GByteArray *in,
  2434. GList **exceptions,
  2435. khash_t (rspamd_url_hash) *url_set,
  2436. GPtrArray *part_urls,
  2437. bool allow_css)
  2438. {
  2439. const guchar *p, *c, *end, *savep = NULL;
  2440. guchar t;
  2441. gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
  2442. balanced;
  2443. GByteArray *dest;
  2444. guint obrace = 0, ebrace = 0;
  2445. GNode *cur_level = NULL;
  2446. gint substate = 0, len, href_offset = -1;
  2447. struct html_tag *cur_tag = NULL, *content_tag = NULL;
  2448. struct rspamd_url *url = NULL;
  2449. GQueue *styles_blocks;
  2450. enum {
  2451. parse_start = 0,
  2452. tag_begin,
  2453. sgml_tag,
  2454. xml_tag,
  2455. compound_tag,
  2456. comment_tag,
  2457. comment_content,
  2458. sgml_content,
  2459. tag_content,
  2460. tag_end,
  2461. xml_tag_end,
  2462. content_ignore,
  2463. content_write,
  2464. content_style,
  2465. content_ignore_sp
  2466. } state = parse_start;
  2467. g_assert (in != NULL);
  2468. g_assert (hc != NULL);
  2469. g_assert (pool != NULL);
  2470. rspamd_html_library_init ();
  2471. hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (N_TAGS));
  2472. /* Set white background color by default */
  2473. hc->bgcolor.d.comp.alpha = 0;
  2474. hc->bgcolor.d.comp.r = 255;
  2475. hc->bgcolor.d.comp.g = 255;
  2476. hc->bgcolor.d.comp.b = 255;
  2477. hc->bgcolor.valid = TRUE;
  2478. dest = g_byte_array_sized_new (in->len / 3 * 2);
  2479. styles_blocks = g_queue_new ();
  2480. p = in->data;
  2481. c = p;
  2482. end = p + in->len;
  2483. while (p < end) {
  2484. t = *p;
  2485. switch (state) {
  2486. case parse_start:
  2487. if (t == '<') {
  2488. state = tag_begin;
  2489. }
  2490. else {
  2491. /* We have no starting tag, so assume that it's content */
  2492. hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
  2493. state = content_write;
  2494. }
  2495. break;
  2496. case tag_begin:
  2497. switch (t) {
  2498. case '<':
  2499. p ++;
  2500. closing = FALSE;
  2501. break;
  2502. case '!':
  2503. state = sgml_tag;
  2504. p ++;
  2505. break;
  2506. case '?':
  2507. state = xml_tag;
  2508. hc->flags |= RSPAMD_HTML_FLAG_XML;
  2509. p ++;
  2510. break;
  2511. case '/':
  2512. closing = TRUE;
  2513. p ++;
  2514. break;
  2515. case '>':
  2516. /* Empty tag */
  2517. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2518. state = tag_end;
  2519. continue;
  2520. default:
  2521. state = tag_content;
  2522. substate = 0;
  2523. savep = NULL;
  2524. cur_tag = rspamd_mempool_alloc0 (pool, sizeof (*cur_tag));
  2525. cur_tag->params = g_queue_new ();
  2526. rspamd_mempool_add_destructor (pool,
  2527. (rspamd_mempool_destruct_t)g_queue_free, cur_tag->params);
  2528. break;
  2529. }
  2530. break;
  2531. case sgml_tag:
  2532. switch (t) {
  2533. case '[':
  2534. state = compound_tag;
  2535. obrace = 1;
  2536. ebrace = 0;
  2537. p ++;
  2538. break;
  2539. case '-':
  2540. state = comment_tag;
  2541. p ++;
  2542. break;
  2543. default:
  2544. state = sgml_content;
  2545. break;
  2546. }
  2547. break;
  2548. case xml_tag:
  2549. if (t == '?') {
  2550. state = xml_tag_end;
  2551. }
  2552. else if (t == '>') {
  2553. /* Misformed xml tag */
  2554. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2555. state = tag_end;
  2556. continue;
  2557. }
  2558. /* We efficiently ignore xml tags */
  2559. p ++;
  2560. break;
  2561. case xml_tag_end:
  2562. if (t == '>') {
  2563. state = tag_end;
  2564. continue;
  2565. }
  2566. else {
  2567. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2568. p ++;
  2569. }
  2570. break;
  2571. case compound_tag:
  2572. if (t == '[') {
  2573. obrace ++;
  2574. }
  2575. else if (t == ']') {
  2576. ebrace ++;
  2577. }
  2578. else if (t == '>' && obrace == ebrace) {
  2579. state = tag_end;
  2580. continue;
  2581. }
  2582. p ++;
  2583. break;
  2584. case comment_tag:
  2585. if (t != '-') {
  2586. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2587. state = tag_end;
  2588. }
  2589. else {
  2590. p++;
  2591. ebrace = 0;
  2592. /*
  2593. * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
  2594. * ... the text must not start with a single
  2595. * U+003E GREATER-THAN SIGN character (>),
  2596. * nor start with a "-" (U+002D) character followed by
  2597. * a U+003E GREATER-THAN SIGN (>) character,
  2598. * nor contain two consecutive U+002D HYPHEN-MINUS
  2599. * characters (--), nor end with a "-" (U+002D) character.
  2600. */
  2601. if (p[0] == '-' && p + 1 < end && p[1] == '>') {
  2602. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2603. p ++;
  2604. state = tag_end;
  2605. }
  2606. else if (*p == '>') {
  2607. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2608. state = tag_end;
  2609. }
  2610. else {
  2611. state = comment_content;
  2612. }
  2613. }
  2614. break;
  2615. case comment_content:
  2616. if (t == '-') {
  2617. ebrace ++;
  2618. }
  2619. else if (t == '>' && ebrace >= 2) {
  2620. state = tag_end;
  2621. continue;
  2622. }
  2623. else {
  2624. ebrace = 0;
  2625. }
  2626. p ++;
  2627. break;
  2628. case content_ignore:
  2629. if (t != '<') {
  2630. p ++;
  2631. }
  2632. else {
  2633. state = tag_begin;
  2634. }
  2635. break;
  2636. case content_write:
  2637. if (t != '<') {
  2638. if (t == '&') {
  2639. need_decode = TRUE;
  2640. }
  2641. else if (g_ascii_isspace (t)) {
  2642. save_space = TRUE;
  2643. if (p > c) {
  2644. if (need_decode) {
  2645. goffset old_offset = dest->len;
  2646. if (content_tag) {
  2647. if (content_tag->content_length == 0) {
  2648. content_tag->content_offset = old_offset;
  2649. }
  2650. }
  2651. g_byte_array_append (dest, c, (p - c));
  2652. len = rspamd_html_decode_entitles_inplace (
  2653. dest->data + old_offset,
  2654. p - c);
  2655. dest->len = dest->len + len - (p - c);
  2656. if (content_tag) {
  2657. content_tag->content_length += len;
  2658. }
  2659. }
  2660. else {
  2661. len = p - c;
  2662. if (content_tag) {
  2663. if (content_tag->content_length == 0) {
  2664. content_tag->content_offset = dest->len;
  2665. }
  2666. content_tag->content_length += len;
  2667. }
  2668. g_byte_array_append (dest, c, len);
  2669. }
  2670. }
  2671. c = p;
  2672. state = content_ignore_sp;
  2673. }
  2674. else {
  2675. if (save_space) {
  2676. /* Append one space if needed */
  2677. if (dest->len > 0 &&
  2678. !g_ascii_isspace (dest->data[dest->len - 1])) {
  2679. g_byte_array_append (dest, " ", 1);
  2680. if (content_tag) {
  2681. if (content_tag->content_length == 0) {
  2682. /*
  2683. * Special case
  2684. * we have a space at the beginning but
  2685. * we have no set content_offset
  2686. * so we need to do it here
  2687. */
  2688. content_tag->content_offset = dest->len;
  2689. }
  2690. else {
  2691. content_tag->content_length++;
  2692. }
  2693. }
  2694. }
  2695. save_space = FALSE;
  2696. }
  2697. }
  2698. }
  2699. else {
  2700. if (c != p) {
  2701. if (need_decode) {
  2702. goffset old_offset = dest->len;
  2703. if (content_tag) {
  2704. if (content_tag->content_length == 0) {
  2705. content_tag->content_offset = dest->len;
  2706. }
  2707. }
  2708. g_byte_array_append (dest, c, (p - c));
  2709. len = rspamd_html_decode_entitles_inplace (
  2710. dest->data + old_offset,
  2711. p - c);
  2712. dest->len = dest->len + len - (p - c);
  2713. if (content_tag) {
  2714. content_tag->content_length += len;
  2715. }
  2716. }
  2717. else {
  2718. len = p - c;
  2719. if (content_tag) {
  2720. if (content_tag->content_length == 0) {
  2721. content_tag->content_offset = dest->len;
  2722. }
  2723. content_tag->content_length += len;
  2724. }
  2725. g_byte_array_append (dest, c, len);
  2726. }
  2727. }
  2728. content_tag = NULL;
  2729. state = tag_begin;
  2730. continue;
  2731. }
  2732. p ++;
  2733. break;
  2734. case content_style: {
  2735. /*
  2736. * We just search for the first </s substring and then pass
  2737. * the content to the parser (if needed)
  2738. */
  2739. goffset end_style = rspamd_substring_search (p, end - p,
  2740. "</", 2);
  2741. if (end_style == -1 || g_ascii_tolower (p[end_style + 2]) != 's') {
  2742. /* Invalid style */
  2743. state = content_ignore;
  2744. }
  2745. else {
  2746. if (allow_css) {
  2747. GError *err = NULL;
  2748. (void)rspamd_css_parse_style (pool, p, end_style, &err);
  2749. if (err) {
  2750. msg_info_pool ("cannot parse css: %e", err);
  2751. g_error_free (err);
  2752. }
  2753. }
  2754. p += end_style;
  2755. state = tag_begin;
  2756. }
  2757. break;
  2758. }
  2759. case content_ignore_sp:
  2760. if (!g_ascii_isspace (t)) {
  2761. c = p;
  2762. state = content_write;
  2763. continue;
  2764. }
  2765. p ++;
  2766. break;
  2767. case sgml_content:
  2768. /* TODO: parse DOCTYPE here */
  2769. if (t == '>') {
  2770. state = tag_end;
  2771. /* We don't know a lot about sgml tags, ignore them */
  2772. cur_tag = NULL;
  2773. continue;
  2774. }
  2775. p ++;
  2776. break;
  2777. case tag_content:
  2778. rspamd_html_parse_tag_content (pool, hc, cur_tag,
  2779. p, &substate, &savep);
  2780. if (t == '>') {
  2781. if (closing) {
  2782. cur_tag->flags |= FL_CLOSING;
  2783. if (cur_tag->flags & FL_CLOSED) {
  2784. /* Bad mix of closed and closing */
  2785. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2786. }
  2787. closing = FALSE;
  2788. }
  2789. state = tag_end;
  2790. continue;
  2791. }
  2792. p ++;
  2793. break;
  2794. case tag_end:
  2795. substate = 0;
  2796. savep = NULL;
  2797. if (cur_tag != NULL) {
  2798. balanced = TRUE;
  2799. if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level,
  2800. &balanced)) {
  2801. state = content_write;
  2802. need_decode = FALSE;
  2803. }
  2804. else {
  2805. if (cur_tag->id == Tag_STYLE) {
  2806. state = content_style;
  2807. }
  2808. else {
  2809. state = content_ignore;
  2810. }
  2811. }
  2812. if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
  2813. if (cur_tag->flags & CM_UNIQUE) {
  2814. if (isset (hc->tags_seen, cur_tag->id)) {
  2815. /* Duplicate tag has been found */
  2816. hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
  2817. }
  2818. }
  2819. setbit (hc->tags_seen, cur_tag->id);
  2820. }
  2821. if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
  2822. content_tag = cur_tag;
  2823. }
  2824. /* Handle newlines */
  2825. if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
  2826. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2827. g_byte_array_append (dest, "\r\n", 2);
  2828. if (content_tag) {
  2829. if (content_tag->content_length == 0) {
  2830. /*
  2831. * Special case
  2832. * we have a \r\n at the beginning but
  2833. * we have no set content_offset
  2834. * so we need to do it here
  2835. */
  2836. content_tag->content_offset = dest->len;
  2837. }
  2838. else {
  2839. content_tag->content_length += 2;
  2840. }
  2841. }
  2842. }
  2843. save_space = FALSE;
  2844. }
  2845. if ((cur_tag->id == Tag_P ||
  2846. cur_tag->id == Tag_TR ||
  2847. cur_tag->id == Tag_DIV)) {
  2848. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2849. g_byte_array_append (dest, "\r\n", 2);
  2850. if (content_tag) {
  2851. if (content_tag->content_length == 0) {
  2852. /*
  2853. * Special case
  2854. * we have a \r\n at the beginning but
  2855. * we have no set content_offset
  2856. * so we need to get it here
  2857. */
  2858. content_tag->content_offset = dest->len;
  2859. }
  2860. else {
  2861. content_tag->content_length += 2;
  2862. }
  2863. }
  2864. }
  2865. save_space = FALSE;
  2866. }
  2867. /* XXX: uncomment when styles parsing is not so broken */
  2868. if (cur_tag->flags & FL_HREF /* && !(cur_tag->flags & FL_IGNORE) */) {
  2869. if (!(cur_tag->flags & (FL_CLOSING))) {
  2870. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2871. if (url != NULL) {
  2872. if (url_set != NULL) {
  2873. struct rspamd_url *maybe_existing =
  2874. rspamd_url_set_add_or_return (url_set, url);
  2875. if (maybe_existing == url) {
  2876. rspamd_process_html_url (pool, url, url_set,
  2877. part_urls);
  2878. }
  2879. else {
  2880. url = maybe_existing;
  2881. /* Increase count to avoid odd checks failure */
  2882. url->count ++;
  2883. }
  2884. }
  2885. href_offset = dest->len;
  2886. }
  2887. }
  2888. if (cur_tag->id == Tag_A) {
  2889. if (!balanced && cur_level && cur_level->prev) {
  2890. struct html_tag *prev_tag;
  2891. struct rspamd_url *prev_url;
  2892. prev_tag = cur_level->prev->data;
  2893. if (prev_tag->id == Tag_A &&
  2894. !(prev_tag->flags & (FL_CLOSING)) &&
  2895. prev_tag->extra) {
  2896. prev_url = prev_tag->extra;
  2897. rspamd_html_check_displayed_url (pool,
  2898. exceptions, url_set,
  2899. dest, href_offset,
  2900. prev_url);
  2901. }
  2902. }
  2903. if (cur_tag->flags & (FL_CLOSING)) {
  2904. /* Insert exception */
  2905. if (url != NULL && (gint) dest->len > href_offset) {
  2906. rspamd_html_check_displayed_url (pool,
  2907. exceptions, url_set,
  2908. dest, href_offset,
  2909. url);
  2910. }
  2911. href_offset = -1;
  2912. url = NULL;
  2913. }
  2914. }
  2915. }
  2916. else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
  2917. /*
  2918. * Base is allowed only within head tag but HTML is retarded
  2919. */
  2920. if (hc->base_url == NULL) {
  2921. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2922. if (url != NULL) {
  2923. msg_debug_html ("got valid base tag");
  2924. hc->base_url = url;
  2925. cur_tag->extra = url;
  2926. cur_tag->flags |= FL_HREF;
  2927. }
  2928. else {
  2929. msg_debug_html ("got invalid base tag!");
  2930. }
  2931. }
  2932. }
  2933. if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
  2934. rspamd_html_process_img_tag (pool, cur_tag, hc, url_set,
  2935. part_urls, dest);
  2936. }
  2937. else if (cur_tag->id == Tag_LINK && !(cur_tag->flags & FL_CLOSING)) {
  2938. rspamd_html_process_link_tag (pool, cur_tag, hc, url_set,
  2939. part_urls);
  2940. }
  2941. else if (cur_tag->flags & FL_BLOCK) {
  2942. struct html_block *bl;
  2943. if (cur_tag->flags & FL_CLOSING) {
  2944. /* Just remove block element from the queue if any */
  2945. if (styles_blocks->length > 0) {
  2946. g_queue_pop_tail (styles_blocks);
  2947. }
  2948. }
  2949. else {
  2950. rspamd_html_process_block_tag (pool, cur_tag, hc);
  2951. bl = cur_tag->extra;
  2952. if (bl) {
  2953. rspamd_html_propagate_style (hc, cur_tag,
  2954. cur_tag->extra, styles_blocks);
  2955. /* Check visibility */
  2956. if (bl->font_size < 3 ||
  2957. bl->font_color.d.comp.alpha < 10) {
  2958. bl->visible = FALSE;
  2959. msg_debug_html ("tag is not visible: font size: "
  2960. "%d, alpha: %d",
  2961. (int)bl->font_size,
  2962. (int)bl->font_color.d.comp.alpha);
  2963. }
  2964. if (!bl->visible) {
  2965. state = content_ignore;
  2966. }
  2967. }
  2968. }
  2969. }
  2970. }
  2971. else {
  2972. state = content_write;
  2973. }
  2974. p++;
  2975. c = p;
  2976. cur_tag = NULL;
  2977. break;
  2978. }
  2979. }
  2980. if (hc->html_tags) {
  2981. g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1,
  2982. rspamd_html_propagate_lengths, NULL);
  2983. }
  2984. g_queue_free (styles_blocks);
  2985. hc->parsed = dest;
  2986. return dest;
  2987. }
  2988. GByteArray*
  2989. rspamd_html_process_part (rspamd_mempool_t *pool,
  2990. struct html_content *hc,
  2991. GByteArray *in)
  2992. {
  2993. return rspamd_html_process_part_full (pool, hc, in, NULL,
  2994. NULL, NULL, FALSE);
  2995. }