You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.c 71KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "util.h"
  18. #include "rspamd.h"
  19. #include "message.h"
  20. #include "html.h"
  21. #include "html_tags.h"
  22. #include "html_colors.h"
  23. #include "html_entities.h"
  24. #include "url.h"
  25. #include "contrib/libucl/khash.h"
  26. #include "libmime/images.h"
  27. #include <unicode/uversion.h>
  28. #include <unicode/ucnv.h>
  29. #if U_ICU_VERSION_MAJOR_NUM >= 46
  30. #include <unicode/uidna.h>
  31. #endif
  32. static sig_atomic_t tags_sorted = 0;
  33. static sig_atomic_t entities_sorted = 0;
  34. static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
  35. struct html_tag_def {
  36. const gchar *name;
  37. gint16 id;
  38. guint16 len;
  39. guint flags;
  40. };
  41. #define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
  42. rspamd_html_log_id, "html", pool->tag.uid, \
  43. G_STRFUNC, \
  44. __VA_ARGS__)
  45. INIT_LOG_MODULE(html)
  46. #define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
  47. static struct html_tag_def tag_defs[] = {
  48. /* W3C defined elements */
  49. TAG_DEF(Tag_A, "a", FL_HREF),
  50. TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
  51. TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
  52. TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
  53. TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  54. TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
  55. TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
  56. TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY | FL_HREF)),
  57. TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
  58. TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
  59. TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
  60. TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
  61. TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
  62. TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
  63. TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
  64. TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
  65. TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
  66. TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
  67. TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
  68. TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
  69. TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
  70. TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  71. TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  72. TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
  73. TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
  74. TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
  75. TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
  76. TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  77. TAG_DEF(Tag_EM, "em", (CM_INLINE)),
  78. TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
  79. TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
  80. TAG_DEF(Tag_FORM, "form", (CM_BLOCK)),
  81. TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
  82. TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
  83. TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
  84. TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
  85. TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
  86. TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
  87. TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
  88. TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
  89. TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  90. TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
  91. TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  92. TAG_DEF(Tag_I, "i", (CM_INLINE)),
  93. TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
  94. TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
  95. TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
  96. TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
  97. TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
  98. TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
  99. TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
  100. TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
  101. TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  102. TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY|FL_HREF)),
  103. TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
  104. TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)),
  105. TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
  106. TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
  107. TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
  108. TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  109. TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
  110. TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
  111. TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
  112. TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
  113. TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
  114. TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
  115. TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
  116. TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
  117. TAG_DEF(Tag_Q, "q", (CM_INLINE)),
  118. TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
  119. TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
  120. TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
  121. TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
  122. TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
  123. TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
  124. TAG_DEF(Tag_S, "s", (CM_INLINE)),
  125. TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
  126. TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
  127. TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
  128. TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
  129. TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
  130. TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
  131. TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
  132. TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
  133. TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
  134. TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
  135. TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
  136. TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
  137. TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  138. TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
  139. TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  140. TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  141. TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  142. TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
  143. TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
  144. TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
  145. TAG_DEF(Tag_U, "u", (CM_INLINE)),
  146. TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
  147. TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
  148. TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
  149. TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
  150. /* proprietary elements */
  151. TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
  152. TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
  153. TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
  154. TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
  155. TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
  156. TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
  157. TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
  158. TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
  159. TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
  160. TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
  161. TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
  162. TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
  163. TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
  164. TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
  165. TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
  166. TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
  167. TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
  168. TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
  169. };
  170. KHASH_MAP_INIT_INT (entity_by_number, const char *);
  171. KHASH_MAP_INIT_STR (entity_by_name, const char *);
  172. KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
  173. KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
  174. KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
  175. rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
  176. khash_t(entity_by_number) *html_entity_by_number;
  177. khash_t(entity_by_name) *html_entity_by_name;
  178. khash_t(tag_by_name) *html_tag_by_name;
  179. khash_t(tag_by_id) *html_tag_by_id;
  180. khash_t(color_by_name) *html_color_by_name;
  181. static void
  182. rspamd_html_library_init (void)
  183. {
  184. guint i;
  185. khiter_t k;
  186. gint rc;
  187. if (!tags_sorted) {
  188. html_tag_by_id = kh_init (tag_by_id);
  189. html_tag_by_name = kh_init (tag_by_name);
  190. kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
  191. kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
  192. for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
  193. k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
  194. kh_val (html_tag_by_id, k) = tag_defs[i];
  195. k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
  196. kh_val (html_tag_by_name, k) = tag_defs[i];
  197. }
  198. tags_sorted = 1;
  199. }
  200. if (!entities_sorted) {
  201. html_entity_by_number = kh_init (entity_by_number);
  202. html_entity_by_name = kh_init (entity_by_name);
  203. kh_resize (entity_by_number, html_entity_by_number,
  204. G_N_ELEMENTS (entities_defs));
  205. kh_resize (entity_by_name, html_entity_by_name,
  206. G_N_ELEMENTS (entities_defs));
  207. for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
  208. if (entities_defs[i].code != 0) {
  209. k = kh_put (entity_by_number, html_entity_by_number,
  210. entities_defs[i].code, &rc);
  211. kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
  212. }
  213. k = kh_put (entity_by_name, html_entity_by_name,
  214. entities_defs[i].name, &rc);
  215. kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
  216. }
  217. html_color_by_name = kh_init (color_by_name);
  218. kh_resize (color_by_name, html_color_by_name,
  219. G_N_ELEMENTS (html_colornames));
  220. rspamd_ftok_t *keys;
  221. keys = g_malloc0 (sizeof (rspamd_ftok_t) *
  222. G_N_ELEMENTS (html_colornames));
  223. for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
  224. struct html_color c;
  225. keys[i].begin = html_colornames[i].name;
  226. keys[i].len = strlen (html_colornames[i].name);
  227. k = kh_put (color_by_name, html_color_by_name,
  228. &keys[i], &rc);
  229. c.valid = true;
  230. c.d.comp.r = html_colornames[i].rgb.r;
  231. c.d.comp.g = html_colornames[i].rgb.g;
  232. c.d.comp.b = html_colornames[i].rgb.b;
  233. c.d.comp.alpha = 255;
  234. kh_val (html_color_by_name, k) = c;
  235. }
  236. entities_sorted = 1;
  237. }
  238. }
  239. static gboolean
  240. rspamd_html_check_balance (GNode * node, GNode ** cur_level)
  241. {
  242. struct html_tag *arg = node->data, *tmp;
  243. GNode *cur;
  244. if (arg->flags & FL_CLOSING) {
  245. /* First of all check whether this tag is closing tag for parent node */
  246. cur = node->parent;
  247. while (cur && cur->data) {
  248. tmp = cur->data;
  249. if (tmp->id == arg->id &&
  250. (tmp->flags & FL_CLOSED) == 0) {
  251. tmp->flags |= FL_CLOSED;
  252. /* Destroy current node as we find corresponding parent node */
  253. g_node_destroy (node);
  254. /* Change level */
  255. *cur_level = cur->parent;
  256. return TRUE;
  257. }
  258. cur = cur->parent;
  259. }
  260. }
  261. else {
  262. return TRUE;
  263. }
  264. return FALSE;
  265. }
  266. gint
  267. rspamd_html_tag_by_name (const gchar *name)
  268. {
  269. khiter_t k;
  270. k = kh_get (tag_by_name, html_tag_by_name, name);
  271. if (k != kh_end (html_tag_by_name)) {
  272. return kh_val (html_tag_by_name, k).id;
  273. }
  274. return -1;
  275. }
  276. gboolean
  277. rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
  278. {
  279. gint id;
  280. g_assert (hc != NULL);
  281. g_assert (hc->tags_seen != NULL);
  282. id = rspamd_html_tag_by_name (tagname);
  283. if (id != -1) {
  284. return isset (hc->tags_seen, id);
  285. }
  286. return FALSE;
  287. }
  288. const gchar *
  289. rspamd_html_tag_by_id (gint id)
  290. {
  291. khiter_t k;
  292. k = kh_get (tag_by_id, html_tag_by_id, id);
  293. if (k != kh_end (html_tag_by_id)) {
  294. return kh_val (html_tag_by_id, k).name;
  295. }
  296. return NULL;
  297. }
  298. /* Decode HTML entitles in text */
  299. guint
  300. rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
  301. {
  302. goffset l, rep_len;
  303. gchar *t = s, *h = s, *e = s, *end_ptr;
  304. const gchar *end;
  305. const gchar *entity;
  306. gint state = 0, base;
  307. UChar32 uc;
  308. khiter_t k;
  309. if (len == 0) {
  310. l = strlen (s);
  311. }
  312. else {
  313. l = len;
  314. }
  315. end = s + l;
  316. while (h - s < l) {
  317. switch (state) {
  318. /* Out of entity */
  319. case 0:
  320. if (*h == '&') {
  321. state = 1;
  322. e = h;
  323. h++;
  324. continue;
  325. }
  326. else {
  327. *t = *h;
  328. h++;
  329. t++;
  330. }
  331. break;
  332. case 1:
  333. if (*h == ';' && h > e) {
  334. /* Determine base */
  335. /* First find in entities table */
  336. *h = '\0';
  337. entity = e + 1;
  338. uc = 0;
  339. if (*entity != '#') {
  340. k = kh_get (entity_by_name, html_entity_by_name, entity);
  341. *h = ';';
  342. if (k != kh_end (html_entity_by_name)) {
  343. if (kh_val (html_entity_by_name, k)) {
  344. rep_len = strlen (kh_val (html_entity_by_name, k));
  345. if (end - t >= rep_len) {
  346. memcpy (t, kh_val (html_entity_by_name, k),
  347. rep_len);
  348. t += rep_len;
  349. }
  350. } else {
  351. if (end - t > h - e + 1) {
  352. memmove (t, e, h - e + 1);
  353. t += h - e + 1;
  354. }
  355. }
  356. }
  357. else {
  358. if (end - t > h - e + 1) {
  359. memmove (t, e, h - e + 1);
  360. t += h - e + 1;
  361. }
  362. }
  363. }
  364. else if (e + 2 < h) {
  365. if (*(e + 2) == 'x' || *(e + 2) == 'X') {
  366. base = 16;
  367. }
  368. else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
  369. base = 8;
  370. }
  371. else {
  372. base = 10;
  373. }
  374. if (base == 10) {
  375. uc = strtoul ((e + 2), &end_ptr, base);
  376. }
  377. else {
  378. uc = strtoul ((e + 3), &end_ptr, base);
  379. }
  380. if (end_ptr != NULL && *end_ptr != '\0') {
  381. /* Skip undecoded */
  382. *h = ';';
  383. if (end - t > h - e + 1) {
  384. memmove (t, e, h - e + 1);
  385. t += h - e + 1;
  386. }
  387. }
  388. else {
  389. /* Search for a replacement */
  390. *h = ';';
  391. k = kh_get (entity_by_number, html_entity_by_number, uc);
  392. if (k != kh_end (html_entity_by_number)) {
  393. if (kh_val (html_entity_by_number, k)) {
  394. rep_len = strlen (kh_val (html_entity_by_number, k));
  395. if (end - t >= rep_len) {
  396. memcpy (t, kh_val (html_entity_by_number, k),
  397. rep_len);
  398. t += rep_len;
  399. }
  400. } else {
  401. if (end - t > h - e + 1) {
  402. memmove (t, e, h - e + 1);
  403. t += h - e + 1;
  404. }
  405. }
  406. }
  407. else {
  408. /* Unicode point */
  409. goffset off = t - s;
  410. UBool is_error = 0;
  411. if (uc > 0) {
  412. U8_APPEND (s, off, len, uc, is_error);
  413. if (!is_error) {
  414. t = s + off;
  415. }
  416. else {
  417. /* Leave invalid entities as is */
  418. if (end - t > h - e + 1) {
  419. memmove (t, e, h - e + 1);
  420. t += h - e + 1;
  421. }
  422. }
  423. }
  424. else if (end - t > h - e + 1) {
  425. memmove (t, e, h - e + 1);
  426. t += h - e + 1;
  427. }
  428. }
  429. }
  430. }
  431. state = 0;
  432. }
  433. else if (*h == '&') {
  434. /* Previous `&` was bogus */
  435. state = 1;
  436. if (end - t > h - e) {
  437. memmove (t, e, h - e);
  438. t += h - e;
  439. }
  440. e = h;
  441. }
  442. h++;
  443. break;
  444. }
  445. }
  446. /* Leftover */
  447. if (state == 1 && h > e) {
  448. /* Unfinished entity, copy as is */
  449. if (end - t > h - e) {
  450. memmove (t, e, h - e);
  451. t += h - e;
  452. }
  453. }
  454. return (t - s);
  455. }
  456. static gboolean
  457. rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
  458. {
  459. const gchar *p1, *p2;
  460. p1 = t1->begin + t1->len - 1;
  461. p2 = t2->begin + t2->len - 1;
  462. /* Skip trailing dots */
  463. while (p1 > t1->begin) {
  464. if (*p1 != '.') {
  465. break;
  466. }
  467. p1 --;
  468. }
  469. while (p2 > t2->begin) {
  470. if (*p2 != '.') {
  471. break;
  472. }
  473. p2 --;
  474. }
  475. while (p1 > t1->begin && p2 > t2->begin) {
  476. if (*p1 != *p2) {
  477. break;
  478. }
  479. p1 --;
  480. p2 --;
  481. }
  482. if (p2 == t2->begin) {
  483. /* p2 can be subdomain of p1 if *p1 is '.' */
  484. if (p1 != t1->begin && *(p1 - 1) == '.') {
  485. return TRUE;
  486. }
  487. }
  488. else if (p1 == t1->begin) {
  489. if (p2 != t2->begin && *(p2 - 1) == '.') {
  490. return TRUE;
  491. }
  492. }
  493. return FALSE;
  494. }
  495. static void
  496. rspamd_html_url_is_phished (rspamd_mempool_t *pool,
  497. struct rspamd_url *href_url,
  498. const guchar *url_text,
  499. gsize len,
  500. gboolean *url_found,
  501. struct rspamd_url **ptext_url)
  502. {
  503. struct rspamd_url *text_url;
  504. rspamd_ftok_t phished_tld, disp_tok, href_tok;
  505. gint rc;
  506. goffset url_pos;
  507. gchar *url_str = NULL, *idn_hbuf;
  508. const guchar *end = url_text + len, *p;
  509. #if U_ICU_VERSION_MAJOR_NUM >= 46
  510. static UIDNA *udn;
  511. UErrorCode uc_err = U_ZERO_ERROR;
  512. UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER;
  513. #endif
  514. *url_found = FALSE;
  515. #if U_ICU_VERSION_MAJOR_NUM >= 46
  516. if (udn == NULL) {
  517. udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
  518. if (uc_err != U_ZERO_ERROR) {
  519. msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
  520. }
  521. }
  522. #endif
  523. while (url_text < end && g_ascii_isspace (*url_text)) {
  524. url_text ++;
  525. }
  526. if (end > url_text + 4 &&
  527. rspamd_url_find (pool, url_text, end - url_text, &url_str,
  528. RSPAMD_URL_FIND_ALL,
  529. &url_pos, NULL) &&
  530. url_str != NULL) {
  531. if (url_pos > 0) {
  532. /*
  533. * We have some url at some offset, so we need to check what is
  534. * at the start of the text
  535. */
  536. p = url_text;
  537. while (p < url_text + url_pos) {
  538. if (!g_ascii_isspace (*p)) {
  539. *url_found = FALSE;
  540. return;
  541. }
  542. p++;
  543. }
  544. }
  545. text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
  546. rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
  547. RSPAMD_URL_PARSE_TEXT);
  548. if (rc == URI_ERRNO_OK) {
  549. disp_tok.len = text_url->hostlen;
  550. disp_tok.begin = text_url->host;
  551. #if U_ICU_VERSION_MAJOR_NUM >= 46
  552. if (rspamd_substring_search_caseless (text_url->host,
  553. text_url->hostlen, "xn--", 4) != -1) {
  554. idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
  555. /* We need to convert it to the normal value first */
  556. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  557. text_url->host, text_url->hostlen,
  558. idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
  559. if (uc_err != U_ZERO_ERROR) {
  560. msg_err_pool ("cannot convert to IDN: %s",
  561. u_errorName (uc_err));
  562. disp_tok.len = text_url->hostlen;
  563. }
  564. else {
  565. disp_tok.begin = idn_hbuf;
  566. }
  567. }
  568. #endif
  569. href_tok.len = href_url->hostlen;
  570. href_tok.begin = href_url->host;
  571. #if U_ICU_VERSION_MAJOR_NUM >= 46
  572. if (rspamd_substring_search_caseless (href_url->host,
  573. href_url->hostlen, "xn--", 4) != -1) {
  574. idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
  575. /* We need to convert it to the normal value first */
  576. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  577. href_url->host, href_url->hostlen,
  578. idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
  579. if (uc_err != U_ZERO_ERROR) {
  580. msg_err_pool ("cannot convert to IDN: %s",
  581. u_errorName (uc_err));
  582. href_tok.len = href_url->hostlen;
  583. }
  584. else {
  585. href_tok.begin = idn_hbuf;
  586. }
  587. }
  588. #endif
  589. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0 &&
  590. text_url->tldlen > 0 && href_url->tldlen > 0) {
  591. /* Apply the same logic for TLD */
  592. disp_tok.len = text_url->tldlen;
  593. disp_tok.begin = text_url->tld;
  594. #if U_ICU_VERSION_MAJOR_NUM >= 46
  595. if (rspamd_substring_search_caseless (text_url->tld,
  596. text_url->tldlen, "xn--", 4) != -1) {
  597. idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
  598. /* We need to convert it to the normal value first */
  599. disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
  600. text_url->tld, text_url->tldlen,
  601. idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
  602. if (uc_err != U_ZERO_ERROR) {
  603. msg_err_pool ("cannot convert to IDN: %s",
  604. u_errorName (uc_err));
  605. disp_tok.len = text_url->tldlen;
  606. }
  607. else {
  608. disp_tok.begin = idn_hbuf;
  609. }
  610. }
  611. #endif
  612. href_tok.len = href_url->tldlen;
  613. href_tok.begin = href_url->tld;
  614. #if U_ICU_VERSION_MAJOR_NUM >= 46
  615. if (rspamd_substring_search_caseless (href_url->tld,
  616. href_url->tldlen, "xn--", 4) != -1) {
  617. idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
  618. /* We need to convert it to the normal value first */
  619. href_tok.len = uidna_nameToUnicodeUTF8 (udn,
  620. href_url->tld, href_url->tldlen,
  621. idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
  622. if (uc_err != U_ZERO_ERROR) {
  623. msg_err_pool ("cannot convert to IDN: %s",
  624. u_errorName (uc_err));
  625. href_tok.len = href_url->tldlen;
  626. }
  627. else {
  628. href_tok.begin = idn_hbuf;
  629. }
  630. }
  631. #endif
  632. if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0) {
  633. /* Check if one url is a subdomain for another */
  634. if (!rspamd_url_is_subdomain (&disp_tok, &href_tok)) {
  635. href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
  636. href_url->phished_url = text_url;
  637. phished_tld.begin = href_tok.begin;
  638. phished_tld.len = href_tok.len;
  639. text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  640. }
  641. }
  642. }
  643. *ptext_url = text_url;
  644. *url_found = TRUE;
  645. }
  646. else {
  647. msg_info_pool ("extract of url '%s' failed: %s",
  648. url_str,
  649. rspamd_url_strerror (rc));
  650. }
  651. }
  652. }
  653. static gboolean
  654. rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
  655. struct html_tag *tag, GNode **cur_level, gboolean *balanced)
  656. {
  657. GNode *nnode;
  658. struct html_tag *parent;
  659. if (hc->html_tags == NULL) {
  660. nnode = g_node_new (NULL);
  661. *cur_level = nnode;
  662. hc->html_tags = nnode;
  663. rspamd_mempool_add_destructor (pool,
  664. (rspamd_mempool_destruct_t) g_node_destroy,
  665. nnode);
  666. }
  667. if (hc->total_tags > max_tags) {
  668. hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
  669. }
  670. if (tag->id == -1) {
  671. /* Ignore unknown tags */
  672. hc->total_tags ++;
  673. return FALSE;
  674. }
  675. tag->parent = *cur_level;
  676. if (!(tag->flags & CM_INLINE)) {
  677. /* Block tag */
  678. if (tag->flags & (FL_CLOSING|FL_CLOSED)) {
  679. if (!*cur_level) {
  680. msg_debug_html ("bad parent node");
  681. return FALSE;
  682. }
  683. if (hc->total_tags < max_tags) {
  684. nnode = g_node_new (tag);
  685. g_node_append (*cur_level, nnode);
  686. if (!rspamd_html_check_balance (nnode, cur_level)) {
  687. msg_debug_html (
  688. "mark part as unbalanced as it has not pairable closing tags");
  689. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  690. *balanced = FALSE;
  691. } else {
  692. *balanced = TRUE;
  693. }
  694. hc->total_tags ++;
  695. }
  696. }
  697. else {
  698. parent = (*cur_level)->data;
  699. if (parent) {
  700. if ((parent->flags & FL_IGNORE)) {
  701. tag->flags |= FL_IGNORE;
  702. }
  703. if (!(tag->flags & FL_CLOSED) &&
  704. !(parent->flags & FL_BLOCK)) {
  705. /* We likely have some bad nesting */
  706. if (parent->id == tag->id) {
  707. /* Something like <a>bla<a>foo... */
  708. hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
  709. *balanced = FALSE;
  710. tag->parent = parent->parent;
  711. if (hc->total_tags < max_tags) {
  712. nnode = g_node_new (tag);
  713. g_node_append (parent->parent, nnode);
  714. *cur_level = nnode;
  715. hc->total_tags ++;
  716. }
  717. return TRUE;
  718. }
  719. }
  720. parent->content_length += tag->content_length;
  721. }
  722. if (hc->total_tags < max_tags) {
  723. nnode = g_node_new (tag);
  724. g_node_append (*cur_level, nnode);
  725. if ((tag->flags & FL_CLOSED) == 0) {
  726. *cur_level = nnode;
  727. }
  728. hc->total_tags ++;
  729. }
  730. if (tag->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE)) {
  731. tag->flags |= FL_IGNORE;
  732. return FALSE;
  733. }
  734. }
  735. }
  736. else {
  737. /* Inline tag */
  738. parent = (*cur_level)->data;
  739. if (parent && (parent->flags & (CM_HEAD|CM_UNKNOWN|FL_IGNORE))) {
  740. tag->flags |= FL_IGNORE;
  741. return FALSE;
  742. }
  743. }
  744. return TRUE;
  745. }
  746. #define NEW_COMPONENT(comp_type) do { \
  747. comp = rspamd_mempool_alloc (pool, sizeof (*comp)); \
  748. comp->type = (comp_type); \
  749. comp->start = NULL; \
  750. comp->len = 0; \
  751. g_queue_push_tail (tag->params, comp); \
  752. ret = TRUE; \
  753. } while(0)
  754. static gboolean
  755. rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
  756. const guchar *begin, const guchar *end,
  757. struct html_tag *tag)
  758. {
  759. struct html_tag_component *comp;
  760. gint len;
  761. gboolean ret = FALSE;
  762. gchar *p;
  763. g_assert (end >= begin);
  764. p = rspamd_mempool_alloc (pool, end - begin);
  765. memcpy (p, begin, end - begin);
  766. len = rspamd_html_decode_entitles_inplace (p, end - begin);
  767. if (len == 3) {
  768. if (g_ascii_strncasecmp (p, "src", len) == 0) {
  769. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  770. }
  771. }
  772. else if (len == 4) {
  773. if (g_ascii_strncasecmp (p, "href", len) == 0) {
  774. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
  775. }
  776. }
  777. if (tag->id == Tag_IMG) {
  778. /* Check width and height if presented */
  779. if (len == 5 && g_ascii_strncasecmp (p, "width", len) == 0) {
  780. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH);
  781. }
  782. else if (len == 6 && g_ascii_strncasecmp (p, "height", len) == 0) {
  783. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT);
  784. }
  785. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  786. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  787. }
  788. }
  789. else if (tag->id == Tag_FONT) {
  790. if (len == 5){
  791. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  792. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  793. }
  794. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  795. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  796. }
  797. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  798. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  799. }
  800. }
  801. else if (len == 7) {
  802. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  803. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  804. }
  805. }
  806. else if (len == 4) {
  807. if (g_ascii_strncasecmp (p, "size", len) == 0) {
  808. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE);
  809. }
  810. }
  811. }
  812. else if (tag->flags & FL_BLOCK) {
  813. if (len == 5){
  814. if (g_ascii_strncasecmp (p, "color", len) == 0) {
  815. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
  816. }
  817. else if (g_ascii_strncasecmp (p, "style", len) == 0) {
  818. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
  819. }
  820. else if (g_ascii_strncasecmp (p, "class", len) == 0) {
  821. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
  822. }
  823. }
  824. else if (len == 7) {
  825. if (g_ascii_strncasecmp (p, "bgcolor", len) == 0) {
  826. NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
  827. }
  828. }
  829. }
  830. return ret;
  831. }
  832. static inline void
  833. rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
  834. struct html_content *hc, struct html_tag *tag, const guchar *in,
  835. gint *statep, guchar const **savep)
  836. {
  837. enum {
  838. parse_start = 0,
  839. parse_name,
  840. parse_attr_name,
  841. parse_equal,
  842. parse_start_dquote,
  843. parse_dqvalue,
  844. parse_end_dquote,
  845. parse_start_squote,
  846. parse_sqvalue,
  847. parse_end_squote,
  848. parse_value,
  849. spaces_after_name,
  850. spaces_before_eq,
  851. spaces_after_eq,
  852. spaces_after_param,
  853. ignore_bad_tag
  854. } state;
  855. struct html_tag_def *found;
  856. gboolean store = FALSE;
  857. struct html_tag_component *comp;
  858. state = *statep;
  859. switch (state) {
  860. case parse_start:
  861. if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
  862. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  863. state = ignore_bad_tag;
  864. tag->id = -1;
  865. tag->flags |= FL_BROKEN;
  866. }
  867. else if (g_ascii_isalpha (*in)) {
  868. state = parse_name;
  869. tag->name.start = in;
  870. }
  871. break;
  872. case parse_name:
  873. if (g_ascii_isspace (*in) || *in == '>' || *in == '/') {
  874. g_assert (in >= tag->name.start);
  875. if (*in == '/') {
  876. tag->flags |= FL_CLOSED;
  877. }
  878. tag->name.len = in - tag->name.start;
  879. if (tag->name.len == 0) {
  880. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  881. tag->id = -1;
  882. tag->flags |= FL_BROKEN;
  883. state = ignore_bad_tag;
  884. }
  885. else {
  886. gchar *s;
  887. khiter_t k;
  888. /* We CANNOT safely modify tag's name here, as it is already parsed */
  889. s = rspamd_mempool_alloc (pool, tag->name.len + 1);
  890. memcpy (s, tag->name.start, tag->name.len);
  891. tag->name.len = rspamd_html_decode_entitles_inplace (s,
  892. tag->name.len);
  893. tag->name.start = s;
  894. tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len);
  895. s[tag->name.len] = '\0';
  896. k = kh_get (tag_by_name, html_tag_by_name, s);
  897. if (k == kh_end (html_tag_by_name)) {
  898. hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
  899. tag->id = -1;
  900. }
  901. else {
  902. found = &kh_val (html_tag_by_name, k);
  903. tag->id = found->id;
  904. tag->flags = found->flags;
  905. }
  906. state = spaces_after_name;
  907. }
  908. }
  909. break;
  910. case parse_attr_name:
  911. if (*savep == NULL) {
  912. state = ignore_bad_tag;
  913. }
  914. else {
  915. const guchar *attr_name_end = in;
  916. if (*in == '=') {
  917. state = parse_equal;
  918. }
  919. else if (*in == '"') {
  920. /* No equal or something sane but we have quote character */
  921. state = parse_start_dquote;
  922. attr_name_end = in - 1;
  923. while (attr_name_end > *savep) {
  924. if (!g_ascii_isalnum (*attr_name_end)) {
  925. attr_name_end --;
  926. }
  927. else {
  928. break;
  929. }
  930. }
  931. /* One character forward to obtain length */
  932. attr_name_end ++;
  933. }
  934. else if (g_ascii_isspace (*in)) {
  935. state = spaces_before_eq;
  936. }
  937. else if (*in == '/') {
  938. tag->flags |= FL_CLOSED;
  939. }
  940. else if (!g_ascii_isgraph (*in)) {
  941. state = parse_value;
  942. attr_name_end = in - 1;
  943. while (attr_name_end > *savep) {
  944. if (!g_ascii_isalnum (*attr_name_end)) {
  945. attr_name_end --;
  946. }
  947. else {
  948. break;
  949. }
  950. }
  951. /* One character forward to obtain length */
  952. attr_name_end ++;
  953. }
  954. else {
  955. return;
  956. }
  957. if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
  958. /* Ignore unknown params */
  959. *savep = NULL;
  960. }
  961. else if (state == parse_value) {
  962. *savep = in + 1;
  963. }
  964. }
  965. break;
  966. case spaces_after_name:
  967. if (!g_ascii_isspace (*in)) {
  968. *savep = in;
  969. if (*in == '/') {
  970. tag->flags |= FL_CLOSED;
  971. }
  972. else if (*in != '>') {
  973. state = parse_attr_name;
  974. }
  975. }
  976. break;
  977. case spaces_before_eq:
  978. if (*in == '=') {
  979. state = parse_equal;
  980. }
  981. else if (!g_ascii_isspace (*in)) {
  982. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  983. tag->flags |= FL_BROKEN;
  984. state = ignore_bad_tag;
  985. }
  986. break;
  987. case spaces_after_eq:
  988. if (*in == '"') {
  989. state = parse_start_dquote;
  990. }
  991. else if (*in == '\'') {
  992. state = parse_start_squote;
  993. }
  994. else if (!g_ascii_isspace (*in)) {
  995. if (*savep != NULL) {
  996. /* We need to save this param */
  997. *savep = in;
  998. }
  999. state = parse_value;
  1000. }
  1001. break;
  1002. case parse_equal:
  1003. if (g_ascii_isspace (*in)) {
  1004. state = spaces_after_eq;
  1005. }
  1006. else if (*in == '"') {
  1007. state = parse_start_dquote;
  1008. }
  1009. else if (*in == '\'') {
  1010. state = parse_start_squote;
  1011. }
  1012. else {
  1013. if (*savep != NULL) {
  1014. /* We need to save this param */
  1015. *savep = in;
  1016. }
  1017. state = parse_value;
  1018. }
  1019. break;
  1020. case parse_start_dquote:
  1021. if (*in == '"') {
  1022. if (*savep != NULL) {
  1023. /* We have an empty attribute value */
  1024. savep = NULL;
  1025. }
  1026. state = spaces_after_param;
  1027. }
  1028. else {
  1029. if (*savep != NULL) {
  1030. /* We need to save this param */
  1031. *savep = in;
  1032. }
  1033. state = parse_dqvalue;
  1034. }
  1035. break;
  1036. case parse_start_squote:
  1037. if (*in == '\'') {
  1038. if (*savep != NULL) {
  1039. /* We have an empty attribute value */
  1040. savep = NULL;
  1041. }
  1042. state = spaces_after_param;
  1043. }
  1044. else {
  1045. if (*savep != NULL) {
  1046. /* We need to save this param */
  1047. *savep = in;
  1048. }
  1049. state = parse_sqvalue;
  1050. }
  1051. break;
  1052. case parse_dqvalue:
  1053. if (*in == '"') {
  1054. store = TRUE;
  1055. state = parse_end_dquote;
  1056. }
  1057. if (store) {
  1058. if (*savep != NULL) {
  1059. gchar *s;
  1060. g_assert (tag->params != NULL);
  1061. comp = g_queue_peek_tail (tag->params);
  1062. g_assert (comp != NULL);
  1063. comp->len = in - *savep;
  1064. s = rspamd_mempool_alloc (pool, comp->len);
  1065. memcpy (s, *savep, comp->len);
  1066. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1067. comp->start = s;
  1068. *savep = NULL;
  1069. }
  1070. }
  1071. break;
  1072. case parse_sqvalue:
  1073. if (*in == '\'') {
  1074. store = TRUE;
  1075. state = parse_end_squote;
  1076. }
  1077. if (store) {
  1078. if (*savep != NULL) {
  1079. gchar *s;
  1080. g_assert (tag->params != NULL);
  1081. comp = g_queue_peek_tail (tag->params);
  1082. g_assert (comp != NULL);
  1083. comp->len = in - *savep;
  1084. s = rspamd_mempool_alloc (pool, comp->len);
  1085. memcpy (s, *savep, comp->len);
  1086. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1087. comp->start = s;
  1088. *savep = NULL;
  1089. }
  1090. }
  1091. break;
  1092. case parse_value:
  1093. if (*in == '/' && *(in + 1) == '>') {
  1094. tag->flags |= FL_CLOSED;
  1095. store = TRUE;
  1096. }
  1097. else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
  1098. store = TRUE;
  1099. state = spaces_after_param;
  1100. }
  1101. if (store) {
  1102. if (*savep != NULL) {
  1103. gchar *s;
  1104. g_assert (tag->params != NULL);
  1105. comp = g_queue_peek_tail (tag->params);
  1106. g_assert (comp != NULL);
  1107. comp->len = in - *savep;
  1108. s = rspamd_mempool_alloc (pool, comp->len);
  1109. memcpy (s, *savep, comp->len);
  1110. comp->len = rspamd_html_decode_entitles_inplace (s, comp->len);
  1111. comp->start = s;
  1112. *savep = NULL;
  1113. }
  1114. }
  1115. break;
  1116. case parse_end_dquote:
  1117. case parse_end_squote:
  1118. if (g_ascii_isspace (*in)) {
  1119. state = spaces_after_param;
  1120. }
  1121. else if (*in == '/' && *(in + 1) == '>') {
  1122. tag->flags |= FL_CLOSED;
  1123. }
  1124. break;
  1125. case spaces_after_param:
  1126. if (!g_ascii_isspace (*in)) {
  1127. if (*in == '/' && *(in + 1) == '>') {
  1128. tag->flags |= FL_CLOSED;
  1129. }
  1130. state = parse_attr_name;
  1131. *savep = in;
  1132. }
  1133. break;
  1134. case ignore_bad_tag:
  1135. break;
  1136. }
  1137. *statep = state;
  1138. }
  1139. struct rspamd_url *
  1140. rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
  1141. struct html_tag_component *comp)
  1142. {
  1143. struct rspamd_url *url;
  1144. guint saved_flags = 0;
  1145. gchar *decoded;
  1146. gint rc;
  1147. gsize decoded_len;
  1148. const gchar *p, *s, *prefix = "http://";
  1149. gchar *d;
  1150. guint i, dlen;
  1151. gboolean has_bad_chars = FALSE, no_prefix = FALSE;
  1152. static const gchar hexdigests[16] = "0123456789abcdef";
  1153. p = start;
  1154. /* Strip spaces from the url */
  1155. /* Head spaces */
  1156. while (p < start + len && g_ascii_isspace (*p)) {
  1157. p ++;
  1158. start ++;
  1159. len --;
  1160. }
  1161. if (comp) {
  1162. comp->start = p;
  1163. comp->len = len;
  1164. }
  1165. /* Trailing spaces */
  1166. p = start + len - 1;
  1167. while (p >= start && g_ascii_isspace (*p)) {
  1168. p --;
  1169. len --;
  1170. if (comp) {
  1171. comp->len --;
  1172. }
  1173. }
  1174. s = start;
  1175. dlen = 0;
  1176. for (i = 0; i < len; i ++) {
  1177. if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1178. dlen += 3;
  1179. }
  1180. else {
  1181. dlen ++;
  1182. }
  1183. }
  1184. if (rspamd_substring_search (start, len, "://", 3) == -1) {
  1185. if (len >= sizeof ("mailto:") &&
  1186. (memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
  1187. memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 ||
  1188. memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) {
  1189. /* Exclusion, has valid but 'strange' prefix */
  1190. }
  1191. else {
  1192. for (i = 0; i < len; i ++) {
  1193. if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
  1194. if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
  1195. prefix = "http:";
  1196. dlen += sizeof ("http:") - 1;
  1197. no_prefix = TRUE;
  1198. }
  1199. else if (s[i] == '@') {
  1200. /* Likely email prefix */
  1201. prefix = "mailto://";
  1202. dlen += sizeof ("mailto://") - 1;
  1203. no_prefix = TRUE;
  1204. }
  1205. else if (s[i] == ':' && i != 0) {
  1206. /* Special case */
  1207. no_prefix = FALSE;
  1208. }
  1209. else {
  1210. if (i == 0) {
  1211. /* No valid data */
  1212. return NULL;
  1213. }
  1214. else {
  1215. no_prefix = TRUE;
  1216. dlen += strlen (prefix);
  1217. }
  1218. }
  1219. break;
  1220. }
  1221. }
  1222. }
  1223. }
  1224. decoded = rspamd_mempool_alloc (pool, dlen + 1);
  1225. d = decoded;
  1226. if (no_prefix) {
  1227. gsize plen = strlen (prefix);
  1228. memcpy (d, prefix, plen);
  1229. d += plen;
  1230. }
  1231. /*
  1232. * We also need to remove all internal newlines, spaces
  1233. * and encode unsafe characters
  1234. */
  1235. for (i = 0; i < len; i ++) {
  1236. if (G_UNLIKELY (g_ascii_isspace (s[i]))) {
  1237. continue;
  1238. }
  1239. else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) {
  1240. /* URL encode */
  1241. *d++ = '%';
  1242. *d++ = hexdigests[(s[i] >> 4) & 0xf];
  1243. *d++ = hexdigests[s[i] & 0xf];
  1244. has_bad_chars = TRUE;
  1245. }
  1246. else {
  1247. *d++ = s[i];
  1248. }
  1249. }
  1250. *d = '\0';
  1251. dlen = d - decoded;
  1252. url = rspamd_mempool_alloc0 (pool, sizeof (*url));
  1253. enum rspamd_normalise_result norm_res;
  1254. norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
  1255. if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
  1256. saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
  1257. }
  1258. if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
  1259. saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
  1260. if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
  1261. saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
  1262. }
  1263. }
  1264. rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
  1265. /* Filter some completely damaged urls */
  1266. if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
  1267. !((url->flags & RSPAMD_URL_FLAG_OBSCURED) && (url->protocol & PROTOCOL_UNKNOWN))) {
  1268. url->flags |= saved_flags;
  1269. if (has_bad_chars) {
  1270. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1271. }
  1272. if (no_prefix) {
  1273. url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  1274. }
  1275. decoded = url->string;
  1276. decoded_len = url->urllen;
  1277. if (comp) {
  1278. comp->start = decoded;
  1279. comp->len = decoded_len;
  1280. }
  1281. /* Spaces in href usually mean an attempt to obfuscate URL */
  1282. /* See https://github.com/vstakhov/rspamd/issues/593 */
  1283. #if 0
  1284. if (has_spaces) {
  1285. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1286. }
  1287. #endif
  1288. return url;
  1289. }
  1290. return NULL;
  1291. }
  1292. static struct rspamd_url *
  1293. rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1294. struct html_content *hc)
  1295. {
  1296. struct html_tag_component *comp;
  1297. GList *cur;
  1298. struct rspamd_url *url;
  1299. const gchar *start;
  1300. gsize len;
  1301. cur = tag->params->head;
  1302. while (cur) {
  1303. comp = cur->data;
  1304. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1305. start = comp->start;
  1306. len = comp->len;
  1307. /* Check base url */
  1308. if (hc && hc->base_url && comp->len > 2) {
  1309. /*
  1310. * Relative url canot start from the following:
  1311. * schema://
  1312. * slash
  1313. */
  1314. gchar *buf;
  1315. gsize orig_len;
  1316. if (rspamd_substring_search (start, len, "://", 3) == -1) {
  1317. /* Assume relative url */
  1318. gboolean need_slash = FALSE;
  1319. orig_len = len;
  1320. len += hc->base_url->urllen;
  1321. if (hc->base_url->string[hc->base_url->urllen - 1] != '/') {
  1322. need_slash = TRUE;
  1323. len ++;
  1324. }
  1325. buf = rspamd_mempool_alloc (pool, len + 1);
  1326. rspamd_snprintf (buf, len + 1, "%*s%s%*s",
  1327. hc->base_url->urllen, hc->base_url->string,
  1328. need_slash ? "/" : "",
  1329. (gint)orig_len, start);
  1330. start = buf;
  1331. }
  1332. else if (start[0] == '/' && start[1] != '/') {
  1333. /* Relative to the hostname */
  1334. orig_len = len;
  1335. len += hc->base_url->hostlen + hc->base_url->protocollen +
  1336. 3 /* for :// */;
  1337. buf = rspamd_mempool_alloc (pool, len + 1);
  1338. rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
  1339. hc->base_url->protocollen, hc->base_url->string,
  1340. hc->base_url->hostlen, hc->base_url->host,
  1341. (gint)orig_len, start);
  1342. start = buf;
  1343. }
  1344. }
  1345. url = rspamd_html_process_url (pool, start, len, comp);
  1346. if (url && tag->extra == NULL) {
  1347. tag->extra = url;
  1348. }
  1349. return url;
  1350. }
  1351. cur = g_list_next (cur);
  1352. }
  1353. return NULL;
  1354. }
  1355. static void
  1356. rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
  1357. GHashTable *tbl_urls, GHashTable *tbl_emails)
  1358. {
  1359. GHashTable *target_tbl;
  1360. struct rspamd_url *query_url, *existing;
  1361. gchar *url_str;
  1362. gint rc;
  1363. gboolean prefix_added;
  1364. if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) {
  1365. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1366. }
  1367. if (url->querylen > 0) {
  1368. if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
  1369. RSPAMD_URL_FIND_ALL,
  1370. NULL, &prefix_added)) {
  1371. query_url = rspamd_mempool_alloc0 (pool,
  1372. sizeof (struct rspamd_url));
  1373. rc = rspamd_url_parse (query_url,
  1374. url_str,
  1375. strlen (url_str),
  1376. pool,
  1377. RSPAMD_URL_PARSE_TEXT);
  1378. if (rc == URI_ERRNO_OK &&
  1379. query_url->hostlen > 0) {
  1380. msg_debug_html ("found url %s in query of url"
  1381. " %*s", url_str, url->querylen, url->query);
  1382. if (query_url->protocol == PROTOCOL_MAILTO) {
  1383. target_tbl = tbl_emails;
  1384. }
  1385. else {
  1386. target_tbl = tbl_urls;
  1387. }
  1388. if (prefix_added) {
  1389. query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
  1390. }
  1391. if (query_url->flags
  1392. & (RSPAMD_URL_FLAG_UNNORMALISED|RSPAMD_URL_FLAG_OBSCURED|
  1393. RSPAMD_URL_FLAG_NUMERIC)) {
  1394. /* Set obscured flag if query url is bad */
  1395. url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1396. }
  1397. /* And vice-versa */
  1398. if (url->flags & RSPAMD_URL_FLAG_OBSCURED) {
  1399. query_url->flags |= RSPAMD_URL_FLAG_OBSCURED;
  1400. }
  1401. if ((existing = g_hash_table_lookup (target_tbl,
  1402. query_url)) == NULL) {
  1403. g_hash_table_insert (target_tbl,
  1404. query_url,
  1405. query_url);
  1406. }
  1407. else {
  1408. existing->count ++;
  1409. }
  1410. }
  1411. }
  1412. }
  1413. }
  1414. static void
  1415. rspamd_html_process_data_image (rspamd_mempool_t *pool,
  1416. struct html_image *img,
  1417. struct html_tag_component *src)
  1418. {
  1419. /*
  1420. * Here, we do very basic processing of the data:
  1421. * detect if we have something like: ``
  1422. * We only parse base64 encoded data.
  1423. * We ignore content type so far
  1424. */
  1425. struct rspamd_image *parsed_image;
  1426. const gchar *semicolon_pos = NULL, *end = src->start + src->len;
  1427. semicolon_pos = src->start;
  1428. while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) {
  1429. if (end - semicolon_pos > sizeof ("base64,")) {
  1430. if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) {
  1431. const gchar *data_pos = semicolon_pos + sizeof ("base64,");
  1432. gchar *decoded;
  1433. gsize encoded_len = end - data_pos, decoded_len;
  1434. rspamd_ftok_t inp;
  1435. decoded_len = (encoded_len / 4 * 3) + 12;
  1436. decoded = rspamd_mempool_alloc (pool, decoded_len);
  1437. rspamd_cryptobox_base64_decode (data_pos, encoded_len,
  1438. decoded, &decoded_len);
  1439. inp.begin = decoded;
  1440. inp.len = decoded_len;
  1441. parsed_image = rspamd_maybe_process_image (pool, &inp);
  1442. if (parsed_image) {
  1443. msg_debug_html ("detected %s image of size %ud x %ud in data url",
  1444. rspamd_image_type_str (parsed_image->type),
  1445. parsed_image->width, parsed_image->height);
  1446. img->embedded_image = parsed_image;
  1447. }
  1448. }
  1449. break;
  1450. }
  1451. else {
  1452. /* Nothing useful */
  1453. return;
  1454. }
  1455. semicolon_pos ++;
  1456. }
  1457. }
  1458. static void
  1459. rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  1460. struct html_content *hc)
  1461. {
  1462. struct html_tag_component *comp;
  1463. struct html_image *img;
  1464. rspamd_ftok_t fstr;
  1465. const guchar *p;
  1466. GList *cur;
  1467. gulong val;
  1468. gboolean seen_width = FALSE, seen_height = FALSE;
  1469. goffset pos;
  1470. cur = tag->params->head;
  1471. img = rspamd_mempool_alloc0 (pool, sizeof (*img));
  1472. img->tag = tag;
  1473. while (cur) {
  1474. comp = cur->data;
  1475. if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
  1476. fstr.begin = (gchar *)comp->start;
  1477. fstr.len = comp->len;
  1478. img->src = rspamd_mempool_ftokdup (pool, &fstr);
  1479. if (comp->len > sizeof ("cid:") - 1 && memcmp (comp->start,
  1480. "cid:", sizeof ("cid:") - 1) == 0) {
  1481. /* We have an embedded image */
  1482. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
  1483. }
  1484. else {
  1485. if (comp->len > sizeof ("data:") - 1 && memcmp (comp->start,
  1486. "data:", sizeof ("data:") - 1) == 0) {
  1487. /* We have an embedded image in HTML tag */
  1488. img->flags |=
  1489. (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
  1490. rspamd_html_process_data_image (pool, img, comp);
  1491. hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
  1492. }
  1493. else {
  1494. img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
  1495. if (img->src) {
  1496. img->url = rspamd_html_process_url (pool,
  1497. img->src, fstr.len, NULL);
  1498. }
  1499. }
  1500. }
  1501. }
  1502. else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) {
  1503. rspamd_strtoul (comp->start, comp->len, &val);
  1504. img->height = val;
  1505. seen_height = TRUE;
  1506. }
  1507. else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) {
  1508. rspamd_strtoul (comp->start, comp->len, &val);
  1509. img->width = val;
  1510. seen_width = TRUE;
  1511. }
  1512. else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) {
  1513. /* Try to search for height= or width= in style tag */
  1514. if (!seen_height && comp->len > 0) {
  1515. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1516. "height", sizeof ("height") - 1);
  1517. if (pos != -1) {
  1518. p = comp->start + pos + sizeof ("height") - 1;
  1519. while (p < comp->start + comp->len) {
  1520. if (g_ascii_isdigit (*p)) {
  1521. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1522. img->height = val;
  1523. break;
  1524. }
  1525. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1526. /* Fallback */
  1527. break;
  1528. }
  1529. p ++;
  1530. }
  1531. }
  1532. }
  1533. if (!seen_width && comp->len > 0) {
  1534. pos = rspamd_substring_search_caseless (comp->start, comp->len,
  1535. "width", sizeof ("width") - 1);
  1536. if (pos != -1) {
  1537. p = comp->start + pos + sizeof ("width") - 1;
  1538. while (p < comp->start + comp->len) {
  1539. if (g_ascii_isdigit (*p)) {
  1540. rspamd_strtoul (p, comp->len - (p - comp->start), &val);
  1541. img->width = val;
  1542. break;
  1543. }
  1544. else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
  1545. /* Fallback */
  1546. break;
  1547. }
  1548. p ++;
  1549. }
  1550. }
  1551. }
  1552. }
  1553. cur = g_list_next (cur);
  1554. }
  1555. if (hc->images == NULL) {
  1556. hc->images = g_ptr_array_sized_new (4);
  1557. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  1558. hc->images);
  1559. }
  1560. if (img->embedded_image) {
  1561. if (!seen_height) {
  1562. img->height = img->embedded_image->height;
  1563. }
  1564. if (!seen_width) {
  1565. img->width = img->embedded_image->width;
  1566. }
  1567. }
  1568. g_ptr_array_add (hc->images, img);
  1569. tag->extra = img;
  1570. }
  1571. static void
  1572. rspamd_html_process_color (const gchar *line, guint len, struct html_color *cl)
  1573. {
  1574. const gchar *p = line, *end = line + len;
  1575. char hexbuf[7];
  1576. rspamd_ftok_t search;
  1577. struct html_color *el;
  1578. memset (cl, 0, sizeof (*cl));
  1579. if (*p == '#') {
  1580. /* HEX color */
  1581. p ++;
  1582. rspamd_strlcpy (hexbuf, p, MIN ((gint)sizeof(hexbuf), end - p + 1));
  1583. cl->d.val = strtoul (hexbuf, NULL, 16);
  1584. cl->d.comp.alpha = 255;
  1585. cl->valid = TRUE;
  1586. }
  1587. else if (len > 4 && rspamd_lc_cmp (p, "rgb", 3) == 0) {
  1588. /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
  1589. enum {
  1590. obrace,
  1591. num1,
  1592. num2,
  1593. num3,
  1594. num4,
  1595. skip_spaces
  1596. } state = skip_spaces, next_state = obrace;
  1597. gulong r = 0, g = 0, b = 0, opacity = 255;
  1598. const gchar *c;
  1599. gboolean valid = FALSE;
  1600. p += 3;
  1601. if (*p == 'a') {
  1602. p ++;
  1603. }
  1604. c = p;
  1605. while (p < end) {
  1606. switch (state) {
  1607. case obrace:
  1608. if (*p == '(') {
  1609. p ++;
  1610. state = skip_spaces;
  1611. next_state = num1;
  1612. }
  1613. else if (g_ascii_isspace (*p)) {
  1614. state = skip_spaces;
  1615. next_state = obrace;
  1616. }
  1617. else {
  1618. goto stop;
  1619. }
  1620. break;
  1621. case num1:
  1622. if (*p == ',') {
  1623. if (!rspamd_strtoul (c, p - c, &r)) {
  1624. goto stop;
  1625. }
  1626. p ++;
  1627. state = skip_spaces;
  1628. next_state = num2;
  1629. }
  1630. else if (!g_ascii_isdigit (*p)) {
  1631. goto stop;
  1632. }
  1633. else {
  1634. p ++;
  1635. }
  1636. break;
  1637. case num2:
  1638. if (*p == ',') {
  1639. if (!rspamd_strtoul (c, p - c, &g)) {
  1640. goto stop;
  1641. }
  1642. p ++;
  1643. state = skip_spaces;
  1644. next_state = num3;
  1645. }
  1646. else if (!g_ascii_isdigit (*p)) {
  1647. goto stop;
  1648. }
  1649. else {
  1650. p ++;
  1651. }
  1652. break;
  1653. case num3:
  1654. if (*p == ',') {
  1655. if (!rspamd_strtoul (c, p - c, &b)) {
  1656. goto stop;
  1657. }
  1658. valid = TRUE;
  1659. p ++;
  1660. state = skip_spaces;
  1661. next_state = num4;
  1662. }
  1663. else if (*p == ')') {
  1664. if (!rspamd_strtoul (c, p - c, &b)) {
  1665. goto stop;
  1666. }
  1667. valid = TRUE;
  1668. goto stop;
  1669. }
  1670. else if (!g_ascii_isdigit (*p)) {
  1671. goto stop;
  1672. }
  1673. else {
  1674. p ++;
  1675. }
  1676. break;
  1677. case num4:
  1678. if (*p == ',') {
  1679. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1680. goto stop;
  1681. }
  1682. valid = TRUE;
  1683. goto stop;
  1684. }
  1685. else if (*p == ')') {
  1686. if (!rspamd_strtoul (c, p - c, &opacity)) {
  1687. goto stop;
  1688. }
  1689. valid = TRUE;
  1690. goto stop;
  1691. }
  1692. else if (!g_ascii_isdigit (*p)) {
  1693. goto stop;
  1694. }
  1695. else {
  1696. p ++;
  1697. }
  1698. break;
  1699. case skip_spaces:
  1700. if (!g_ascii_isspace (*p)) {
  1701. c = p;
  1702. state = next_state;
  1703. }
  1704. else {
  1705. p ++;
  1706. }
  1707. break;
  1708. }
  1709. }
  1710. stop:
  1711. if (valid) {
  1712. cl->d.comp.r = r;
  1713. cl->d.comp.g = g;
  1714. cl->d.comp.b = b;
  1715. cl->d.comp.alpha = opacity;
  1716. cl->valid = TRUE;
  1717. }
  1718. }
  1719. else {
  1720. khiter_t k;
  1721. /* Compare color by name */
  1722. search.begin = line;
  1723. search.len = len;
  1724. k = kh_get (color_by_name, html_color_by_name, &search);
  1725. if (k != kh_end (html_color_by_name)) {
  1726. el = &kh_val (html_color_by_name, k);
  1727. memcpy (cl, el, sizeof (*cl));
  1728. cl->d.comp.alpha = 255; /* Non transparent */
  1729. }
  1730. }
  1731. }
  1732. /*
  1733. * Target is used for in and out if this function returns TRUE
  1734. */
  1735. static gboolean
  1736. rspamd_html_process_css_size (const gchar *suffix, gsize len,
  1737. gdouble *tgt)
  1738. {
  1739. gdouble sz = *tgt;
  1740. gboolean ret = FALSE;
  1741. if (len >= 2) {
  1742. if (memcmp (suffix, "px", 2) == 0) {
  1743. sz = (guint) sz; /* Round to number */
  1744. ret = TRUE;
  1745. }
  1746. else if (memcmp (suffix, "em", 2) == 0) {
  1747. /* EM is 16 px, so multiply and round */
  1748. sz = (guint) (sz * 16.0);
  1749. ret = TRUE;
  1750. }
  1751. else if (len >= 3 && memcmp (suffix, "rem", 3) == 0) {
  1752. /* equal to EM in our case */
  1753. sz = (guint) (sz * 16.0);
  1754. ret = TRUE;
  1755. }
  1756. else if (memcmp (suffix, "ex", 2) == 0) {
  1757. /*
  1758. * Represents the x-height of the element's font.
  1759. * On fonts with the "x" letter, this is generally the height
  1760. * of lowercase letters in the font; 1ex = 0.5em in many fonts.
  1761. */
  1762. sz = (guint) (sz * 8.0);
  1763. ret = TRUE;
  1764. }
  1765. else if (memcmp (suffix, "vw", 2) == 0) {
  1766. /*
  1767. * Vewport width in percentages:
  1768. * we assume 1% of viewport width as 8px
  1769. */
  1770. sz = (guint) (sz * 8.0);
  1771. ret = TRUE;
  1772. }
  1773. else if (memcmp (suffix, "vh", 2) == 0) {
  1774. /*
  1775. * Vewport height in percentages
  1776. * we assume 1% of viewport width as 6px
  1777. */
  1778. sz = (guint) (sz * 6.0);
  1779. ret = TRUE;
  1780. }
  1781. else if (len >= 4 && memcmp (suffix, "vmax", 4) == 0) {
  1782. /*
  1783. * Vewport width in percentages
  1784. * we assume 1% of viewport width as 6px
  1785. */
  1786. sz = (guint) (sz * 8.0);
  1787. ret = TRUE;
  1788. }
  1789. else if (len >= 4 && memcmp (suffix, "vmin", 4) == 0) {
  1790. /*
  1791. * Vewport height in percentages
  1792. * we assume 1% of viewport width as 6px
  1793. */
  1794. sz = (guint) (sz * 6.0);
  1795. ret = TRUE;
  1796. }
  1797. else if (memcmp (suffix, "pt", 2) == 0) {
  1798. sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
  1799. ret = TRUE;
  1800. }
  1801. else if (memcmp (suffix, "cm", 2) == 0) {
  1802. sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
  1803. ret = TRUE;
  1804. }
  1805. else if (memcmp (suffix, "mm", 2) == 0) {
  1806. sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
  1807. ret = TRUE;
  1808. }
  1809. else if (memcmp (suffix, "in", 2) == 0) {
  1810. sz = (guint) (sz * 96.0); /* 96px */
  1811. ret = TRUE;
  1812. }
  1813. else if (memcmp (suffix, "pc", 2) == 0) {
  1814. sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
  1815. ret = TRUE;
  1816. }
  1817. }
  1818. else if (suffix[0] == '%') {
  1819. /* Percentages from 16 px */
  1820. sz = (guint)(sz / 100.0 * 16.0);
  1821. ret = TRUE;
  1822. }
  1823. if (ret) {
  1824. *tgt = sz;
  1825. }
  1826. return ret;
  1827. }
  1828. static void
  1829. rspamd_html_process_font_size (const gchar *line, guint len, guint *fs,
  1830. gboolean is_css)
  1831. {
  1832. const gchar *p = line, *end = line + len;
  1833. gchar *err = NULL, numbuf[64];
  1834. gdouble sz = 0;
  1835. gboolean failsafe = FALSE;
  1836. while (p < end && g_ascii_isspace (*p)) {
  1837. p ++;
  1838. len --;
  1839. }
  1840. if (g_ascii_isdigit (*p)) {
  1841. rspamd_strlcpy (numbuf, p, MIN (sizeof (numbuf), len + 1));
  1842. sz = strtod (numbuf, &err);
  1843. /* Now check leftover */
  1844. if (sz < 0) {
  1845. sz = 0;
  1846. }
  1847. }
  1848. else {
  1849. /* Ignore the rest */
  1850. failsafe = TRUE;
  1851. sz = is_css ? 16 : 1;
  1852. /* TODO: add textual fonts descriptions */
  1853. }
  1854. if (err && *err != '\0') {
  1855. const gchar *e = err;
  1856. gsize slen;
  1857. /* Skip spaces */
  1858. while (*e && g_ascii_isspace (*e)) {
  1859. e ++;
  1860. }
  1861. /* Lowercase */
  1862. slen = strlen (e);
  1863. rspamd_str_lc ((gchar *)e, slen);
  1864. if (!rspamd_html_process_css_size (e, slen, &sz)) {
  1865. failsafe = TRUE;
  1866. }
  1867. }
  1868. else {
  1869. /* Failsafe naked number */
  1870. failsafe = TRUE;
  1871. }
  1872. if (failsafe) {
  1873. if (is_css) {
  1874. /*
  1875. * In css mode we usually ignore sizes, but let's treat
  1876. * small sizes specially
  1877. */
  1878. if (sz < 1) {
  1879. sz = 0;
  1880. } else {
  1881. sz = 16; /* Ignore */
  1882. }
  1883. } else {
  1884. /* In non-css mode we have to check legacy size */
  1885. sz = sz >= 1 ? sz * 16 : 16;
  1886. }
  1887. }
  1888. if (sz > 32) {
  1889. sz = 32;
  1890. }
  1891. *fs = sz;
  1892. }
  1893. static void
  1894. rspamd_html_process_style (rspamd_mempool_t *pool, struct html_block *bl,
  1895. struct html_content *hc, const gchar *style, guint len)
  1896. {
  1897. const gchar *p, *c, *end, *key = NULL;
  1898. enum {
  1899. read_key,
  1900. read_colon,
  1901. read_value,
  1902. skip_spaces,
  1903. } state = skip_spaces, next_state = read_key;
  1904. guint klen = 0;
  1905. gdouble opacity = 1.0;
  1906. p = style;
  1907. c = p;
  1908. end = p + len;
  1909. while (p <= end) {
  1910. switch(state) {
  1911. case read_key:
  1912. if (p == end || *p == ':') {
  1913. key = c;
  1914. klen = p - c;
  1915. state = skip_spaces;
  1916. next_state = read_value;
  1917. }
  1918. else if (g_ascii_isspace (*p)) {
  1919. key = c;
  1920. klen = p - c;
  1921. state = skip_spaces;
  1922. next_state = read_colon;
  1923. }
  1924. p ++;
  1925. break;
  1926. case read_colon:
  1927. if (p == end || *p == ':') {
  1928. state = skip_spaces;
  1929. next_state = read_value;
  1930. }
  1931. p ++;
  1932. break;
  1933. case read_value:
  1934. if (p == end || *p == ';') {
  1935. if (key && klen && p - c > 0) {
  1936. if ((klen == 5 && g_ascii_strncasecmp (key, "color", 5) == 0)
  1937. || (klen == 10 && g_ascii_strncasecmp (key, "font-color", 10) == 0)) {
  1938. rspamd_html_process_color (c, p - c, &bl->font_color);
  1939. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  1940. }
  1941. else if ((klen == 16 && g_ascii_strncasecmp (key,
  1942. "background-color", 16) == 0) ||
  1943. (klen == 10 && g_ascii_strncasecmp (key,
  1944. "background", 10) == 0)) {
  1945. rspamd_html_process_color (c, p - c, &bl->background_color);
  1946. msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
  1947. }
  1948. else if (klen == 7 && g_ascii_strncasecmp (key, "display", 7) == 0) {
  1949. if (p - c >= 4 && rspamd_substring_search_caseless (c, p - c,
  1950. "none", 4) != -1) {
  1951. bl->visible = FALSE;
  1952. msg_debug_html ("tag is not visible");
  1953. }
  1954. }
  1955. else if (klen == 9 &&
  1956. g_ascii_strncasecmp (key, "font-size", 9) == 0) {
  1957. rspamd_html_process_font_size (c, p - c,
  1958. &bl->font_size, TRUE);
  1959. msg_debug_html ("got font size: %ud", bl->font_size);
  1960. }
  1961. else if (klen == 7 &&
  1962. g_ascii_strncasecmp (key, "opacity", 7) == 0) {
  1963. gchar numbuf[64];
  1964. rspamd_strlcpy (numbuf, c,
  1965. MIN (sizeof (numbuf), p - c + 1));
  1966. opacity = strtod (numbuf, NULL);
  1967. if (opacity > 1) {
  1968. opacity = 1;
  1969. }
  1970. else if (opacity < 0) {
  1971. opacity = 0;
  1972. }
  1973. bl->font_color.d.comp.alpha = (guint8)(opacity * 255.0);
  1974. }
  1975. else if (klen == 10 &&
  1976. g_ascii_strncasecmp (key, "visibility", 10) == 0) {
  1977. if (p - c >= 6 && rspamd_substring_search_caseless (c,
  1978. p - c,
  1979. "hidden", 6) != -1) {
  1980. bl->visible = FALSE;
  1981. msg_debug_html ("tag is not visible");
  1982. }
  1983. }
  1984. }
  1985. key = NULL;
  1986. klen = 0;
  1987. state = skip_spaces;
  1988. next_state = read_key;
  1989. }
  1990. p ++;
  1991. break;
  1992. case skip_spaces:
  1993. if (p < end && !g_ascii_isspace (*p)) {
  1994. c = p;
  1995. state = next_state;
  1996. }
  1997. else {
  1998. p ++;
  1999. }
  2000. break;
  2001. }
  2002. }
  2003. }
  2004. static void
  2005. rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag,
  2006. struct html_content *hc)
  2007. {
  2008. struct html_tag_component *comp;
  2009. struct html_block *bl;
  2010. rspamd_ftok_t fstr;
  2011. GList *cur;
  2012. cur = tag->params->head;
  2013. bl = rspamd_mempool_alloc0 (pool, sizeof (*bl));
  2014. bl->tag = tag;
  2015. bl->visible = TRUE;
  2016. bl->font_size = (guint)-1;
  2017. bl->font_color.d.comp.alpha = 255;
  2018. while (cur) {
  2019. comp = cur->data;
  2020. if (comp->len > 0) {
  2021. switch (comp->type) {
  2022. case RSPAMD_HTML_COMPONENT_COLOR:
  2023. fstr.begin = (gchar *) comp->start;
  2024. fstr.len = comp->len;
  2025. rspamd_html_process_color (comp->start, comp->len,
  2026. &bl->font_color);
  2027. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  2028. break;
  2029. case RSPAMD_HTML_COMPONENT_BGCOLOR:
  2030. fstr.begin = (gchar *) comp->start;
  2031. fstr.len = comp->len;
  2032. rspamd_html_process_color (comp->start, comp->len,
  2033. &bl->background_color);
  2034. msg_debug_html ("got color: %xd", bl->font_color.d.val);
  2035. if (tag->id == Tag_BODY) {
  2036. /* Set global background color */
  2037. memcpy (&hc->bgcolor, &bl->background_color,
  2038. sizeof (hc->bgcolor));
  2039. }
  2040. break;
  2041. case RSPAMD_HTML_COMPONENT_STYLE:
  2042. bl->style.len = comp->len;
  2043. bl->style.start = comp->start;
  2044. msg_debug_html ("got style: %*s", (gint) bl->style.len,
  2045. bl->style.start);
  2046. rspamd_html_process_style (pool, bl, hc, comp->start, comp->len);
  2047. break;
  2048. case RSPAMD_HTML_COMPONENT_CLASS:
  2049. fstr.begin = (gchar *) comp->start;
  2050. fstr.len = comp->len;
  2051. bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
  2052. msg_debug_html ("got class: %s", bl->html_class);
  2053. break;
  2054. case RSPAMD_HTML_COMPONENT_SIZE:
  2055. /* Not supported by html5 */
  2056. /* FIXME maybe support it */
  2057. bl->font_size = 16;
  2058. msg_debug_html ("got size: %*s", (gint)comp->len, comp->start);
  2059. break;
  2060. default:
  2061. /* NYI */
  2062. break;
  2063. }
  2064. }
  2065. cur = g_list_next (cur);
  2066. }
  2067. if (hc->blocks == NULL) {
  2068. hc->blocks = g_ptr_array_sized_new (64);
  2069. rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
  2070. hc->blocks);
  2071. }
  2072. g_ptr_array_add (hc->blocks, bl);
  2073. tag->extra = bl;
  2074. }
  2075. static void
  2076. rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
  2077. GList **exceptions, GHashTable *urls, GHashTable *emails,
  2078. GByteArray *dest, GHashTable *target_tbl,
  2079. gint href_offset,
  2080. struct rspamd_url *url)
  2081. {
  2082. struct rspamd_url *displayed_url = NULL;
  2083. struct rspamd_url *turl;
  2084. gboolean url_found = FALSE;
  2085. struct rspamd_process_exception *ex;
  2086. if (href_offset <= 0) {
  2087. /* No dispalyed url, just some text within <a> tag */
  2088. return;
  2089. }
  2090. url->visible_part = rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
  2091. rspamd_strlcpy (url->visible_part, dest->data + href_offset,
  2092. dest->len - href_offset + 1);
  2093. g_strstrip (url->visible_part);
  2094. rspamd_html_url_is_phished (pool, url,
  2095. dest->data + href_offset,
  2096. dest->len - href_offset,
  2097. &url_found, &displayed_url);
  2098. if (url_found) {
  2099. url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
  2100. }
  2101. if (exceptions && url_found) {
  2102. ex = rspamd_mempool_alloc (pool,
  2103. sizeof (*ex));
  2104. ex->pos = href_offset;
  2105. ex->len = dest->len - href_offset;
  2106. ex->type = RSPAMD_EXCEPTION_URL;
  2107. ex->ptr = url;
  2108. *exceptions = g_list_prepend (*exceptions,
  2109. ex);
  2110. }
  2111. if (displayed_url) {
  2112. if (displayed_url->protocol ==
  2113. PROTOCOL_MAILTO) {
  2114. target_tbl = emails;
  2115. }
  2116. else {
  2117. target_tbl = urls;
  2118. }
  2119. if (target_tbl != NULL) {
  2120. turl = g_hash_table_lookup (target_tbl,
  2121. displayed_url);
  2122. if (turl != NULL) {
  2123. /* Here, we assume the following:
  2124. * if we have a URL in the text part which
  2125. * is the same as displayed URL in the
  2126. * HTML part, we assume that it is also
  2127. * hint only.
  2128. */
  2129. if (turl->flags &
  2130. RSPAMD_URL_FLAG_FROM_TEXT) {
  2131. turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
  2132. turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
  2133. }
  2134. turl->count ++;
  2135. }
  2136. else {
  2137. g_hash_table_insert (target_tbl,
  2138. displayed_url,
  2139. displayed_url);
  2140. }
  2141. }
  2142. }
  2143. }
  2144. static gboolean
  2145. rspamd_html_propagate_lengths (GNode *node, gpointer _unused)
  2146. {
  2147. GNode *child;
  2148. struct html_tag *tag = node->data, *cld_tag;
  2149. if (tag) {
  2150. child = node->children;
  2151. /* Summarize content length from children */
  2152. while (child) {
  2153. cld_tag = child->data;
  2154. tag->content_length += cld_tag->content_length;
  2155. child = child->next;
  2156. }
  2157. }
  2158. return FALSE;
  2159. }
  2160. static void
  2161. rspamd_html_propagate_style (struct html_content *hc,
  2162. struct html_tag *tag,
  2163. struct html_block *bl,
  2164. GQueue *blocks)
  2165. {
  2166. struct html_block *bl_parent;
  2167. gboolean push_block = FALSE;
  2168. /* Propagate from the parent if needed */
  2169. bl_parent = g_queue_peek_tail (blocks);
  2170. if (bl_parent) {
  2171. if (!bl->background_color.valid) {
  2172. /* Try to propagate background color from parent nodes */
  2173. if (bl_parent->background_color.valid) {
  2174. memcpy (&bl->background_color, &bl_parent->background_color,
  2175. sizeof (bl->background_color));
  2176. }
  2177. }
  2178. else {
  2179. push_block = TRUE;
  2180. }
  2181. if (!bl->font_color.valid) {
  2182. /* Try to propagate background color from parent nodes */
  2183. if (bl_parent->font_color.valid) {
  2184. memcpy (&bl->font_color, &bl_parent->font_color,
  2185. sizeof (bl->font_color));
  2186. }
  2187. }
  2188. else {
  2189. push_block = TRUE;
  2190. }
  2191. /* Propagate font size */
  2192. if (bl->font_size == (guint)-1) {
  2193. if (bl_parent->font_size != (guint)-1) {
  2194. bl->font_size = bl_parent->font_size;
  2195. }
  2196. }
  2197. else {
  2198. push_block = TRUE;
  2199. }
  2200. }
  2201. /* Set bgcolor to the html bgcolor and font color to black as a last resort */
  2202. if (!bl->font_color.valid) {
  2203. /* Don't touch opacity as it can be set separately */
  2204. bl->font_color.d.comp.r = 0;
  2205. bl->font_color.d.comp.g = 0;
  2206. bl->font_color.d.comp.b = 0;
  2207. bl->font_color.valid = TRUE;
  2208. }
  2209. else {
  2210. push_block = TRUE;
  2211. }
  2212. if (!bl->background_color.valid) {
  2213. memcpy (&bl->background_color, &hc->bgcolor, sizeof (hc->bgcolor));
  2214. }
  2215. else {
  2216. push_block = TRUE;
  2217. }
  2218. if (bl->font_size == (guint)-1) {
  2219. bl->font_size = 16; /* Default for browsers */
  2220. }
  2221. else {
  2222. push_block = TRUE;
  2223. }
  2224. if (push_block && !(tag->flags & FL_CLOSED)) {
  2225. g_queue_push_tail (blocks, bl);
  2226. }
  2227. }
  2228. GByteArray*
  2229. rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
  2230. GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails)
  2231. {
  2232. const guchar *p, *c, *end, *savep = NULL;
  2233. guchar t;
  2234. gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
  2235. balanced;
  2236. GByteArray *dest;
  2237. GHashTable *target_tbl;
  2238. guint obrace = 0, ebrace = 0;
  2239. GNode *cur_level = NULL;
  2240. gint substate = 0, len, href_offset = -1;
  2241. struct html_tag *cur_tag = NULL, *content_tag = NULL;
  2242. struct rspamd_url *url = NULL, *turl;
  2243. GQueue *styles_blocks;
  2244. enum {
  2245. parse_start = 0,
  2246. tag_begin,
  2247. sgml_tag,
  2248. xml_tag,
  2249. compound_tag,
  2250. comment_tag,
  2251. comment_content,
  2252. sgml_content,
  2253. tag_content,
  2254. tag_end,
  2255. xml_tag_end,
  2256. content_ignore,
  2257. content_write,
  2258. content_ignore_sp
  2259. } state = parse_start;
  2260. g_assert (in != NULL);
  2261. g_assert (hc != NULL);
  2262. g_assert (pool != NULL);
  2263. rspamd_html_library_init ();
  2264. hc->tags_seen = rspamd_mempool_alloc0 (pool, NBYTES (G_N_ELEMENTS (tag_defs)));
  2265. /* Set white background color by default */
  2266. hc->bgcolor.d.comp.alpha = 0;
  2267. hc->bgcolor.d.comp.r = 255;
  2268. hc->bgcolor.d.comp.g = 255;
  2269. hc->bgcolor.d.comp.b = 255;
  2270. hc->bgcolor.valid = TRUE;
  2271. dest = g_byte_array_sized_new (in->len / 3 * 2);
  2272. styles_blocks = g_queue_new ();
  2273. p = in->data;
  2274. c = p;
  2275. end = p + in->len;
  2276. while (p < end) {
  2277. t = *p;
  2278. switch (state) {
  2279. case parse_start:
  2280. if (t == '<') {
  2281. state = tag_begin;
  2282. }
  2283. else {
  2284. /* We have no starting tag, so assume that it's content */
  2285. hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
  2286. state = content_write;
  2287. }
  2288. break;
  2289. case tag_begin:
  2290. switch (t) {
  2291. case '<':
  2292. p ++;
  2293. closing = FALSE;
  2294. break;
  2295. case '!':
  2296. state = sgml_tag;
  2297. p ++;
  2298. break;
  2299. case '?':
  2300. state = xml_tag;
  2301. hc->flags |= RSPAMD_HTML_FLAG_XML;
  2302. p ++;
  2303. break;
  2304. case '/':
  2305. closing = TRUE;
  2306. p ++;
  2307. break;
  2308. case '>':
  2309. /* Empty tag */
  2310. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2311. state = tag_end;
  2312. continue;
  2313. default:
  2314. state = tag_content;
  2315. substate = 0;
  2316. savep = NULL;
  2317. cur_tag = rspamd_mempool_alloc0 (pool, sizeof (*cur_tag));
  2318. cur_tag->params = g_queue_new ();
  2319. rspamd_mempool_add_destructor (pool,
  2320. (rspamd_mempool_destruct_t)g_queue_free, cur_tag->params);
  2321. break;
  2322. }
  2323. break;
  2324. case sgml_tag:
  2325. switch (t) {
  2326. case '[':
  2327. state = compound_tag;
  2328. obrace = 1;
  2329. ebrace = 0;
  2330. p ++;
  2331. break;
  2332. case '-':
  2333. state = comment_tag;
  2334. p ++;
  2335. break;
  2336. default:
  2337. state = sgml_content;
  2338. break;
  2339. }
  2340. break;
  2341. case xml_tag:
  2342. if (t == '?') {
  2343. state = xml_tag_end;
  2344. }
  2345. else if (t == '>') {
  2346. /* Misformed xml tag */
  2347. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2348. state = tag_end;
  2349. continue;
  2350. }
  2351. /* We efficiently ignore xml tags */
  2352. p ++;
  2353. break;
  2354. case xml_tag_end:
  2355. if (t == '>') {
  2356. state = tag_end;
  2357. continue;
  2358. }
  2359. else {
  2360. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2361. p ++;
  2362. }
  2363. break;
  2364. case compound_tag:
  2365. if (t == '[') {
  2366. obrace ++;
  2367. }
  2368. else if (t == ']') {
  2369. ebrace ++;
  2370. }
  2371. else if (t == '>' && obrace == ebrace) {
  2372. state = tag_end;
  2373. continue;
  2374. }
  2375. p ++;
  2376. break;
  2377. case comment_tag:
  2378. if (t != '-') {
  2379. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2380. state = tag_end;
  2381. }
  2382. else {
  2383. p++;
  2384. ebrace = 0;
  2385. /*
  2386. * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
  2387. * ... the text must not start with a single
  2388. * U+003E GREATER-THAN SIGN character (>),
  2389. * nor start with a "-" (U+002D) character followed by
  2390. * a U+003E GREATER-THAN SIGN (>) character,
  2391. * nor contain two consecutive U+002D HYPHEN-MINUS
  2392. * characters (--), nor end with a "-" (U+002D) character.
  2393. */
  2394. if (p[0] == '-' && p + 1 < end && p[1] == '>') {
  2395. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2396. p ++;
  2397. state = tag_end;
  2398. }
  2399. else if (*p == '>') {
  2400. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2401. state = tag_end;
  2402. }
  2403. else {
  2404. state = comment_content;
  2405. }
  2406. }
  2407. break;
  2408. case comment_content:
  2409. if (t == '-') {
  2410. ebrace ++;
  2411. }
  2412. else if (t == '>' && ebrace >= 2) {
  2413. state = tag_end;
  2414. continue;
  2415. }
  2416. else {
  2417. ebrace = 0;
  2418. }
  2419. p ++;
  2420. break;
  2421. case content_ignore:
  2422. if (t != '<') {
  2423. p ++;
  2424. }
  2425. else {
  2426. if (content_tag) {
  2427. if (content_tag->content == NULL) {
  2428. content_tag->content = c;
  2429. }
  2430. content_tag->content_length += p - c;
  2431. }
  2432. state = tag_begin;
  2433. }
  2434. break;
  2435. case content_write:
  2436. if (t != '<') {
  2437. if (t == '&') {
  2438. need_decode = TRUE;
  2439. }
  2440. else if (g_ascii_isspace (t)) {
  2441. save_space = TRUE;
  2442. if (p > c) {
  2443. if (need_decode) {
  2444. goffset old_offset = dest->len;
  2445. g_byte_array_append (dest, c, (p - c));
  2446. len = rspamd_html_decode_entitles_inplace (
  2447. dest->data + old_offset,
  2448. p - c);
  2449. dest->len = dest->len + len - (p - c);
  2450. }
  2451. else {
  2452. len = p - c;
  2453. g_byte_array_append (dest, c, len);
  2454. }
  2455. if (content_tag) {
  2456. if (content_tag->content == NULL) {
  2457. content_tag->content = c;
  2458. }
  2459. content_tag->content_length += p - c + 1;
  2460. }
  2461. }
  2462. c = p;
  2463. state = content_ignore_sp;
  2464. }
  2465. else {
  2466. if (save_space) {
  2467. /* Append one space if needed */
  2468. if (dest->len > 0 &&
  2469. !g_ascii_isspace (dest->data[dest->len - 1])) {
  2470. g_byte_array_append (dest, " ", 1);
  2471. }
  2472. save_space = FALSE;
  2473. }
  2474. }
  2475. }
  2476. else {
  2477. if (c != p) {
  2478. if (need_decode) {
  2479. goffset old_offset = dest->len;
  2480. g_byte_array_append (dest, c, (p - c));
  2481. len = rspamd_html_decode_entitles_inplace (
  2482. dest->data + old_offset,
  2483. p - c);
  2484. dest->len = dest->len + len - (p - c);
  2485. }
  2486. else {
  2487. len = p - c;
  2488. g_byte_array_append (dest, c, len);
  2489. }
  2490. if (content_tag) {
  2491. if (content_tag->content == NULL) {
  2492. content_tag->content = c;
  2493. }
  2494. content_tag->content_length += p - c;
  2495. }
  2496. }
  2497. content_tag = NULL;
  2498. state = tag_begin;
  2499. continue;
  2500. }
  2501. p ++;
  2502. break;
  2503. case content_ignore_sp:
  2504. if (!g_ascii_isspace (t)) {
  2505. c = p;
  2506. state = content_write;
  2507. continue;
  2508. }
  2509. if (content_tag) {
  2510. content_tag->content_length ++;
  2511. }
  2512. p ++;
  2513. break;
  2514. case sgml_content:
  2515. /* TODO: parse DOCTYPE here */
  2516. if (t == '>') {
  2517. state = tag_end;
  2518. /* We don't know a lot about sgml tags, ignore them */
  2519. cur_tag = NULL;
  2520. continue;
  2521. }
  2522. p ++;
  2523. break;
  2524. case tag_content:
  2525. rspamd_html_parse_tag_content (pool, hc, cur_tag,
  2526. p, &substate, &savep);
  2527. if (t == '>') {
  2528. if (closing) {
  2529. cur_tag->flags |= FL_CLOSING;
  2530. if (cur_tag->flags & FL_CLOSED) {
  2531. /* Bad mix of closed and closing */
  2532. hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
  2533. }
  2534. closing = FALSE;
  2535. }
  2536. state = tag_end;
  2537. continue;
  2538. }
  2539. p ++;
  2540. break;
  2541. case tag_end:
  2542. substate = 0;
  2543. savep = NULL;
  2544. if (cur_tag != NULL) {
  2545. balanced = TRUE;
  2546. if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level,
  2547. &balanced)) {
  2548. state = content_write;
  2549. need_decode = FALSE;
  2550. }
  2551. else {
  2552. state = content_ignore;
  2553. }
  2554. if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
  2555. if (cur_tag->flags & CM_UNIQUE) {
  2556. if (isset (hc->tags_seen, cur_tag->id)) {
  2557. /* Duplicate tag has been found */
  2558. hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
  2559. }
  2560. }
  2561. setbit (hc->tags_seen, cur_tag->id);
  2562. }
  2563. if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
  2564. content_tag = cur_tag;
  2565. }
  2566. /* Handle newlines */
  2567. if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
  2568. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2569. g_byte_array_append (dest, "\r\n", 2);
  2570. }
  2571. save_space = FALSE;
  2572. }
  2573. if ((cur_tag->id == Tag_P ||
  2574. cur_tag->id == Tag_TR ||
  2575. cur_tag->id == Tag_DIV)) {
  2576. if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
  2577. g_byte_array_append (dest, "\r\n", 2);
  2578. }
  2579. save_space = FALSE;
  2580. }
  2581. if (cur_tag->flags & FL_HREF) {
  2582. if (!(cur_tag->flags & (FL_CLOSING))) {
  2583. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2584. if (url != NULL) {
  2585. if (url->protocol == PROTOCOL_MAILTO) {
  2586. target_tbl = emails;
  2587. }
  2588. else {
  2589. target_tbl = urls;
  2590. }
  2591. if (target_tbl != NULL) {
  2592. turl = g_hash_table_lookup (target_tbl, url);
  2593. if (turl == NULL) {
  2594. g_hash_table_insert (target_tbl, url, url);
  2595. }
  2596. else {
  2597. turl->count ++;
  2598. url = NULL;
  2599. }
  2600. if (turl == NULL && url != NULL) {
  2601. rspamd_process_html_url (pool,
  2602. url,
  2603. urls, emails);
  2604. }
  2605. }
  2606. href_offset = dest->len;
  2607. }
  2608. }
  2609. if (cur_tag->id == Tag_A) {
  2610. if (!balanced && cur_level && cur_level->prev) {
  2611. struct html_tag *prev_tag;
  2612. struct rspamd_url *prev_url;
  2613. prev_tag = cur_level->prev->data;
  2614. if (prev_tag->id == Tag_A &&
  2615. !(prev_tag->flags & (FL_CLOSING)) &&
  2616. prev_tag->extra) {
  2617. prev_url = prev_tag->extra;
  2618. rspamd_html_check_displayed_url (pool,
  2619. exceptions, urls, emails,
  2620. dest, target_tbl, href_offset,
  2621. prev_url);
  2622. }
  2623. }
  2624. if (cur_tag->flags & (FL_CLOSING)) {
  2625. /* Insert exception */
  2626. if (url != NULL && (gint) dest->len > href_offset) {
  2627. rspamd_html_check_displayed_url (pool,
  2628. exceptions, urls, emails,
  2629. dest, target_tbl, href_offset,
  2630. url);
  2631. }
  2632. href_offset = -1;
  2633. url = NULL;
  2634. }
  2635. }
  2636. }
  2637. else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
  2638. struct html_tag *prev_tag = NULL;
  2639. if (cur_level && cur_level->parent) {
  2640. prev_tag = cur_level->parent->data;
  2641. }
  2642. /*
  2643. * Base is allowed only within head tag but we slightly
  2644. * relax that
  2645. */
  2646. if (!prev_tag || prev_tag->id == Tag_HEAD ||
  2647. prev_tag->id == Tag_HTML) {
  2648. url = rspamd_html_process_url_tag (pool, cur_tag, hc);
  2649. if (url != NULL) {
  2650. if (hc->base_url == NULL) {
  2651. /* We have a base tag available */
  2652. hc->base_url = url;
  2653. }
  2654. cur_tag->extra = url;
  2655. }
  2656. }
  2657. }
  2658. if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
  2659. rspamd_html_process_img_tag (pool, cur_tag, hc);
  2660. }
  2661. else if (cur_tag->flags & FL_BLOCK) {
  2662. struct html_block *bl;
  2663. if (cur_tag->flags & FL_CLOSING) {
  2664. /* Just remove block element from the queue if any */
  2665. if (styles_blocks->length > 0) {
  2666. g_queue_pop_tail (styles_blocks);
  2667. }
  2668. }
  2669. else {
  2670. rspamd_html_process_block_tag (pool, cur_tag, hc);
  2671. bl = cur_tag->extra;
  2672. if (bl) {
  2673. rspamd_html_propagate_style (hc, cur_tag,
  2674. cur_tag->extra, styles_blocks);
  2675. /* Check visibility */
  2676. if (bl->font_size < 3 ||
  2677. bl->font_color.d.comp.alpha < 10) {
  2678. bl->visible = FALSE;
  2679. msg_debug_html ("tag is not visible");
  2680. }
  2681. if (!bl->visible) {
  2682. state = content_ignore;
  2683. }
  2684. }
  2685. }
  2686. }
  2687. }
  2688. else {
  2689. state = content_write;
  2690. }
  2691. p++;
  2692. c = p;
  2693. cur_tag = NULL;
  2694. break;
  2695. }
  2696. }
  2697. if (hc->html_tags) {
  2698. g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1,
  2699. rspamd_html_propagate_lengths, NULL);
  2700. }
  2701. g_queue_free (styles_blocks);
  2702. return dest;
  2703. }
  2704. GByteArray*
  2705. rspamd_html_process_part (rspamd_mempool_t *pool,
  2706. struct html_content *hc,
  2707. GByteArray *in)
  2708. {
  2709. return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
  2710. }