You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "libmime/message.h"
  17. #include "re_cache.h"
  18. #include "cryptobox.h"
  19. #include "ref.h"
  20. #include "libserver/url.h"
  21. #include "libserver/task.h"
  22. #include "libserver/cfg_file.h"
  23. #include "libutil/util.h"
  24. #include "libutil/regexp.h"
  25. #include "lua/lua_common.h"
  26. #include "libstat/stat_api.h"
  27. #include "contrib/uthash/utlist.h"
  28. #include "lua/lua_classnames.h"
  29. #include "khash.h"
  30. #ifdef WITH_HYPERSCAN
  31. #include "hs.h"
  32. #include "hyperscan_tools.h"
  33. #endif
  34. #include "unix-std.h"
  35. #include <signal.h>
  36. #include <stdalign.h>
  37. #include <math.h>
  38. #include "contrib/libev/ev.h"
  39. #ifndef WITH_PCRE2
  40. #include <pcre.h>
  41. #else
  42. #include <pcre2.h>
  43. #endif
  44. #include "contrib/fastutf8/fastutf8.h"
  45. #ifdef HAVE_SYS_WAIT_H
  46. #include <sys/wait.h>
  47. #endif
  48. #define msg_err_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
  49. "re_cache", cache->hash, \
  50. RSPAMD_LOG_FUNC, \
  51. __VA_ARGS__)
  52. #define msg_warn_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
  53. "re_cache", cache->hash, \
  54. RSPAMD_LOG_FUNC, \
  55. __VA_ARGS__)
  56. #define msg_info_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
  57. "re_cache", cache->hash, \
  58. RSPAMD_LOG_FUNC, \
  59. __VA_ARGS__)
  60. #define msg_debug_re_task(...) rspamd_conditional_debug_fast(NULL, NULL, \
  61. rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \
  62. RSPAMD_LOG_FUNC, \
  63. __VA_ARGS__)
  64. #define msg_debug_re_cache(...) rspamd_conditional_debug_fast(NULL, NULL, \
  65. rspamd_re_cache_log_id, "re_cache", cache->hash, \
  66. RSPAMD_LOG_FUNC, \
  67. __VA_ARGS__)
  68. INIT_LOG_MODULE(re_cache)
  69. #ifdef WITH_HYPERSCAN
  70. #define RSPAMD_HS_MAGIC_LEN (sizeof(rspamd_hs_magic))
  71. static const guchar rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'},
  72. rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'};
  73. #endif
  74. struct rspamd_re_class {
  75. guint64 id;
  76. enum rspamd_re_type type;
  77. gboolean has_utf8; /* if there are any utf8 regexps */
  78. gpointer type_data;
  79. gsize type_len;
  80. GHashTable *re;
  81. rspamd_cryptobox_hash_state_t *st;
  82. gchar hash[rspamd_cryptobox_HASHBYTES + 1];
  83. #ifdef WITH_HYPERSCAN
  84. rspamd_hyperscan_t *hs_db;
  85. hs_scratch_t *hs_scratch;
  86. gint *hs_ids;
  87. guint nhs;
  88. #endif
  89. };
  90. enum rspamd_re_cache_elt_match_type {
  91. RSPAMD_RE_CACHE_PCRE = 0,
  92. RSPAMD_RE_CACHE_HYPERSCAN,
  93. RSPAMD_RE_CACHE_HYPERSCAN_PRE
  94. };
  95. struct rspamd_re_cache_elt {
  96. rspamd_regexp_t *re;
  97. gint lua_cbref;
  98. enum rspamd_re_cache_elt_match_type match_type;
  99. };
  100. KHASH_INIT(lua_selectors_hash, gchar *, int, 1, kh_str_hash_func, kh_str_hash_equal);
  101. struct rspamd_re_cache {
  102. GHashTable *re_classes;
  103. GPtrArray *re;
  104. khash_t(lua_selectors_hash) * selectors;
  105. ref_entry_t ref;
  106. guint nre;
  107. guint max_re_data;
  108. gchar hash[rspamd_cryptobox_HASHBYTES + 1];
  109. lua_State *L;
  110. #ifdef WITH_HYPERSCAN
  111. enum rspamd_hyperscan_status hyperscan_loaded;
  112. gboolean disable_hyperscan;
  113. hs_platform_info_t plt;
  114. #endif
  115. };
  116. struct rspamd_re_selector_result {
  117. guchar **scvec;
  118. guint *lenvec;
  119. guint cnt;
  120. };
  121. KHASH_INIT(selectors_results_hash, int, struct rspamd_re_selector_result, 1,
  122. kh_int_hash_func, kh_int_hash_equal);
  123. struct rspamd_re_runtime {
  124. guchar *checked;
  125. guchar *results;
  126. khash_t(selectors_results_hash) * sel_cache;
  127. struct rspamd_re_cache *cache;
  128. struct rspamd_re_cache_stat stat;
  129. gboolean has_hs;
  130. };
  131. static GQuark
  132. rspamd_re_cache_quark(void)
  133. {
  134. return g_quark_from_static_string("re_cache");
  135. }
  136. static guint64
  137. rspamd_re_cache_class_id(enum rspamd_re_type type,
  138. gconstpointer type_data,
  139. gsize datalen)
  140. {
  141. rspamd_cryptobox_fast_hash_state_t st;
  142. rspamd_cryptobox_fast_hash_init(&st, 0xdeadbabe);
  143. rspamd_cryptobox_fast_hash_update(&st, &type, sizeof(type));
  144. if (datalen > 0) {
  145. rspamd_cryptobox_fast_hash_update(&st, type_data, datalen);
  146. }
  147. return rspamd_cryptobox_fast_hash_final(&st);
  148. }
  149. static void
  150. rspamd_re_cache_destroy(struct rspamd_re_cache *cache)
  151. {
  152. GHashTableIter it;
  153. gpointer k, v;
  154. struct rspamd_re_class *re_class;
  155. gchar *skey;
  156. gint sref;
  157. g_assert(cache != NULL);
  158. g_hash_table_iter_init(&it, cache->re_classes);
  159. while (g_hash_table_iter_next(&it, &k, &v)) {
  160. re_class = v;
  161. g_hash_table_iter_steal(&it);
  162. g_hash_table_unref(re_class->re);
  163. if (re_class->type_data) {
  164. g_free(re_class->type_data);
  165. }
  166. #ifdef WITH_HYPERSCAN
  167. if (re_class->hs_db) {
  168. rspamd_hyperscan_free(re_class->hs_db, false);
  169. }
  170. if (re_class->hs_scratch) {
  171. hs_free_scratch(re_class->hs_scratch);
  172. }
  173. if (re_class->hs_ids) {
  174. g_free(re_class->hs_ids);
  175. }
  176. #endif
  177. g_free(re_class);
  178. }
  179. if (cache->L) {
  180. kh_foreach(cache->selectors, skey, sref, {
  181. luaL_unref(cache->L, LUA_REGISTRYINDEX, sref);
  182. g_free(skey);
  183. });
  184. struct rspamd_re_cache_elt *elt;
  185. guint i;
  186. PTR_ARRAY_FOREACH(cache->re, i, elt)
  187. {
  188. if (elt->lua_cbref != -1) {
  189. luaL_unref(cache->L, LUA_REGISTRYINDEX, elt->lua_cbref);
  190. }
  191. }
  192. }
  193. kh_destroy(lua_selectors_hash, cache->selectors);
  194. g_hash_table_unref(cache->re_classes);
  195. g_ptr_array_free(cache->re, TRUE);
  196. g_free(cache);
  197. }
  198. static void
  199. rspamd_re_cache_elt_dtor(gpointer e)
  200. {
  201. struct rspamd_re_cache_elt *elt = e;
  202. rspamd_regexp_unref(elt->re);
  203. g_free(elt);
  204. }
  205. struct rspamd_re_cache *
  206. rspamd_re_cache_new(void)
  207. {
  208. struct rspamd_re_cache *cache;
  209. cache = g_malloc0(sizeof(*cache));
  210. cache->re_classes = g_hash_table_new(g_int64_hash, g_int64_equal);
  211. cache->nre = 0;
  212. cache->re = g_ptr_array_new_full(256, rspamd_re_cache_elt_dtor);
  213. cache->selectors = kh_init(lua_selectors_hash);
  214. #ifdef WITH_HYPERSCAN
  215. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN;
  216. #endif
  217. REF_INIT_RETAIN(cache, rspamd_re_cache_destroy);
  218. return cache;
  219. }
  220. enum rspamd_hyperscan_status
  221. rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache *cache)
  222. {
  223. g_assert(cache != NULL);
  224. #ifdef WITH_HYPERSCAN
  225. return cache->hyperscan_loaded;
  226. #else
  227. return RSPAMD_HYPERSCAN_UNSUPPORTED;
  228. #endif
  229. }
  230. rspamd_regexp_t *
  231. rspamd_re_cache_add(struct rspamd_re_cache *cache,
  232. rspamd_regexp_t *re,
  233. enum rspamd_re_type type,
  234. gconstpointer type_data, gsize datalen,
  235. gint lua_cbref)
  236. {
  237. guint64 class_id;
  238. struct rspamd_re_class *re_class;
  239. rspamd_regexp_t *nre;
  240. struct rspamd_re_cache_elt *elt;
  241. g_assert(cache != NULL);
  242. g_assert(re != NULL);
  243. class_id = rspamd_re_cache_class_id(type, type_data, datalen);
  244. re_class = g_hash_table_lookup(cache->re_classes, &class_id);
  245. if (re_class == NULL) {
  246. re_class = g_malloc0(sizeof(*re_class));
  247. re_class->id = class_id;
  248. re_class->type_len = datalen;
  249. re_class->type = type;
  250. re_class->re = g_hash_table_new_full(rspamd_regexp_hash,
  251. rspamd_regexp_equal, NULL, (GDestroyNotify) rspamd_regexp_unref);
  252. if (datalen > 0) {
  253. re_class->type_data = g_malloc0(datalen);
  254. memcpy(re_class->type_data, type_data, datalen);
  255. }
  256. g_hash_table_insert(cache->re_classes, &re_class->id, re_class);
  257. }
  258. if ((nre = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(re))) == NULL) {
  259. /*
  260. * We set re id based on the global position in the cache
  261. */
  262. elt = g_malloc0(sizeof(*elt));
  263. /* One ref for re_class */
  264. nre = rspamd_regexp_ref(re);
  265. rspamd_regexp_set_cache_id(re, cache->nre++);
  266. /* One ref for cache */
  267. elt->re = rspamd_regexp_ref(re);
  268. g_ptr_array_add(cache->re, elt);
  269. rspamd_regexp_set_class(re, re_class);
  270. elt->lua_cbref = lua_cbref;
  271. g_hash_table_insert(re_class->re, rspamd_regexp_get_id(nre), nre);
  272. }
  273. if (rspamd_regexp_get_flags(re) & RSPAMD_REGEXP_FLAG_UTF) {
  274. re_class->has_utf8 = TRUE;
  275. }
  276. return nre;
  277. }
  278. void rspamd_re_cache_replace(struct rspamd_re_cache *cache,
  279. rspamd_regexp_t *what,
  280. rspamd_regexp_t *with)
  281. {
  282. guint64 re_id;
  283. struct rspamd_re_class *re_class;
  284. rspamd_regexp_t *src;
  285. struct rspamd_re_cache_elt *elt;
  286. g_assert(cache != NULL);
  287. g_assert(what != NULL);
  288. g_assert(with != NULL);
  289. re_class = rspamd_regexp_get_class(what);
  290. if (re_class != NULL) {
  291. re_id = rspamd_regexp_get_cache_id(what);
  292. g_assert(re_id != RSPAMD_INVALID_ID);
  293. src = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(what));
  294. elt = g_ptr_array_index(cache->re, re_id);
  295. g_assert(elt != NULL);
  296. g_assert(src != NULL);
  297. rspamd_regexp_set_cache_id(what, RSPAMD_INVALID_ID);
  298. rspamd_regexp_set_class(what, NULL);
  299. rspamd_regexp_set_cache_id(with, re_id);
  300. rspamd_regexp_set_class(with, re_class);
  301. /*
  302. * On calling of this function, we actually unref old re (what)
  303. */
  304. g_hash_table_insert(re_class->re,
  305. rspamd_regexp_get_id(what),
  306. rspamd_regexp_ref(with));
  307. rspamd_regexp_unref(elt->re);
  308. elt->re = rspamd_regexp_ref(with);
  309. /* XXX: do not touch match type here */
  310. }
  311. }
  312. static gint
  313. rspamd_re_cache_sort_func(gconstpointer a, gconstpointer b)
  314. {
  315. struct rspamd_re_cache_elt *const *re1 = a, *const *re2 = b;
  316. return rspamd_regexp_cmp(rspamd_regexp_get_id((*re1)->re),
  317. rspamd_regexp_get_id((*re2)->re));
  318. }
  319. void rspamd_re_cache_init(struct rspamd_re_cache *cache, struct rspamd_config *cfg)
  320. {
  321. guint i, fl;
  322. GHashTableIter it;
  323. gpointer k, v;
  324. struct rspamd_re_class *re_class;
  325. rspamd_cryptobox_hash_state_t st_global;
  326. rspamd_regexp_t *re;
  327. struct rspamd_re_cache_elt *elt;
  328. guchar hash_out[rspamd_cryptobox_HASHBYTES];
  329. g_assert(cache != NULL);
  330. rspamd_cryptobox_hash_init(&st_global, NULL, 0);
  331. /* Resort all regexps */
  332. g_ptr_array_sort(cache->re, rspamd_re_cache_sort_func);
  333. for (i = 0; i < cache->re->len; i++) {
  334. elt = g_ptr_array_index(cache->re, i);
  335. re = elt->re;
  336. re_class = rspamd_regexp_get_class(re);
  337. g_assert(re_class != NULL);
  338. rspamd_regexp_set_cache_id(re, i);
  339. if (re_class->st == NULL) {
  340. (void) !posix_memalign((void **) &re_class->st, RSPAMD_ALIGNOF(rspamd_cryptobox_hash_state_t),
  341. sizeof(*re_class->st));
  342. g_assert(re_class->st != NULL);
  343. rspamd_cryptobox_hash_init(re_class->st, NULL, 0);
  344. }
  345. /* Update hashes */
  346. /* Id of re class */
  347. rspamd_cryptobox_hash_update(re_class->st, (gpointer) &re_class->id,
  348. sizeof(re_class->id));
  349. rspamd_cryptobox_hash_update(&st_global, (gpointer) &re_class->id,
  350. sizeof(re_class->id));
  351. /* Id of re expression */
  352. rspamd_cryptobox_hash_update(re_class->st, rspamd_regexp_get_id(re),
  353. rspamd_cryptobox_HASHBYTES);
  354. rspamd_cryptobox_hash_update(&st_global, rspamd_regexp_get_id(re),
  355. rspamd_cryptobox_HASHBYTES);
  356. /* PCRE flags */
  357. fl = rspamd_regexp_get_pcre_flags(re);
  358. rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl,
  359. sizeof(fl));
  360. rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl,
  361. sizeof(fl));
  362. /* Rspamd flags */
  363. fl = rspamd_regexp_get_flags(re);
  364. rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl,
  365. sizeof(fl));
  366. rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl,
  367. sizeof(fl));
  368. /* Limit of hits */
  369. fl = rspamd_regexp_get_maxhits(re);
  370. rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl,
  371. sizeof(fl));
  372. rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl,
  373. sizeof(fl));
  374. /* Numeric order */
  375. rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &i,
  376. sizeof(i));
  377. rspamd_cryptobox_hash_update(&st_global, (const guchar *) &i,
  378. sizeof(i));
  379. }
  380. rspamd_cryptobox_hash_final(&st_global, hash_out);
  381. rspamd_snprintf(cache->hash, sizeof(cache->hash), "%*xs",
  382. (gint) rspamd_cryptobox_HASHBYTES, hash_out);
  383. /* Now finalize all classes */
  384. g_hash_table_iter_init(&it, cache->re_classes);
  385. while (g_hash_table_iter_next(&it, &k, &v)) {
  386. re_class = v;
  387. if (re_class->st) {
  388. /*
  389. * We finally update all classes with the number of expressions
  390. * in the cache to ensure that if even a single re has been changed
  391. * we won't be broken due to id mismatch
  392. */
  393. rspamd_cryptobox_hash_update(re_class->st,
  394. (gpointer) &cache->re->len,
  395. sizeof(cache->re->len));
  396. rspamd_cryptobox_hash_final(re_class->st, hash_out);
  397. rspamd_snprintf(re_class->hash, sizeof(re_class->hash), "%*xs",
  398. (gint) rspamd_cryptobox_HASHBYTES, hash_out);
  399. free(re_class->st); /* Due to posix_memalign */
  400. re_class->st = NULL;
  401. }
  402. }
  403. cache->L = cfg->lua_state;
  404. #ifdef WITH_HYPERSCAN
  405. const gchar *platform = "generic";
  406. rspamd_fstring_t *features = rspamd_fstring_new();
  407. cache->disable_hyperscan = cfg->disable_hyperscan;
  408. g_assert(hs_populate_platform(&cache->plt) == HS_SUCCESS);
  409. /* Now decode what we do have */
  410. switch (cache->plt.tune) {
  411. case HS_TUNE_FAMILY_HSW:
  412. platform = "haswell";
  413. break;
  414. case HS_TUNE_FAMILY_SNB:
  415. platform = "sandy";
  416. break;
  417. case HS_TUNE_FAMILY_BDW:
  418. platform = "broadwell";
  419. break;
  420. case HS_TUNE_FAMILY_IVB:
  421. platform = "ivy";
  422. break;
  423. default:
  424. break;
  425. }
  426. if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) {
  427. features = rspamd_fstring_append(features, "AVX2", 4);
  428. }
  429. hs_set_allocator(g_malloc, g_free);
  430. msg_info_re_cache("loaded hyperscan engine with cpu tune '%s' and features '%V'",
  431. platform, features);
  432. rspamd_fstring_free(features);
  433. #endif
  434. }
  435. struct rspamd_re_runtime *
  436. rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache)
  437. {
  438. struct rspamd_re_runtime *rt;
  439. g_assert(cache != NULL);
  440. rt = g_malloc0(sizeof(*rt) + NBYTES(cache->nre) + cache->nre);
  441. rt->cache = cache;
  442. REF_RETAIN(cache);
  443. rt->checked = ((guchar *) rt) + sizeof(*rt);
  444. rt->results = rt->checked + NBYTES(cache->nre);
  445. rt->stat.regexp_total = cache->nre;
  446. #ifdef WITH_HYPERSCAN
  447. rt->has_hs = cache->hyperscan_loaded;
  448. #endif
  449. return rt;
  450. }
  451. const struct rspamd_re_cache_stat *
  452. rspamd_re_cache_get_stat(struct rspamd_re_runtime *rt)
  453. {
  454. g_assert(rt != NULL);
  455. return &rt->stat;
  456. }
  457. static gboolean
  458. rspamd_re_cache_check_lua_condition(struct rspamd_task *task,
  459. rspamd_regexp_t *re,
  460. const guchar *in, gsize len,
  461. goffset start, goffset end,
  462. gint lua_cbref)
  463. {
  464. lua_State *L = (lua_State *) task->cfg->lua_state;
  465. GError *err = NULL;
  466. struct rspamd_lua_text __attribute__((unused)) * t;
  467. gint text_pos;
  468. if (G_LIKELY(lua_cbref == -1)) {
  469. return TRUE;
  470. }
  471. t = lua_new_text(L, in, len, FALSE);
  472. text_pos = lua_gettop(L);
  473. if (!rspamd_lua_universal_pcall(L, lua_cbref,
  474. G_STRLOC, 1, "utii", &err,
  475. rspamd_task_classname, task,
  476. text_pos, start, end)) {
  477. msg_warn_task("cannot call for re_cache_check_lua_condition for re %s: %e",
  478. rspamd_regexp_get_pattern(re), err);
  479. g_error_free(err);
  480. lua_settop(L, text_pos - 1);
  481. return TRUE;
  482. }
  483. gboolean res = lua_toboolean(L, -1);
  484. lua_settop(L, text_pos - 1);
  485. return res;
  486. }
  487. static guint
  488. rspamd_re_cache_process_pcre(struct rspamd_re_runtime *rt,
  489. rspamd_regexp_t *re, struct rspamd_task *task,
  490. const guchar *in, gsize len,
  491. gboolean is_raw,
  492. gint lua_cbref)
  493. {
  494. guint r = 0;
  495. const gchar *start = NULL, *end = NULL;
  496. guint max_hits = rspamd_regexp_get_maxhits(re);
  497. guint64 id = rspamd_regexp_get_cache_id(re);
  498. gdouble t1 = NAN, t2, pr;
  499. const gdouble slow_time = 1e8;
  500. if (in == NULL) {
  501. return rt->results[id];
  502. }
  503. if (len == 0) {
  504. return rt->results[id];
  505. }
  506. if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) {
  507. len = rt->cache->max_re_data;
  508. }
  509. r = rt->results[id];
  510. if (max_hits == 0 || r < max_hits) {
  511. pr = rspamd_random_double_fast();
  512. if (pr > 0.9) {
  513. t1 = rspamd_get_ticks(TRUE);
  514. }
  515. while (rspamd_regexp_search(re,
  516. in,
  517. len,
  518. &start,
  519. &end,
  520. is_raw,
  521. NULL)) {
  522. if (rspamd_re_cache_check_lua_condition(task, re, in, len,
  523. start - (const gchar *) in, end - (const gchar *) in, lua_cbref)) {
  524. r++;
  525. msg_debug_re_task("found regexp /%s/, total hits: %d",
  526. rspamd_regexp_get_pattern(re), r);
  527. }
  528. if (max_hits > 0 && r >= max_hits) {
  529. break;
  530. }
  531. }
  532. rt->results[id] += r;
  533. rt->stat.regexp_checked++;
  534. rt->stat.bytes_scanned_pcre += len;
  535. rt->stat.bytes_scanned += len;
  536. if (r > 0) {
  537. rt->stat.regexp_matched += r;
  538. }
  539. if (!isnan(t1)) {
  540. t2 = rspamd_get_ticks(TRUE);
  541. if (t2 - t1 > slow_time) {
  542. rspamd_symcache_enable_profile(task);
  543. msg_info_task("regexp '%16s' took %.0f ticks to execute",
  544. rspamd_regexp_get_pattern(re), t2 - t1);
  545. }
  546. }
  547. }
  548. return r;
  549. }
  550. #ifdef WITH_HYPERSCAN
  551. struct rspamd_re_hyperscan_cbdata {
  552. struct rspamd_re_runtime *rt;
  553. const guchar **ins;
  554. const guint *lens;
  555. guint count;
  556. rspamd_regexp_t *re;
  557. struct rspamd_task *task;
  558. };
  559. static gint
  560. rspamd_re_cache_hyperscan_cb(unsigned int id,
  561. unsigned long long from,
  562. unsigned long long to,
  563. unsigned int flags,
  564. void *ud)
  565. {
  566. struct rspamd_re_hyperscan_cbdata *cbdata = ud;
  567. struct rspamd_re_runtime *rt;
  568. struct rspamd_re_cache_elt *cache_elt;
  569. guint ret, maxhits, i, processed;
  570. struct rspamd_task *task;
  571. rt = cbdata->rt;
  572. task = cbdata->task;
  573. cache_elt = g_ptr_array_index(rt->cache->re, id);
  574. maxhits = rspamd_regexp_get_maxhits(cache_elt->re);
  575. if (cache_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) {
  576. if (rspamd_re_cache_check_lua_condition(task, cache_elt->re,
  577. cbdata->ins[0], cbdata->lens[0], from, to, cache_elt->lua_cbref)) {
  578. ret = 1;
  579. setbit(rt->checked, id);
  580. if (maxhits == 0 || rt->results[id] < maxhits) {
  581. rt->results[id] += ret;
  582. rt->stat.regexp_matched++;
  583. }
  584. msg_debug_re_task("found regexp /%s/ using hyperscan only, total hits: %d",
  585. rspamd_regexp_get_pattern(cache_elt->re), rt->results[id]);
  586. }
  587. }
  588. else {
  589. if (!isset(rt->checked, id)) {
  590. processed = 0;
  591. for (i = 0; i < cbdata->count; i++) {
  592. rspamd_re_cache_process_pcre(rt,
  593. cache_elt->re,
  594. cbdata->task,
  595. cbdata->ins[i],
  596. cbdata->lens[i],
  597. FALSE,
  598. cache_elt->lua_cbref);
  599. setbit(rt->checked, id);
  600. processed += cbdata->lens[i];
  601. if (processed >= to) {
  602. break;
  603. }
  604. }
  605. }
  606. }
  607. return 0;
  608. }
  609. #endif
  610. static guint
  611. rspamd_re_cache_process_regexp_data(struct rspamd_re_runtime *rt,
  612. rspamd_regexp_t *re, struct rspamd_task *task,
  613. const guchar **in, guint *lens,
  614. guint count,
  615. gboolean is_raw,
  616. gboolean *processed_hyperscan)
  617. {
  618. guint64 re_id;
  619. guint ret = 0;
  620. guint i;
  621. struct rspamd_re_cache_elt *cache_elt;
  622. re_id = rspamd_regexp_get_cache_id(re);
  623. if (count == 0 || in == NULL) {
  624. /* We assume this as absence of the specified data */
  625. setbit(rt->checked, re_id);
  626. rt->results[re_id] = ret;
  627. return ret;
  628. }
  629. cache_elt = (struct rspamd_re_cache_elt *) g_ptr_array_index(rt->cache->re, re_id);
  630. #ifndef WITH_HYPERSCAN
  631. for (i = 0; i < count; i++) {
  632. ret = rspamd_re_cache_process_pcre(rt,
  633. re,
  634. task,
  635. in[i],
  636. lens[i],
  637. is_raw,
  638. cache_elt->lua_cbref);
  639. rt->results[re_id] = ret;
  640. }
  641. setbit(rt->checked, re_id);
  642. #else
  643. struct rspamd_re_class *re_class;
  644. struct rspamd_re_hyperscan_cbdata cbdata;
  645. cache_elt = g_ptr_array_index(rt->cache->re, re_id);
  646. re_class = rspamd_regexp_get_class(re);
  647. if (rt->cache->disable_hyperscan || cache_elt->match_type == RSPAMD_RE_CACHE_PCRE ||
  648. !rt->has_hs || (is_raw && re_class->has_utf8)) {
  649. for (i = 0; i < count; i++) {
  650. ret = rspamd_re_cache_process_pcre(rt,
  651. re,
  652. task,
  653. in[i],
  654. lens[i],
  655. is_raw,
  656. cache_elt->lua_cbref);
  657. }
  658. setbit(rt->checked, re_id);
  659. }
  660. else {
  661. for (i = 0; i < count; i++) {
  662. /* For Hyperscan we can probably safely disable all those limits */
  663. #if 0
  664. if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) {
  665. lens[i] = rt->cache->max_re_data;
  666. }
  667. #endif
  668. rt->stat.bytes_scanned += lens[i];
  669. }
  670. g_assert(re_class->hs_scratch != NULL);
  671. g_assert(re_class->hs_db != NULL);
  672. /* Go through hyperscan API */
  673. for (i = 0; i < count; i++) {
  674. cbdata.ins = &in[i];
  675. cbdata.re = re;
  676. cbdata.rt = rt;
  677. cbdata.lens = &lens[i];
  678. cbdata.count = 1;
  679. cbdata.task = task;
  680. if ((hs_scan(rspamd_hyperscan_get_database(re_class->hs_db),
  681. in[i], lens[i], 0,
  682. re_class->hs_scratch,
  683. rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
  684. ret = 0;
  685. }
  686. else {
  687. ret = rt->results[re_id];
  688. *processed_hyperscan = TRUE;
  689. }
  690. }
  691. }
  692. #endif
  693. return ret;
  694. }
  695. static void
  696. rspamd_re_cache_finish_class(struct rspamd_task *task,
  697. struct rspamd_re_runtime *rt,
  698. struct rspamd_re_class *re_class,
  699. const gchar *class_name)
  700. {
  701. #ifdef WITH_HYPERSCAN
  702. guint i;
  703. guint64 re_id;
  704. guint found = 0;
  705. /* Set all bits that are not checked and included in hyperscan to 1 */
  706. for (i = 0; i < re_class->nhs; i++) {
  707. re_id = re_class->hs_ids[i];
  708. if (!isset(rt->checked, re_id)) {
  709. g_assert(rt->results[re_id] == 0);
  710. rt->results[re_id] = 0;
  711. setbit(rt->checked, re_id);
  712. }
  713. else {
  714. found++;
  715. }
  716. }
  717. msg_debug_re_task("finished hyperscan for class %s; %d "
  718. "matches found; %d hyperscan supported regexps; %d total regexps",
  719. class_name, found, re_class->nhs, (gint) g_hash_table_size(re_class->re));
  720. #endif
  721. }
  722. static gboolean
  723. rspamd_re_cache_process_selector(struct rspamd_task *task,
  724. struct rspamd_re_runtime *rt,
  725. const gchar *name,
  726. guchar ***svec,
  727. guint **lenvec,
  728. guint *n)
  729. {
  730. gint ref;
  731. khiter_t k;
  732. lua_State *L;
  733. gint err_idx, ret;
  734. struct rspamd_task **ptask;
  735. gboolean result = FALSE;
  736. struct rspamd_re_cache *cache = rt->cache;
  737. struct rspamd_re_selector_result *sr;
  738. L = cache->L;
  739. k = kh_get(lua_selectors_hash, cache->selectors, (gchar *) name);
  740. if (k == kh_end(cache->selectors)) {
  741. msg_err_task("cannot find selector %s, not registered", name);
  742. return FALSE;
  743. }
  744. ref = kh_value(cache->selectors, k);
  745. /* First, search for the cached result */
  746. if (rt->sel_cache) {
  747. k = kh_get(selectors_results_hash, rt->sel_cache, ref);
  748. if (k != kh_end(rt->sel_cache)) {
  749. sr = &kh_value(rt->sel_cache, k);
  750. *svec = sr->scvec;
  751. *lenvec = sr->lenvec;
  752. *n = sr->cnt;
  753. return TRUE;
  754. }
  755. }
  756. else {
  757. rt->sel_cache = kh_init(selectors_results_hash);
  758. }
  759. lua_pushcfunction(L, &rspamd_lua_traceback);
  760. err_idx = lua_gettop(L);
  761. lua_rawgeti(L, LUA_REGISTRYINDEX, ref);
  762. ptask = lua_newuserdata(L, sizeof(*ptask));
  763. *ptask = task;
  764. rspamd_lua_setclass(L, rspamd_task_classname, -1);
  765. if ((ret = lua_pcall(L, 1, 1, err_idx)) != 0) {
  766. msg_err_task("call to selector %s "
  767. "failed (%d): %s",
  768. name, ret,
  769. lua_tostring(L, -1));
  770. }
  771. else {
  772. struct rspamd_lua_text *txt;
  773. gsize slen;
  774. const gchar *sel_data;
  775. if (lua_type(L, -1) != LUA_TTABLE) {
  776. txt = lua_check_text_or_string(L, -1);
  777. if (txt) {
  778. msg_debug_re_cache("re selector %s returned 1 element", name);
  779. sel_data = txt->start;
  780. slen = txt->len;
  781. *n = 1;
  782. *svec = g_malloc(sizeof(guchar *));
  783. *lenvec = g_malloc(sizeof(guint));
  784. (*svec)[0] = g_malloc(slen);
  785. memcpy((*svec)[0], sel_data, slen);
  786. (*lenvec)[0] = slen;
  787. result = TRUE;
  788. }
  789. else {
  790. msg_debug_re_cache("re selector %s returned NULL", name);
  791. }
  792. }
  793. else {
  794. *n = rspamd_lua_table_size(L, -1);
  795. msg_debug_re_cache("re selector %s returned %d elements", name, *n);
  796. if (*n > 0) {
  797. *svec = g_malloc(sizeof(guchar *) * (*n));
  798. *lenvec = g_malloc(sizeof(guint) * (*n));
  799. for (int i = 0; i < *n; i++) {
  800. lua_rawgeti(L, -1, i + 1);
  801. txt = lua_check_text_or_string(L, -1);
  802. if (txt && txt->len > 0) {
  803. sel_data = txt->start;
  804. slen = txt->len;
  805. (*svec)[i] = g_malloc(slen);
  806. memcpy((*svec)[i], sel_data, slen);
  807. }
  808. else {
  809. /* A hack to avoid malloc(0) */
  810. sel_data = "";
  811. slen = 0;
  812. (*svec)[i] = g_malloc(1);
  813. memcpy((*svec)[i], sel_data, 1);
  814. }
  815. (*lenvec)[i] = slen;
  816. lua_pop(L, 1);
  817. }
  818. }
  819. /* Empty table is also a valid result */
  820. result = TRUE;
  821. }
  822. }
  823. lua_settop(L, err_idx - 1);
  824. if (result) {
  825. k = kh_put(selectors_results_hash, rt->sel_cache, ref, &ret);
  826. sr = &kh_value(rt->sel_cache, k);
  827. sr->cnt = *n;
  828. sr->scvec = *svec;
  829. sr->lenvec = *lenvec;
  830. }
  831. return result;
  832. }
  833. static inline guint
  834. rspamd_process_words_vector(GArray *words,
  835. const guchar **scvec,
  836. guint *lenvec,
  837. struct rspamd_re_class *re_class,
  838. guint cnt,
  839. gboolean *raw)
  840. {
  841. guint j;
  842. rspamd_stat_token_t *tok;
  843. if (words) {
  844. for (j = 0; j < words->len; j++) {
  845. tok = &g_array_index(words, rspamd_stat_token_t, j);
  846. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
  847. if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
  848. if (!re_class->has_utf8) {
  849. *raw = TRUE;
  850. }
  851. else {
  852. continue; /* Skip */
  853. }
  854. }
  855. }
  856. else {
  857. continue; /* Skip non text */
  858. }
  859. if (re_class->type == RSPAMD_RE_RAWWORDS) {
  860. if (tok->original.len > 0) {
  861. scvec[cnt] = tok->original.begin;
  862. lenvec[cnt++] = tok->original.len;
  863. }
  864. }
  865. else if (re_class->type == RSPAMD_RE_WORDS) {
  866. if (tok->normalized.len > 0) {
  867. scvec[cnt] = tok->normalized.begin;
  868. lenvec[cnt++] = tok->normalized.len;
  869. }
  870. }
  871. else {
  872. /* Stemmed words */
  873. if (tok->stemmed.len > 0) {
  874. scvec[cnt] = tok->stemmed.begin;
  875. lenvec[cnt++] = tok->stemmed.len;
  876. }
  877. }
  878. }
  879. }
  880. return cnt;
  881. }
  882. static guint
  883. rspamd_re_cache_process_headers_list(struct rspamd_task *task,
  884. struct rspamd_re_runtime *rt,
  885. rspamd_regexp_t *re,
  886. struct rspamd_re_class *re_class,
  887. struct rspamd_mime_header *rh,
  888. gboolean is_strong,
  889. gboolean *processed_hyperscan)
  890. {
  891. const guchar **scvec, *in;
  892. gboolean raw = FALSE;
  893. guint *lenvec;
  894. struct rspamd_mime_header *cur;
  895. guint cnt = 0, i = 0, ret = 0;
  896. DL_COUNT(rh, cur, cnt);
  897. scvec = g_malloc(sizeof(*scvec) * cnt);
  898. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  899. DL_FOREACH(rh, cur)
  900. {
  901. if (is_strong && strcmp(cur->name, re_class->type_data) != 0) {
  902. /* Skip a different case */
  903. continue;
  904. }
  905. if (re_class->type == RSPAMD_RE_RAWHEADER) {
  906. in = (const guchar *) cur->value;
  907. lenvec[i] = strlen(cur->value);
  908. if (rspamd_fast_utf8_validate(in, lenvec[i]) != 0) {
  909. raw = TRUE;
  910. }
  911. }
  912. else {
  913. in = (const guchar *) cur->decoded;
  914. /* Validate input^W^WNo need to validate as it is already valid */
  915. if (!in) {
  916. lenvec[i] = 0;
  917. scvec[i] = (guchar *) "";
  918. continue;
  919. }
  920. lenvec[i] = strlen(in);
  921. }
  922. scvec[i] = in;
  923. i++;
  924. }
  925. if (i > 0) {
  926. ret = rspamd_re_cache_process_regexp_data(rt, re,
  927. task, scvec, lenvec, i, raw, processed_hyperscan);
  928. msg_debug_re_task("checking header %s regexp: %s=%*s -> %d",
  929. re_class->type_data,
  930. rspamd_regexp_get_pattern(re),
  931. (int) lenvec[0], scvec[0], ret);
  932. }
  933. g_free(scvec);
  934. g_free(lenvec);
  935. return ret;
  936. }
  937. /*
  938. * Calculates the specified regexp for the specified class if it's not calculated
  939. */
  940. static guint
  941. rspamd_re_cache_exec_re(struct rspamd_task *task,
  942. struct rspamd_re_runtime *rt,
  943. rspamd_regexp_t *re,
  944. struct rspamd_re_class *re_class,
  945. gboolean is_strong)
  946. {
  947. guint ret = 0, i, re_id;
  948. struct rspamd_mime_header *rh;
  949. const gchar *in;
  950. const guchar **scvec = NULL;
  951. guint *lenvec = NULL;
  952. gboolean raw = FALSE, processed_hyperscan = FALSE;
  953. struct rspamd_mime_text_part *text_part;
  954. struct rspamd_mime_part *mime_part;
  955. struct rspamd_url *url;
  956. guint len = 0, cnt = 0;
  957. const gchar *class_name;
  958. class_name = rspamd_re_cache_type_to_string(re_class->type);
  959. msg_debug_re_task("start check re type: %s: /%s/",
  960. class_name,
  961. rspamd_regexp_get_pattern(re));
  962. re_id = rspamd_regexp_get_cache_id(re);
  963. switch (re_class->type) {
  964. case RSPAMD_RE_HEADER:
  965. case RSPAMD_RE_RAWHEADER:
  966. /* Get list of specified headers */
  967. rh = rspamd_message_get_header_array(task,
  968. re_class->type_data, FALSE);
  969. if (rh) {
  970. ret = rspamd_re_cache_process_headers_list(task, rt, re,
  971. re_class, rh, is_strong, &processed_hyperscan);
  972. msg_debug_re_task("checked header(%s) regexp: %s -> %d",
  973. (const char *) re_class->type_data,
  974. rspamd_regexp_get_pattern(re),
  975. ret);
  976. }
  977. break;
  978. case RSPAMD_RE_ALLHEADER:
  979. raw = TRUE;
  980. in = MESSAGE_FIELD(task, raw_headers_content).begin;
  981. len = MESSAGE_FIELD(task, raw_headers_content).len;
  982. ret = rspamd_re_cache_process_regexp_data(rt, re,
  983. task, (const guchar **) &in, &len, 1, raw, &processed_hyperscan);
  984. msg_debug_re_task("checked allheader regexp: %s -> %d",
  985. rspamd_regexp_get_pattern(re), ret);
  986. break;
  987. case RSPAMD_RE_MIMEHEADER:
  988. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, mime_part)
  989. {
  990. if (mime_part->parent_part == NULL ||
  991. !IS_PART_MULTIPART(mime_part->parent_part) ||
  992. IS_PART_MESSAGE(mime_part)) {
  993. /* We filter parts that have no multipart parent or are a messages here */
  994. continue;
  995. }
  996. rh = rspamd_message_get_header_from_hash(mime_part->raw_headers,
  997. re_class->type_data, FALSE);
  998. if (rh) {
  999. ret += rspamd_re_cache_process_headers_list(task, rt, re,
  1000. re_class, rh, is_strong, &processed_hyperscan);
  1001. }
  1002. msg_debug_re_task("checked mime header(%s) regexp: %s -> %d",
  1003. (const char *) re_class->type_data,
  1004. rspamd_regexp_get_pattern(re),
  1005. ret);
  1006. }
  1007. break;
  1008. case RSPAMD_RE_MIME:
  1009. case RSPAMD_RE_RAWMIME:
  1010. /* Iterate through text parts */
  1011. if (MESSAGE_FIELD(task, text_parts)->len > 0) {
  1012. cnt = MESSAGE_FIELD(task, text_parts)->len;
  1013. scvec = g_malloc(sizeof(*scvec) * cnt);
  1014. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1015. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1016. {
  1017. /* Select data for regexp */
  1018. if (re_class->type == RSPAMD_RE_RAWMIME) {
  1019. if (text_part->raw.len == 0) {
  1020. len = 0;
  1021. in = "";
  1022. }
  1023. else {
  1024. in = text_part->raw.begin;
  1025. len = text_part->raw.len;
  1026. }
  1027. raw = TRUE;
  1028. }
  1029. else {
  1030. /* Skip empty parts */
  1031. if (IS_TEXT_PART_EMPTY(text_part)) {
  1032. len = 0;
  1033. in = "";
  1034. }
  1035. else {
  1036. /* Check raw flags */
  1037. if (!IS_TEXT_PART_UTF(text_part)) {
  1038. raw = TRUE;
  1039. }
  1040. in = text_part->utf_content.begin;
  1041. len = text_part->utf_content.len;
  1042. }
  1043. }
  1044. scvec[i] = (guchar *) in;
  1045. lenvec[i] = len;
  1046. }
  1047. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1048. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1049. msg_debug_re_task("checked mime regexp: %s -> %d",
  1050. rspamd_regexp_get_pattern(re), ret);
  1051. g_free(scvec);
  1052. g_free(lenvec);
  1053. }
  1054. break;
  1055. case RSPAMD_RE_URL:
  1056. cnt = kh_size(MESSAGE_FIELD(task, urls));
  1057. if (cnt > 0) {
  1058. scvec = g_malloc(sizeof(*scvec) * cnt);
  1059. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1060. i = 0;
  1061. raw = FALSE;
  1062. kh_foreach_key(MESSAGE_FIELD(task, urls), url, {
  1063. if ((url->protocol & PROTOCOL_MAILTO)) {
  1064. continue;
  1065. }
  1066. in = url->string;
  1067. len = url->urllen;
  1068. if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
  1069. scvec[i] = (guchar *) in;
  1070. lenvec[i++] = len;
  1071. }
  1072. });
  1073. /* URL regexps do not include emails, that's why the code below is commented */
  1074. #if 0
  1075. g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
  1076. while (g_hash_table_iter_next (&it, &k, &v)) {
  1077. url = v;
  1078. in = url->string;
  1079. len = url->urllen;
  1080. if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
  1081. scvec[i] = (guchar *) in;
  1082. lenvec[i++] = len;
  1083. }
  1084. }
  1085. #endif
  1086. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1087. task, scvec, lenvec, i, raw, &processed_hyperscan);
  1088. msg_debug_re_task("checked url regexp: %s -> %d",
  1089. rspamd_regexp_get_pattern(re), ret);
  1090. g_free(scvec);
  1091. g_free(lenvec);
  1092. }
  1093. break;
  1094. case RSPAMD_RE_EMAIL:
  1095. cnt = kh_size(MESSAGE_FIELD(task, urls));
  1096. if (cnt > 0) {
  1097. scvec = g_malloc(sizeof(*scvec) * cnt);
  1098. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1099. i = 0;
  1100. raw = FALSE;
  1101. kh_foreach_key(MESSAGE_FIELD(task, urls), url, {
  1102. if (!(url->protocol & PROTOCOL_MAILTO)) {
  1103. continue;
  1104. }
  1105. if (url->userlen == 0 || url->hostlen == 0) {
  1106. continue;
  1107. }
  1108. in = rspamd_url_user_unsafe(url);
  1109. len = url->userlen + 1 + url->hostlen;
  1110. scvec[i] = (guchar *) in;
  1111. lenvec[i++] = len;
  1112. });
  1113. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1114. task, scvec, lenvec, i, raw, &processed_hyperscan);
  1115. msg_debug_re_task("checked email regexp: %s -> %d",
  1116. rspamd_regexp_get_pattern(re), ret);
  1117. g_free(scvec);
  1118. g_free(lenvec);
  1119. }
  1120. break;
  1121. case RSPAMD_RE_BODY:
  1122. raw = TRUE;
  1123. in = task->msg.begin;
  1124. len = task->msg.len;
  1125. ret = rspamd_re_cache_process_regexp_data(rt, re, task,
  1126. (const guchar **) &in, &len, 1, raw, &processed_hyperscan);
  1127. msg_debug_re_task("checked rawbody regexp: %s -> %d",
  1128. rspamd_regexp_get_pattern(re), ret);
  1129. break;
  1130. case RSPAMD_RE_SABODY:
  1131. /* According to SA docs:
  1132. * The 'body' in this case is the textual parts of the message body;
  1133. * any non-text MIME parts are stripped, and the message decoded from
  1134. * Quoted-Printable or Base-64-encoded format if necessary. The message
  1135. * Subject header is considered part of the body and becomes the first
  1136. * paragraph when running the rules. All HTML tags and line breaks will
  1137. * be removed before matching.
  1138. */
  1139. cnt = MESSAGE_FIELD(task, text_parts)->len + 1;
  1140. scvec = g_malloc(sizeof(*scvec) * cnt);
  1141. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1142. /*
  1143. * Body rules also include the Subject as the first line
  1144. * of the body content.
  1145. */
  1146. rh = rspamd_message_get_header_array(task, "Subject", FALSE);
  1147. if (rh) {
  1148. scvec[0] = (guchar *) rh->decoded;
  1149. lenvec[0] = strlen(rh->decoded);
  1150. }
  1151. else {
  1152. scvec[0] = (guchar *) "";
  1153. lenvec[0] = 0;
  1154. }
  1155. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1156. {
  1157. if (text_part->utf_stripped_content) {
  1158. scvec[i + 1] = (guchar *) text_part->utf_stripped_content->data;
  1159. lenvec[i + 1] = text_part->utf_stripped_content->len;
  1160. if (!IS_TEXT_PART_UTF(text_part)) {
  1161. raw = TRUE;
  1162. }
  1163. }
  1164. else {
  1165. scvec[i + 1] = (guchar *) "";
  1166. lenvec[i + 1] = 0;
  1167. }
  1168. }
  1169. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1170. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1171. msg_debug_re_task("checked sa body regexp: %s -> %d",
  1172. rspamd_regexp_get_pattern(re), ret);
  1173. g_free(scvec);
  1174. g_free(lenvec);
  1175. break;
  1176. case RSPAMD_RE_SARAWBODY:
  1177. /* According to SA docs:
  1178. * The 'raw body' of a message is the raw data inside all textual
  1179. * parts. The text will be decoded from base64 or quoted-printable
  1180. * encoding, but HTML tags and line breaks will still be present.
  1181. * Multiline expressions will need to be used to match strings that are
  1182. * broken by line breaks.
  1183. */
  1184. if (MESSAGE_FIELD(task, text_parts)->len > 0) {
  1185. cnt = MESSAGE_FIELD(task, text_parts)->len;
  1186. scvec = g_malloc(sizeof(*scvec) * cnt);
  1187. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1188. for (i = 0; i < cnt; i++) {
  1189. text_part = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), i);
  1190. if (text_part->parsed.len > 0) {
  1191. scvec[i] = (guchar *) text_part->parsed.begin;
  1192. lenvec[i] = text_part->parsed.len;
  1193. if (!IS_TEXT_PART_UTF(text_part)) {
  1194. raw = TRUE;
  1195. }
  1196. }
  1197. else {
  1198. scvec[i] = (guchar *) "";
  1199. lenvec[i] = 0;
  1200. }
  1201. }
  1202. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1203. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1204. msg_debug_re_task("checked sa rawbody regexp: %s -> %d",
  1205. rspamd_regexp_get_pattern(re), ret);
  1206. g_free(scvec);
  1207. g_free(lenvec);
  1208. }
  1209. break;
  1210. case RSPAMD_RE_WORDS:
  1211. case RSPAMD_RE_STEMWORDS:
  1212. case RSPAMD_RE_RAWWORDS:
  1213. if (MESSAGE_FIELD(task, text_parts)->len > 0) {
  1214. cnt = 0;
  1215. raw = FALSE;
  1216. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1217. {
  1218. if (text_part->utf_words) {
  1219. cnt += text_part->utf_words->len;
  1220. }
  1221. }
  1222. if (task->meta_words && task->meta_words->len > 0) {
  1223. cnt += task->meta_words->len;
  1224. }
  1225. if (cnt > 0) {
  1226. scvec = g_malloc(sizeof(*scvec) * cnt);
  1227. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1228. cnt = 0;
  1229. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1230. {
  1231. if (text_part->utf_words) {
  1232. cnt = rspamd_process_words_vector(text_part->utf_words,
  1233. scvec, lenvec, re_class, cnt, &raw);
  1234. }
  1235. }
  1236. if (task->meta_words) {
  1237. cnt = rspamd_process_words_vector(task->meta_words,
  1238. scvec, lenvec, re_class, cnt, &raw);
  1239. }
  1240. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1241. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1242. msg_debug_re_task("checked sa words regexp: %s -> %d",
  1243. rspamd_regexp_get_pattern(re), ret);
  1244. g_free(scvec);
  1245. g_free(lenvec);
  1246. }
  1247. }
  1248. break;
  1249. case RSPAMD_RE_SELECTOR:
  1250. if (rspamd_re_cache_process_selector(task, rt,
  1251. re_class->type_data,
  1252. (guchar ***) &scvec,
  1253. &lenvec, &cnt)) {
  1254. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1255. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1256. msg_debug_re_task("checked selector(%s) regexp: %s -> %d",
  1257. re_class->type_data,
  1258. rspamd_regexp_get_pattern(re), ret);
  1259. /* Do not free vectors as they are managed by rt->sel_cache */
  1260. }
  1261. break;
  1262. case RSPAMD_RE_MAX:
  1263. msg_err_task("regexp of class invalid has been called: %s",
  1264. rspamd_regexp_get_pattern(re));
  1265. break;
  1266. }
  1267. #if WITH_HYPERSCAN
  1268. if (processed_hyperscan) {
  1269. rspamd_re_cache_finish_class(task, rt, re_class, class_name);
  1270. }
  1271. #endif
  1272. setbit(rt->checked, re_id);
  1273. return rt->results[re_id];
  1274. }
  1275. gint rspamd_re_cache_process(struct rspamd_task *task,
  1276. rspamd_regexp_t *re,
  1277. enum rspamd_re_type type,
  1278. gconstpointer type_data,
  1279. gsize datalen,
  1280. gboolean is_strong)
  1281. {
  1282. guint64 re_id;
  1283. struct rspamd_re_class *re_class;
  1284. struct rspamd_re_cache *cache;
  1285. struct rspamd_re_runtime *rt;
  1286. g_assert(task != NULL);
  1287. rt = task->re_rt;
  1288. g_assert(rt != NULL);
  1289. g_assert(re != NULL);
  1290. cache = rt->cache;
  1291. re_id = rspamd_regexp_get_cache_id(re);
  1292. if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) {
  1293. msg_err_task("re '%s' has no valid id for the cache",
  1294. rspamd_regexp_get_pattern(re));
  1295. return 0;
  1296. }
  1297. if (isset(rt->checked, re_id)) {
  1298. /* Fast path */
  1299. rt->stat.regexp_fast_cached++;
  1300. return rt->results[re_id];
  1301. }
  1302. else {
  1303. /* Slow path */
  1304. re_class = rspamd_regexp_get_class(re);
  1305. if (re_class == NULL) {
  1306. msg_err_task("cannot find re class for regexp '%s'",
  1307. rspamd_regexp_get_pattern(re));
  1308. return 0;
  1309. }
  1310. return rspamd_re_cache_exec_re(task, rt, re, re_class,
  1311. is_strong);
  1312. }
  1313. return 0;
  1314. }
  1315. int rspamd_re_cache_process_ffi(void *ptask,
  1316. void *pre,
  1317. int type,
  1318. void *type_data,
  1319. int is_strong)
  1320. {
  1321. struct rspamd_lua_regexp **lua_re = pre;
  1322. struct rspamd_task **real_task = ptask;
  1323. gsize typelen = 0;
  1324. if (type_data) {
  1325. typelen = strlen(type_data);
  1326. }
  1327. return rspamd_re_cache_process(*real_task, (*lua_re)->re,
  1328. type, type_data, typelen, is_strong);
  1329. }
  1330. void rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime *rt)
  1331. {
  1332. g_assert(rt != NULL);
  1333. if (rt->sel_cache) {
  1334. struct rspamd_re_selector_result sr;
  1335. kh_foreach_value(rt->sel_cache, sr, {
  1336. for (guint i = 0; i < sr.cnt; i++) {
  1337. g_free((gpointer) sr.scvec[i]);
  1338. }
  1339. g_free(sr.scvec);
  1340. g_free(sr.lenvec);
  1341. });
  1342. kh_destroy(selectors_results_hash, rt->sel_cache);
  1343. }
  1344. REF_RELEASE(rt->cache);
  1345. g_free(rt);
  1346. }
  1347. void rspamd_re_cache_unref(struct rspamd_re_cache *cache)
  1348. {
  1349. if (cache) {
  1350. REF_RELEASE(cache);
  1351. }
  1352. }
  1353. struct rspamd_re_cache *
  1354. rspamd_re_cache_ref(struct rspamd_re_cache *cache)
  1355. {
  1356. if (cache) {
  1357. REF_RETAIN(cache);
  1358. }
  1359. return cache;
  1360. }
  1361. guint rspamd_re_cache_set_limit(struct rspamd_re_cache *cache, guint limit)
  1362. {
  1363. guint old;
  1364. g_assert(cache != NULL);
  1365. old = cache->max_re_data;
  1366. cache->max_re_data = limit;
  1367. return old;
  1368. }
  1369. const gchar *
  1370. rspamd_re_cache_type_to_string(enum rspamd_re_type type)
  1371. {
  1372. const gchar *ret = "unknown";
  1373. switch (type) {
  1374. case RSPAMD_RE_HEADER:
  1375. ret = "header";
  1376. break;
  1377. case RSPAMD_RE_RAWHEADER:
  1378. ret = "raw header";
  1379. break;
  1380. case RSPAMD_RE_MIMEHEADER:
  1381. ret = "mime header";
  1382. break;
  1383. case RSPAMD_RE_ALLHEADER:
  1384. ret = "all headers";
  1385. break;
  1386. case RSPAMD_RE_MIME:
  1387. ret = "part";
  1388. break;
  1389. case RSPAMD_RE_RAWMIME:
  1390. ret = "raw part";
  1391. break;
  1392. case RSPAMD_RE_BODY:
  1393. ret = "rawbody";
  1394. break;
  1395. case RSPAMD_RE_URL:
  1396. ret = "url";
  1397. break;
  1398. case RSPAMD_RE_EMAIL:
  1399. ret = "email";
  1400. break;
  1401. case RSPAMD_RE_SABODY:
  1402. ret = "sa body";
  1403. break;
  1404. case RSPAMD_RE_SARAWBODY:
  1405. ret = "sa raw body";
  1406. break;
  1407. case RSPAMD_RE_SELECTOR:
  1408. ret = "selector";
  1409. break;
  1410. case RSPAMD_RE_WORDS:
  1411. ret = "words";
  1412. break;
  1413. case RSPAMD_RE_RAWWORDS:
  1414. ret = "raw_words";
  1415. break;
  1416. case RSPAMD_RE_STEMWORDS:
  1417. ret = "stem_words";
  1418. break;
  1419. case RSPAMD_RE_MAX:
  1420. default:
  1421. ret = "invalid class";
  1422. break;
  1423. }
  1424. return ret;
  1425. }
  1426. enum rspamd_re_type
  1427. rspamd_re_cache_type_from_string(const char *str)
  1428. {
  1429. enum rspamd_re_type ret;
  1430. guint64 h;
  1431. /*
  1432. * To optimize this function, we apply hash to input string and
  1433. * pre-select it from the values
  1434. */
  1435. if (str != NULL) {
  1436. h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
  1437. str, strlen(str), 0xdeadbabe);
  1438. switch (h) {
  1439. case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */
  1440. ret = RSPAMD_RE_HEADER;
  1441. break;
  1442. case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */
  1443. ret = RSPAMD_RE_RAWHEADER;
  1444. break;
  1445. case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */
  1446. ret = RSPAMD_RE_MIME;
  1447. break;
  1448. case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */
  1449. ret = RSPAMD_RE_RAWMIME;
  1450. break;
  1451. case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */
  1452. case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */
  1453. ret = RSPAMD_RE_BODY;
  1454. break;
  1455. case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */
  1456. case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */
  1457. ret = RSPAMD_RE_URL;
  1458. break;
  1459. case G_GUINT64_CONSTANT(0x7e232b0f60b571be): /* email */
  1460. ret = RSPAMD_RE_EMAIL;
  1461. break;
  1462. case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */
  1463. ret = RSPAMD_RE_ALLHEADER;
  1464. break;
  1465. case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */
  1466. ret = RSPAMD_RE_MIMEHEADER;
  1467. break;
  1468. case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */
  1469. ret = RSPAMD_RE_SABODY;
  1470. break;
  1471. case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */
  1472. ret = RSPAMD_RE_SARAWBODY;
  1473. break;
  1474. default:
  1475. ret = RSPAMD_RE_MAX;
  1476. break;
  1477. }
  1478. }
  1479. else {
  1480. ret = RSPAMD_RE_MAX;
  1481. }
  1482. return ret;
  1483. }
  1484. #ifdef WITH_HYPERSCAN
  1485. static gchar *
  1486. rspamd_re_cache_hs_pattern_from_pcre(rspamd_regexp_t *re)
  1487. {
  1488. /*
  1489. * Workaround for bug in ragel 7.0.0.11
  1490. * https://github.com/intel/hyperscan/issues/133
  1491. */
  1492. const gchar *pat = rspamd_regexp_get_pattern(re);
  1493. guint flags = rspamd_regexp_get_flags(re), esc_flags = RSPAMD_REGEXP_ESCAPE_RE;
  1494. gchar *escaped;
  1495. gsize esc_len;
  1496. if (flags & RSPAMD_REGEXP_FLAG_UTF) {
  1497. esc_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
  1498. }
  1499. escaped = rspamd_str_regexp_escape(pat, strlen(pat), &esc_len, esc_flags);
  1500. return escaped;
  1501. }
  1502. static gboolean
  1503. rspamd_re_cache_is_finite(struct rspamd_re_cache *cache,
  1504. rspamd_regexp_t *re, gint flags, gdouble max_time)
  1505. {
  1506. pid_t cld;
  1507. gint status;
  1508. struct timespec ts;
  1509. hs_compile_error_t *hs_errors;
  1510. hs_database_t *test_db;
  1511. gdouble wait_time;
  1512. const gint max_tries = 10;
  1513. gint tries = 0, rc;
  1514. void (*old_hdl)(int);
  1515. wait_time = max_time / max_tries;
  1516. /* We need to restore SIGCHLD processing */
  1517. old_hdl = signal(SIGCHLD, SIG_DFL);
  1518. cld = fork();
  1519. if (cld == 0) {
  1520. /* Try to compile pattern */
  1521. gchar *pat = rspamd_re_cache_hs_pattern_from_pcre(re);
  1522. if (hs_compile(pat,
  1523. flags | HS_FLAG_PREFILTER,
  1524. HS_MODE_BLOCK,
  1525. &cache->plt,
  1526. &test_db,
  1527. &hs_errors) != HS_SUCCESS) {
  1528. msg_info_re_cache("cannot compile (prefilter mode) '%s' to hyperscan: '%s'",
  1529. pat,
  1530. hs_errors != NULL ? hs_errors->message : "unknown error");
  1531. hs_free_compile_error(hs_errors);
  1532. g_free(pat);
  1533. exit(EXIT_FAILURE);
  1534. }
  1535. g_free(pat);
  1536. exit(EXIT_SUCCESS);
  1537. }
  1538. else if (cld > 0) {
  1539. double_to_ts(wait_time, &ts);
  1540. while ((rc = waitpid(cld, &status, WNOHANG)) == 0 && tries++ < max_tries) {
  1541. (void) nanosleep(&ts, NULL);
  1542. }
  1543. /* Child has been terminated */
  1544. if (rc > 0) {
  1545. /* Forget about SIGCHLD after this point */
  1546. signal(SIGCHLD, old_hdl);
  1547. if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_SUCCESS) {
  1548. return TRUE;
  1549. }
  1550. else {
  1551. msg_err_re_cache(
  1552. "cannot approximate %s to hyperscan",
  1553. rspamd_regexp_get_pattern(re));
  1554. return FALSE;
  1555. }
  1556. }
  1557. else {
  1558. /* We consider that as timeout */
  1559. kill(cld, SIGKILL);
  1560. g_assert(waitpid(cld, &status, 0) != -1);
  1561. msg_err_re_cache(
  1562. "cannot approximate %s to hyperscan: timeout waiting",
  1563. rspamd_regexp_get_pattern(re));
  1564. signal(SIGCHLD, old_hdl);
  1565. }
  1566. }
  1567. else {
  1568. msg_err_re_cache(
  1569. "cannot approximate %s to hyperscan: fork failed: %s",
  1570. rspamd_regexp_get_pattern(re), strerror(errno));
  1571. signal(SIGCHLD, old_hdl);
  1572. }
  1573. return FALSE;
  1574. }
  1575. #endif
  1576. #ifdef WITH_HYPERSCAN
  1577. struct rspamd_re_cache_hs_compile_cbdata {
  1578. GHashTableIter it;
  1579. struct rspamd_re_cache *cache;
  1580. const char *cache_dir;
  1581. gdouble max_time;
  1582. gboolean silent;
  1583. guint total;
  1584. void (*cb)(guint ncompiled, GError *err, void *cbd);
  1585. void *cbd;
  1586. };
  1587. static void
  1588. rspamd_re_cache_compile_err(EV_P_ ev_timer *w, GError *err,
  1589. struct rspamd_re_cache_hs_compile_cbdata *cbdata, bool is_fatal)
  1590. {
  1591. cbdata->cb(cbdata->total, err, cbdata->cbd);
  1592. if (is_fatal) {
  1593. ev_timer_stop(EV_A_ w);
  1594. g_free(w);
  1595. g_free(cbdata);
  1596. }
  1597. else {
  1598. /* Continue compilation */
  1599. ev_timer_again(EV_A_ w);
  1600. }
  1601. g_error_free(err);
  1602. }
  1603. static void
  1604. rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer *w, int revents)
  1605. {
  1606. struct rspamd_re_cache_hs_compile_cbdata *cbdata =
  1607. (struct rspamd_re_cache_hs_compile_cbdata *) w->data;
  1608. GHashTableIter cit;
  1609. gpointer k, v;
  1610. struct rspamd_re_class *re_class;
  1611. gchar path[PATH_MAX], npath[PATH_MAX];
  1612. hs_database_t *test_db;
  1613. gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags;
  1614. rspamd_cryptobox_fast_hash_state_t crc_st;
  1615. guint64 crc;
  1616. rspamd_regexp_t *re;
  1617. hs_compile_error_t *hs_errors = NULL;
  1618. guint *hs_flags = NULL;
  1619. const hs_expr_ext_t **hs_exts = NULL;
  1620. gchar **hs_pats = NULL;
  1621. gchar *hs_serialized = NULL;
  1622. gsize serialized_len;
  1623. struct iovec iov[7];
  1624. struct rspamd_re_cache *cache;
  1625. GError *err;
  1626. pid_t our_pid = getpid();
  1627. cache = cbdata->cache;
  1628. if (!g_hash_table_iter_next(&cbdata->it, &k, &v)) {
  1629. /* All done */
  1630. ev_timer_stop(EV_A_ w);
  1631. cbdata->cb(cbdata->total, NULL, cbdata->cbd);
  1632. g_free(w);
  1633. g_free(cbdata);
  1634. return;
  1635. }
  1636. re_class = v;
  1637. rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cbdata->cache_dir,
  1638. G_DIR_SEPARATOR, re_class->hash);
  1639. if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, TRUE, TRUE, NULL)) {
  1640. fd = open(path, O_RDONLY, 00600);
  1641. /* Read number of regexps */
  1642. g_assert(fd != -1);
  1643. g_assert(lseek(fd, RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt), SEEK_SET) != -1);
  1644. g_assert(read(fd, &n, sizeof(n)) == sizeof(n));
  1645. close(fd);
  1646. if (re_class->type_len > 0) {
  1647. if (!cbdata->silent) {
  1648. msg_info_re_cache(
  1649. "skip already valid class %s(%*s) to cache %6s, %d regexps",
  1650. rspamd_re_cache_type_to_string(re_class->type),
  1651. (gint) re_class->type_len - 1,
  1652. re_class->type_data,
  1653. re_class->hash,
  1654. n);
  1655. }
  1656. }
  1657. else {
  1658. if (!cbdata->silent) {
  1659. msg_info_re_cache(
  1660. "skip already valid class %s to cache %6s, %d regexps",
  1661. rspamd_re_cache_type_to_string(re_class->type),
  1662. re_class->hash,
  1663. n);
  1664. }
  1665. }
  1666. ev_timer_again(EV_A_ w);
  1667. return;
  1668. }
  1669. rspamd_snprintf(path, sizeof(path), "%s%c%s%P-XXXXXXXXXX", cbdata->cache_dir,
  1670. G_DIR_SEPARATOR, re_class->hash, our_pid);
  1671. fd = g_mkstemp_full(path, O_CREAT | O_TRUNC | O_EXCL | O_WRONLY, 00600);
  1672. if (fd == -1) {
  1673. err = g_error_new(rspamd_re_cache_quark(), errno,
  1674. "cannot open file %s: %s", path, strerror(errno));
  1675. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1676. return;
  1677. }
  1678. g_hash_table_iter_init(&cit, re_class->re);
  1679. n = g_hash_table_size(re_class->re);
  1680. hs_flags = g_new0(guint, n);
  1681. hs_ids = g_new0(guint, n);
  1682. hs_pats = g_new0(char *, n);
  1683. hs_exts = g_new0(const hs_expr_ext_t *, n);
  1684. i = 0;
  1685. while (g_hash_table_iter_next(&cit, &k, &v)) {
  1686. re = v;
  1687. pcre_flags = rspamd_regexp_get_pcre_flags(re);
  1688. re_flags = rspamd_regexp_get_flags(re);
  1689. if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) {
  1690. /* Do not try to compile bad regexp */
  1691. msg_info_re_cache(
  1692. "do not try compile %s to hyperscan as it is PCRE only",
  1693. rspamd_regexp_get_pattern(re));
  1694. continue;
  1695. }
  1696. hs_flags[i] = 0;
  1697. hs_exts[i] = NULL;
  1698. #ifndef WITH_PCRE2
  1699. if (pcre_flags & PCRE_FLAG(UTF8)) {
  1700. hs_flags[i] |= HS_FLAG_UTF8;
  1701. }
  1702. #else
  1703. if (pcre_flags & PCRE_FLAG(UTF)) {
  1704. hs_flags[i] |= HS_FLAG_UTF8;
  1705. }
  1706. #endif
  1707. if (pcre_flags & PCRE_FLAG(CASELESS)) {
  1708. hs_flags[i] |= HS_FLAG_CASELESS;
  1709. }
  1710. if (pcre_flags & PCRE_FLAG(MULTILINE)) {
  1711. hs_flags[i] |= HS_FLAG_MULTILINE;
  1712. }
  1713. if (pcre_flags & PCRE_FLAG(DOTALL)) {
  1714. hs_flags[i] |= HS_FLAG_DOTALL;
  1715. }
  1716. if (re_flags & RSPAMD_REGEXP_FLAG_LEFTMOST) {
  1717. hs_flags[i] |= HS_FLAG_SOM_LEFTMOST;
  1718. }
  1719. else if (rspamd_regexp_get_maxhits(re) == 1) {
  1720. hs_flags[i] |= HS_FLAG_SINGLEMATCH;
  1721. }
  1722. gchar *pat = rspamd_re_cache_hs_pattern_from_pcre(re);
  1723. if (hs_compile(pat,
  1724. hs_flags[i],
  1725. HS_MODE_BLOCK,
  1726. &cache->plt,
  1727. &test_db,
  1728. &hs_errors) != HS_SUCCESS) {
  1729. msg_info_re_cache("cannot compile '%s' to hyperscan: '%s', try prefilter match",
  1730. pat,
  1731. hs_errors != NULL ? hs_errors->message : "unknown error");
  1732. hs_free_compile_error(hs_errors);
  1733. /* The approximation operation might take a significant
  1734. * amount of time, so we need to check if it's finite
  1735. */
  1736. if (rspamd_re_cache_is_finite(cache, re, hs_flags[i], cbdata->max_time)) {
  1737. hs_flags[i] |= HS_FLAG_PREFILTER;
  1738. hs_ids[i] = rspamd_regexp_get_cache_id(re);
  1739. hs_pats[i] = pat;
  1740. i++;
  1741. }
  1742. else {
  1743. g_free(pat); /* Avoid leak */
  1744. }
  1745. }
  1746. else {
  1747. hs_ids[i] = rspamd_regexp_get_cache_id(re);
  1748. hs_pats[i] = pat;
  1749. i++;
  1750. hs_free_database(test_db);
  1751. }
  1752. }
  1753. /* Adjust real re number */
  1754. n = i;
  1755. #define CLEANUP_ALLOCATED(is_err) \
  1756. do { \
  1757. g_free(hs_flags); \
  1758. g_free(hs_ids); \
  1759. for (guint j = 0; j < i; j++) { \
  1760. g_free(hs_pats[j]); \
  1761. } \
  1762. g_free(hs_pats); \
  1763. g_free(hs_exts); \
  1764. if (is_err) { \
  1765. close(fd); \
  1766. unlink(path); \
  1767. if (hs_errors) hs_free_compile_error(hs_errors); \
  1768. } \
  1769. } while (0)
  1770. if (n > 0) {
  1771. /* Create the hs tree */
  1772. hs_errors = NULL;
  1773. if (hs_compile_ext_multi((const char **) hs_pats,
  1774. hs_flags,
  1775. hs_ids,
  1776. hs_exts,
  1777. n,
  1778. HS_MODE_BLOCK,
  1779. &cache->plt,
  1780. &test_db,
  1781. &hs_errors) != HS_SUCCESS) {
  1782. err = g_error_new(rspamd_re_cache_quark(), EINVAL,
  1783. "cannot create tree of regexp when processing '%s': %s",
  1784. hs_pats[hs_errors->expression], hs_errors->message);
  1785. CLEANUP_ALLOCATED(true);
  1786. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1787. return;
  1788. }
  1789. if (hs_serialize_database(test_db, &hs_serialized,
  1790. &serialized_len) != HS_SUCCESS) {
  1791. err = g_error_new(rspamd_re_cache_quark(),
  1792. errno,
  1793. "cannot serialize tree of regexp for %s",
  1794. re_class->hash);
  1795. CLEANUP_ALLOCATED(true);
  1796. hs_free_database(test_db);
  1797. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1798. return;
  1799. }
  1800. hs_free_database(test_db);
  1801. /*
  1802. * Magic - 8 bytes
  1803. * Platform - sizeof (platform)
  1804. * n - number of regexps
  1805. * n * <regexp ids>
  1806. * n * <regexp flags>
  1807. * crc - 8 bytes checksum
  1808. * <hyperscan blob>
  1809. */
  1810. rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
  1811. /* IDs -> Flags -> Hs blob */
  1812. rspamd_cryptobox_fast_hash_update(&crc_st,
  1813. hs_ids, sizeof(*hs_ids) * n);
  1814. rspamd_cryptobox_fast_hash_update(&crc_st,
  1815. hs_flags, sizeof(*hs_flags) * n);
  1816. rspamd_cryptobox_fast_hash_update(&crc_st,
  1817. hs_serialized, serialized_len);
  1818. crc = rspamd_cryptobox_fast_hash_final(&crc_st);
  1819. iov[0].iov_base = (void *) rspamd_hs_magic;
  1820. iov[0].iov_len = RSPAMD_HS_MAGIC_LEN;
  1821. iov[1].iov_base = &cache->plt;
  1822. iov[1].iov_len = sizeof(cache->plt);
  1823. iov[2].iov_base = &n;
  1824. iov[2].iov_len = sizeof(n);
  1825. iov[3].iov_base = hs_ids;
  1826. iov[3].iov_len = sizeof(*hs_ids) * n;
  1827. iov[4].iov_base = hs_flags;
  1828. iov[4].iov_len = sizeof(*hs_flags) * n;
  1829. iov[5].iov_base = &crc;
  1830. iov[5].iov_len = sizeof(crc);
  1831. iov[6].iov_base = hs_serialized;
  1832. iov[6].iov_len = serialized_len;
  1833. if (writev(fd, iov, G_N_ELEMENTS(iov)) == -1) {
  1834. err = g_error_new(rspamd_re_cache_quark(),
  1835. errno,
  1836. "cannot serialize tree of regexp to %s: %s",
  1837. path, strerror(errno));
  1838. CLEANUP_ALLOCATED(true);
  1839. g_free(hs_serialized);
  1840. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1841. return;
  1842. }
  1843. if (re_class->type_len > 0) {
  1844. msg_info_re_cache(
  1845. "compiled class %s(%*s) to cache %6s, %d/%d regexps",
  1846. rspamd_re_cache_type_to_string(re_class->type),
  1847. (gint) re_class->type_len - 1,
  1848. re_class->type_data,
  1849. re_class->hash,
  1850. n,
  1851. (gint) g_hash_table_size(re_class->re));
  1852. }
  1853. else {
  1854. msg_info_re_cache(
  1855. "compiled class %s to cache %6s, %d/%d regexps",
  1856. rspamd_re_cache_type_to_string(re_class->type),
  1857. re_class->hash,
  1858. n,
  1859. (gint) g_hash_table_size(re_class->re));
  1860. }
  1861. cbdata->total += n;
  1862. CLEANUP_ALLOCATED(false);
  1863. /* Now rename temporary file to the new .hs file */
  1864. rspamd_snprintf(npath, sizeof(npath), "%s%c%s.hs", cbdata->cache_dir,
  1865. G_DIR_SEPARATOR, re_class->hash);
  1866. if (rename(path, npath) == -1) {
  1867. err = g_error_new(rspamd_re_cache_quark(),
  1868. errno,
  1869. "cannot rename %s to %s: %s",
  1870. path, npath, strerror(errno));
  1871. unlink(path);
  1872. close(fd);
  1873. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1874. return;
  1875. }
  1876. close(fd);
  1877. }
  1878. else {
  1879. err = g_error_new(rspamd_re_cache_quark(),
  1880. errno,
  1881. "no suitable regular expressions %s (%d original): "
  1882. "remove temporary file %s",
  1883. rspamd_re_cache_type_to_string(re_class->type),
  1884. (gint) g_hash_table_size(re_class->re),
  1885. path);
  1886. CLEANUP_ALLOCATED(true);
  1887. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1888. return;
  1889. }
  1890. /* Continue process */
  1891. ev_timer_again(EV_A_ w);
  1892. }
  1893. #endif
  1894. gint rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache *cache,
  1895. const char *cache_dir,
  1896. gdouble max_time,
  1897. gboolean silent,
  1898. struct ev_loop *event_loop,
  1899. void (*cb)(guint ncompiled, GError *err, void *cbd),
  1900. void *cbd)
  1901. {
  1902. g_assert(cache != NULL);
  1903. g_assert(cache_dir != NULL);
  1904. #ifndef WITH_HYPERSCAN
  1905. return -1;
  1906. #else
  1907. static ev_timer *timer;
  1908. static const ev_tstamp timer_interval = 0.1;
  1909. struct rspamd_re_cache_hs_compile_cbdata *cbdata;
  1910. cbdata = g_malloc0(sizeof(*cbdata));
  1911. g_hash_table_iter_init(&cbdata->it, cache->re_classes);
  1912. cbdata->cache = cache;
  1913. cbdata->cache_dir = cache_dir;
  1914. cbdata->cb = cb;
  1915. cbdata->cbd = cbd;
  1916. cbdata->max_time = max_time;
  1917. cbdata->silent = silent;
  1918. cbdata->total = 0;
  1919. timer = g_malloc0(sizeof(*timer));
  1920. timer->data = (void *) cbdata; /* static */
  1921. ev_timer_init(timer, rspamd_re_cache_compile_timer_cb,
  1922. timer_interval, timer_interval);
  1923. ev_timer_start(event_loop, timer);
  1924. return 0;
  1925. #endif
  1926. }
  1927. gboolean
  1928. rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache,
  1929. const char *path, gboolean silent, gboolean try_load, GError **err)
  1930. {
  1931. g_assert(cache != NULL);
  1932. g_assert(path != NULL);
  1933. #ifndef WITH_HYPERSCAN
  1934. return FALSE;
  1935. #else
  1936. gint fd, n, ret;
  1937. guchar magicbuf[RSPAMD_HS_MAGIC_LEN];
  1938. const guchar *mb;
  1939. GHashTableIter it;
  1940. gpointer k, v;
  1941. struct rspamd_re_class *re_class;
  1942. gsize len;
  1943. const gchar *hash_pos;
  1944. hs_platform_info_t test_plt;
  1945. hs_database_t *test_db = NULL;
  1946. guchar *map, *p, *end;
  1947. rspamd_cryptobox_fast_hash_state_t crc_st;
  1948. guint64 crc, valid_crc;
  1949. len = strlen(path);
  1950. if (len < sizeof(rspamd_cryptobox_HASHBYTES + 3)) {
  1951. if (!silent) {
  1952. msg_err_re_cache("cannot open hyperscan cache file %s: too short filename",
  1953. path);
  1954. }
  1955. g_set_error(err, rspamd_re_cache_quark(), 0,
  1956. "too short filename");
  1957. return FALSE;
  1958. }
  1959. if (memcmp(path + len - 3, ".hs", 3) != 0) {
  1960. if (!silent) {
  1961. msg_err_re_cache("cannot open hyperscan cache file %s: not ending with .hs",
  1962. path);
  1963. }
  1964. g_set_error(err, rspamd_re_cache_quark(), 0,
  1965. "not ending with .hs");
  1966. return FALSE;
  1967. }
  1968. hash_pos = path + len - 3 - (sizeof(re_class->hash) - 1);
  1969. g_hash_table_iter_init(&it, cache->re_classes);
  1970. while (g_hash_table_iter_next(&it, &k, &v)) {
  1971. re_class = v;
  1972. if (memcmp(hash_pos, re_class->hash, sizeof(re_class->hash) - 1) == 0) {
  1973. /* Open file and check magic */
  1974. gssize r;
  1975. fd = open(path, O_RDONLY);
  1976. if (fd == -1) {
  1977. if (errno != ENOENT || !silent) {
  1978. msg_err_re_cache("cannot open hyperscan cache file %s: %s",
  1979. path, strerror(errno));
  1980. }
  1981. g_set_error(err, rspamd_re_cache_quark(), 0,
  1982. "%s",
  1983. strerror(errno));
  1984. return FALSE;
  1985. }
  1986. if ((r = read(fd, magicbuf, sizeof(magicbuf))) != sizeof(magicbuf)) {
  1987. if (r == -1) {
  1988. msg_err_re_cache("cannot read magic from hyperscan "
  1989. "cache file %s: %s",
  1990. path, strerror(errno));
  1991. g_set_error(err, rspamd_re_cache_quark(), 0,
  1992. "cannot read magic: %s",
  1993. strerror(errno));
  1994. }
  1995. else {
  1996. msg_err_re_cache("truncated read magic from hyperscan "
  1997. "cache file %s: %z, %z wanted",
  1998. path, r, (gsize) sizeof(magicbuf));
  1999. g_set_error(err, rspamd_re_cache_quark(), 0,
  2000. "truncated read magic %zd, %zd wanted",
  2001. r, (gsize) sizeof(magicbuf));
  2002. }
  2003. close(fd);
  2004. return FALSE;
  2005. }
  2006. mb = rspamd_hs_magic;
  2007. if (memcmp(magicbuf, mb, sizeof(magicbuf)) != 0) {
  2008. msg_err_re_cache("cannot open hyperscan cache file %s: "
  2009. "bad magic ('%*xs', '%*xs' expected)",
  2010. path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf,
  2011. (int) RSPAMD_HS_MAGIC_LEN, mb);
  2012. close(fd);
  2013. g_set_error(err, rspamd_re_cache_quark(), 0, "invalid magic");
  2014. return FALSE;
  2015. }
  2016. if ((r = read(fd, &test_plt, sizeof(test_plt))) != sizeof(test_plt)) {
  2017. if (r == -1) {
  2018. msg_err_re_cache("cannot read platform data from hyperscan "
  2019. "cache file %s: %s",
  2020. path, strerror(errno));
  2021. }
  2022. else {
  2023. msg_err_re_cache("truncated read platform data from hyperscan "
  2024. "cache file %s: %z, %z wanted",
  2025. path, r, (gsize) sizeof(magicbuf));
  2026. }
  2027. g_set_error(err, rspamd_re_cache_quark(), 0,
  2028. "cannot read platform data: %s", strerror(errno));
  2029. close(fd);
  2030. return FALSE;
  2031. }
  2032. if (test_plt.cpu_features != cache->plt.cpu_features) {
  2033. msg_err_re_cache("cannot open hyperscan cache file %s: "
  2034. "compiled for a different platform",
  2035. path);
  2036. g_set_error(err, rspamd_re_cache_quark(), 0,
  2037. "compiled for a different platform");
  2038. close(fd);
  2039. return FALSE;
  2040. }
  2041. close(fd);
  2042. if (try_load) {
  2043. map = rspamd_file_xmap(path, PROT_READ, &len, TRUE);
  2044. if (map == NULL) {
  2045. msg_err_re_cache("cannot mmap hyperscan cache file %s: "
  2046. "%s",
  2047. path, strerror(errno));
  2048. g_set_error(err, rspamd_re_cache_quark(), 0,
  2049. "mmap error: %s", strerror(errno));
  2050. return FALSE;
  2051. }
  2052. p = map + RSPAMD_HS_MAGIC_LEN + sizeof(test_plt);
  2053. end = map + len;
  2054. memcpy(&n, p, sizeof(n));
  2055. p += sizeof(gint);
  2056. if (n <= 0 || 2 * n * sizeof(gint) + /* IDs + flags */
  2057. sizeof(guint64) + /* crc */
  2058. RSPAMD_HS_MAGIC_LEN + /* header */
  2059. sizeof(cache->plt) >
  2060. len) {
  2061. /* Some wrong amount of regexps */
  2062. msg_err_re_cache("bad number of expressions in %s: %d",
  2063. path, n);
  2064. g_set_error(err, rspamd_re_cache_quark(), 0,
  2065. "bad number of expressions: %d", n);
  2066. munmap(map, len);
  2067. return FALSE;
  2068. }
  2069. /*
  2070. * Magic - 8 bytes
  2071. * Platform - sizeof (platform)
  2072. * n - number of regexps
  2073. * n * <regexp ids>
  2074. * n * <regexp flags>
  2075. * crc - 8 bytes checksum
  2076. * <hyperscan blob>
  2077. */
  2078. memcpy(&crc, p + n * 2 * sizeof(gint), sizeof(crc));
  2079. rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
  2080. /* IDs */
  2081. rspamd_cryptobox_fast_hash_update(&crc_st, p, n * sizeof(gint));
  2082. /* Flags */
  2083. rspamd_cryptobox_fast_hash_update(&crc_st, p + n * sizeof(gint),
  2084. n * sizeof(gint));
  2085. /* HS database */
  2086. p += n * sizeof(gint) * 2 + sizeof(guint64);
  2087. rspamd_cryptobox_fast_hash_update(&crc_st, p, end - p);
  2088. valid_crc = rspamd_cryptobox_fast_hash_final(&crc_st);
  2089. if (crc != valid_crc) {
  2090. msg_warn_re_cache("outdated or invalid hs database in %s: "
  2091. "crc read %xL, crc expected %xL",
  2092. path, crc, valid_crc);
  2093. g_set_error(err, rspamd_re_cache_quark(), 0,
  2094. "outdated or invalid hs database, crc check failure");
  2095. munmap(map, len);
  2096. return FALSE;
  2097. }
  2098. if ((ret = hs_deserialize_database(p, end - p, &test_db)) != HS_SUCCESS) {
  2099. msg_err_re_cache("bad hs database in %s: %d", path, ret);
  2100. g_set_error(err, rspamd_re_cache_quark(), 0,
  2101. "deserialize error: %d", ret);
  2102. munmap(map, len);
  2103. return FALSE;
  2104. }
  2105. hs_free_database(test_db);
  2106. munmap(map, len);
  2107. }
  2108. /* XXX: add crc check */
  2109. return TRUE;
  2110. }
  2111. }
  2112. if (!silent) {
  2113. msg_warn_re_cache("unknown hyperscan cache file %s", path);
  2114. }
  2115. g_set_error(err, rspamd_re_cache_quark(), 0,
  2116. "unknown hyperscan file");
  2117. return FALSE;
  2118. #endif
  2119. }
  2120. enum rspamd_hyperscan_status
  2121. rspamd_re_cache_load_hyperscan(struct rspamd_re_cache *cache,
  2122. const char *cache_dir, bool try_load)
  2123. {
  2124. g_assert(cache != NULL);
  2125. g_assert(cache_dir != NULL);
  2126. #ifndef WITH_HYPERSCAN
  2127. return RSPAMD_HYPERSCAN_UNSUPPORTED;
  2128. #else
  2129. gchar path[PATH_MAX];
  2130. gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret;
  2131. GHashTableIter it;
  2132. gpointer k, v;
  2133. guint8 *map, *p;
  2134. struct rspamd_re_class *re_class;
  2135. struct rspamd_re_cache_elt *elt;
  2136. struct stat st;
  2137. gboolean has_valid = FALSE, all_valid = FALSE;
  2138. g_hash_table_iter_init(&it, cache->re_classes);
  2139. while (g_hash_table_iter_next(&it, &k, &v)) {
  2140. re_class = v;
  2141. rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cache_dir,
  2142. G_DIR_SEPARATOR, re_class->hash);
  2143. if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, try_load, FALSE, NULL)) {
  2144. msg_debug_re_cache("load hyperscan database from '%s'",
  2145. re_class->hash);
  2146. fd = open(path, O_RDONLY);
  2147. /* Read number of regexps */
  2148. g_assert(fd != -1);
  2149. fstat(fd, &st);
  2150. map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
  2151. if (map == MAP_FAILED) {
  2152. if (!try_load) {
  2153. msg_err_re_cache("cannot mmap %s: %s", path, strerror(errno));
  2154. }
  2155. else {
  2156. msg_debug_re_cache("cannot mmap %s: %s", path, strerror(errno));
  2157. }
  2158. close(fd);
  2159. all_valid = FALSE;
  2160. continue;
  2161. }
  2162. close(fd);
  2163. p = map + RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt);
  2164. n = *(gint *) p;
  2165. if (n <= 0 || 2 * n * sizeof(gint) + /* IDs + flags */
  2166. sizeof(guint64) + /* crc */
  2167. RSPAMD_HS_MAGIC_LEN + /* header */
  2168. sizeof(cache->plt) >
  2169. (gsize) st.st_size) {
  2170. /* Some wrong amount of regexps */
  2171. if (!try_load) {
  2172. msg_err_re_cache("bad number of expressions in %s: %d",
  2173. path, n);
  2174. }
  2175. else {
  2176. msg_debug_re_cache("bad number of expressions in %s: %d",
  2177. path, n);
  2178. }
  2179. munmap(map, st.st_size);
  2180. all_valid = FALSE;
  2181. continue;
  2182. }
  2183. total += n;
  2184. p += sizeof(n);
  2185. hs_ids = g_malloc(n * sizeof(*hs_ids));
  2186. memcpy(hs_ids, p, n * sizeof(*hs_ids));
  2187. p += n * sizeof(*hs_ids);
  2188. hs_flags = g_malloc(n * sizeof(*hs_flags));
  2189. memcpy(hs_flags, p, n * sizeof(*hs_flags));
  2190. /* Skip crc */
  2191. p += n * sizeof(*hs_ids) + sizeof(guint64);
  2192. /* Cleanup */
  2193. if (re_class->hs_scratch != NULL) {
  2194. hs_free_scratch(re_class->hs_scratch);
  2195. }
  2196. if (re_class->hs_db != NULL) {
  2197. rspamd_hyperscan_free(re_class->hs_db, false);
  2198. }
  2199. if (re_class->hs_ids) {
  2200. g_free(re_class->hs_ids);
  2201. }
  2202. re_class->hs_ids = NULL;
  2203. re_class->hs_scratch = NULL;
  2204. re_class->hs_db = NULL;
  2205. munmap(map, st.st_size);
  2206. re_class->hs_db = rspamd_hyperscan_maybe_load(path, p - map);
  2207. if (re_class->hs_db == NULL) {
  2208. if (!try_load) {
  2209. msg_err_re_cache("bad hs database in %s", path);
  2210. }
  2211. else {
  2212. msg_debug_re_cache("bad hs database in %s", path);
  2213. }
  2214. g_free(hs_ids);
  2215. g_free(hs_flags);
  2216. re_class->hs_ids = NULL;
  2217. re_class->hs_scratch = NULL;
  2218. re_class->hs_db = NULL;
  2219. all_valid = FALSE;
  2220. continue;
  2221. }
  2222. if ((ret = hs_alloc_scratch(rspamd_hyperscan_get_database(re_class->hs_db),
  2223. &re_class->hs_scratch)) != HS_SUCCESS) {
  2224. if (!try_load) {
  2225. msg_err_re_cache("bad hs database in %s; error code: %d", path, ret);
  2226. }
  2227. else {
  2228. msg_debug_re_cache("bad hs database in %s; error code: %d", path, ret);
  2229. }
  2230. g_free(hs_ids);
  2231. g_free(hs_flags);
  2232. rspamd_hyperscan_free(re_class->hs_db, true);
  2233. re_class->hs_ids = NULL;
  2234. re_class->hs_scratch = NULL;
  2235. re_class->hs_db = NULL;
  2236. all_valid = FALSE;
  2237. continue;
  2238. }
  2239. /*
  2240. * Now find hyperscan elts that are successfully compiled and
  2241. * specify that they should be matched using hyperscan
  2242. */
  2243. for (i = 0; i < n; i++) {
  2244. g_assert((gint) cache->re->len > hs_ids[i] && hs_ids[i] >= 0);
  2245. elt = g_ptr_array_index(cache->re, hs_ids[i]);
  2246. if (hs_flags[i] & HS_FLAG_PREFILTER) {
  2247. elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE;
  2248. }
  2249. else {
  2250. elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN;
  2251. }
  2252. }
  2253. re_class->hs_ids = hs_ids;
  2254. g_free(hs_flags);
  2255. re_class->nhs = n;
  2256. if (!has_valid) {
  2257. has_valid = TRUE;
  2258. all_valid = TRUE;
  2259. }
  2260. }
  2261. else {
  2262. if (!try_load) {
  2263. msg_err_re_cache("invalid hyperscan hash file '%s'",
  2264. path);
  2265. }
  2266. else {
  2267. msg_debug_re_cache("invalid hyperscan hash file '%s'",
  2268. path);
  2269. }
  2270. all_valid = FALSE;
  2271. continue;
  2272. }
  2273. }
  2274. if (has_valid) {
  2275. if (all_valid) {
  2276. msg_info_re_cache("full hyperscan database of %d regexps has been loaded", total);
  2277. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL;
  2278. }
  2279. else {
  2280. msg_info_re_cache("partial hyperscan database of %d regexps has been loaded", total);
  2281. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL;
  2282. }
  2283. }
  2284. else {
  2285. msg_info_re_cache("hyperscan database has NOT been loaded; no valid expressions");
  2286. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR;
  2287. }
  2288. return cache->hyperscan_loaded;
  2289. #endif
  2290. }
  2291. void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache,
  2292. const gchar *sname,
  2293. gint ref)
  2294. {
  2295. khiter_t k;
  2296. k = kh_get(lua_selectors_hash, cache->selectors, (gchar *) sname);
  2297. if (k == kh_end(cache->selectors)) {
  2298. gchar *cpy = g_strdup(sname);
  2299. gint res;
  2300. k = kh_put(lua_selectors_hash, cache->selectors, cpy, &res);
  2301. kh_value(cache->selectors, k) = ref;
  2302. }
  2303. else {
  2304. msg_warn_re_cache("replacing selector with name %s", sname);
  2305. if (cache->L) {
  2306. luaL_unref(cache->L, LUA_REGISTRYINDEX, kh_value(cache->selectors, k));
  2307. }
  2308. kh_value(cache->selectors, k) = ref;
  2309. }
  2310. }