You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

re_cache.c 68KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "libmime/message.h"
  17. #include "re_cache.h"
  18. #include "cryptobox.h"
  19. #include "ref.h"
  20. #include "libserver/url.h"
  21. #include "libserver/task.h"
  22. #include "libserver/cfg_file.h"
  23. #include "libutil/util.h"
  24. #include "libutil/regexp.h"
  25. #include "lua/lua_common.h"
  26. #include "libstat/stat_api.h"
  27. #include "contrib/uthash/utlist.h"
  28. #include "lua/lua_classnames.h"
  29. #include "khash.h"
  30. #ifdef WITH_HYPERSCAN
  31. #include "hs.h"
  32. #include "hyperscan_tools.h"
  33. #endif
  34. #include "unix-std.h"
  35. #include <signal.h>
  36. #include <stdalign.h>
  37. #include <math.h>
  38. #include "contrib/libev/ev.h"
  39. #ifndef WITH_PCRE2
  40. #include <pcre.h>
  41. #else
  42. #include <pcre2.h>
  43. #endif
  44. #include "contrib/fastutf8/fastutf8.h"
  45. #ifdef HAVE_SYS_WAIT_H
  46. #include <sys/wait.h>
  47. #endif
  48. #define msg_err_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
  49. "re_cache", cache->hash, \
  50. RSPAMD_LOG_FUNC, \
  51. __VA_ARGS__)
  52. #define msg_warn_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
  53. "re_cache", cache->hash, \
  54. RSPAMD_LOG_FUNC, \
  55. __VA_ARGS__)
  56. #define msg_info_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
  57. "re_cache", cache->hash, \
  58. RSPAMD_LOG_FUNC, \
  59. __VA_ARGS__)
  60. #define msg_debug_re_task(...) rspamd_conditional_debug_fast(NULL, NULL, \
  61. rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \
  62. RSPAMD_LOG_FUNC, \
  63. __VA_ARGS__)
  64. #define msg_debug_re_cache(...) rspamd_conditional_debug_fast(NULL, NULL, \
  65. rspamd_re_cache_log_id, "re_cache", cache->hash, \
  66. RSPAMD_LOG_FUNC, \
  67. __VA_ARGS__)
  68. INIT_LOG_MODULE(re_cache)
  69. #ifdef WITH_HYPERSCAN
  70. #define RSPAMD_HS_MAGIC_LEN (sizeof(rspamd_hs_magic))
  71. static const unsigned char rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'},
  72. rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'};
  73. #endif
  74. struct rspamd_re_class {
  75. uint64_t id;
  76. enum rspamd_re_type type;
  77. gboolean has_utf8; /* if there are any utf8 regexps */
  78. gpointer type_data;
  79. gsize type_len;
  80. GHashTable *re;
  81. rspamd_cryptobox_hash_state_t *st;
  82. char hash[rspamd_cryptobox_HASHBYTES + 1];
  83. #ifdef WITH_HYPERSCAN
  84. rspamd_hyperscan_t *hs_db;
  85. hs_scratch_t *hs_scratch;
  86. int *hs_ids;
  87. unsigned int nhs;
  88. #endif
  89. };
  90. enum rspamd_re_cache_elt_match_type {
  91. RSPAMD_RE_CACHE_PCRE = 0,
  92. RSPAMD_RE_CACHE_HYPERSCAN,
  93. RSPAMD_RE_CACHE_HYPERSCAN_PRE
  94. };
  95. struct rspamd_re_cache_elt {
  96. rspamd_regexp_t *re;
  97. int lua_cbref;
  98. enum rspamd_re_cache_elt_match_type match_type;
  99. };
  100. KHASH_INIT(lua_selectors_hash, char *, int, 1, kh_str_hash_func, kh_str_hash_equal);
  101. struct rspamd_re_cache {
  102. GHashTable *re_classes;
  103. GPtrArray *re;
  104. khash_t(lua_selectors_hash) * selectors;
  105. ref_entry_t ref;
  106. unsigned int nre;
  107. unsigned int max_re_data;
  108. char hash[rspamd_cryptobox_HASHBYTES + 1];
  109. lua_State *L;
  110. #ifdef WITH_HYPERSCAN
  111. enum rspamd_hyperscan_status hyperscan_loaded;
  112. gboolean disable_hyperscan;
  113. hs_platform_info_t plt;
  114. #endif
  115. };
  116. struct rspamd_re_selector_result {
  117. unsigned char **scvec;
  118. unsigned int *lenvec;
  119. unsigned int cnt;
  120. };
  121. KHASH_INIT(selectors_results_hash, int, struct rspamd_re_selector_result, 1,
  122. kh_int_hash_func, kh_int_hash_equal);
  123. struct rspamd_re_runtime {
  124. unsigned char *checked;
  125. unsigned char *results;
  126. khash_t(selectors_results_hash) * sel_cache;
  127. struct rspamd_re_cache *cache;
  128. struct rspamd_re_cache_stat stat;
  129. gboolean has_hs;
  130. };
  131. static GQuark
  132. rspamd_re_cache_quark(void)
  133. {
  134. return g_quark_from_static_string("re_cache");
  135. }
  136. static uint64_t
  137. rspamd_re_cache_class_id(enum rspamd_re_type type,
  138. gconstpointer type_data,
  139. gsize datalen)
  140. {
  141. rspamd_cryptobox_fast_hash_state_t st;
  142. rspamd_cryptobox_fast_hash_init(&st, 0xdeadbabe);
  143. rspamd_cryptobox_fast_hash_update(&st, &type, sizeof(type));
  144. if (datalen > 0) {
  145. rspamd_cryptobox_fast_hash_update(&st, type_data, datalen);
  146. }
  147. return rspamd_cryptobox_fast_hash_final(&st);
  148. }
  149. static void
  150. rspamd_re_cache_destroy(struct rspamd_re_cache *cache)
  151. {
  152. GHashTableIter it;
  153. gpointer k, v;
  154. struct rspamd_re_class *re_class;
  155. char *skey;
  156. int sref;
  157. g_assert(cache != NULL);
  158. g_hash_table_iter_init(&it, cache->re_classes);
  159. while (g_hash_table_iter_next(&it, &k, &v)) {
  160. re_class = v;
  161. g_hash_table_iter_steal(&it);
  162. g_hash_table_unref(re_class->re);
  163. if (re_class->type_data) {
  164. g_free(re_class->type_data);
  165. }
  166. #ifdef WITH_HYPERSCAN
  167. if (re_class->hs_db) {
  168. rspamd_hyperscan_free(re_class->hs_db, false);
  169. }
  170. if (re_class->hs_scratch) {
  171. hs_free_scratch(re_class->hs_scratch);
  172. }
  173. if (re_class->hs_ids) {
  174. g_free(re_class->hs_ids);
  175. }
  176. #endif
  177. g_free(re_class);
  178. }
  179. if (cache->L) {
  180. kh_foreach(cache->selectors, skey, sref, {
  181. luaL_unref(cache->L, LUA_REGISTRYINDEX, sref);
  182. g_free(skey);
  183. });
  184. struct rspamd_re_cache_elt *elt;
  185. unsigned int i;
  186. PTR_ARRAY_FOREACH(cache->re, i, elt)
  187. {
  188. if (elt->lua_cbref != -1) {
  189. luaL_unref(cache->L, LUA_REGISTRYINDEX, elt->lua_cbref);
  190. }
  191. }
  192. }
  193. kh_destroy(lua_selectors_hash, cache->selectors);
  194. g_hash_table_unref(cache->re_classes);
  195. g_ptr_array_free(cache->re, TRUE);
  196. g_free(cache);
  197. }
  198. static void
  199. rspamd_re_cache_elt_dtor(gpointer e)
  200. {
  201. struct rspamd_re_cache_elt *elt = e;
  202. rspamd_regexp_unref(elt->re);
  203. g_free(elt);
  204. }
  205. struct rspamd_re_cache *
  206. rspamd_re_cache_new(void)
  207. {
  208. struct rspamd_re_cache *cache;
  209. cache = g_malloc0(sizeof(*cache));
  210. cache->re_classes = g_hash_table_new(g_int64_hash, g_int64_equal);
  211. cache->nre = 0;
  212. cache->re = g_ptr_array_new_full(256, rspamd_re_cache_elt_dtor);
  213. cache->selectors = kh_init(lua_selectors_hash);
  214. #ifdef WITH_HYPERSCAN
  215. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN;
  216. #endif
  217. REF_INIT_RETAIN(cache, rspamd_re_cache_destroy);
  218. return cache;
  219. }
  220. enum rspamd_hyperscan_status
  221. rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache *cache)
  222. {
  223. g_assert(cache != NULL);
  224. #ifdef WITH_HYPERSCAN
  225. return cache->hyperscan_loaded;
  226. #else
  227. return RSPAMD_HYPERSCAN_UNSUPPORTED;
  228. #endif
  229. }
  230. rspamd_regexp_t *
  231. rspamd_re_cache_add(struct rspamd_re_cache *cache,
  232. rspamd_regexp_t *re,
  233. enum rspamd_re_type type,
  234. gconstpointer type_data, gsize datalen,
  235. int lua_cbref)
  236. {
  237. uint64_t class_id;
  238. struct rspamd_re_class *re_class;
  239. rspamd_regexp_t *nre;
  240. struct rspamd_re_cache_elt *elt;
  241. g_assert(cache != NULL);
  242. g_assert(re != NULL);
  243. class_id = rspamd_re_cache_class_id(type, type_data, datalen);
  244. re_class = g_hash_table_lookup(cache->re_classes, &class_id);
  245. if (re_class == NULL) {
  246. re_class = g_malloc0(sizeof(*re_class));
  247. re_class->id = class_id;
  248. re_class->type_len = datalen;
  249. re_class->type = type;
  250. re_class->re = g_hash_table_new_full(rspamd_regexp_hash,
  251. rspamd_regexp_equal, NULL, (GDestroyNotify) rspamd_regexp_unref);
  252. if (datalen > 0) {
  253. re_class->type_data = g_malloc0(datalen);
  254. memcpy(re_class->type_data, type_data, datalen);
  255. }
  256. g_hash_table_insert(cache->re_classes, &re_class->id, re_class);
  257. }
  258. if ((nre = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(re))) == NULL) {
  259. /*
  260. * We set re id based on the global position in the cache
  261. */
  262. elt = g_malloc0(sizeof(*elt));
  263. /* One ref for re_class */
  264. nre = rspamd_regexp_ref(re);
  265. rspamd_regexp_set_cache_id(re, cache->nre++);
  266. /* One ref for cache */
  267. elt->re = rspamd_regexp_ref(re);
  268. g_ptr_array_add(cache->re, elt);
  269. rspamd_regexp_set_class(re, re_class);
  270. elt->lua_cbref = lua_cbref;
  271. g_hash_table_insert(re_class->re, rspamd_regexp_get_id(nre), nre);
  272. }
  273. if (rspamd_regexp_get_flags(re) & RSPAMD_REGEXP_FLAG_UTF) {
  274. re_class->has_utf8 = TRUE;
  275. }
  276. return nre;
  277. }
  278. void rspamd_re_cache_replace(struct rspamd_re_cache *cache,
  279. rspamd_regexp_t *what,
  280. rspamd_regexp_t *with)
  281. {
  282. uint64_t re_id;
  283. struct rspamd_re_class *re_class;
  284. rspamd_regexp_t *src;
  285. struct rspamd_re_cache_elt *elt;
  286. g_assert(cache != NULL);
  287. g_assert(what != NULL);
  288. g_assert(with != NULL);
  289. re_class = rspamd_regexp_get_class(what);
  290. if (re_class != NULL) {
  291. re_id = rspamd_regexp_get_cache_id(what);
  292. g_assert(re_id != RSPAMD_INVALID_ID);
  293. src = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(what));
  294. elt = g_ptr_array_index(cache->re, re_id);
  295. g_assert(elt != NULL);
  296. g_assert(src != NULL);
  297. rspamd_regexp_set_cache_id(what, RSPAMD_INVALID_ID);
  298. rspamd_regexp_set_class(what, NULL);
  299. rspamd_regexp_set_cache_id(with, re_id);
  300. rspamd_regexp_set_class(with, re_class);
  301. /*
  302. * On calling of this function, we actually unref old re (what)
  303. */
  304. g_hash_table_insert(re_class->re,
  305. rspamd_regexp_get_id(what),
  306. rspamd_regexp_ref(with));
  307. rspamd_regexp_unref(elt->re);
  308. elt->re = rspamd_regexp_ref(with);
  309. /* XXX: do not touch match type here */
  310. }
  311. }
  312. static int
  313. rspamd_re_cache_sort_func(gconstpointer a, gconstpointer b)
  314. {
  315. struct rspamd_re_cache_elt *const *re1 = a, *const *re2 = b;
  316. return rspamd_regexp_cmp(rspamd_regexp_get_id((*re1)->re),
  317. rspamd_regexp_get_id((*re2)->re));
  318. }
  319. void rspamd_re_cache_init(struct rspamd_re_cache *cache, struct rspamd_config *cfg)
  320. {
  321. unsigned int i, fl;
  322. GHashTableIter it;
  323. gpointer k, v;
  324. struct rspamd_re_class *re_class;
  325. rspamd_cryptobox_hash_state_t st_global;
  326. rspamd_regexp_t *re;
  327. struct rspamd_re_cache_elt *elt;
  328. unsigned char hash_out[rspamd_cryptobox_HASHBYTES];
  329. g_assert(cache != NULL);
  330. rspamd_cryptobox_hash_init(&st_global, NULL, 0);
  331. /* Resort all regexps */
  332. g_ptr_array_sort(cache->re, rspamd_re_cache_sort_func);
  333. for (i = 0; i < cache->re->len; i++) {
  334. elt = g_ptr_array_index(cache->re, i);
  335. re = elt->re;
  336. re_class = rspamd_regexp_get_class(re);
  337. g_assert(re_class != NULL);
  338. rspamd_regexp_set_cache_id(re, i);
  339. if (re_class->st == NULL) {
  340. (void) !posix_memalign((void **) &re_class->st, RSPAMD_ALIGNOF(rspamd_cryptobox_hash_state_t),
  341. sizeof(*re_class->st));
  342. g_assert(re_class->st != NULL);
  343. rspamd_cryptobox_hash_init(re_class->st, NULL, 0);
  344. }
  345. /* Update hashes */
  346. /* Id of re class */
  347. rspamd_cryptobox_hash_update(re_class->st, (gpointer) &re_class->id,
  348. sizeof(re_class->id));
  349. rspamd_cryptobox_hash_update(&st_global, (gpointer) &re_class->id,
  350. sizeof(re_class->id));
  351. /* Id of re expression */
  352. rspamd_cryptobox_hash_update(re_class->st, rspamd_regexp_get_id(re),
  353. rspamd_cryptobox_HASHBYTES);
  354. rspamd_cryptobox_hash_update(&st_global, rspamd_regexp_get_id(re),
  355. rspamd_cryptobox_HASHBYTES);
  356. /* PCRE flags */
  357. fl = rspamd_regexp_get_pcre_flags(re);
  358. rspamd_cryptobox_hash_update(re_class->st, (const unsigned char *) &fl,
  359. sizeof(fl));
  360. rspamd_cryptobox_hash_update(&st_global, (const unsigned char *) &fl,
  361. sizeof(fl));
  362. /* Rspamd flags */
  363. fl = rspamd_regexp_get_flags(re);
  364. rspamd_cryptobox_hash_update(re_class->st, (const unsigned char *) &fl,
  365. sizeof(fl));
  366. rspamd_cryptobox_hash_update(&st_global, (const unsigned char *) &fl,
  367. sizeof(fl));
  368. /* Limit of hits */
  369. fl = rspamd_regexp_get_maxhits(re);
  370. rspamd_cryptobox_hash_update(re_class->st, (const unsigned char *) &fl,
  371. sizeof(fl));
  372. rspamd_cryptobox_hash_update(&st_global, (const unsigned char *) &fl,
  373. sizeof(fl));
  374. /* Numeric order */
  375. rspamd_cryptobox_hash_update(re_class->st, (const unsigned char *) &i,
  376. sizeof(i));
  377. rspamd_cryptobox_hash_update(&st_global, (const unsigned char *) &i,
  378. sizeof(i));
  379. }
  380. rspamd_cryptobox_hash_final(&st_global, hash_out);
  381. rspamd_snprintf(cache->hash, sizeof(cache->hash), "%*xs",
  382. (int) rspamd_cryptobox_HASHBYTES, hash_out);
  383. /* Now finalize all classes */
  384. g_hash_table_iter_init(&it, cache->re_classes);
  385. while (g_hash_table_iter_next(&it, &k, &v)) {
  386. re_class = v;
  387. if (re_class->st) {
  388. /*
  389. * We finally update all classes with the number of expressions
  390. * in the cache to ensure that if even a single re has been changed
  391. * we won't be broken due to id mismatch
  392. */
  393. rspamd_cryptobox_hash_update(re_class->st,
  394. (gpointer) &cache->re->len,
  395. sizeof(cache->re->len));
  396. rspamd_cryptobox_hash_final(re_class->st, hash_out);
  397. rspamd_snprintf(re_class->hash, sizeof(re_class->hash), "%*xs",
  398. (int) rspamd_cryptobox_HASHBYTES, hash_out);
  399. free(re_class->st); /* Due to posix_memalign */
  400. re_class->st = NULL;
  401. }
  402. }
  403. cache->L = cfg->lua_state;
  404. #ifdef WITH_HYPERSCAN
  405. const char *platform = "generic";
  406. rspamd_fstring_t *features = rspamd_fstring_new();
  407. cache->disable_hyperscan = cfg->disable_hyperscan;
  408. g_assert(hs_populate_platform(&cache->plt) == HS_SUCCESS);
  409. /* Now decode what we do have */
  410. switch (cache->plt.tune) {
  411. case HS_TUNE_FAMILY_HSW:
  412. platform = "haswell";
  413. break;
  414. case HS_TUNE_FAMILY_SNB:
  415. platform = "sandy";
  416. break;
  417. case HS_TUNE_FAMILY_BDW:
  418. platform = "broadwell";
  419. break;
  420. case HS_TUNE_FAMILY_IVB:
  421. platform = "ivy";
  422. break;
  423. default:
  424. break;
  425. }
  426. if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) {
  427. features = rspamd_fstring_append(features, "AVX2", 4);
  428. }
  429. hs_set_allocator(g_malloc, g_free);
  430. msg_info_re_cache("loaded hyperscan engine with cpu tune '%s' and features '%V'",
  431. platform, features);
  432. rspamd_fstring_free(features);
  433. #endif
  434. }
  435. struct rspamd_re_runtime *
  436. rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache)
  437. {
  438. struct rspamd_re_runtime *rt;
  439. g_assert(cache != NULL);
  440. rt = g_malloc0(sizeof(*rt) + NBYTES(cache->nre) + cache->nre);
  441. rt->cache = cache;
  442. REF_RETAIN(cache);
  443. rt->checked = ((unsigned char *) rt) + sizeof(*rt);
  444. rt->results = rt->checked + NBYTES(cache->nre);
  445. rt->stat.regexp_total = cache->nre;
  446. #ifdef WITH_HYPERSCAN
  447. rt->has_hs = cache->hyperscan_loaded;
  448. #endif
  449. return rt;
  450. }
  451. const struct rspamd_re_cache_stat *
  452. rspamd_re_cache_get_stat(struct rspamd_re_runtime *rt)
  453. {
  454. g_assert(rt != NULL);
  455. return &rt->stat;
  456. }
  457. static gboolean
  458. rspamd_re_cache_check_lua_condition(struct rspamd_task *task,
  459. rspamd_regexp_t *re,
  460. const unsigned char *in, gsize len,
  461. goffset start, goffset end,
  462. int lua_cbref)
  463. {
  464. lua_State *L = (lua_State *) task->cfg->lua_state;
  465. GError *err = NULL;
  466. struct rspamd_lua_text __attribute__((unused)) * t;
  467. int text_pos;
  468. if (G_LIKELY(lua_cbref == -1)) {
  469. return TRUE;
  470. }
  471. t = lua_new_text(L, in, len, FALSE);
  472. text_pos = lua_gettop(L);
  473. if (!rspamd_lua_universal_pcall(L, lua_cbref,
  474. G_STRLOC, 1, "utii", &err,
  475. rspamd_task_classname, task,
  476. text_pos, start, end)) {
  477. msg_warn_task("cannot call for re_cache_check_lua_condition for re %s: %e",
  478. rspamd_regexp_get_pattern(re), err);
  479. g_error_free(err);
  480. lua_settop(L, text_pos - 1);
  481. return TRUE;
  482. }
  483. gboolean res = lua_toboolean(L, -1);
  484. lua_settop(L, text_pos - 1);
  485. return res;
  486. }
  487. static unsigned int
  488. rspamd_re_cache_process_pcre(struct rspamd_re_runtime *rt,
  489. rspamd_regexp_t *re, struct rspamd_task *task,
  490. const unsigned char *in, gsize len,
  491. gboolean is_raw,
  492. int lua_cbref)
  493. {
  494. unsigned int r = 0;
  495. const char *start = NULL, *end = NULL;
  496. unsigned int max_hits = rspamd_regexp_get_maxhits(re);
  497. uint64_t id = rspamd_regexp_get_cache_id(re);
  498. double t1 = NAN, t2, pr;
  499. const double slow_time = 1e8;
  500. if (in == NULL) {
  501. return rt->results[id];
  502. }
  503. if (len == 0) {
  504. return rt->results[id];
  505. }
  506. if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) {
  507. len = rt->cache->max_re_data;
  508. }
  509. r = rt->results[id];
  510. if (max_hits == 0 || r < max_hits) {
  511. pr = rspamd_random_double_fast();
  512. if (pr > 0.9) {
  513. t1 = rspamd_get_ticks(TRUE);
  514. }
  515. while (rspamd_regexp_search(re,
  516. in,
  517. len,
  518. &start,
  519. &end,
  520. is_raw,
  521. NULL)) {
  522. if (rspamd_re_cache_check_lua_condition(task, re, in, len,
  523. start - (const char *) in, end - (const char *) in, lua_cbref)) {
  524. r++;
  525. msg_debug_re_task("found regexp /%s/, total hits: %d",
  526. rspamd_regexp_get_pattern(re), r);
  527. }
  528. if (max_hits > 0 && r >= max_hits) {
  529. break;
  530. }
  531. if (start >= end) {
  532. /* We found all matches, so no more hits are possible (protect from empty patterns) */
  533. break;
  534. }
  535. }
  536. rt->results[id] += r;
  537. rt->stat.regexp_checked++;
  538. rt->stat.bytes_scanned_pcre += len;
  539. rt->stat.bytes_scanned += len;
  540. if (r > 0) {
  541. rt->stat.regexp_matched += r;
  542. }
  543. if (!isnan(t1)) {
  544. t2 = rspamd_get_ticks(TRUE);
  545. if (t2 - t1 > slow_time) {
  546. rspamd_symcache_enable_profile(task);
  547. msg_info_task("regexp '%16s' took %.0f ticks to execute",
  548. rspamd_regexp_get_pattern(re), t2 - t1);
  549. }
  550. }
  551. }
  552. return r;
  553. }
  554. #ifdef WITH_HYPERSCAN
  555. struct rspamd_re_hyperscan_cbdata {
  556. struct rspamd_re_runtime *rt;
  557. const unsigned char **ins;
  558. const unsigned int *lens;
  559. unsigned int count;
  560. rspamd_regexp_t *re;
  561. struct rspamd_task *task;
  562. };
  563. static int
  564. rspamd_re_cache_hyperscan_cb(unsigned int id,
  565. unsigned long long from,
  566. unsigned long long to,
  567. unsigned int flags,
  568. void *ud)
  569. {
  570. struct rspamd_re_hyperscan_cbdata *cbdata = ud;
  571. struct rspamd_re_runtime *rt;
  572. struct rspamd_re_cache_elt *cache_elt;
  573. unsigned int ret, maxhits, i, processed;
  574. struct rspamd_task *task;
  575. rt = cbdata->rt;
  576. task = cbdata->task;
  577. cache_elt = g_ptr_array_index(rt->cache->re, id);
  578. maxhits = rspamd_regexp_get_maxhits(cache_elt->re);
  579. if (cache_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) {
  580. if (rspamd_re_cache_check_lua_condition(task, cache_elt->re,
  581. cbdata->ins[0], cbdata->lens[0], from, to, cache_elt->lua_cbref)) {
  582. ret = 1;
  583. setbit(rt->checked, id);
  584. if (maxhits == 0 || rt->results[id] < maxhits) {
  585. rt->results[id] += ret;
  586. rt->stat.regexp_matched++;
  587. }
  588. msg_debug_re_task("found regexp /%s/ using hyperscan only, total hits: %d",
  589. rspamd_regexp_get_pattern(cache_elt->re), rt->results[id]);
  590. }
  591. }
  592. else {
  593. if (!isset(rt->checked, id)) {
  594. processed = 0;
  595. for (i = 0; i < cbdata->count; i++) {
  596. rspamd_re_cache_process_pcre(rt,
  597. cache_elt->re,
  598. cbdata->task,
  599. cbdata->ins[i],
  600. cbdata->lens[i],
  601. FALSE,
  602. cache_elt->lua_cbref);
  603. setbit(rt->checked, id);
  604. processed += cbdata->lens[i];
  605. if (processed >= to) {
  606. break;
  607. }
  608. }
  609. }
  610. }
  611. return 0;
  612. }
  613. #endif
  614. static unsigned int
  615. rspamd_re_cache_process_regexp_data(struct rspamd_re_runtime *rt,
  616. rspamd_regexp_t *re, struct rspamd_task *task,
  617. const unsigned char **in, unsigned int *lens,
  618. unsigned int count,
  619. gboolean is_raw,
  620. gboolean *processed_hyperscan)
  621. {
  622. uint64_t re_id;
  623. unsigned int ret = 0;
  624. unsigned int i;
  625. struct rspamd_re_cache_elt *cache_elt;
  626. re_id = rspamd_regexp_get_cache_id(re);
  627. if (count == 0 || in == NULL) {
  628. /* We assume this as absence of the specified data */
  629. setbit(rt->checked, re_id);
  630. rt->results[re_id] = ret;
  631. return ret;
  632. }
  633. cache_elt = (struct rspamd_re_cache_elt *) g_ptr_array_index(rt->cache->re, re_id);
  634. #ifndef WITH_HYPERSCAN
  635. for (i = 0; i < count; i++) {
  636. ret = rspamd_re_cache_process_pcre(rt,
  637. re,
  638. task,
  639. in[i],
  640. lens[i],
  641. is_raw,
  642. cache_elt->lua_cbref);
  643. rt->results[re_id] = ret;
  644. }
  645. setbit(rt->checked, re_id);
  646. #else
  647. struct rspamd_re_class *re_class;
  648. struct rspamd_re_hyperscan_cbdata cbdata;
  649. cache_elt = g_ptr_array_index(rt->cache->re, re_id);
  650. re_class = rspamd_regexp_get_class(re);
  651. if (rt->cache->disable_hyperscan || cache_elt->match_type == RSPAMD_RE_CACHE_PCRE ||
  652. !rt->has_hs || (is_raw && re_class->has_utf8)) {
  653. for (i = 0; i < count; i++) {
  654. ret = rspamd_re_cache_process_pcre(rt,
  655. re,
  656. task,
  657. in[i],
  658. lens[i],
  659. is_raw,
  660. cache_elt->lua_cbref);
  661. }
  662. setbit(rt->checked, re_id);
  663. }
  664. else {
  665. for (i = 0; i < count; i++) {
  666. /* For Hyperscan we can probably safely disable all those limits */
  667. #if 0
  668. if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) {
  669. lens[i] = rt->cache->max_re_data;
  670. }
  671. #endif
  672. rt->stat.bytes_scanned += lens[i];
  673. }
  674. g_assert(re_class->hs_scratch != NULL);
  675. g_assert(re_class->hs_db != NULL);
  676. /* Go through hyperscan API */
  677. for (i = 0; i < count; i++) {
  678. cbdata.ins = &in[i];
  679. cbdata.re = re;
  680. cbdata.rt = rt;
  681. cbdata.lens = &lens[i];
  682. cbdata.count = 1;
  683. cbdata.task = task;
  684. if ((hs_scan(rspamd_hyperscan_get_database(re_class->hs_db),
  685. in[i], lens[i], 0,
  686. re_class->hs_scratch,
  687. rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
  688. ret = 0;
  689. }
  690. else {
  691. ret = rt->results[re_id];
  692. *processed_hyperscan = TRUE;
  693. }
  694. }
  695. }
  696. #endif
  697. return ret;
  698. }
  699. static void
  700. rspamd_re_cache_finish_class(struct rspamd_task *task,
  701. struct rspamd_re_runtime *rt,
  702. struct rspamd_re_class *re_class,
  703. const char *class_name)
  704. {
  705. #ifdef WITH_HYPERSCAN
  706. unsigned int i;
  707. uint64_t re_id;
  708. unsigned int found = 0;
  709. /* Set all bits that are not checked and included in hyperscan to 1 */
  710. for (i = 0; i < re_class->nhs; i++) {
  711. re_id = re_class->hs_ids[i];
  712. if (!isset(rt->checked, re_id)) {
  713. g_assert(rt->results[re_id] == 0);
  714. rt->results[re_id] = 0;
  715. setbit(rt->checked, re_id);
  716. }
  717. else {
  718. found++;
  719. }
  720. }
  721. msg_debug_re_task("finished hyperscan for class %s; %d "
  722. "matches found; %d hyperscan supported regexps; %d total regexps",
  723. class_name, found, re_class->nhs, (int) g_hash_table_size(re_class->re));
  724. #endif
  725. }
  726. static gboolean
  727. rspamd_re_cache_process_selector(struct rspamd_task *task,
  728. struct rspamd_re_runtime *rt,
  729. const char *name,
  730. unsigned char ***svec,
  731. unsigned int **lenvec,
  732. unsigned int *n)
  733. {
  734. int ref;
  735. khiter_t k;
  736. lua_State *L;
  737. int err_idx, ret;
  738. struct rspamd_task **ptask;
  739. gboolean result = FALSE;
  740. struct rspamd_re_cache *cache = rt->cache;
  741. struct rspamd_re_selector_result *sr;
  742. L = cache->L;
  743. k = kh_get(lua_selectors_hash, cache->selectors, (char *) name);
  744. if (k == kh_end(cache->selectors)) {
  745. msg_err_task("cannot find selector %s, not registered", name);
  746. return FALSE;
  747. }
  748. ref = kh_value(cache->selectors, k);
  749. /* First, search for the cached result */
  750. if (rt->sel_cache) {
  751. k = kh_get(selectors_results_hash, rt->sel_cache, ref);
  752. if (k != kh_end(rt->sel_cache)) {
  753. sr = &kh_value(rt->sel_cache, k);
  754. *svec = sr->scvec;
  755. *lenvec = sr->lenvec;
  756. *n = sr->cnt;
  757. return TRUE;
  758. }
  759. }
  760. else {
  761. rt->sel_cache = kh_init(selectors_results_hash);
  762. }
  763. lua_pushcfunction(L, &rspamd_lua_traceback);
  764. err_idx = lua_gettop(L);
  765. lua_rawgeti(L, LUA_REGISTRYINDEX, ref);
  766. ptask = lua_newuserdata(L, sizeof(*ptask));
  767. *ptask = task;
  768. rspamd_lua_setclass(L, rspamd_task_classname, -1);
  769. if ((ret = lua_pcall(L, 1, 1, err_idx)) != 0) {
  770. msg_err_task("call to selector %s "
  771. "failed (%d): %s",
  772. name, ret,
  773. lua_tostring(L, -1));
  774. }
  775. else {
  776. struct rspamd_lua_text *txt;
  777. gsize slen;
  778. const char *sel_data;
  779. if (lua_type(L, -1) != LUA_TTABLE) {
  780. txt = lua_check_text_or_string(L, -1);
  781. if (txt) {
  782. msg_debug_re_cache("re selector %s returned 1 element", name);
  783. sel_data = txt->start;
  784. slen = txt->len;
  785. *n = 1;
  786. *svec = g_malloc(sizeof(unsigned char *));
  787. *lenvec = g_malloc(sizeof(unsigned int));
  788. (*svec)[0] = g_malloc(slen);
  789. memcpy((*svec)[0], sel_data, slen);
  790. (*lenvec)[0] = slen;
  791. result = TRUE;
  792. }
  793. else {
  794. msg_debug_re_cache("re selector %s returned NULL", name);
  795. }
  796. }
  797. else {
  798. *n = rspamd_lua_table_size(L, -1);
  799. msg_debug_re_cache("re selector %s returned %d elements", name, *n);
  800. if (*n > 0) {
  801. *svec = g_malloc(sizeof(unsigned char *) * (*n));
  802. *lenvec = g_malloc(sizeof(unsigned int) * (*n));
  803. for (int i = 0; i < *n; i++) {
  804. lua_rawgeti(L, -1, i + 1);
  805. txt = lua_check_text_or_string(L, -1);
  806. if (txt && txt->len > 0) {
  807. sel_data = txt->start;
  808. slen = txt->len;
  809. (*svec)[i] = g_malloc(slen);
  810. memcpy((*svec)[i], sel_data, slen);
  811. }
  812. else {
  813. /* A hack to avoid malloc(0) */
  814. sel_data = "";
  815. slen = 0;
  816. (*svec)[i] = g_malloc(1);
  817. memcpy((*svec)[i], sel_data, 1);
  818. }
  819. (*lenvec)[i] = slen;
  820. lua_pop(L, 1);
  821. }
  822. }
  823. /* Empty table is also a valid result */
  824. result = TRUE;
  825. }
  826. }
  827. lua_settop(L, err_idx - 1);
  828. if (result) {
  829. k = kh_put(selectors_results_hash, rt->sel_cache, ref, &ret);
  830. sr = &kh_value(rt->sel_cache, k);
  831. sr->cnt = *n;
  832. sr->scvec = *svec;
  833. sr->lenvec = *lenvec;
  834. }
  835. return result;
  836. }
  837. static inline unsigned int
  838. rspamd_process_words_vector(GArray *words,
  839. const unsigned char **scvec,
  840. unsigned int *lenvec,
  841. struct rspamd_re_class *re_class,
  842. unsigned int cnt,
  843. gboolean *raw)
  844. {
  845. unsigned int j;
  846. rspamd_stat_token_t *tok;
  847. if (words) {
  848. for (j = 0; j < words->len; j++) {
  849. tok = &g_array_index(words, rspamd_stat_token_t, j);
  850. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
  851. if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
  852. if (!re_class->has_utf8) {
  853. *raw = TRUE;
  854. }
  855. else {
  856. continue; /* Skip */
  857. }
  858. }
  859. }
  860. else {
  861. continue; /* Skip non text */
  862. }
  863. if (re_class->type == RSPAMD_RE_RAWWORDS) {
  864. if (tok->original.len > 0) {
  865. scvec[cnt] = tok->original.begin;
  866. lenvec[cnt++] = tok->original.len;
  867. }
  868. }
  869. else if (re_class->type == RSPAMD_RE_WORDS) {
  870. if (tok->normalized.len > 0) {
  871. scvec[cnt] = tok->normalized.begin;
  872. lenvec[cnt++] = tok->normalized.len;
  873. }
  874. }
  875. else {
  876. /* Stemmed words */
  877. if (tok->stemmed.len > 0) {
  878. scvec[cnt] = tok->stemmed.begin;
  879. lenvec[cnt++] = tok->stemmed.len;
  880. }
  881. }
  882. }
  883. }
  884. return cnt;
  885. }
  886. static unsigned int
  887. rspamd_re_cache_process_headers_list(struct rspamd_task *task,
  888. struct rspamd_re_runtime *rt,
  889. rspamd_regexp_t *re,
  890. struct rspamd_re_class *re_class,
  891. struct rspamd_mime_header *rh,
  892. gboolean is_strong,
  893. gboolean *processed_hyperscan)
  894. {
  895. const unsigned char **scvec, *in;
  896. gboolean raw = FALSE;
  897. unsigned int *lenvec;
  898. struct rspamd_mime_header *cur;
  899. unsigned int cnt = 0, i = 0, ret = 0;
  900. DL_COUNT(rh, cur, cnt);
  901. scvec = g_malloc(sizeof(*scvec) * cnt);
  902. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  903. DL_FOREACH(rh, cur)
  904. {
  905. if (is_strong && strcmp(cur->name, re_class->type_data) != 0) {
  906. /* Skip a different case */
  907. continue;
  908. }
  909. if (re_class->type == RSPAMD_RE_RAWHEADER) {
  910. in = (const unsigned char *) cur->value;
  911. lenvec[i] = strlen(cur->value);
  912. if (rspamd_fast_utf8_validate(in, lenvec[i]) != 0) {
  913. raw = TRUE;
  914. }
  915. }
  916. else {
  917. in = (const unsigned char *) cur->decoded;
  918. /* Validate input^W^WNo need to validate as it is already valid */
  919. if (!in) {
  920. lenvec[i] = 0;
  921. scvec[i] = (unsigned char *) "";
  922. continue;
  923. }
  924. lenvec[i] = strlen(in);
  925. }
  926. scvec[i] = in;
  927. i++;
  928. }
  929. if (i > 0) {
  930. ret = rspamd_re_cache_process_regexp_data(rt, re,
  931. task, scvec, lenvec, i, raw, processed_hyperscan);
  932. msg_debug_re_task("checking header %s regexp: %s=%*s -> %d",
  933. re_class->type_data,
  934. rspamd_regexp_get_pattern(re),
  935. (int) lenvec[0], scvec[0], ret);
  936. }
  937. g_free(scvec);
  938. g_free(lenvec);
  939. return ret;
  940. }
  941. /*
  942. * Calculates the specified regexp for the specified class if it's not calculated
  943. */
  944. static unsigned int
  945. rspamd_re_cache_exec_re(struct rspamd_task *task,
  946. struct rspamd_re_runtime *rt,
  947. rspamd_regexp_t *re,
  948. struct rspamd_re_class *re_class,
  949. gboolean is_strong)
  950. {
  951. unsigned int ret = 0, i, re_id;
  952. struct rspamd_mime_header *rh;
  953. const char *in;
  954. const unsigned char **scvec = NULL;
  955. unsigned int *lenvec = NULL;
  956. gboolean raw = FALSE, processed_hyperscan = FALSE;
  957. struct rspamd_mime_text_part *text_part;
  958. struct rspamd_mime_part *mime_part;
  959. struct rspamd_url *url;
  960. unsigned int len = 0, cnt = 0;
  961. const char *class_name;
  962. class_name = rspamd_re_cache_type_to_string(re_class->type);
  963. msg_debug_re_task("start check re type: %s: /%s/",
  964. class_name,
  965. rspamd_regexp_get_pattern(re));
  966. re_id = rspamd_regexp_get_cache_id(re);
  967. switch (re_class->type) {
  968. case RSPAMD_RE_HEADER:
  969. case RSPAMD_RE_RAWHEADER:
  970. /* Get list of specified headers */
  971. rh = rspamd_message_get_header_array(task,
  972. re_class->type_data, FALSE);
  973. if (rh) {
  974. ret = rspamd_re_cache_process_headers_list(task, rt, re,
  975. re_class, rh, is_strong, &processed_hyperscan);
  976. msg_debug_re_task("checked header(%s) regexp: %s -> %d",
  977. (const char *) re_class->type_data,
  978. rspamd_regexp_get_pattern(re),
  979. ret);
  980. }
  981. break;
  982. case RSPAMD_RE_ALLHEADER:
  983. raw = TRUE;
  984. in = MESSAGE_FIELD(task, raw_headers_content).begin;
  985. len = MESSAGE_FIELD(task, raw_headers_content).len;
  986. ret = rspamd_re_cache_process_regexp_data(rt, re,
  987. task, (const unsigned char **) &in, &len, 1, raw, &processed_hyperscan);
  988. msg_debug_re_task("checked allheader regexp: %s -> %d",
  989. rspamd_regexp_get_pattern(re), ret);
  990. break;
  991. case RSPAMD_RE_MIMEHEADER:
  992. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, mime_part)
  993. {
  994. if (mime_part->parent_part == NULL ||
  995. !IS_PART_MULTIPART(mime_part->parent_part) ||
  996. IS_PART_MESSAGE(mime_part)) {
  997. /* We filter parts that have no multipart parent or are a messages here */
  998. continue;
  999. }
  1000. rh = rspamd_message_get_header_from_hash(mime_part->raw_headers,
  1001. re_class->type_data, FALSE);
  1002. if (rh) {
  1003. ret += rspamd_re_cache_process_headers_list(task, rt, re,
  1004. re_class, rh, is_strong, &processed_hyperscan);
  1005. }
  1006. msg_debug_re_task("checked mime header(%s) regexp: %s -> %d",
  1007. (const char *) re_class->type_data,
  1008. rspamd_regexp_get_pattern(re),
  1009. ret);
  1010. }
  1011. break;
  1012. case RSPAMD_RE_MIME:
  1013. case RSPAMD_RE_RAWMIME:
  1014. /* Iterate through text parts */
  1015. if (MESSAGE_FIELD(task, text_parts)->len > 0) {
  1016. cnt = MESSAGE_FIELD(task, text_parts)->len;
  1017. scvec = g_malloc(sizeof(*scvec) * cnt);
  1018. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1019. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1020. {
  1021. /* Select data for regexp */
  1022. if (re_class->type == RSPAMD_RE_RAWMIME) {
  1023. if (text_part->raw.len == 0) {
  1024. len = 0;
  1025. in = "";
  1026. }
  1027. else {
  1028. in = text_part->raw.begin;
  1029. len = text_part->raw.len;
  1030. }
  1031. raw = TRUE;
  1032. }
  1033. else {
  1034. /* Skip empty parts */
  1035. if (IS_TEXT_PART_EMPTY(text_part)) {
  1036. len = 0;
  1037. in = "";
  1038. }
  1039. else {
  1040. /* Check raw flags */
  1041. if (!IS_TEXT_PART_UTF(text_part)) {
  1042. raw = TRUE;
  1043. }
  1044. in = text_part->utf_content.begin;
  1045. len = text_part->utf_content.len;
  1046. }
  1047. }
  1048. scvec[i] = (unsigned char *) in;
  1049. lenvec[i] = len;
  1050. }
  1051. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1052. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1053. msg_debug_re_task("checked mime regexp: %s -> %d",
  1054. rspamd_regexp_get_pattern(re), ret);
  1055. g_free(scvec);
  1056. g_free(lenvec);
  1057. }
  1058. break;
  1059. case RSPAMD_RE_URL:
  1060. cnt = kh_size(MESSAGE_FIELD(task, urls));
  1061. if (cnt > 0) {
  1062. scvec = g_malloc(sizeof(*scvec) * cnt);
  1063. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1064. i = 0;
  1065. raw = FALSE;
  1066. kh_foreach_key(MESSAGE_FIELD(task, urls), url, {
  1067. if ((url->protocol & PROTOCOL_MAILTO)) {
  1068. continue;
  1069. }
  1070. in = url->string;
  1071. len = url->urllen;
  1072. if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
  1073. scvec[i] = (unsigned char *) in;
  1074. lenvec[i++] = len;
  1075. }
  1076. });
  1077. /* URL regexps do not include emails, that's why the code below is commented */
  1078. #if 0
  1079. g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
  1080. while (g_hash_table_iter_next (&it, &k, &v)) {
  1081. url = v;
  1082. in = url->string;
  1083. len = url->urllen;
  1084. if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
  1085. scvec[i] = (unsigned char *) in;
  1086. lenvec[i++] = len;
  1087. }
  1088. }
  1089. #endif
  1090. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1091. task, scvec, lenvec, i, raw, &processed_hyperscan);
  1092. msg_debug_re_task("checked url regexp: %s -> %d",
  1093. rspamd_regexp_get_pattern(re), ret);
  1094. g_free(scvec);
  1095. g_free(lenvec);
  1096. }
  1097. break;
  1098. case RSPAMD_RE_EMAIL:
  1099. cnt = kh_size(MESSAGE_FIELD(task, urls));
  1100. if (cnt > 0) {
  1101. scvec = g_malloc(sizeof(*scvec) * cnt);
  1102. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1103. i = 0;
  1104. raw = FALSE;
  1105. kh_foreach_key(MESSAGE_FIELD(task, urls), url, {
  1106. if (!(url->protocol & PROTOCOL_MAILTO)) {
  1107. continue;
  1108. }
  1109. if (url->userlen == 0 || url->hostlen == 0) {
  1110. continue;
  1111. }
  1112. in = rspamd_url_user_unsafe(url);
  1113. len = url->userlen + 1 + url->hostlen;
  1114. scvec[i] = (unsigned char *) in;
  1115. lenvec[i++] = len;
  1116. });
  1117. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1118. task, scvec, lenvec, i, raw, &processed_hyperscan);
  1119. msg_debug_re_task("checked email regexp: %s -> %d",
  1120. rspamd_regexp_get_pattern(re), ret);
  1121. g_free(scvec);
  1122. g_free(lenvec);
  1123. }
  1124. break;
  1125. case RSPAMD_RE_BODY:
  1126. raw = TRUE;
  1127. in = task->msg.begin;
  1128. len = task->msg.len;
  1129. ret = rspamd_re_cache_process_regexp_data(rt, re, task,
  1130. (const unsigned char **) &in, &len, 1, raw, &processed_hyperscan);
  1131. msg_debug_re_task("checked rawbody regexp: %s -> %d",
  1132. rspamd_regexp_get_pattern(re), ret);
  1133. break;
  1134. case RSPAMD_RE_SABODY:
  1135. /* According to SA docs:
  1136. * The 'body' in this case is the textual parts of the message body;
  1137. * any non-text MIME parts are stripped, and the message decoded from
  1138. * Quoted-Printable or Base-64-encoded format if necessary. The message
  1139. * Subject header is considered part of the body and becomes the first
  1140. * paragraph when running the rules. All HTML tags and line breaks will
  1141. * be removed before matching.
  1142. */
  1143. cnt = MESSAGE_FIELD(task, text_parts)->len + 1;
  1144. scvec = g_malloc(sizeof(*scvec) * cnt);
  1145. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1146. /*
  1147. * Body rules also include the Subject as the first line
  1148. * of the body content.
  1149. */
  1150. rh = rspamd_message_get_header_array(task, "Subject", FALSE);
  1151. if (rh) {
  1152. scvec[0] = (unsigned char *) rh->decoded;
  1153. lenvec[0] = strlen(rh->decoded);
  1154. }
  1155. else {
  1156. scvec[0] = (unsigned char *) "";
  1157. lenvec[0] = 0;
  1158. }
  1159. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1160. {
  1161. if (text_part->utf_stripped_content) {
  1162. scvec[i + 1] = (unsigned char *) text_part->utf_stripped_content->data;
  1163. lenvec[i + 1] = text_part->utf_stripped_content->len;
  1164. if (!IS_TEXT_PART_UTF(text_part)) {
  1165. raw = TRUE;
  1166. }
  1167. }
  1168. else {
  1169. scvec[i + 1] = (unsigned char *) "";
  1170. lenvec[i + 1] = 0;
  1171. }
  1172. }
  1173. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1174. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1175. msg_debug_re_task("checked sa body regexp: %s -> %d",
  1176. rspamd_regexp_get_pattern(re), ret);
  1177. g_free(scvec);
  1178. g_free(lenvec);
  1179. break;
  1180. case RSPAMD_RE_SARAWBODY:
  1181. /* According to SA docs:
  1182. * The 'raw body' of a message is the raw data inside all textual
  1183. * parts. The text will be decoded from base64 or quoted-printable
  1184. * encoding, but HTML tags and line breaks will still be present.
  1185. * Multiline expressions will need to be used to match strings that are
  1186. * broken by line breaks.
  1187. */
  1188. if (MESSAGE_FIELD(task, text_parts)->len > 0) {
  1189. cnt = MESSAGE_FIELD(task, text_parts)->len;
  1190. scvec = g_malloc(sizeof(*scvec) * cnt);
  1191. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1192. for (i = 0; i < cnt; i++) {
  1193. text_part = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), i);
  1194. if (text_part->parsed.len > 0) {
  1195. scvec[i] = (unsigned char *) text_part->parsed.begin;
  1196. lenvec[i] = text_part->parsed.len;
  1197. if (!IS_TEXT_PART_UTF(text_part)) {
  1198. raw = TRUE;
  1199. }
  1200. }
  1201. else {
  1202. scvec[i] = (unsigned char *) "";
  1203. lenvec[i] = 0;
  1204. }
  1205. }
  1206. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1207. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1208. msg_debug_re_task("checked sa rawbody regexp: %s -> %d",
  1209. rspamd_regexp_get_pattern(re), ret);
  1210. g_free(scvec);
  1211. g_free(lenvec);
  1212. }
  1213. break;
  1214. case RSPAMD_RE_WORDS:
  1215. case RSPAMD_RE_STEMWORDS:
  1216. case RSPAMD_RE_RAWWORDS:
  1217. if (MESSAGE_FIELD(task, text_parts)->len > 0) {
  1218. cnt = 0;
  1219. raw = FALSE;
  1220. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1221. {
  1222. if (text_part->utf_words) {
  1223. cnt += text_part->utf_words->len;
  1224. }
  1225. }
  1226. if (task->meta_words && task->meta_words->len > 0) {
  1227. cnt += task->meta_words->len;
  1228. }
  1229. if (cnt > 0) {
  1230. scvec = g_malloc(sizeof(*scvec) * cnt);
  1231. lenvec = g_malloc(sizeof(*lenvec) * cnt);
  1232. cnt = 0;
  1233. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
  1234. {
  1235. if (text_part->utf_words) {
  1236. cnt = rspamd_process_words_vector(text_part->utf_words,
  1237. scvec, lenvec, re_class, cnt, &raw);
  1238. }
  1239. }
  1240. if (task->meta_words) {
  1241. cnt = rspamd_process_words_vector(task->meta_words,
  1242. scvec, lenvec, re_class, cnt, &raw);
  1243. }
  1244. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1245. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1246. msg_debug_re_task("checked sa words regexp: %s -> %d",
  1247. rspamd_regexp_get_pattern(re), ret);
  1248. g_free(scvec);
  1249. g_free(lenvec);
  1250. }
  1251. }
  1252. break;
  1253. case RSPAMD_RE_SELECTOR:
  1254. if (rspamd_re_cache_process_selector(task, rt,
  1255. re_class->type_data,
  1256. (unsigned char ***) &scvec,
  1257. &lenvec, &cnt)) {
  1258. ret = rspamd_re_cache_process_regexp_data(rt, re,
  1259. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1260. msg_debug_re_task("checked selector(%s) regexp: %s -> %d",
  1261. re_class->type_data,
  1262. rspamd_regexp_get_pattern(re), ret);
  1263. /* Do not free vectors as they are managed by rt->sel_cache */
  1264. }
  1265. break;
  1266. case RSPAMD_RE_MAX:
  1267. msg_err_task("regexp of class invalid has been called: %s",
  1268. rspamd_regexp_get_pattern(re));
  1269. break;
  1270. }
  1271. #if WITH_HYPERSCAN
  1272. if (processed_hyperscan) {
  1273. rspamd_re_cache_finish_class(task, rt, re_class, class_name);
  1274. }
  1275. #endif
  1276. setbit(rt->checked, re_id);
  1277. return rt->results[re_id];
  1278. }
  1279. int rspamd_re_cache_process(struct rspamd_task *task,
  1280. rspamd_regexp_t *re,
  1281. enum rspamd_re_type type,
  1282. gconstpointer type_data,
  1283. gsize datalen,
  1284. gboolean is_strong)
  1285. {
  1286. uint64_t re_id;
  1287. struct rspamd_re_class *re_class;
  1288. struct rspamd_re_cache *cache;
  1289. struct rspamd_re_runtime *rt;
  1290. g_assert(task != NULL);
  1291. rt = task->re_rt;
  1292. g_assert(rt != NULL);
  1293. g_assert(re != NULL);
  1294. cache = rt->cache;
  1295. re_id = rspamd_regexp_get_cache_id(re);
  1296. if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) {
  1297. msg_err_task("re '%s' has no valid id for the cache",
  1298. rspamd_regexp_get_pattern(re));
  1299. return 0;
  1300. }
  1301. if (isset(rt->checked, re_id)) {
  1302. /* Fast path */
  1303. rt->stat.regexp_fast_cached++;
  1304. return rt->results[re_id];
  1305. }
  1306. else {
  1307. /* Slow path */
  1308. re_class = rspamd_regexp_get_class(re);
  1309. if (re_class == NULL) {
  1310. msg_err_task("cannot find re class for regexp '%s'",
  1311. rspamd_regexp_get_pattern(re));
  1312. return 0;
  1313. }
  1314. return rspamd_re_cache_exec_re(task, rt, re, re_class,
  1315. is_strong);
  1316. }
  1317. return 0;
  1318. }
  1319. int rspamd_re_cache_process_ffi(void *ptask,
  1320. void *pre,
  1321. int type,
  1322. void *type_data,
  1323. int is_strong)
  1324. {
  1325. struct rspamd_lua_regexp **lua_re = pre;
  1326. struct rspamd_task **real_task = ptask;
  1327. gsize typelen = 0;
  1328. if (type_data) {
  1329. typelen = strlen(type_data);
  1330. }
  1331. return rspamd_re_cache_process(*real_task, (*lua_re)->re,
  1332. type, type_data, typelen, is_strong);
  1333. }
  1334. void rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime *rt)
  1335. {
  1336. g_assert(rt != NULL);
  1337. if (rt->sel_cache) {
  1338. struct rspamd_re_selector_result sr;
  1339. kh_foreach_value(rt->sel_cache, sr, {
  1340. for (unsigned int i = 0; i < sr.cnt; i++) {
  1341. g_free((gpointer) sr.scvec[i]);
  1342. }
  1343. g_free(sr.scvec);
  1344. g_free(sr.lenvec);
  1345. });
  1346. kh_destroy(selectors_results_hash, rt->sel_cache);
  1347. }
  1348. REF_RELEASE(rt->cache);
  1349. g_free(rt);
  1350. }
  1351. void rspamd_re_cache_unref(struct rspamd_re_cache *cache)
  1352. {
  1353. if (cache) {
  1354. REF_RELEASE(cache);
  1355. }
  1356. }
  1357. struct rspamd_re_cache *
  1358. rspamd_re_cache_ref(struct rspamd_re_cache *cache)
  1359. {
  1360. if (cache) {
  1361. REF_RETAIN(cache);
  1362. }
  1363. return cache;
  1364. }
  1365. unsigned int rspamd_re_cache_set_limit(struct rspamd_re_cache *cache, unsigned int limit)
  1366. {
  1367. unsigned int old;
  1368. g_assert(cache != NULL);
  1369. old = cache->max_re_data;
  1370. cache->max_re_data = limit;
  1371. return old;
  1372. }
  1373. const char *
  1374. rspamd_re_cache_type_to_string(enum rspamd_re_type type)
  1375. {
  1376. const char *ret = "unknown";
  1377. switch (type) {
  1378. case RSPAMD_RE_HEADER:
  1379. ret = "header";
  1380. break;
  1381. case RSPAMD_RE_RAWHEADER:
  1382. ret = "raw header";
  1383. break;
  1384. case RSPAMD_RE_MIMEHEADER:
  1385. ret = "mime header";
  1386. break;
  1387. case RSPAMD_RE_ALLHEADER:
  1388. ret = "all headers";
  1389. break;
  1390. case RSPAMD_RE_MIME:
  1391. ret = "part";
  1392. break;
  1393. case RSPAMD_RE_RAWMIME:
  1394. ret = "raw part";
  1395. break;
  1396. case RSPAMD_RE_BODY:
  1397. ret = "rawbody";
  1398. break;
  1399. case RSPAMD_RE_URL:
  1400. ret = "url";
  1401. break;
  1402. case RSPAMD_RE_EMAIL:
  1403. ret = "email";
  1404. break;
  1405. case RSPAMD_RE_SABODY:
  1406. ret = "sa body";
  1407. break;
  1408. case RSPAMD_RE_SARAWBODY:
  1409. ret = "sa raw body";
  1410. break;
  1411. case RSPAMD_RE_SELECTOR:
  1412. ret = "selector";
  1413. break;
  1414. case RSPAMD_RE_WORDS:
  1415. ret = "words";
  1416. break;
  1417. case RSPAMD_RE_RAWWORDS:
  1418. ret = "raw_words";
  1419. break;
  1420. case RSPAMD_RE_STEMWORDS:
  1421. ret = "stem_words";
  1422. break;
  1423. case RSPAMD_RE_MAX:
  1424. default:
  1425. ret = "invalid class";
  1426. break;
  1427. }
  1428. return ret;
  1429. }
  1430. enum rspamd_re_type
  1431. rspamd_re_cache_type_from_string(const char *str)
  1432. {
  1433. enum rspamd_re_type ret;
  1434. uint64_t h;
  1435. /*
  1436. * To optimize this function, we apply hash to input string and
  1437. * pre-select it from the values
  1438. */
  1439. if (str != NULL) {
  1440. h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
  1441. str, strlen(str), 0xdeadbabe);
  1442. switch (h) {
  1443. case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */
  1444. ret = RSPAMD_RE_HEADER;
  1445. break;
  1446. case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */
  1447. ret = RSPAMD_RE_RAWHEADER;
  1448. break;
  1449. case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */
  1450. ret = RSPAMD_RE_MIME;
  1451. break;
  1452. case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */
  1453. ret = RSPAMD_RE_RAWMIME;
  1454. break;
  1455. case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */
  1456. case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */
  1457. ret = RSPAMD_RE_BODY;
  1458. break;
  1459. case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */
  1460. case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */
  1461. ret = RSPAMD_RE_URL;
  1462. break;
  1463. case G_GUINT64_CONSTANT(0x7e232b0f60b571be): /* email */
  1464. ret = RSPAMD_RE_EMAIL;
  1465. break;
  1466. case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */
  1467. ret = RSPAMD_RE_ALLHEADER;
  1468. break;
  1469. case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */
  1470. ret = RSPAMD_RE_MIMEHEADER;
  1471. break;
  1472. case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */
  1473. ret = RSPAMD_RE_SABODY;
  1474. break;
  1475. case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */
  1476. ret = RSPAMD_RE_SARAWBODY;
  1477. break;
  1478. default:
  1479. ret = RSPAMD_RE_MAX;
  1480. break;
  1481. }
  1482. }
  1483. else {
  1484. ret = RSPAMD_RE_MAX;
  1485. }
  1486. return ret;
  1487. }
  1488. #ifdef WITH_HYPERSCAN
  1489. static char *
  1490. rspamd_re_cache_hs_pattern_from_pcre(rspamd_regexp_t *re)
  1491. {
  1492. /*
  1493. * Workaround for bug in ragel 7.0.0.11
  1494. * https://github.com/intel/hyperscan/issues/133
  1495. */
  1496. const char *pat = rspamd_regexp_get_pattern(re);
  1497. unsigned int flags = rspamd_regexp_get_flags(re), esc_flags = RSPAMD_REGEXP_ESCAPE_RE;
  1498. char *escaped;
  1499. gsize esc_len;
  1500. if (flags & RSPAMD_REGEXP_FLAG_UTF) {
  1501. esc_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
  1502. }
  1503. escaped = rspamd_str_regexp_escape(pat, strlen(pat), &esc_len, esc_flags);
  1504. return escaped;
  1505. }
  1506. static gboolean
  1507. rspamd_re_cache_is_finite(struct rspamd_re_cache *cache,
  1508. rspamd_regexp_t *re, int flags, double max_time)
  1509. {
  1510. pid_t cld;
  1511. int status;
  1512. struct timespec ts;
  1513. hs_compile_error_t *hs_errors;
  1514. hs_database_t *test_db;
  1515. double wait_time;
  1516. const int max_tries = 10;
  1517. int tries = 0, rc;
  1518. void (*old_hdl)(int);
  1519. wait_time = max_time / max_tries;
  1520. /* We need to restore SIGCHLD processing */
  1521. old_hdl = signal(SIGCHLD, SIG_DFL);
  1522. cld = fork();
  1523. if (cld == 0) {
  1524. /* Try to compile pattern */
  1525. char *pat = rspamd_re_cache_hs_pattern_from_pcre(re);
  1526. if (hs_compile(pat,
  1527. flags | HS_FLAG_PREFILTER,
  1528. HS_MODE_BLOCK,
  1529. &cache->plt,
  1530. &test_db,
  1531. &hs_errors) != HS_SUCCESS) {
  1532. msg_info_re_cache("cannot compile (prefilter mode) '%s' to hyperscan: '%s'",
  1533. pat,
  1534. hs_errors != NULL ? hs_errors->message : "unknown error");
  1535. hs_free_compile_error(hs_errors);
  1536. g_free(pat);
  1537. exit(EXIT_FAILURE);
  1538. }
  1539. g_free(pat);
  1540. exit(EXIT_SUCCESS);
  1541. }
  1542. else if (cld > 0) {
  1543. double_to_ts(wait_time, &ts);
  1544. while ((rc = waitpid(cld, &status, WNOHANG)) == 0 && tries++ < max_tries) {
  1545. (void) nanosleep(&ts, NULL);
  1546. }
  1547. /* Child has been terminated */
  1548. if (rc > 0) {
  1549. /* Forget about SIGCHLD after this point */
  1550. signal(SIGCHLD, old_hdl);
  1551. if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_SUCCESS) {
  1552. return TRUE;
  1553. }
  1554. else {
  1555. msg_err_re_cache(
  1556. "cannot approximate %s to hyperscan",
  1557. rspamd_regexp_get_pattern(re));
  1558. return FALSE;
  1559. }
  1560. }
  1561. else {
  1562. /* We consider that as timeout */
  1563. kill(cld, SIGKILL);
  1564. g_assert(waitpid(cld, &status, 0) != -1);
  1565. msg_err_re_cache(
  1566. "cannot approximate %s to hyperscan: timeout waiting",
  1567. rspamd_regexp_get_pattern(re));
  1568. signal(SIGCHLD, old_hdl);
  1569. }
  1570. }
  1571. else {
  1572. msg_err_re_cache(
  1573. "cannot approximate %s to hyperscan: fork failed: %s",
  1574. rspamd_regexp_get_pattern(re), strerror(errno));
  1575. signal(SIGCHLD, old_hdl);
  1576. }
  1577. return FALSE;
  1578. }
  1579. #endif
  1580. #ifdef WITH_HYPERSCAN
  1581. struct rspamd_re_cache_hs_compile_cbdata {
  1582. GHashTableIter it;
  1583. struct rspamd_re_cache *cache;
  1584. const char *cache_dir;
  1585. double max_time;
  1586. gboolean silent;
  1587. unsigned int total;
  1588. void (*cb)(unsigned int ncompiled, GError *err, void *cbd);
  1589. void *cbd;
  1590. };
  1591. static void
  1592. rspamd_re_cache_compile_err(EV_P_ ev_timer *w, GError *err,
  1593. struct rspamd_re_cache_hs_compile_cbdata *cbdata, bool is_fatal)
  1594. {
  1595. cbdata->cb(cbdata->total, err, cbdata->cbd);
  1596. if (is_fatal) {
  1597. ev_timer_stop(EV_A_ w);
  1598. g_free(w);
  1599. g_free(cbdata);
  1600. }
  1601. else {
  1602. /* Continue compilation */
  1603. ev_timer_again(EV_A_ w);
  1604. }
  1605. g_error_free(err);
  1606. }
  1607. static void
  1608. rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer *w, int revents)
  1609. {
  1610. struct rspamd_re_cache_hs_compile_cbdata *cbdata =
  1611. (struct rspamd_re_cache_hs_compile_cbdata *) w->data;
  1612. GHashTableIter cit;
  1613. gpointer k, v;
  1614. struct rspamd_re_class *re_class;
  1615. char path[PATH_MAX], npath[PATH_MAX];
  1616. hs_database_t *test_db;
  1617. int fd, i, n, *hs_ids = NULL, pcre_flags, re_flags;
  1618. rspamd_cryptobox_fast_hash_state_t crc_st;
  1619. uint64_t crc;
  1620. rspamd_regexp_t *re;
  1621. hs_compile_error_t *hs_errors = NULL;
  1622. unsigned int *hs_flags = NULL;
  1623. const hs_expr_ext_t **hs_exts = NULL;
  1624. char **hs_pats = NULL;
  1625. char *hs_serialized = NULL;
  1626. gsize serialized_len;
  1627. struct iovec iov[7];
  1628. struct rspamd_re_cache *cache;
  1629. GError *err;
  1630. pid_t our_pid = getpid();
  1631. cache = cbdata->cache;
  1632. if (!g_hash_table_iter_next(&cbdata->it, &k, &v)) {
  1633. /* All done */
  1634. ev_timer_stop(EV_A_ w);
  1635. cbdata->cb(cbdata->total, NULL, cbdata->cbd);
  1636. g_free(w);
  1637. g_free(cbdata);
  1638. return;
  1639. }
  1640. re_class = v;
  1641. rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cbdata->cache_dir,
  1642. G_DIR_SEPARATOR, re_class->hash);
  1643. if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, TRUE, TRUE, NULL)) {
  1644. fd = open(path, O_RDONLY, 00600);
  1645. /* Read number of regexps */
  1646. g_assert(fd != -1);
  1647. g_assert(lseek(fd, RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt), SEEK_SET) != -1);
  1648. g_assert(read(fd, &n, sizeof(n)) == sizeof(n));
  1649. close(fd);
  1650. if (re_class->type_len > 0) {
  1651. if (!cbdata->silent) {
  1652. msg_info_re_cache(
  1653. "skip already valid class %s(%*s) to cache %6s, %d regexps",
  1654. rspamd_re_cache_type_to_string(re_class->type),
  1655. (int) re_class->type_len - 1,
  1656. re_class->type_data,
  1657. re_class->hash,
  1658. n);
  1659. }
  1660. }
  1661. else {
  1662. if (!cbdata->silent) {
  1663. msg_info_re_cache(
  1664. "skip already valid class %s to cache %6s, %d regexps",
  1665. rspamd_re_cache_type_to_string(re_class->type),
  1666. re_class->hash,
  1667. n);
  1668. }
  1669. }
  1670. ev_timer_again(EV_A_ w);
  1671. return;
  1672. }
  1673. rspamd_snprintf(path, sizeof(path), "%s%c%s%P-XXXXXXXXXX", cbdata->cache_dir,
  1674. G_DIR_SEPARATOR, re_class->hash, our_pid);
  1675. fd = g_mkstemp_full(path, O_CREAT | O_TRUNC | O_EXCL | O_WRONLY, 00600);
  1676. if (fd == -1) {
  1677. err = g_error_new(rspamd_re_cache_quark(), errno,
  1678. "cannot open file %s: %s", path, strerror(errno));
  1679. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1680. return;
  1681. }
  1682. g_hash_table_iter_init(&cit, re_class->re);
  1683. n = g_hash_table_size(re_class->re);
  1684. hs_flags = g_new0(unsigned int, n);
  1685. hs_ids = g_new0(unsigned int, n);
  1686. hs_pats = g_new0(char *, n);
  1687. hs_exts = g_new0(const hs_expr_ext_t *, n);
  1688. i = 0;
  1689. while (g_hash_table_iter_next(&cit, &k, &v)) {
  1690. re = v;
  1691. pcre_flags = rspamd_regexp_get_pcre_flags(re);
  1692. re_flags = rspamd_regexp_get_flags(re);
  1693. if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) {
  1694. /* Do not try to compile bad regexp */
  1695. msg_info_re_cache(
  1696. "do not try compile %s to hyperscan as it is PCRE only",
  1697. rspamd_regexp_get_pattern(re));
  1698. continue;
  1699. }
  1700. hs_flags[i] = 0;
  1701. hs_exts[i] = NULL;
  1702. #ifndef WITH_PCRE2
  1703. if (pcre_flags & PCRE_FLAG(UTF8)) {
  1704. hs_flags[i] |= HS_FLAG_UTF8;
  1705. }
  1706. #else
  1707. if (pcre_flags & PCRE_FLAG(UTF)) {
  1708. hs_flags[i] |= HS_FLAG_UTF8;
  1709. }
  1710. #endif
  1711. if (pcre_flags & PCRE_FLAG(CASELESS)) {
  1712. hs_flags[i] |= HS_FLAG_CASELESS;
  1713. }
  1714. if (pcre_flags & PCRE_FLAG(MULTILINE)) {
  1715. hs_flags[i] |= HS_FLAG_MULTILINE;
  1716. }
  1717. if (pcre_flags & PCRE_FLAG(DOTALL)) {
  1718. hs_flags[i] |= HS_FLAG_DOTALL;
  1719. }
  1720. if (re_flags & RSPAMD_REGEXP_FLAG_LEFTMOST) {
  1721. hs_flags[i] |= HS_FLAG_SOM_LEFTMOST;
  1722. }
  1723. else if (rspamd_regexp_get_maxhits(re) == 1) {
  1724. hs_flags[i] |= HS_FLAG_SINGLEMATCH;
  1725. }
  1726. char *pat = rspamd_re_cache_hs_pattern_from_pcre(re);
  1727. if (hs_compile(pat,
  1728. hs_flags[i],
  1729. HS_MODE_BLOCK,
  1730. &cache->plt,
  1731. &test_db,
  1732. &hs_errors) != HS_SUCCESS) {
  1733. msg_info_re_cache("cannot compile '%s' to hyperscan: '%s', try prefilter match",
  1734. pat,
  1735. hs_errors != NULL ? hs_errors->message : "unknown error");
  1736. hs_free_compile_error(hs_errors);
  1737. /* The approximation operation might take a significant
  1738. * amount of time, so we need to check if it's finite
  1739. */
  1740. if (rspamd_re_cache_is_finite(cache, re, hs_flags[i], cbdata->max_time)) {
  1741. hs_flags[i] |= HS_FLAG_PREFILTER;
  1742. hs_ids[i] = rspamd_regexp_get_cache_id(re);
  1743. hs_pats[i] = pat;
  1744. i++;
  1745. }
  1746. else {
  1747. g_free(pat); /* Avoid leak */
  1748. }
  1749. }
  1750. else {
  1751. hs_ids[i] = rspamd_regexp_get_cache_id(re);
  1752. hs_pats[i] = pat;
  1753. i++;
  1754. hs_free_database(test_db);
  1755. }
  1756. }
  1757. /* Adjust real re number */
  1758. n = i;
  1759. #define CLEANUP_ALLOCATED(is_err) \
  1760. do { \
  1761. g_free(hs_flags); \
  1762. g_free(hs_ids); \
  1763. for (unsigned int j = 0; j < i; j++) { \
  1764. g_free(hs_pats[j]); \
  1765. } \
  1766. g_free(hs_pats); \
  1767. g_free(hs_exts); \
  1768. if (is_err) { \
  1769. close(fd); \
  1770. unlink(path); \
  1771. if (hs_errors) hs_free_compile_error(hs_errors); \
  1772. } \
  1773. } while (0)
  1774. if (n > 0) {
  1775. /* Create the hs tree */
  1776. hs_errors = NULL;
  1777. if (hs_compile_ext_multi((const char **) hs_pats,
  1778. hs_flags,
  1779. hs_ids,
  1780. hs_exts,
  1781. n,
  1782. HS_MODE_BLOCK,
  1783. &cache->plt,
  1784. &test_db,
  1785. &hs_errors) != HS_SUCCESS) {
  1786. err = g_error_new(rspamd_re_cache_quark(), EINVAL,
  1787. "cannot create tree of regexp when processing '%s': %s",
  1788. hs_pats[hs_errors->expression], hs_errors->message);
  1789. CLEANUP_ALLOCATED(true);
  1790. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1791. return;
  1792. }
  1793. if (hs_serialize_database(test_db, &hs_serialized,
  1794. &serialized_len) != HS_SUCCESS) {
  1795. err = g_error_new(rspamd_re_cache_quark(),
  1796. errno,
  1797. "cannot serialize tree of regexp for %s",
  1798. re_class->hash);
  1799. CLEANUP_ALLOCATED(true);
  1800. hs_free_database(test_db);
  1801. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1802. return;
  1803. }
  1804. hs_free_database(test_db);
  1805. /*
  1806. * Magic - 8 bytes
  1807. * Platform - sizeof (platform)
  1808. * n - number of regexps
  1809. * n * <regexp ids>
  1810. * n * <regexp flags>
  1811. * crc - 8 bytes checksum
  1812. * <hyperscan blob>
  1813. */
  1814. rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
  1815. /* IDs -> Flags -> Hs blob */
  1816. rspamd_cryptobox_fast_hash_update(&crc_st,
  1817. hs_ids, sizeof(*hs_ids) * n);
  1818. rspamd_cryptobox_fast_hash_update(&crc_st,
  1819. hs_flags, sizeof(*hs_flags) * n);
  1820. rspamd_cryptobox_fast_hash_update(&crc_st,
  1821. hs_serialized, serialized_len);
  1822. crc = rspamd_cryptobox_fast_hash_final(&crc_st);
  1823. iov[0].iov_base = (void *) rspamd_hs_magic;
  1824. iov[0].iov_len = RSPAMD_HS_MAGIC_LEN;
  1825. iov[1].iov_base = &cache->plt;
  1826. iov[1].iov_len = sizeof(cache->plt);
  1827. iov[2].iov_base = &n;
  1828. iov[2].iov_len = sizeof(n);
  1829. iov[3].iov_base = hs_ids;
  1830. iov[3].iov_len = sizeof(*hs_ids) * n;
  1831. iov[4].iov_base = hs_flags;
  1832. iov[4].iov_len = sizeof(*hs_flags) * n;
  1833. iov[5].iov_base = &crc;
  1834. iov[5].iov_len = sizeof(crc);
  1835. iov[6].iov_base = hs_serialized;
  1836. iov[6].iov_len = serialized_len;
  1837. if (writev(fd, iov, G_N_ELEMENTS(iov)) == -1) {
  1838. err = g_error_new(rspamd_re_cache_quark(),
  1839. errno,
  1840. "cannot serialize tree of regexp to %s: %s",
  1841. path, strerror(errno));
  1842. CLEANUP_ALLOCATED(true);
  1843. g_free(hs_serialized);
  1844. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1845. return;
  1846. }
  1847. if (re_class->type_len > 0) {
  1848. msg_info_re_cache(
  1849. "compiled class %s(%*s) to cache %6s, %d/%d regexps",
  1850. rspamd_re_cache_type_to_string(re_class->type),
  1851. (int) re_class->type_len - 1,
  1852. re_class->type_data,
  1853. re_class->hash,
  1854. n,
  1855. (int) g_hash_table_size(re_class->re));
  1856. }
  1857. else {
  1858. msg_info_re_cache(
  1859. "compiled class %s to cache %6s, %d/%d regexps",
  1860. rspamd_re_cache_type_to_string(re_class->type),
  1861. re_class->hash,
  1862. n,
  1863. (int) g_hash_table_size(re_class->re));
  1864. }
  1865. cbdata->total += n;
  1866. CLEANUP_ALLOCATED(false);
  1867. /* Now rename temporary file to the new .hs file */
  1868. rspamd_snprintf(npath, sizeof(npath), "%s%c%s.hs", cbdata->cache_dir,
  1869. G_DIR_SEPARATOR, re_class->hash);
  1870. if (rename(path, npath) == -1) {
  1871. err = g_error_new(rspamd_re_cache_quark(),
  1872. errno,
  1873. "cannot rename %s to %s: %s",
  1874. path, npath, strerror(errno));
  1875. unlink(path);
  1876. close(fd);
  1877. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1878. return;
  1879. }
  1880. close(fd);
  1881. }
  1882. else {
  1883. err = g_error_new(rspamd_re_cache_quark(),
  1884. errno,
  1885. "no suitable regular expressions %s (%d original): "
  1886. "remove temporary file %s",
  1887. rspamd_re_cache_type_to_string(re_class->type),
  1888. (int) g_hash_table_size(re_class->re),
  1889. path);
  1890. CLEANUP_ALLOCATED(true);
  1891. rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false);
  1892. return;
  1893. }
  1894. /* Continue process */
  1895. ev_timer_again(EV_A_ w);
  1896. }
  1897. #endif
  1898. int rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache *cache,
  1899. const char *cache_dir,
  1900. double max_time,
  1901. gboolean silent,
  1902. struct ev_loop *event_loop,
  1903. void (*cb)(unsigned int ncompiled, GError *err, void *cbd),
  1904. void *cbd)
  1905. {
  1906. g_assert(cache != NULL);
  1907. g_assert(cache_dir != NULL);
  1908. #ifndef WITH_HYPERSCAN
  1909. return -1;
  1910. #else
  1911. static ev_timer *timer;
  1912. static const ev_tstamp timer_interval = 0.1;
  1913. struct rspamd_re_cache_hs_compile_cbdata *cbdata;
  1914. cbdata = g_malloc0(sizeof(*cbdata));
  1915. g_hash_table_iter_init(&cbdata->it, cache->re_classes);
  1916. cbdata->cache = cache;
  1917. cbdata->cache_dir = cache_dir;
  1918. cbdata->cb = cb;
  1919. cbdata->cbd = cbd;
  1920. cbdata->max_time = max_time;
  1921. cbdata->silent = silent;
  1922. cbdata->total = 0;
  1923. timer = g_malloc0(sizeof(*timer));
  1924. timer->data = (void *) cbdata; /* static */
  1925. ev_timer_init(timer, rspamd_re_cache_compile_timer_cb,
  1926. timer_interval, timer_interval);
  1927. ev_timer_start(event_loop, timer);
  1928. return 0;
  1929. #endif
  1930. }
  1931. gboolean
  1932. rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache,
  1933. const char *path, gboolean silent, gboolean try_load, GError **err)
  1934. {
  1935. g_assert(cache != NULL);
  1936. g_assert(path != NULL);
  1937. #ifndef WITH_HYPERSCAN
  1938. return FALSE;
  1939. #else
  1940. int fd, n, ret;
  1941. unsigned char magicbuf[RSPAMD_HS_MAGIC_LEN];
  1942. const unsigned char *mb;
  1943. GHashTableIter it;
  1944. gpointer k, v;
  1945. struct rspamd_re_class *re_class;
  1946. gsize len;
  1947. const char *hash_pos;
  1948. hs_platform_info_t test_plt;
  1949. hs_database_t *test_db = NULL;
  1950. unsigned char *map, *p, *end;
  1951. rspamd_cryptobox_fast_hash_state_t crc_st;
  1952. uint64_t crc, valid_crc;
  1953. len = strlen(path);
  1954. if (len < sizeof(rspamd_cryptobox_HASHBYTES + 3)) {
  1955. if (!silent) {
  1956. msg_err_re_cache("cannot open hyperscan cache file %s: too short filename",
  1957. path);
  1958. }
  1959. g_set_error(err, rspamd_re_cache_quark(), 0,
  1960. "too short filename");
  1961. return FALSE;
  1962. }
  1963. if (memcmp(path + len - 3, ".hs", 3) != 0) {
  1964. if (!silent) {
  1965. msg_err_re_cache("cannot open hyperscan cache file %s: not ending with .hs",
  1966. path);
  1967. }
  1968. g_set_error(err, rspamd_re_cache_quark(), 0,
  1969. "not ending with .hs");
  1970. return FALSE;
  1971. }
  1972. hash_pos = path + len - 3 - (sizeof(re_class->hash) - 1);
  1973. g_hash_table_iter_init(&it, cache->re_classes);
  1974. while (g_hash_table_iter_next(&it, &k, &v)) {
  1975. re_class = v;
  1976. if (memcmp(hash_pos, re_class->hash, sizeof(re_class->hash) - 1) == 0) {
  1977. /* Open file and check magic */
  1978. gssize r;
  1979. fd = open(path, O_RDONLY);
  1980. if (fd == -1) {
  1981. if (errno != ENOENT || !silent) {
  1982. msg_err_re_cache("cannot open hyperscan cache file %s: %s",
  1983. path, strerror(errno));
  1984. }
  1985. g_set_error(err, rspamd_re_cache_quark(), 0,
  1986. "%s",
  1987. strerror(errno));
  1988. return FALSE;
  1989. }
  1990. if ((r = read(fd, magicbuf, sizeof(magicbuf))) != sizeof(magicbuf)) {
  1991. if (r == -1) {
  1992. msg_err_re_cache("cannot read magic from hyperscan "
  1993. "cache file %s: %s",
  1994. path, strerror(errno));
  1995. g_set_error(err, rspamd_re_cache_quark(), 0,
  1996. "cannot read magic: %s",
  1997. strerror(errno));
  1998. }
  1999. else {
  2000. msg_err_re_cache("truncated read magic from hyperscan "
  2001. "cache file %s: %z, %z wanted",
  2002. path, r, (gsize) sizeof(magicbuf));
  2003. g_set_error(err, rspamd_re_cache_quark(), 0,
  2004. "truncated read magic %zd, %zd wanted",
  2005. r, (gsize) sizeof(magicbuf));
  2006. }
  2007. close(fd);
  2008. return FALSE;
  2009. }
  2010. mb = rspamd_hs_magic;
  2011. if (memcmp(magicbuf, mb, sizeof(magicbuf)) != 0) {
  2012. msg_err_re_cache("cannot open hyperscan cache file %s: "
  2013. "bad magic ('%*xs', '%*xs' expected)",
  2014. path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf,
  2015. (int) RSPAMD_HS_MAGIC_LEN, mb);
  2016. close(fd);
  2017. g_set_error(err, rspamd_re_cache_quark(), 0, "invalid magic");
  2018. return FALSE;
  2019. }
  2020. if ((r = read(fd, &test_plt, sizeof(test_plt))) != sizeof(test_plt)) {
  2021. if (r == -1) {
  2022. msg_err_re_cache("cannot read platform data from hyperscan "
  2023. "cache file %s: %s",
  2024. path, strerror(errno));
  2025. }
  2026. else {
  2027. msg_err_re_cache("truncated read platform data from hyperscan "
  2028. "cache file %s: %z, %z wanted",
  2029. path, r, (gsize) sizeof(magicbuf));
  2030. }
  2031. g_set_error(err, rspamd_re_cache_quark(), 0,
  2032. "cannot read platform data: %s", strerror(errno));
  2033. close(fd);
  2034. return FALSE;
  2035. }
  2036. if (test_plt.cpu_features != cache->plt.cpu_features) {
  2037. msg_err_re_cache("cannot open hyperscan cache file %s: "
  2038. "compiled for a different platform",
  2039. path);
  2040. g_set_error(err, rspamd_re_cache_quark(), 0,
  2041. "compiled for a different platform");
  2042. close(fd);
  2043. return FALSE;
  2044. }
  2045. close(fd);
  2046. if (try_load) {
  2047. map = rspamd_file_xmap(path, PROT_READ, &len, TRUE);
  2048. if (map == NULL) {
  2049. msg_err_re_cache("cannot mmap hyperscan cache file %s: "
  2050. "%s",
  2051. path, strerror(errno));
  2052. g_set_error(err, rspamd_re_cache_quark(), 0,
  2053. "mmap error: %s", strerror(errno));
  2054. return FALSE;
  2055. }
  2056. p = map + RSPAMD_HS_MAGIC_LEN + sizeof(test_plt);
  2057. end = map + len;
  2058. memcpy(&n, p, sizeof(n));
  2059. p += sizeof(int);
  2060. if (n <= 0 || 2 * n * sizeof(int) + /* IDs + flags */
  2061. sizeof(uint64_t) + /* crc */
  2062. RSPAMD_HS_MAGIC_LEN + /* header */
  2063. sizeof(cache->plt) >
  2064. len) {
  2065. /* Some wrong amount of regexps */
  2066. msg_err_re_cache("bad number of expressions in %s: %d",
  2067. path, n);
  2068. g_set_error(err, rspamd_re_cache_quark(), 0,
  2069. "bad number of expressions: %d", n);
  2070. munmap(map, len);
  2071. return FALSE;
  2072. }
  2073. /*
  2074. * Magic - 8 bytes
  2075. * Platform - sizeof (platform)
  2076. * n - number of regexps
  2077. * n * <regexp ids>
  2078. * n * <regexp flags>
  2079. * crc - 8 bytes checksum
  2080. * <hyperscan blob>
  2081. */
  2082. memcpy(&crc, p + n * 2 * sizeof(int), sizeof(crc));
  2083. rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
  2084. /* IDs */
  2085. rspamd_cryptobox_fast_hash_update(&crc_st, p, n * sizeof(int));
  2086. /* Flags */
  2087. rspamd_cryptobox_fast_hash_update(&crc_st, p + n * sizeof(int),
  2088. n * sizeof(int));
  2089. /* HS database */
  2090. p += n * sizeof(int) * 2 + sizeof(uint64_t);
  2091. rspamd_cryptobox_fast_hash_update(&crc_st, p, end - p);
  2092. valid_crc = rspamd_cryptobox_fast_hash_final(&crc_st);
  2093. if (crc != valid_crc) {
  2094. msg_warn_re_cache("outdated or invalid hs database in %s: "
  2095. "crc read %xL, crc expected %xL",
  2096. path, crc, valid_crc);
  2097. g_set_error(err, rspamd_re_cache_quark(), 0,
  2098. "outdated or invalid hs database, crc check failure");
  2099. munmap(map, len);
  2100. return FALSE;
  2101. }
  2102. if ((ret = hs_deserialize_database(p, end - p, &test_db)) != HS_SUCCESS) {
  2103. msg_err_re_cache("bad hs database in %s: %d", path, ret);
  2104. g_set_error(err, rspamd_re_cache_quark(), 0,
  2105. "deserialize error: %d", ret);
  2106. munmap(map, len);
  2107. return FALSE;
  2108. }
  2109. hs_free_database(test_db);
  2110. munmap(map, len);
  2111. }
  2112. /* XXX: add crc check */
  2113. return TRUE;
  2114. }
  2115. }
  2116. if (!silent) {
  2117. msg_warn_re_cache("unknown hyperscan cache file %s", path);
  2118. }
  2119. g_set_error(err, rspamd_re_cache_quark(), 0,
  2120. "unknown hyperscan file");
  2121. return FALSE;
  2122. #endif
  2123. }
  2124. enum rspamd_hyperscan_status
  2125. rspamd_re_cache_load_hyperscan(struct rspamd_re_cache *cache,
  2126. const char *cache_dir, bool try_load)
  2127. {
  2128. g_assert(cache != NULL);
  2129. g_assert(cache_dir != NULL);
  2130. #ifndef WITH_HYPERSCAN
  2131. return RSPAMD_HYPERSCAN_UNSUPPORTED;
  2132. #else
  2133. char path[PATH_MAX];
  2134. int fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret;
  2135. GHashTableIter it;
  2136. gpointer k, v;
  2137. uint8_t *map, *p;
  2138. struct rspamd_re_class *re_class;
  2139. struct rspamd_re_cache_elt *elt;
  2140. struct stat st;
  2141. gboolean has_valid = FALSE, all_valid = FALSE;
  2142. g_hash_table_iter_init(&it, cache->re_classes);
  2143. while (g_hash_table_iter_next(&it, &k, &v)) {
  2144. re_class = v;
  2145. rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cache_dir,
  2146. G_DIR_SEPARATOR, re_class->hash);
  2147. if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, try_load, FALSE, NULL)) {
  2148. msg_debug_re_cache("load hyperscan database from '%s'",
  2149. re_class->hash);
  2150. fd = open(path, O_RDONLY);
  2151. /* Read number of regexps */
  2152. g_assert(fd != -1);
  2153. fstat(fd, &st);
  2154. map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
  2155. if (map == MAP_FAILED) {
  2156. if (!try_load) {
  2157. msg_err_re_cache("cannot mmap %s: %s", path, strerror(errno));
  2158. }
  2159. else {
  2160. msg_debug_re_cache("cannot mmap %s: %s", path, strerror(errno));
  2161. }
  2162. close(fd);
  2163. all_valid = FALSE;
  2164. continue;
  2165. }
  2166. close(fd);
  2167. p = map + RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt);
  2168. n = *(int *) p;
  2169. if (n <= 0 || 2 * n * sizeof(int) + /* IDs + flags */
  2170. sizeof(uint64_t) + /* crc */
  2171. RSPAMD_HS_MAGIC_LEN + /* header */
  2172. sizeof(cache->plt) >
  2173. (gsize) st.st_size) {
  2174. /* Some wrong amount of regexps */
  2175. if (!try_load) {
  2176. msg_err_re_cache("bad number of expressions in %s: %d",
  2177. path, n);
  2178. }
  2179. else {
  2180. msg_debug_re_cache("bad number of expressions in %s: %d",
  2181. path, n);
  2182. }
  2183. munmap(map, st.st_size);
  2184. all_valid = FALSE;
  2185. continue;
  2186. }
  2187. total += n;
  2188. p += sizeof(n);
  2189. hs_ids = g_malloc(n * sizeof(*hs_ids));
  2190. memcpy(hs_ids, p, n * sizeof(*hs_ids));
  2191. p += n * sizeof(*hs_ids);
  2192. hs_flags = g_malloc(n * sizeof(*hs_flags));
  2193. memcpy(hs_flags, p, n * sizeof(*hs_flags));
  2194. /* Skip crc */
  2195. p += n * sizeof(*hs_ids) + sizeof(uint64_t);
  2196. /* Cleanup */
  2197. if (re_class->hs_scratch != NULL) {
  2198. hs_free_scratch(re_class->hs_scratch);
  2199. }
  2200. if (re_class->hs_db != NULL) {
  2201. rspamd_hyperscan_free(re_class->hs_db, false);
  2202. }
  2203. if (re_class->hs_ids) {
  2204. g_free(re_class->hs_ids);
  2205. }
  2206. re_class->hs_ids = NULL;
  2207. re_class->hs_scratch = NULL;
  2208. re_class->hs_db = NULL;
  2209. munmap(map, st.st_size);
  2210. re_class->hs_db = rspamd_hyperscan_maybe_load(path, p - map);
  2211. if (re_class->hs_db == NULL) {
  2212. if (!try_load) {
  2213. msg_err_re_cache("bad hs database in %s", path);
  2214. }
  2215. else {
  2216. msg_debug_re_cache("bad hs database in %s", path);
  2217. }
  2218. g_free(hs_ids);
  2219. g_free(hs_flags);
  2220. re_class->hs_ids = NULL;
  2221. re_class->hs_scratch = NULL;
  2222. re_class->hs_db = NULL;
  2223. all_valid = FALSE;
  2224. continue;
  2225. }
  2226. if ((ret = hs_alloc_scratch(rspamd_hyperscan_get_database(re_class->hs_db),
  2227. &re_class->hs_scratch)) != HS_SUCCESS) {
  2228. if (!try_load) {
  2229. msg_err_re_cache("bad hs database in %s; error code: %d", path, ret);
  2230. }
  2231. else {
  2232. msg_debug_re_cache("bad hs database in %s; error code: %d", path, ret);
  2233. }
  2234. g_free(hs_ids);
  2235. g_free(hs_flags);
  2236. rspamd_hyperscan_free(re_class->hs_db, true);
  2237. re_class->hs_ids = NULL;
  2238. re_class->hs_scratch = NULL;
  2239. re_class->hs_db = NULL;
  2240. all_valid = FALSE;
  2241. continue;
  2242. }
  2243. /*
  2244. * Now find hyperscan elts that are successfully compiled and
  2245. * specify that they should be matched using hyperscan
  2246. */
  2247. for (i = 0; i < n; i++) {
  2248. g_assert((int) cache->re->len > hs_ids[i] && hs_ids[i] >= 0);
  2249. elt = g_ptr_array_index(cache->re, hs_ids[i]);
  2250. if (hs_flags[i] & HS_FLAG_PREFILTER) {
  2251. elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE;
  2252. }
  2253. else {
  2254. elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN;
  2255. }
  2256. }
  2257. re_class->hs_ids = hs_ids;
  2258. g_free(hs_flags);
  2259. re_class->nhs = n;
  2260. if (!has_valid) {
  2261. has_valid = TRUE;
  2262. all_valid = TRUE;
  2263. }
  2264. }
  2265. else {
  2266. if (!try_load) {
  2267. msg_err_re_cache("invalid hyperscan hash file '%s'",
  2268. path);
  2269. }
  2270. else {
  2271. msg_debug_re_cache("invalid hyperscan hash file '%s'",
  2272. path);
  2273. }
  2274. all_valid = FALSE;
  2275. continue;
  2276. }
  2277. }
  2278. if (has_valid) {
  2279. if (all_valid) {
  2280. msg_info_re_cache("full hyperscan database of %d regexps has been loaded", total);
  2281. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL;
  2282. }
  2283. else {
  2284. msg_info_re_cache("partial hyperscan database of %d regexps has been loaded", total);
  2285. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL;
  2286. }
  2287. }
  2288. else {
  2289. msg_info_re_cache("hyperscan database has NOT been loaded; no valid expressions");
  2290. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR;
  2291. }
  2292. return cache->hyperscan_loaded;
  2293. #endif
  2294. }
  2295. void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache,
  2296. const char *sname,
  2297. int ref)
  2298. {
  2299. khiter_t k;
  2300. k = kh_get(lua_selectors_hash, cache->selectors, (char *) sname);
  2301. if (k == kh_end(cache->selectors)) {
  2302. char *cpy = g_strdup(sname);
  2303. int res;
  2304. k = kh_put(lua_selectors_hash, cache->selectors, cpy, &res);
  2305. kh_value(cache->selectors, k) = ref;
  2306. }
  2307. else {
  2308. msg_warn_re_cache("replacing selector with name %s", sname);
  2309. if (cache->L) {
  2310. luaL_unref(cache->L, LUA_REGISTRYINDEX, kh_value(cache->selectors, k));
  2311. }
  2312. kh_value(cache->selectors, k) = ref;
  2313. }
  2314. }